v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-28 12:00:09 -07:00

xpath in progress

This commit is contained in:
Alexandr Nesterenko 2017-07-31 21:23:56 +00:00
parent c64602b3a4
commit e846b63c97
4 changed files with 38 additions and 36 deletions

View File

@ -53,10 +53,18 @@ def html2json(el):
def setBaseAndRemoveScriptsAndMore(response, url):
response.selector.remove_namespaces()
tree = response.selector.root.getroottree()
snapshot_time = str(time.time())
# save html for extended selectors
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
file_path = SNAPSHOT_DIR + '/' + file_name
with open(file_path, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
# set base url to html document
head = tree.xpath("//head")
@ -89,11 +97,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
for attr in bad.attrib:
if attr.startswith('on'):
del bad.attrib[attr]
# sanitize forms
if bad.tag == 'form':
bad.attrib['onsubmit'] = "return false"
body = tree.xpath("//body")
if body:
# append html2json js object
@ -101,11 +109,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
script = etree.Element('script', {'type': 'text/javascript'})
script.text = '\n'.join((
'var html2json = ' + json.dumps(jsobj) + ';',
'var snapshot_time = "' + snapshot_time + '";'
'var snapshot_time = "' + file_name + '";'
))
body[0].append(script)
return (etree.tostring(tree, method='html'), snapshot_time)
return (etree.tostring(tree, method='html'), file_name)
def buildScrapyResponse(response, body, url):
status = response.code
@ -132,15 +140,7 @@ def downloadDone(response_str, request, response, feed_config):
response_str = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
else:
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
# import pdb;pdb.set_trace()
with open(file_name, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + response_str)
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
request.write(response_str)
request.finish()
@ -197,7 +197,7 @@ class Downloader(resource.Resource):
return NOT_DONE_YET
elif self.feed_regexp.match(request.uri) is not None: # feed
feed_id = self.feed_regexp.match(request.uri).groups()[0]
time_left = check_feed_request_time_limit(request.uri)
if time_left:
request.setResponseCode(429)
@ -205,10 +205,10 @@ class Downloader(resource.Resource):
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
else:
res = getFeedData(request, feed_id)
if isinstance(res, basestring): # error message
return res
url, feed_config = res
self.startRequest(request, url, feed_config)
return NOT_DONE_YET

View File

@ -429,7 +429,7 @@ function createFeed() {
return new Promise(function(resolve, reject){
$.ajax({
type: 'POST',
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
url: ET.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
data: JSON.stringify(ET.active()
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }

View File

@ -29,4 +29,4 @@ urlpatterns = i18n_patterns(
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed, name='setup_create_feed_ext'))
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed_ext, name='setup_create_feed_ext'))

View File

@ -1,5 +1,6 @@
import urllib
import json
import re
from django.views.decorators.csrf import ensure_csrf_cookie
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
@ -119,21 +120,20 @@ def setup_create_feed(request):
if not _validate_html(html_json):
return HttpResponseBadRequest('html is invalid')
xpathes = build_xpathes_for_items(item_names, html_json)
feed_id = _create_feed(url, xpathes)
return HttpResponse(reverse('preview', args=(feed_id,)))
def _validate_selectors(selectors):
if not isinstance(selectors, list) or len(selectors) != 2:
return False
feed_xpath = xpathes[0]
item_xpathes = xpathes[1]
feed_xpath = selectors[0]
item_xpathes = selectors[1]
if not isinstance(feed_xpath, basestring):
return False
if not isinstance(item_xpathes, dict):
return False
@ -143,28 +143,30 @@ def _validate_selectors(selectors):
for field in fields:
if field.name in item_xpathes:
if not isinstance(item_xpath[field.name], basestring):
if not isinstance(item_xpathes[field.name], basestring):
return False
else:
item_xpathes_out[field.name] = item_xpath[field.name]
return [feed_xpath. item_xpathes_out]
item_xpathes_out[field.name] = item_xpathes[field.name]
return [feed_xpath, item_xpathes_out]
def setup_create_feed_ext(request):
if request.method == 'POST':
obj = json.loads(request.body)
if 'selectors' not in obj or 'snapshot_time' not in obj or 'url' not in obj:
return HttpResponseBadRequest('"selectors", "snapshot_time" and "url" parameters are required')
if 'selectors' not in obj or 'snapshot_time' not in obj:
return HttpResponseBadRequest('"selectors" and "snapshot_time" are required')
selectors = obj['selectors']
snapshot_time = obj['snapshot_time']
url = obj['url']
file_name = obj['snapshot_time']
if not re.match('^\d{10}\.\d+_[\da-f]{32}', file_name):
return HttpResponseBadRequest('"snapshot_time" is invalid')
validated_selectors = _validate_selectors(selectors)
if not validated_selectors:
return HttpResponseBadRequest('selectors are invalid')
results = build_xpathes_results(validated_selectors, snapshot_time, url)
results = build_xpath_results(validated_selectors, file_name)
return HttpResponse(json.dumps(results))
@ -175,5 +177,5 @@ def preview(request, feed_id):
'feed_url': FEED_PAGE_URL + feed_id,
'feed1_url': FEED1_PAGE_URL + feed_id,
})
return HttpResponseBadRequest('Only GET method supported')