v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-06-01 14:00:09 -07:00

xpath in progress

This commit is contained in:
Alexandr Nesterenko 2017-07-31 21:23:56 +00:00
parent c64602b3a4
commit e846b63c97
4 changed files with 38 additions and 36 deletions

View File

@ -56,7 +56,15 @@ def setBaseAndRemoveScriptsAndMore(response, url):
tree = response.selector.root.getroottree()
snapshot_time = str(time.time())
# save html for extended selectors
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
file_path = SNAPSHOT_DIR + '/' + file_name
with open(file_path, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
# set base url to html document
head = tree.xpath("//head")
@ -101,11 +109,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
script = etree.Element('script', {'type': 'text/javascript'})
script.text = '\n'.join((
'var html2json = ' + json.dumps(jsobj) + ';',
'var snapshot_time = "' + snapshot_time + '";'
'var snapshot_time = "' + file_name + '";'
))
body[0].append(script)
return (etree.tostring(tree, method='html'), snapshot_time)
return (etree.tostring(tree, method='html'), file_name)
def buildScrapyResponse(response, body, url):
status = response.code
@ -132,15 +140,7 @@ def downloadDone(response_str, request, response, feed_config):
response_str = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
else:
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
# import pdb;pdb.set_trace()
with open(file_name, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + response_str)
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
request.write(response_str)
request.finish()

View File

@ -429,7 +429,7 @@ function createFeed() {
return new Promise(function(resolve, reject){
$.ajax({
type: 'POST',
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
url: ET.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
data: JSON.stringify(ET.active()
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }

View File

@ -29,4 +29,4 @@ urlpatterns = i18n_patterns(
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed, name='setup_create_feed_ext'))
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed_ext, name='setup_create_feed_ext'))

View File

@ -1,5 +1,6 @@
import urllib
import json
import re
from django.views.decorators.csrf import ensure_csrf_cookie
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
@ -128,12 +129,11 @@ def setup_create_feed(request):
def _validate_selectors(selectors):
if not isinstance(selectors, list) or len(selectors) != 2:
return False
feed_xpath = xpathes[0]
item_xpathes = xpathes[1]
feed_xpath = selectors[0]
item_xpathes = selectors[1]
if not isinstance(feed_xpath, basestring):
return False
if not isinstance(item_xpathes, dict):
return False
@ -143,28 +143,30 @@ def _validate_selectors(selectors):
for field in fields:
if field.name in item_xpathes:
if not isinstance(item_xpath[field.name], basestring):
if not isinstance(item_xpathes[field.name], basestring):
return False
else:
item_xpathes_out[field.name] = item_xpath[field.name]
return [feed_xpath. item_xpathes_out]
item_xpathes_out[field.name] = item_xpathes[field.name]
return [feed_xpath, item_xpathes_out]
def setup_create_feed_ext(request):
if request.method == 'POST':
obj = json.loads(request.body)
if 'selectors' not in obj or 'snapshot_time' not in obj or 'url' not in obj:
return HttpResponseBadRequest('"selectors", "snapshot_time" and "url" parameters are required')
if 'selectors' not in obj or 'snapshot_time' not in obj:
return HttpResponseBadRequest('"selectors" and "snapshot_time" are required')
selectors = obj['selectors']
snapshot_time = obj['snapshot_time']
url = obj['url']
file_name = obj['snapshot_time']
if not re.match('^\d{10}\.\d+_[\da-f]{32}', file_name):
return HttpResponseBadRequest('"snapshot_time" is invalid')
validated_selectors = _validate_selectors(selectors)
if not validated_selectors:
return HttpResponseBadRequest('selectors are invalid')
results = build_xpathes_results(validated_selectors, snapshot_time, url)
results = build_xpath_results(validated_selectors, file_name)
return HttpResponse(json.dumps(results))