mirror of
https://github.com/taroved/pol
synced 2025-06-01 22:10:08 -07:00
xpath in progress
This commit is contained in:
parent
c64602b3a4
commit
e846b63c97
@ -56,7 +56,15 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
|
||||
tree = response.selector.root.getroottree()
|
||||
|
||||
snapshot_time = str(time.time())
|
||||
# save html for extended selectors
|
||||
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
|
||||
file_path = SNAPSHOT_DIR + '/' + file_name
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(url + '\n')
|
||||
for k, v in response.headers.iteritems():
|
||||
for vv in v:
|
||||
f.write('%s: %s\n' % (k, vv))
|
||||
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
|
||||
|
||||
# set base url to html document
|
||||
head = tree.xpath("//head")
|
||||
@ -101,11 +109,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
script = etree.Element('script', {'type': 'text/javascript'})
|
||||
script.text = '\n'.join((
|
||||
'var html2json = ' + json.dumps(jsobj) + ';',
|
||||
'var snapshot_time = "' + snapshot_time + '";'
|
||||
'var snapshot_time = "' + file_name + '";'
|
||||
))
|
||||
body[0].append(script)
|
||||
|
||||
return (etree.tostring(tree, method='html'), snapshot_time)
|
||||
return (etree.tostring(tree, method='html'), file_name)
|
||||
|
||||
def buildScrapyResponse(response, body, url):
|
||||
status = response.code
|
||||
@ -132,15 +140,7 @@ def downloadDone(response_str, request, response, feed_config):
|
||||
response_str = buildFeed(response, feed_config)
|
||||
request.setHeader(b"Content-Type", b'text/xml')
|
||||
else:
|
||||
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
|
||||
# import pdb;pdb.set_trace()
|
||||
with open(file_name, 'w') as f:
|
||||
f.write(url + '\n')
|
||||
for k, v in response.headers.iteritems():
|
||||
for vv in v:
|
||||
f.write('%s: %s\n' % (k, vv))
|
||||
f.write('\n\n' + response_str)
|
||||
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
|
||||
request.write(response_str)
|
||||
request.finish()
|
||||
|
@ -429,7 +429,7 @@ function createFeed() {
|
||||
return new Promise(function(resolve, reject){
|
||||
$.ajax({
|
||||
type: 'POST',
|
||||
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||
url: ET.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||
data: JSON.stringify(ET.active()
|
||||
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
|
||||
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
|
||||
|
@ -29,4 +29,4 @@ urlpatterns = i18n_patterns(
|
||||
|
||||
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
|
||||
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
|
||||
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed, name='setup_create_feed_ext'))
|
||||
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed_ext, name='setup_create_feed_ext'))
|
||||
|
@ -1,5 +1,6 @@
|
||||
import urllib
|
||||
import json
|
||||
import re
|
||||
|
||||
from django.views.decorators.csrf import ensure_csrf_cookie
|
||||
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
|
||||
@ -128,12 +129,11 @@ def setup_create_feed(request):
|
||||
def _validate_selectors(selectors):
|
||||
if not isinstance(selectors, list) or len(selectors) != 2:
|
||||
return False
|
||||
feed_xpath = xpathes[0]
|
||||
item_xpathes = xpathes[1]
|
||||
feed_xpath = selectors[0]
|
||||
item_xpathes = selectors[1]
|
||||
|
||||
if not isinstance(feed_xpath, basestring):
|
||||
return False
|
||||
|
||||
if not isinstance(item_xpathes, dict):
|
||||
return False
|
||||
|
||||
@ -143,28 +143,30 @@ def _validate_selectors(selectors):
|
||||
|
||||
for field in fields:
|
||||
if field.name in item_xpathes:
|
||||
if not isinstance(item_xpath[field.name], basestring):
|
||||
if not isinstance(item_xpathes[field.name], basestring):
|
||||
return False
|
||||
else:
|
||||
item_xpathes_out[field.name] = item_xpath[field.name]
|
||||
return [feed_xpath. item_xpathes_out]
|
||||
item_xpathes_out[field.name] = item_xpathes[field.name]
|
||||
return [feed_xpath, item_xpathes_out]
|
||||
|
||||
def setup_create_feed_ext(request):
|
||||
if request.method == 'POST':
|
||||
obj = json.loads(request.body)
|
||||
if 'selectors' not in obj or 'snapshot_time' not in obj or 'url' not in obj:
|
||||
return HttpResponseBadRequest('"selectors", "snapshot_time" and "url" parameters are required')
|
||||
if 'selectors' not in obj or 'snapshot_time' not in obj:
|
||||
return HttpResponseBadRequest('"selectors" and "snapshot_time" are required')
|
||||
|
||||
selectors = obj['selectors']
|
||||
snapshot_time = obj['snapshot_time']
|
||||
url = obj['url']
|
||||
file_name = obj['snapshot_time']
|
||||
|
||||
if not re.match('^\d{10}\.\d+_[\da-f]{32}', file_name):
|
||||
return HttpResponseBadRequest('"snapshot_time" is invalid')
|
||||
|
||||
validated_selectors = _validate_selectors(selectors)
|
||||
|
||||
if not validated_selectors:
|
||||
return HttpResponseBadRequest('selectors are invalid')
|
||||
|
||||
results = build_xpathes_results(validated_selectors, snapshot_time, url)
|
||||
results = build_xpath_results(validated_selectors, file_name)
|
||||
|
||||
return HttpResponse(json.dumps(results))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user