mirror of
https://github.com/taroved/pol
synced 2025-05-28 12:00:09 -07:00
xpath in progress
This commit is contained in:
parent
c64602b3a4
commit
e846b63c97
@ -53,10 +53,18 @@ def html2json(el):
|
||||
|
||||
def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
response.selector.remove_namespaces()
|
||||
|
||||
|
||||
tree = response.selector.root.getroottree()
|
||||
|
||||
snapshot_time = str(time.time())
|
||||
|
||||
# save html for extended selectors
|
||||
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
|
||||
file_path = SNAPSHOT_DIR + '/' + file_name
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(url + '\n')
|
||||
for k, v in response.headers.iteritems():
|
||||
for vv in v:
|
||||
f.write('%s: %s\n' % (k, vv))
|
||||
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
|
||||
|
||||
# set base url to html document
|
||||
head = tree.xpath("//head")
|
||||
@ -89,11 +97,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
for attr in bad.attrib:
|
||||
if attr.startswith('on'):
|
||||
del bad.attrib[attr]
|
||||
|
||||
|
||||
# sanitize forms
|
||||
if bad.tag == 'form':
|
||||
bad.attrib['onsubmit'] = "return false"
|
||||
|
||||
|
||||
body = tree.xpath("//body")
|
||||
if body:
|
||||
# append html2json js object
|
||||
@ -101,11 +109,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
script = etree.Element('script', {'type': 'text/javascript'})
|
||||
script.text = '\n'.join((
|
||||
'var html2json = ' + json.dumps(jsobj) + ';',
|
||||
'var snapshot_time = "' + snapshot_time + '";'
|
||||
'var snapshot_time = "' + file_name + '";'
|
||||
))
|
||||
body[0].append(script)
|
||||
|
||||
return (etree.tostring(tree, method='html'), snapshot_time)
|
||||
|
||||
return (etree.tostring(tree, method='html'), file_name)
|
||||
|
||||
def buildScrapyResponse(response, body, url):
|
||||
status = response.code
|
||||
@ -132,15 +140,7 @@ def downloadDone(response_str, request, response, feed_config):
|
||||
response_str = buildFeed(response, feed_config)
|
||||
request.setHeader(b"Content-Type", b'text/xml')
|
||||
else:
|
||||
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
|
||||
# import pdb;pdb.set_trace()
|
||||
with open(file_name, 'w') as f:
|
||||
f.write(url + '\n')
|
||||
for k, v in response.headers.iteritems():
|
||||
for vv in v:
|
||||
f.write('%s: %s\n' % (k, vv))
|
||||
f.write('\n\n' + response_str)
|
||||
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
|
||||
request.write(response_str)
|
||||
request.finish()
|
||||
@ -197,7 +197,7 @@ class Downloader(resource.Resource):
|
||||
return NOT_DONE_YET
|
||||
elif self.feed_regexp.match(request.uri) is not None: # feed
|
||||
feed_id = self.feed_regexp.match(request.uri).groups()[0]
|
||||
|
||||
|
||||
time_left = check_feed_request_time_limit(request.uri)
|
||||
if time_left:
|
||||
request.setResponseCode(429)
|
||||
@ -205,10 +205,10 @@ class Downloader(resource.Resource):
|
||||
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
|
||||
else:
|
||||
res = getFeedData(request, feed_id)
|
||||
|
||||
|
||||
if isinstance(res, basestring): # error message
|
||||
return res
|
||||
|
||||
|
||||
url, feed_config = res
|
||||
self.startRequest(request, url, feed_config)
|
||||
return NOT_DONE_YET
|
||||
|
@ -429,7 +429,7 @@ function createFeed() {
|
||||
return new Promise(function(resolve, reject){
|
||||
$.ajax({
|
||||
type: 'POST',
|
||||
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||
url: ET.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||
data: JSON.stringify(ET.active()
|
||||
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
|
||||
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
|
||||
|
@ -29,4 +29,4 @@ urlpatterns = i18n_patterns(
|
||||
|
||||
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
|
||||
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
|
||||
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed, name='setup_create_feed_ext'))
|
||||
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed_ext, name='setup_create_feed_ext'))
|
||||
|
@ -1,5 +1,6 @@
|
||||
import urllib
|
||||
import json
|
||||
import re
|
||||
|
||||
from django.views.decorators.csrf import ensure_csrf_cookie
|
||||
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
|
||||
@ -119,21 +120,20 @@ def setup_create_feed(request):
|
||||
|
||||
if not _validate_html(html_json):
|
||||
return HttpResponseBadRequest('html is invalid')
|
||||
|
||||
|
||||
xpathes = build_xpathes_for_items(item_names, html_json)
|
||||
feed_id = _create_feed(url, xpathes)
|
||||
|
||||
|
||||
return HttpResponse(reverse('preview', args=(feed_id,)))
|
||||
|
||||
def _validate_selectors(selectors):
|
||||
if not isinstance(selectors, list) or len(selectors) != 2:
|
||||
return False
|
||||
feed_xpath = xpathes[0]
|
||||
item_xpathes = xpathes[1]
|
||||
feed_xpath = selectors[0]
|
||||
item_xpathes = selectors[1]
|
||||
|
||||
if not isinstance(feed_xpath, basestring):
|
||||
return False
|
||||
|
||||
if not isinstance(item_xpathes, dict):
|
||||
return False
|
||||
|
||||
@ -143,28 +143,30 @@ def _validate_selectors(selectors):
|
||||
|
||||
for field in fields:
|
||||
if field.name in item_xpathes:
|
||||
if not isinstance(item_xpath[field.name], basestring):
|
||||
if not isinstance(item_xpathes[field.name], basestring):
|
||||
return False
|
||||
else:
|
||||
item_xpathes_out[field.name] = item_xpath[field.name]
|
||||
return [feed_xpath. item_xpathes_out]
|
||||
item_xpathes_out[field.name] = item_xpathes[field.name]
|
||||
return [feed_xpath, item_xpathes_out]
|
||||
|
||||
def setup_create_feed_ext(request):
|
||||
if request.method == 'POST':
|
||||
obj = json.loads(request.body)
|
||||
if 'selectors' not in obj or 'snapshot_time' not in obj or 'url' not in obj:
|
||||
return HttpResponseBadRequest('"selectors", "snapshot_time" and "url" parameters are required')
|
||||
if 'selectors' not in obj or 'snapshot_time' not in obj:
|
||||
return HttpResponseBadRequest('"selectors" and "snapshot_time" are required')
|
||||
|
||||
selectors = obj['selectors']
|
||||
snapshot_time = obj['snapshot_time']
|
||||
url = obj['url']
|
||||
file_name = obj['snapshot_time']
|
||||
|
||||
if not re.match('^\d{10}\.\d+_[\da-f]{32}', file_name):
|
||||
return HttpResponseBadRequest('"snapshot_time" is invalid')
|
||||
|
||||
validated_selectors = _validate_selectors(selectors)
|
||||
|
||||
if not validated_selectors:
|
||||
return HttpResponseBadRequest('selectors are invalid')
|
||||
|
||||
results = build_xpathes_results(validated_selectors, snapshot_time, url)
|
||||
results = build_xpath_results(validated_selectors, file_name)
|
||||
|
||||
return HttpResponse(json.dumps(results))
|
||||
|
||||
@ -175,5 +177,5 @@ def preview(request, feed_id):
|
||||
'feed_url': FEED_PAGE_URL + feed_id,
|
||||
'feed1_url': FEED1_PAGE_URL + feed_id,
|
||||
})
|
||||
|
||||
|
||||
return HttpResponseBadRequest('Only GET method supported')
|
||||
|
Loading…
x
Reference in New Issue
Block a user