mirror of
https://github.com/taroved/pol
synced 2025-06-01 22:10:08 -07:00
xpath in progress
This commit is contained in:
parent
c64602b3a4
commit
e846b63c97
@ -53,10 +53,18 @@ def html2json(el):
|
|||||||
|
|
||||||
def setBaseAndRemoveScriptsAndMore(response, url):
|
def setBaseAndRemoveScriptsAndMore(response, url):
|
||||||
response.selector.remove_namespaces()
|
response.selector.remove_namespaces()
|
||||||
|
|
||||||
tree = response.selector.root.getroottree()
|
tree = response.selector.root.getroottree()
|
||||||
|
|
||||||
snapshot_time = str(time.time())
|
# save html for extended selectors
|
||||||
|
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
|
||||||
|
file_path = SNAPSHOT_DIR + '/' + file_name
|
||||||
|
with open(file_path, 'w') as f:
|
||||||
|
f.write(url + '\n')
|
||||||
|
for k, v in response.headers.iteritems():
|
||||||
|
for vv in v:
|
||||||
|
f.write('%s: %s\n' % (k, vv))
|
||||||
|
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
|
||||||
|
|
||||||
# set base url to html document
|
# set base url to html document
|
||||||
head = tree.xpath("//head")
|
head = tree.xpath("//head")
|
||||||
@ -89,11 +97,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
|||||||
for attr in bad.attrib:
|
for attr in bad.attrib:
|
||||||
if attr.startswith('on'):
|
if attr.startswith('on'):
|
||||||
del bad.attrib[attr]
|
del bad.attrib[attr]
|
||||||
|
|
||||||
# sanitize forms
|
# sanitize forms
|
||||||
if bad.tag == 'form':
|
if bad.tag == 'form':
|
||||||
bad.attrib['onsubmit'] = "return false"
|
bad.attrib['onsubmit'] = "return false"
|
||||||
|
|
||||||
body = tree.xpath("//body")
|
body = tree.xpath("//body")
|
||||||
if body:
|
if body:
|
||||||
# append html2json js object
|
# append html2json js object
|
||||||
@ -101,11 +109,11 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
|||||||
script = etree.Element('script', {'type': 'text/javascript'})
|
script = etree.Element('script', {'type': 'text/javascript'})
|
||||||
script.text = '\n'.join((
|
script.text = '\n'.join((
|
||||||
'var html2json = ' + json.dumps(jsobj) + ';',
|
'var html2json = ' + json.dumps(jsobj) + ';',
|
||||||
'var snapshot_time = "' + snapshot_time + '";'
|
'var snapshot_time = "' + file_name + '";'
|
||||||
))
|
))
|
||||||
body[0].append(script)
|
body[0].append(script)
|
||||||
|
|
||||||
return (etree.tostring(tree, method='html'), snapshot_time)
|
return (etree.tostring(tree, method='html'), file_name)
|
||||||
|
|
||||||
def buildScrapyResponse(response, body, url):
|
def buildScrapyResponse(response, body, url):
|
||||||
status = response.code
|
status = response.code
|
||||||
@ -132,15 +140,7 @@ def downloadDone(response_str, request, response, feed_config):
|
|||||||
response_str = buildFeed(response, feed_config)
|
response_str = buildFeed(response, feed_config)
|
||||||
request.setHeader(b"Content-Type", b'text/xml')
|
request.setHeader(b"Content-Type", b'text/xml')
|
||||||
else:
|
else:
|
||||||
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
|
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
||||||
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
|
|
||||||
# import pdb;pdb.set_trace()
|
|
||||||
with open(file_name, 'w') as f:
|
|
||||||
f.write(url + '\n')
|
|
||||||
for k, v in response.headers.iteritems():
|
|
||||||
for vv in v:
|
|
||||||
f.write('%s: %s\n' % (k, vv))
|
|
||||||
f.write('\n\n' + response_str)
|
|
||||||
|
|
||||||
request.write(response_str)
|
request.write(response_str)
|
||||||
request.finish()
|
request.finish()
|
||||||
@ -197,7 +197,7 @@ class Downloader(resource.Resource):
|
|||||||
return NOT_DONE_YET
|
return NOT_DONE_YET
|
||||||
elif self.feed_regexp.match(request.uri) is not None: # feed
|
elif self.feed_regexp.match(request.uri) is not None: # feed
|
||||||
feed_id = self.feed_regexp.match(request.uri).groups()[0]
|
feed_id = self.feed_regexp.match(request.uri).groups()[0]
|
||||||
|
|
||||||
time_left = check_feed_request_time_limit(request.uri)
|
time_left = check_feed_request_time_limit(request.uri)
|
||||||
if time_left:
|
if time_left:
|
||||||
request.setResponseCode(429)
|
request.setResponseCode(429)
|
||||||
@ -205,10 +205,10 @@ class Downloader(resource.Resource):
|
|||||||
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
|
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
|
||||||
else:
|
else:
|
||||||
res = getFeedData(request, feed_id)
|
res = getFeedData(request, feed_id)
|
||||||
|
|
||||||
if isinstance(res, basestring): # error message
|
if isinstance(res, basestring): # error message
|
||||||
return res
|
return res
|
||||||
|
|
||||||
url, feed_config = res
|
url, feed_config = res
|
||||||
self.startRequest(request, url, feed_config)
|
self.startRequest(request, url, feed_config)
|
||||||
return NOT_DONE_YET
|
return NOT_DONE_YET
|
||||||
|
@ -429,7 +429,7 @@ function createFeed() {
|
|||||||
return new Promise(function(resolve, reject){
|
return new Promise(function(resolve, reject){
|
||||||
$.ajax({
|
$.ajax({
|
||||||
type: 'POST',
|
type: 'POST',
|
||||||
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
url: ET.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||||
data: JSON.stringify(ET.active()
|
data: JSON.stringify(ET.active()
|
||||||
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
|
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
|
||||||
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
|
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
|
||||||
|
@ -29,4 +29,4 @@ urlpatterns = i18n_patterns(
|
|||||||
|
|
||||||
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
|
urlpatterns.append(url(r'^setup_get_selected_ids$', views.setup_get_selected_ids, name='setup_get_selected_ids'))
|
||||||
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
|
urlpatterns.append(url(r'^setup_create_feed$', views.setup_create_feed, name='setup_create_feed'))
|
||||||
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed, name='setup_create_feed_ext'))
|
urlpatterns.append(url(r'^setup_create_feed_ext$', views.setup_create_feed_ext, name='setup_create_feed_ext'))
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import urllib
|
import urllib
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
from django.views.decorators.csrf import ensure_csrf_cookie
|
from django.views.decorators.csrf import ensure_csrf_cookie
|
||||||
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
|
from django.http import HttpResponseRedirect, HttpResponse, HttpResponseBadRequest
|
||||||
@ -119,21 +120,20 @@ def setup_create_feed(request):
|
|||||||
|
|
||||||
if not _validate_html(html_json):
|
if not _validate_html(html_json):
|
||||||
return HttpResponseBadRequest('html is invalid')
|
return HttpResponseBadRequest('html is invalid')
|
||||||
|
|
||||||
xpathes = build_xpathes_for_items(item_names, html_json)
|
xpathes = build_xpathes_for_items(item_names, html_json)
|
||||||
feed_id = _create_feed(url, xpathes)
|
feed_id = _create_feed(url, xpathes)
|
||||||
|
|
||||||
return HttpResponse(reverse('preview', args=(feed_id,)))
|
return HttpResponse(reverse('preview', args=(feed_id,)))
|
||||||
|
|
||||||
def _validate_selectors(selectors):
|
def _validate_selectors(selectors):
|
||||||
if not isinstance(selectors, list) or len(selectors) != 2:
|
if not isinstance(selectors, list) or len(selectors) != 2:
|
||||||
return False
|
return False
|
||||||
feed_xpath = xpathes[0]
|
feed_xpath = selectors[0]
|
||||||
item_xpathes = xpathes[1]
|
item_xpathes = selectors[1]
|
||||||
|
|
||||||
if not isinstance(feed_xpath, basestring):
|
if not isinstance(feed_xpath, basestring):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not isinstance(item_xpathes, dict):
|
if not isinstance(item_xpathes, dict):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -143,28 +143,30 @@ def _validate_selectors(selectors):
|
|||||||
|
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if field.name in item_xpathes:
|
if field.name in item_xpathes:
|
||||||
if not isinstance(item_xpath[field.name], basestring):
|
if not isinstance(item_xpathes[field.name], basestring):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
item_xpathes_out[field.name] = item_xpath[field.name]
|
item_xpathes_out[field.name] = item_xpathes[field.name]
|
||||||
return [feed_xpath. item_xpathes_out]
|
return [feed_xpath, item_xpathes_out]
|
||||||
|
|
||||||
def setup_create_feed_ext(request):
|
def setup_create_feed_ext(request):
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
obj = json.loads(request.body)
|
obj = json.loads(request.body)
|
||||||
if 'selectors' not in obj or 'snapshot_time' not in obj or 'url' not in obj:
|
if 'selectors' not in obj or 'snapshot_time' not in obj:
|
||||||
return HttpResponseBadRequest('"selectors", "snapshot_time" and "url" parameters are required')
|
return HttpResponseBadRequest('"selectors" and "snapshot_time" are required')
|
||||||
|
|
||||||
selectors = obj['selectors']
|
selectors = obj['selectors']
|
||||||
snapshot_time = obj['snapshot_time']
|
file_name = obj['snapshot_time']
|
||||||
url = obj['url']
|
|
||||||
|
if not re.match('^\d{10}\.\d+_[\da-f]{32}', file_name):
|
||||||
|
return HttpResponseBadRequest('"snapshot_time" is invalid')
|
||||||
|
|
||||||
validated_selectors = _validate_selectors(selectors)
|
validated_selectors = _validate_selectors(selectors)
|
||||||
|
|
||||||
if not validated_selectors:
|
if not validated_selectors:
|
||||||
return HttpResponseBadRequest('selectors are invalid')
|
return HttpResponseBadRequest('selectors are invalid')
|
||||||
|
|
||||||
results = build_xpathes_results(validated_selectors, snapshot_time, url)
|
results = build_xpath_results(validated_selectors, file_name)
|
||||||
|
|
||||||
return HttpResponse(json.dumps(results))
|
return HttpResponse(json.dumps(results))
|
||||||
|
|
||||||
@ -175,5 +177,5 @@ def preview(request, feed_id):
|
|||||||
'feed_url': FEED_PAGE_URL + feed_id,
|
'feed_url': FEED_PAGE_URL + feed_id,
|
||||||
'feed1_url': FEED1_PAGE_URL + feed_id,
|
'feed1_url': FEED1_PAGE_URL + feed_id,
|
||||||
})
|
})
|
||||||
|
|
||||||
return HttpResponseBadRequest('Only GET method supported')
|
return HttpResponseBadRequest('Only GET method supported')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user