v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-06-01 22:10:08 -07:00

xpathes in progress

This commit is contained in:
Alexandr Nesterenko 2017-07-30 15:25:31 +00:00
parent a08a1b3675
commit 0bde3df8a9
4 changed files with 61 additions and 15 deletions

View File

@ -1,9 +1,10 @@
import json import json
import time, sys import time, sys
from hashlib import md5
from datetime import datetime from datetime import datetime
from twisted.web import server, resource from twisted.web import server, resource
from twisted.internet import reactor, endpoints from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from twisted.web.http_headers import Headers from twisted.web.http_headers import Headers
@ -22,7 +23,7 @@ import re
from feed import getFeedData, buildFeed from feed import getFeedData, buildFeed
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
if FEED_REQUEST_PERIOD_LIMIT: if FEED_REQUEST_PERIOD_LIMIT:
@ -55,6 +56,8 @@ def setBaseAndRemoveScriptsAndMore(response, url):
tree = response.selector.root.getroottree() tree = response.selector.root.getroottree()
snapshot_time = str(time.time())
# set base url to html document # set base url to html document
head = tree.xpath("//head") head = tree.xpath("//head")
if head: if head:
@ -96,10 +99,13 @@ def setBaseAndRemoveScriptsAndMore(response, url):
# append html2json js object # append html2json js object
jsobj = html2json(tree.getroot()) jsobj = html2json(tree.getroot())
script = etree.Element('script', {'type': 'text/javascript'}) script = etree.Element('script', {'type': 'text/javascript'})
script.text = 'var html2json = ' + json.dumps(jsobj) + ';' script.text = '\n'.join((
'var html2json = ' + json.dumps(jsobj) + ';',
'var snapshot_time = "' + snapshot_time + '";'
))
body[0].append(script) body[0].append(script)
return etree.tostring(tree, method='html') return (etree.tostring(tree, method='html'), snapshot_time)
def buildScrapyResponse(response, body, url): def buildScrapyResponse(response, body, url):
status = response.code status = response.code
@ -126,7 +132,15 @@ def downloadDone(response_str, request, response, feed_config):
response_str = buildFeed(response, feed_config) response_str = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml') request.setHeader(b"Content-Type", b'text/xml')
else: else:
response_str = setBaseAndRemoveScriptsAndMore(response, url) response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
# import pdb;pdb.set_trace()
with open(file_name, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + response_str)
request.write(response_str) request.write(response_str)
request.finish() request.finish()
@ -137,7 +151,10 @@ def error_html(msg):
def downloadError(error, request=None, url=None, response=None, feed_config=None): def downloadError(error, request=None, url=None, response=None, feed_config=None):
# read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok # read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok
if error.type is PartialDownloadError and error.value.status == '200': if error.type is PartialDownloadError and error.value.status == '200':
downloadDone(error.value.response, request, response, feed_config) d = defer.Deferred()
reactor.callLater(0, d.callback, error.value.response) # error.value.response is response_str
d.addCallback(downloadDone, request=request, response=response, feed_config=feed_config)
d.addErrback(downloadError, request=request, url=url, response=response, feed_config=feed_config)
return return
if DEBUG: if DEBUG:
@ -166,7 +183,6 @@ class Downloader(resource.Resource):
None None
) )
print 'Request <GET %s> started' % (url,) print 'Request <GET %s> started' % (url,)
response_ref = []
d.addCallback(downloadStarted, request=request, url=url, feed_config=feed_config) d.addCallback(downloadStarted, request=request, url=url, feed_config=feed_config)
d.addErrback(downloadError, request=request, url=url) d.addErrback(downloadError, request=request, url=url)

View File

@ -171,3 +171,5 @@ FEED1_PAGE_URL = '/feed1/'
DOWNLOADER_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' DOWNLOADER_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
# limit of seconds in which user can access separate feed # limit of seconds in which user can access separate feed
FEED_REQUEST_PERIOD_LIMIT = 0 FEED_REQUEST_PERIOD_LIMIT = 0
SNAPSHOT_DIR = '/tmp'

View File

@ -38,6 +38,23 @@ function check_pathes(pathes) {
var _config = ['', {}]; var _config = ['', {}];
var _active = false; var _active = false;
function updateSelector(name, messages) {
var control_group = $('#ste-'+ name).parent().parent();
if ('error' in messages) {
control_group.removeClass('info').addClass('error');
control_group.find('.help-inline').text(messages['error']);
}
else {
control_group.removeClass('error').addClass('info');
control_group.find('.help-inline').text(messages['count']);
}
}
// show status and error messages
function updateUIMessages(data) {
}
function updateUI(config) { function updateUI(config) {
console.log(config); console.log(config);
_config = config; _config = config;
@ -89,6 +106,7 @@ window.ET = {
'init': init_tool, 'init': init_tool,
'check': check_pathes, 'check': check_pathes,
'updateUI': updateUI, 'updateUI': updateUI,
'getUIConfig': getUIConfig,
'active': active 'active': active
}; };

View File

@ -405,8 +405,11 @@ function onCreateButtonClick() {
if (active) { if (active) {
//freeze UI //freeze UI
loader(true); loader(true);
createFeed().then(function(feed_page_url){ createFeed().then(function(data){
window.location.href = feed_page_url; if (typeof(data) == 'string'))
window.location.href = data; // feed_page_url
else
ET.updateUI(data);
}, function(error){ }, function(error){
//unfreez UI //unfreez UI
loader(false); loader(false);
@ -416,16 +419,21 @@ function onCreateButtonClick() {
} }
function createFeed() { function createFeed() {
// gather selected tag-ids if (!ET.active()) {
var name_ids = {}; // gather selected tag-ids
selected_any = gatherSelectedTagIds(name_ids); var name_ids = {};
selected_any = gatherSelectedTagIds(name_ids);
}
if (selected_any) if (selected_any)
return new Promise(function(resolve, reject){ return new Promise(function(resolve, reject){
$.ajax({ $.ajax({
type: 'POST', type: 'POST',
url: "/setup_create_feed", url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
data: JSON.stringify({ html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }), data: JSON.stringify(ET.active
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
),
contentType: "application/json; charset=utf-8", contentType: "application/json; charset=utf-8",
headers: {"X-CSRFToken": getCookie('csrftoken')}, headers: {"X-CSRFToken": getCookie('csrftoken')},
success: function(data){ success: function(data){
@ -471,7 +479,9 @@ $(document).ready(function(){
// attach iframe elements event handlers // attach iframe elements event handlers
$('iframe').contents().on('click', '*[tag-id]', onIframeElementClick); $('iframe').contents().on('click', '*[tag-id]', onIframeElementClick);
$('iframe').contents().on('mouseenter mouseleave', '*[tag-id]', onIframeElementHover); $('iframe').contents().on('mouseenter mouseleave', '*[tag-id]', onIframeElementHover);
iframeHtmlJson = $('iframe')[0].contentWindow.html2json; var iframe_window = $('iframe')[0].contentWindow;
iframeHtmlJson = iframe_window.html2json;
snapshot_time = iframe_window.snapshot_time;
loader(false); loader(false);
}); });