v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-28 12:00:09 -07:00

xpathes in progress

This commit is contained in:
Alexandr Nesterenko 2017-07-30 15:25:31 +00:00
parent a08a1b3675
commit 0bde3df8a9
4 changed files with 61 additions and 15 deletions

View File

@ -1,9 +1,10 @@
import json
import time, sys
from hashlib import md5
from datetime import datetime
from twisted.web import server, resource
from twisted.internet import reactor, endpoints
from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError
from twisted.web.server import NOT_DONE_YET
from twisted.web.http_headers import Headers
@ -22,7 +23,7 @@ import re
from feed import getFeedData, buildFeed
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
if FEED_REQUEST_PERIOD_LIMIT:
@ -55,6 +56,8 @@ def setBaseAndRemoveScriptsAndMore(response, url):
tree = response.selector.root.getroottree()
snapshot_time = str(time.time())
# set base url to html document
head = tree.xpath("//head")
if head:
@ -96,10 +99,13 @@ def setBaseAndRemoveScriptsAndMore(response, url):
# append html2json js object
jsobj = html2json(tree.getroot())
script = etree.Element('script', {'type': 'text/javascript'})
script.text = 'var html2json = ' + json.dumps(jsobj) + ';'
script.text = '\n'.join((
'var html2json = ' + json.dumps(jsobj) + ';',
'var snapshot_time = "' + snapshot_time + '";'
))
body[0].append(script)
return etree.tostring(tree, method='html')
return (etree.tostring(tree, method='html'), snapshot_time)
def buildScrapyResponse(response, body, url):
status = response.code
@ -126,7 +132,15 @@ def downloadDone(response_str, request, response, feed_config):
response_str = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
else:
response_str = setBaseAndRemoveScriptsAndMore(response, url)
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
# import pdb;pdb.set_trace()
with open(file_name, 'w') as f:
f.write(url + '\n')
for k, v in response.headers.iteritems():
for vv in v:
f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + response_str)
request.write(response_str)
request.finish()
@ -137,7 +151,10 @@ def error_html(msg):
def downloadError(error, request=None, url=None, response=None, feed_config=None):
# read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok
if error.type is PartialDownloadError and error.value.status == '200':
downloadDone(error.value.response, request, response, feed_config)
d = defer.Deferred()
reactor.callLater(0, d.callback, error.value.response) # error.value.response is response_str
d.addCallback(downloadDone, request=request, response=response, feed_config=feed_config)
d.addErrback(downloadError, request=request, url=url, response=response, feed_config=feed_config)
return
if DEBUG:
@ -166,7 +183,6 @@ class Downloader(resource.Resource):
None
)
print 'Request <GET %s> started' % (url,)
response_ref = []
d.addCallback(downloadStarted, request=request, url=url, feed_config=feed_config)
d.addErrback(downloadError, request=request, url=url)

View File

@ -171,3 +171,5 @@ FEED1_PAGE_URL = '/feed1/'
DOWNLOADER_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
# limit of seconds in which user can access separate feed
FEED_REQUEST_PERIOD_LIMIT = 0
SNAPSHOT_DIR = '/tmp'

View File

@ -38,6 +38,23 @@ function check_pathes(pathes) {
var _config = ['', {}];
var _active = false;
function updateSelector(name, messages) {
var control_group = $('#ste-'+ name).parent().parent();
if ('error' in messages) {
control_group.removeClass('info').addClass('error');
control_group.find('.help-inline').text(messages['error']);
}
else {
control_group.removeClass('error').addClass('info');
control_group.find('.help-inline').text(messages['count']);
}
}
// show status and error messages
function updateUIMessages(data) {
}
function updateUI(config) {
console.log(config);
_config = config;
@ -89,6 +106,7 @@ window.ET = {
'init': init_tool,
'check': check_pathes,
'updateUI': updateUI,
'getUIConfig': getUIConfig,
'active': active
};

View File

@ -405,8 +405,11 @@ function onCreateButtonClick() {
if (active) {
//freeze UI
loader(true);
createFeed().then(function(feed_page_url){
window.location.href = feed_page_url;
createFeed().then(function(data){
if (typeof(data) == 'string'))
window.location.href = data; // feed_page_url
else
ET.updateUI(data);
}, function(error){
//unfreez UI
loader(false);
@ -416,16 +419,21 @@ function onCreateButtonClick() {
}
function createFeed() {
// gather selected tag-ids
var name_ids = {};
selected_any = gatherSelectedTagIds(name_ids);
if (!ET.active()) {
// gather selected tag-ids
var name_ids = {};
selected_any = gatherSelectedTagIds(name_ids);
}
if (selected_any)
return new Promise(function(resolve, reject){
$.ajax({
type: 'POST',
url: "/setup_create_feed",
data: JSON.stringify({ html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }),
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
data: JSON.stringify(ET.active
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
),
contentType: "application/json; charset=utf-8",
headers: {"X-CSRFToken": getCookie('csrftoken')},
success: function(data){
@ -471,7 +479,9 @@ $(document).ready(function(){
// attach iframe elements event handlers
$('iframe').contents().on('click', '*[tag-id]', onIframeElementClick);
$('iframe').contents().on('mouseenter mouseleave', '*[tag-id]', onIframeElementHover);
iframeHtmlJson = $('iframe')[0].contentWindow.html2json;
var iframe_window = $('iframe')[0].contentWindow;
iframeHtmlJson = iframe_window.html2json;
snapshot_time = iframe_window.snapshot_time;
loader(false);
});