mirror of
https://github.com/taroved/pol
synced 2025-06-01 22:10:08 -07:00
xpathes in progress
This commit is contained in:
parent
a08a1b3675
commit
0bde3df8a9
@ -1,9 +1,10 @@
|
|||||||
import json
|
import json
|
||||||
import time, sys
|
import time, sys
|
||||||
|
from hashlib import md5
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from twisted.web import server, resource
|
from twisted.web import server, resource
|
||||||
from twisted.internet import reactor, endpoints
|
from twisted.internet import reactor, endpoints, defer
|
||||||
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError
|
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError
|
||||||
from twisted.web.server import NOT_DONE_YET
|
from twisted.web.server import NOT_DONE_YET
|
||||||
from twisted.web.http_headers import Headers
|
from twisted.web.http_headers import Headers
|
||||||
@ -22,7 +23,7 @@ import re
|
|||||||
|
|
||||||
from feed import getFeedData, buildFeed
|
from feed import getFeedData, buildFeed
|
||||||
|
|
||||||
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
|
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
|
||||||
|
|
||||||
|
|
||||||
if FEED_REQUEST_PERIOD_LIMIT:
|
if FEED_REQUEST_PERIOD_LIMIT:
|
||||||
@ -55,6 +56,8 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
|||||||
|
|
||||||
tree = response.selector.root.getroottree()
|
tree = response.selector.root.getroottree()
|
||||||
|
|
||||||
|
snapshot_time = str(time.time())
|
||||||
|
|
||||||
# set base url to html document
|
# set base url to html document
|
||||||
head = tree.xpath("//head")
|
head = tree.xpath("//head")
|
||||||
if head:
|
if head:
|
||||||
@ -96,10 +99,13 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
|||||||
# append html2json js object
|
# append html2json js object
|
||||||
jsobj = html2json(tree.getroot())
|
jsobj = html2json(tree.getroot())
|
||||||
script = etree.Element('script', {'type': 'text/javascript'})
|
script = etree.Element('script', {'type': 'text/javascript'})
|
||||||
script.text = 'var html2json = ' + json.dumps(jsobj) + ';'
|
script.text = '\n'.join((
|
||||||
|
'var html2json = ' + json.dumps(jsobj) + ';',
|
||||||
|
'var snapshot_time = "' + snapshot_time + '";'
|
||||||
|
))
|
||||||
body[0].append(script)
|
body[0].append(script)
|
||||||
|
|
||||||
return etree.tostring(tree, method='html')
|
return (etree.tostring(tree, method='html'), snapshot_time)
|
||||||
|
|
||||||
def buildScrapyResponse(response, body, url):
|
def buildScrapyResponse(response, body, url):
|
||||||
status = response.code
|
status = response.code
|
||||||
@ -126,7 +132,15 @@ def downloadDone(response_str, request, response, feed_config):
|
|||||||
response_str = buildFeed(response, feed_config)
|
response_str = buildFeed(response, feed_config)
|
||||||
request.setHeader(b"Content-Type", b'text/xml')
|
request.setHeader(b"Content-Type", b'text/xml')
|
||||||
else:
|
else:
|
||||||
response_str = setBaseAndRemoveScriptsAndMore(response, url)
|
response_str, snapshot_time = setBaseAndRemoveScriptsAndMore(response, url)
|
||||||
|
file_name = SNAPSHOT_DIR + '/' + snapshot_time + '_' + md5(url).hexdigest()
|
||||||
|
# import pdb;pdb.set_trace()
|
||||||
|
with open(file_name, 'w') as f:
|
||||||
|
f.write(url + '\n')
|
||||||
|
for k, v in response.headers.iteritems():
|
||||||
|
for vv in v:
|
||||||
|
f.write('%s: %s\n' % (k, vv))
|
||||||
|
f.write('\n\n' + response_str)
|
||||||
|
|
||||||
request.write(response_str)
|
request.write(response_str)
|
||||||
request.finish()
|
request.finish()
|
||||||
@ -137,7 +151,10 @@ def error_html(msg):
|
|||||||
def downloadError(error, request=None, url=None, response=None, feed_config=None):
|
def downloadError(error, request=None, url=None, response=None, feed_config=None):
|
||||||
# read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok
|
# read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok
|
||||||
if error.type is PartialDownloadError and error.value.status == '200':
|
if error.type is PartialDownloadError and error.value.status == '200':
|
||||||
downloadDone(error.value.response, request, response, feed_config)
|
d = defer.Deferred()
|
||||||
|
reactor.callLater(0, d.callback, error.value.response) # error.value.response is response_str
|
||||||
|
d.addCallback(downloadDone, request=request, response=response, feed_config=feed_config)
|
||||||
|
d.addErrback(downloadError, request=request, url=url, response=response, feed_config=feed_config)
|
||||||
return
|
return
|
||||||
|
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
@ -166,7 +183,6 @@ class Downloader(resource.Resource):
|
|||||||
None
|
None
|
||||||
)
|
)
|
||||||
print 'Request <GET %s> started' % (url,)
|
print 'Request <GET %s> started' % (url,)
|
||||||
response_ref = []
|
|
||||||
d.addCallback(downloadStarted, request=request, url=url, feed_config=feed_config)
|
d.addCallback(downloadStarted, request=request, url=url, feed_config=feed_config)
|
||||||
d.addErrback(downloadError, request=request, url=url)
|
d.addErrback(downloadError, request=request, url=url)
|
||||||
|
|
||||||
|
@ -171,3 +171,5 @@ FEED1_PAGE_URL = '/feed1/'
|
|||||||
DOWNLOADER_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
|
DOWNLOADER_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
|
||||||
# limit of seconds in which user can access separate feed
|
# limit of seconds in which user can access separate feed
|
||||||
FEED_REQUEST_PERIOD_LIMIT = 0
|
FEED_REQUEST_PERIOD_LIMIT = 0
|
||||||
|
|
||||||
|
SNAPSHOT_DIR = '/tmp'
|
||||||
|
@ -38,6 +38,23 @@ function check_pathes(pathes) {
|
|||||||
var _config = ['', {}];
|
var _config = ['', {}];
|
||||||
var _active = false;
|
var _active = false;
|
||||||
|
|
||||||
|
function updateSelector(name, messages) {
|
||||||
|
var control_group = $('#ste-'+ name).parent().parent();
|
||||||
|
if ('error' in messages) {
|
||||||
|
control_group.removeClass('info').addClass('error');
|
||||||
|
control_group.find('.help-inline').text(messages['error']);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
control_group.removeClass('error').addClass('info');
|
||||||
|
control_group.find('.help-inline').text(messages['count']);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// show status and error messages
|
||||||
|
function updateUIMessages(data) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
function updateUI(config) {
|
function updateUI(config) {
|
||||||
console.log(config);
|
console.log(config);
|
||||||
_config = config;
|
_config = config;
|
||||||
@ -89,6 +106,7 @@ window.ET = {
|
|||||||
'init': init_tool,
|
'init': init_tool,
|
||||||
'check': check_pathes,
|
'check': check_pathes,
|
||||||
'updateUI': updateUI,
|
'updateUI': updateUI,
|
||||||
|
'getUIConfig': getUIConfig,
|
||||||
'active': active
|
'active': active
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -405,8 +405,11 @@ function onCreateButtonClick() {
|
|||||||
if (active) {
|
if (active) {
|
||||||
//freeze UI
|
//freeze UI
|
||||||
loader(true);
|
loader(true);
|
||||||
createFeed().then(function(feed_page_url){
|
createFeed().then(function(data){
|
||||||
window.location.href = feed_page_url;
|
if (typeof(data) == 'string'))
|
||||||
|
window.location.href = data; // feed_page_url
|
||||||
|
else
|
||||||
|
ET.updateUI(data);
|
||||||
}, function(error){
|
}, function(error){
|
||||||
//unfreez UI
|
//unfreez UI
|
||||||
loader(false);
|
loader(false);
|
||||||
@ -416,16 +419,21 @@ function onCreateButtonClick() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function createFeed() {
|
function createFeed() {
|
||||||
// gather selected tag-ids
|
if (!ET.active()) {
|
||||||
var name_ids = {};
|
// gather selected tag-ids
|
||||||
selected_any = gatherSelectedTagIds(name_ids);
|
var name_ids = {};
|
||||||
|
selected_any = gatherSelectedTagIds(name_ids);
|
||||||
|
}
|
||||||
|
|
||||||
if (selected_any)
|
if (selected_any)
|
||||||
return new Promise(function(resolve, reject){
|
return new Promise(function(resolve, reject){
|
||||||
$.ajax({
|
$.ajax({
|
||||||
type: 'POST',
|
type: 'POST',
|
||||||
url: "/setup_create_feed",
|
url: EI.active() ? "/setup_create_feed_ext" :"/setup_create_feed",
|
||||||
data: JSON.stringify({ html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }),
|
data: JSON.stringify(ET.active
|
||||||
|
? { selectors: ET.getUIConfig(), snapshot_time: snapshot_time, url:$('#create').data('page-url') }
|
||||||
|
: { html: iframeHtmlJson, names: name_ids, url:$('#create').data('page-url') }
|
||||||
|
),
|
||||||
contentType: "application/json; charset=utf-8",
|
contentType: "application/json; charset=utf-8",
|
||||||
headers: {"X-CSRFToken": getCookie('csrftoken')},
|
headers: {"X-CSRFToken": getCookie('csrftoken')},
|
||||||
success: function(data){
|
success: function(data){
|
||||||
@ -471,7 +479,9 @@ $(document).ready(function(){
|
|||||||
// attach iframe elements event handlers
|
// attach iframe elements event handlers
|
||||||
$('iframe').contents().on('click', '*[tag-id]', onIframeElementClick);
|
$('iframe').contents().on('click', '*[tag-id]', onIframeElementClick);
|
||||||
$('iframe').contents().on('mouseenter mouseleave', '*[tag-id]', onIframeElementHover);
|
$('iframe').contents().on('mouseenter mouseleave', '*[tag-id]', onIframeElementHover);
|
||||||
iframeHtmlJson = $('iframe')[0].contentWindow.html2json;
|
var iframe_window = $('iframe')[0].contentWindow;
|
||||||
|
iframeHtmlJson = iframe_window.html2json;
|
||||||
|
snapshot_time = iframe_window.snapshot_time;
|
||||||
loader(false);
|
loader(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user