v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 14:20:10 -07:00
This commit is contained in:
Alexandr Nesterenko 2021-04-15 18:19:46 -07:00
parent 01e63bc787
commit 898c38c71f

View File

@ -6,6 +6,7 @@ import pickle
import time, sys, traceback import time, sys, traceback
import re import re
import six
from lxml import etree from lxml import etree
from twisted.web import server, resource from twisted.web import server, resource
@ -61,14 +62,26 @@ class Downloader(object):
def _saveResponse(self, headers, url, tree): def _saveResponse(self, headers, url, tree):
# save html for extended selectors # save html for extended selectors
file_name = '%s_%s' % (time.time(), md5(url).hexdigest()) if six.PY2:
file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
elif six.PY3:
file_name = '%s_%s' % (time.time(), md5(url.encode('utf-8')).hexdigest())
file_path = self.snapshot_dir + '/' + file_name file_path = self.snapshot_dir + '/' + file_name
with open(file_path, 'w') as f: with open(file_path, 'w') as f:
f.write(url + '\n') f.write(url + '\n')
for k, v in headers.iteritems(): if six.PY2:
for vv in v: for k, v in headers.iteritems():
f.write('%s: %s\n' % (k, vv)) for vv in v:
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html')) f.write('%s: %s\n' % (k, vv))
elif six.PY3:
for k, v in headers.items():
for vv in v:
f.write('%s: %s\n' % (k, vv))
if six.PY2:
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
elif six.PY3:
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html').decode('utf-8'))
return file_name return file_name
def sanitizeAndNumerate(self, selector, numerate=True, sanitize_anchors=True): def sanitizeAndNumerate(self, selector, numerate=True, sanitize_anchors=True):
@ -123,7 +136,10 @@ class Downloader(object):
else: else:
base = etree.Element("base") base = etree.Element("base")
head.insert(0, base) head.insert(0, base)
base.set('href', url.decode('utf-8')) if six.PY2:
base.set('href', url.decode('utf-8'))
elif six.PY3:
base.set('href', url)
self.sanitizeAndNumerate(selector) self.sanitizeAndNumerate(selector)
@ -138,7 +154,10 @@ class Downloader(object):
)) ))
body[0].append(script) body[0].append(script)
return etree.tostring(tree, method='html') if six.PY2:
return etree.tostring(tree, method='html')
elif six.PY3:
return etree.tostring(tree, method='html').decode('utf-8')
def buildScrapyResponse(self, response, body, url): def buildScrapyResponse(self, response, body, url):
status = response.code status = response.code