diff --git a/decompression.py b/decompression.py deleted file mode 100644 index 35f3147..0000000 --- a/decompression.py +++ /dev/null @@ -1,90 +0,0 @@ -""" This module implements the DecompressionMiddleware which tries to recognise -and extract the potentially compressed responses that may arrive. -""" - -import bz2 -import gzip -import zipfile -import tarfile -import logging -from tempfile import mktemp - -import six - -try: - from cStringIO import StringIO as BytesIO -except ImportError: - from io import BytesIO - -from scrapy.responsetypes import responsetypes - -from scrapy.http.response import Response - -logger = logging.getLogger(__name__) - - -class DecompressionMiddleware(object): - """ This middleware tries to recognise and extract the possibly compressed - responses that may arrive. """ - - def __init__(self): - self._formats = { - 'tar': self._is_tar, - 'zip': self._is_zip, - 'gz': self._is_gzip, - 'bz2': self._is_bzip2 - } - - def _is_tar(self, response): - archive = BytesIO(response.body) - try: - tar_file = tarfile.open(name=mktemp(), fileobj=archive) - except tarfile.ReadError: - return - - body = tar_file.extractfile(tar_file.members[0]).read() - respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) - return response.replace(body=body, cls=respcls) - - def _is_zip(self, response): - archive = BytesIO(response.body) - try: - zip_file = zipfile.ZipFile(archive) - except zipfile.BadZipfile: - return - - namelist = zip_file.namelist() - body = zip_file.read(namelist[0]) - respcls = responsetypes.from_args(filename=namelist[0], body=body) - return response.replace(body=body, cls=respcls) - - def _is_gzip(self, response): - archive = BytesIO(response.body) - try: - body = gzip.GzipFile(fileobj=archive).read() - except IOError: - return - - respcls = responsetypes.from_args(body=body) - return response.replace(body=body, cls=respcls) - - def _is_bzip2(self, response): - try: - body = bz2.decompress(response.body) - except IOError: - return - - respcls = responsetypes.from_args(body=body) - return response.replace(body=body, cls=respcls) - - #modified by me - def process_response(self, response_str): - response = Response('http://xxx.yyy', body=response_str) - - for fmt, func in six.iteritems(self._formats): - new_response = func(response) - if new_response: - logger.debug('Decompressed response with format: %(responsefmt)s', - {'responsefmt': fmt}) - return new_response.body - return response.body diff --git a/downloader.py b/downloader.py index 31e3555..d98b9e4 100644 --- a/downloader.py +++ b/downloader.py @@ -5,7 +5,11 @@ from twisted.internet import reactor, endpoints from twisted.web.client import HTTPClientFactory, _makeGetterFactory from twisted.web.server import NOT_DONE_YET -from decompression import DecompressionMiddleware +from scrapy.http.response import Response +from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware +from scrapy.selector import Selector + +from lxml import etree def getPageFactory(url, contextFactory=None, *args, **kwargs): @@ -22,13 +26,44 @@ def getPageFactory(url, contextFactory=None, *args, **kwargs): *args, **kwargs) -def downloadDone(response, request=None, page_factory=None): - response = DecompressionMiddleware().process_response(response) +def setBaseAndRemoveScripts(selector, url): + tree = selector._root.getroottree() + + # set base url to html document + head = tree.xpath("//head") + if head: + head = head[0] + base = head.xpath("./base") + if base: + base = base[0] + else: + base = etree.Element("base") + head.append(base) + base.set('href', url) - request.write(response) + for bad in tree.xpath("//*"): + # remove scripts + if bad.tag == 'script': + bad.getparent().remove(bad) + # remove html events + for attr in bad.attrib: + if attr.startswith('on'): + del bad.attrib[attr] + + return etree.tostring(tree, pretty_print=True) + +def downloadDone(response_str, request=None, page_factory=None, url=None): + response = Response(url, body=response_str) + response = DecompressionMiddleware().process_response(None, response, None) + + sel = Selector(response) + response_str = setBaseAndRemoveScripts(sel, url) + + request.write(response_str) request.finish() def downloadError(error, request=None, page_factory=None): + import pdb; pdb.set_trace() request.write('Downloader error: ' + error.value) request.finish() @@ -45,9 +80,12 @@ class Counter(resource.Resource): 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' - }) + }, + redirectLimit=13, + timeout=5 + ) d = page_factory.deferred - d.addCallback(downloadDone, request=request, page_factory=page_factory) + d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url) d.addErrback(downloadError, request=request, page_factory=page_factory) return NOT_DONE_YET