From 87bbd2a4b16433b8b2fbe01ac39b18e43dae70ab Mon Sep 17 00:00:00 2001 From: Alexandr Nesterenko Date: Wed, 28 Oct 2015 18:02:07 +0000 Subject: [PATCH] parse url as json parameter --- decompression.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ downloader.py | 55 +++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 decompression.py create mode 100644 downloader.py diff --git a/decompression.py b/decompression.py new file mode 100644 index 0000000..35f3147 --- /dev/null +++ b/decompression.py @@ -0,0 +1,90 @@ +""" This module implements the DecompressionMiddleware which tries to recognise +and extract the potentially compressed responses that may arrive. +""" + +import bz2 +import gzip +import zipfile +import tarfile +import logging +from tempfile import mktemp + +import six + +try: + from cStringIO import StringIO as BytesIO +except ImportError: + from io import BytesIO + +from scrapy.responsetypes import responsetypes + +from scrapy.http.response import Response + +logger = logging.getLogger(__name__) + + +class DecompressionMiddleware(object): + """ This middleware tries to recognise and extract the possibly compressed + responses that may arrive. """ + + def __init__(self): + self._formats = { + 'tar': self._is_tar, + 'zip': self._is_zip, + 'gz': self._is_gzip, + 'bz2': self._is_bzip2 + } + + def _is_tar(self, response): + archive = BytesIO(response.body) + try: + tar_file = tarfile.open(name=mktemp(), fileobj=archive) + except tarfile.ReadError: + return + + body = tar_file.extractfile(tar_file.members[0]).read() + respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) + return response.replace(body=body, cls=respcls) + + def _is_zip(self, response): + archive = BytesIO(response.body) + try: + zip_file = zipfile.ZipFile(archive) + except zipfile.BadZipfile: + return + + namelist = zip_file.namelist() + body = zip_file.read(namelist[0]) + respcls = responsetypes.from_args(filename=namelist[0], body=body) + return response.replace(body=body, cls=respcls) + + def _is_gzip(self, response): + archive = BytesIO(response.body) + try: + body = gzip.GzipFile(fileobj=archive).read() + except IOError: + return + + respcls = responsetypes.from_args(body=body) + return response.replace(body=body, cls=respcls) + + def _is_bzip2(self, response): + try: + body = bz2.decompress(response.body) + except IOError: + return + + respcls = responsetypes.from_args(body=body) + return response.replace(body=body, cls=respcls) + + #modified by me + def process_response(self, response_str): + response = Response('http://xxx.yyy', body=response_str) + + for fmt, func in six.iteritems(self._formats): + new_response = func(response) + if new_response: + logger.debug('Decompressed response with format: %(responsefmt)s', + {'responsefmt': fmt}) + return new_response.body + return response.body diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000..31e3555 --- /dev/null +++ b/downloader.py @@ -0,0 +1,55 @@ +import json + +from twisted.web import server, resource +from twisted.internet import reactor, endpoints +from twisted.web.client import HTTPClientFactory, _makeGetterFactory +from twisted.web.server import NOT_DONE_YET + +from decompression import DecompressionMiddleware + + +def getPageFactory(url, contextFactory=None, *args, **kwargs): + """ + Download a web page as a string. + Download a page. Return a deferred, which will callback with a + page (as a string) or errback with a description of the error. + See L{HTTPClientFactory} to see what extra arguments can be passed. + """ + return _makeGetterFactory( + url, + HTTPClientFactory, + contextFactory=contextFactory, + *args, **kwargs) + + +def downloadDone(response, request=None, page_factory=None): + response = DecompressionMiddleware().process_response(response) + + request.write(response) + request.finish() + +def downloadError(error, request=None, page_factory=None): + request.write('Downloader error: ' + error.value) + request.finish() + + +class Counter(resource.Resource): + isLeaf = True + + def render_POST(self, request): + obj = json.load(request.content) + url = obj[0].encode('utf-8') + + page_factory = getPageFactory(url, + headers={ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, sdch', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' + }) + d = page_factory.deferred + d.addCallback(downloadDone, request=request, page_factory=page_factory) + d.addErrback(downloadError, request=request, page_factory=page_factory) + return NOT_DONE_YET + +endpoints.serverFromString(reactor, "tcp:8080").listen(server.Site(Counter())) +reactor.run()