v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 06:10:09 -07:00

parse url as json parameter

This commit is contained in:
Alexandr Nesterenko 2015-10-28 18:02:07 +00:00
commit 87bbd2a4b1
2 changed files with 145 additions and 0 deletions

90
decompression.py Normal file
View File

@ -0,0 +1,90 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""
import bz2
import gzip
import zipfile
import tarfile
import logging
from tempfile import mktemp
import six
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
from scrapy.responsetypes import responsetypes
from scrapy.http.response import Response
logger = logging.getLogger(__name__)
class DecompressionMiddleware(object):
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """
def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}
def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return
body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)
def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return
namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)
def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
#modified by me
def process_response(self, response_str):
response = Response('http://xxx.yyy', body=response_str)
for fmt, func in six.iteritems(self._formats):
new_response = func(response)
if new_response:
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt})
return new_response.body
return response.body

55
downloader.py Normal file
View File

@ -0,0 +1,55 @@
import json
from twisted.web import server, resource
from twisted.internet import reactor, endpoints
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
from twisted.web.server import NOT_DONE_YET
from decompression import DecompressionMiddleware
def getPageFactory(url, contextFactory=None, *args, **kwargs):
"""
Download a web page as a string.
Download a page. Return a deferred, which will callback with a
page (as a string) or errback with a description of the error.
See L{HTTPClientFactory} to see what extra arguments can be passed.
"""
return _makeGetterFactory(
url,
HTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs)
def downloadDone(response, request=None, page_factory=None):
response = DecompressionMiddleware().process_response(response)
request.write(response)
request.finish()
def downloadError(error, request=None, page_factory=None):
request.write('Downloader error: ' + error.value)
request.finish()
class Counter(resource.Resource):
isLeaf = True
def render_POST(self, request):
obj = json.load(request.content)
url = obj[0].encode('utf-8')
page_factory = getPageFactory(url,
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
})
d = page_factory.deferred
d.addCallback(downloadDone, request=request, page_factory=page_factory)
d.addErrback(downloadError, request=request, page_factory=page_factory)
return NOT_DONE_YET
endpoints.serverFromString(reactor, "tcp:8080").listen(server.Site(Counter()))
reactor.run()