mirror of
https://github.com/taroved/pol
synced 2025-05-16 06:10:09 -07:00
parse url as json parameter
This commit is contained in:
commit
87bbd2a4b1
90
decompression.py
Normal file
90
decompression.py
Normal file
@ -0,0 +1,90 @@
|
||||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import logging
|
||||
from tempfile import mktemp
|
||||
|
||||
import six
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
from scrapy.http.response import Response
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware(object):
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_zip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_gzip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
#modified by me
|
||||
def process_response(self, response_str):
|
||||
response = Response('http://xxx.yyy', body=response_str)
|
||||
|
||||
for fmt, func in six.iteritems(self._formats):
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt})
|
||||
return new_response.body
|
||||
return response.body
|
55
downloader.py
Normal file
55
downloader.py
Normal file
@ -0,0 +1,55 @@
|
||||
import json
|
||||
|
||||
from twisted.web import server, resource
|
||||
from twisted.internet import reactor, endpoints
|
||||
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
|
||||
from twisted.web.server import NOT_DONE_YET
|
||||
|
||||
from decompression import DecompressionMiddleware
|
||||
|
||||
|
||||
def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
||||
"""
|
||||
Download a web page as a string.
|
||||
Download a page. Return a deferred, which will callback with a
|
||||
page (as a string) or errback with a description of the error.
|
||||
See L{HTTPClientFactory} to see what extra arguments can be passed.
|
||||
"""
|
||||
return _makeGetterFactory(
|
||||
url,
|
||||
HTTPClientFactory,
|
||||
contextFactory=contextFactory,
|
||||
*args, **kwargs)
|
||||
|
||||
|
||||
def downloadDone(response, request=None, page_factory=None):
|
||||
response = DecompressionMiddleware().process_response(response)
|
||||
|
||||
request.write(response)
|
||||
request.finish()
|
||||
|
||||
def downloadError(error, request=None, page_factory=None):
|
||||
request.write('Downloader error: ' + error.value)
|
||||
request.finish()
|
||||
|
||||
|
||||
class Counter(resource.Resource):
|
||||
isLeaf = True
|
||||
|
||||
def render_POST(self, request):
|
||||
obj = json.load(request.content)
|
||||
url = obj[0].encode('utf-8')
|
||||
|
||||
page_factory = getPageFactory(url,
|
||||
headers={
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
|
||||
})
|
||||
d = page_factory.deferred
|
||||
d.addCallback(downloadDone, request=request, page_factory=page_factory)
|
||||
d.addErrback(downloadError, request=request, page_factory=page_factory)
|
||||
return NOT_DONE_YET
|
||||
|
||||
endpoints.serverFromString(reactor, "tcp:8080").listen(server.Site(Counter()))
|
||||
reactor.run()
|
Loading…
x
Reference in New Issue
Block a user