""" This module implements the DecompressionMiddleware which tries to recognise and extract the potentially compressed responses that may arrive. """ import bz2 import gzip import zipfile import tarfile import logging from tempfile import mktemp import six try: from cStringIO import StringIO as BytesIO except ImportError: from io import BytesIO from scrapy.responsetypes import responsetypes from scrapy.http.response import Response logger = logging.getLogger(__name__) class DecompressionMiddleware(object): """ This middleware tries to recognise and extract the possibly compressed responses that may arrive. """ def __init__(self): self._formats = { 'tar': self._is_tar, 'zip': self._is_zip, 'gz': self._is_gzip, 'bz2': self._is_bzip2 } def _is_tar(self, response): archive = BytesIO(response.body) try: tar_file = tarfile.open(name=mktemp(), fileobj=archive) except tarfile.ReadError: return body = tar_file.extractfile(tar_file.members[0]).read() respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) return response.replace(body=body, cls=respcls) def _is_zip(self, response): archive = BytesIO(response.body) try: zip_file = zipfile.ZipFile(archive) except zipfile.BadZipfile: return namelist = zip_file.namelist() body = zip_file.read(namelist[0]) respcls = responsetypes.from_args(filename=namelist[0], body=body) return response.replace(body=body, cls=respcls) def _is_gzip(self, response): archive = BytesIO(response.body) try: body = gzip.GzipFile(fileobj=archive).read() except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls) def _is_bzip2(self, response): try: body = bz2.decompress(response.body) except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls) #modified by me def process_response(self, response_str): response = Response('http://xxx.yyy', body=response_str) for fmt, func in six.iteritems(self._formats): new_response = func(response) if new_response: logger.debug('Decompressed response with format: %(responsefmt)s', {'responsefmt': fmt}) return new_response.body return response.body