v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-20 08:00:14 -07:00

getting of html with base and with no scripts

This commit is contained in:
Alexandr Nesterenko 2015-11-20 15:47:22 +00:00
parent 87bbd2a4b1
commit 30e80f7bff
2 changed files with 44 additions and 96 deletions

View File

@ -1,90 +0,0 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""
import bz2
import gzip
import zipfile
import tarfile
import logging
from tempfile import mktemp
import six
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
from scrapy.responsetypes import responsetypes
from scrapy.http.response import Response
logger = logging.getLogger(__name__)
class DecompressionMiddleware(object):
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """
def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}
def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return
body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)
def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return
namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)
def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
#modified by me
def process_response(self, response_str):
response = Response('http://xxx.yyy', body=response_str)
for fmt, func in six.iteritems(self._formats):
new_response = func(response)
if new_response:
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt})
return new_response.body
return response.body

View File

@ -5,7 +5,11 @@ from twisted.internet import reactor, endpoints
from twisted.web.client import HTTPClientFactory, _makeGetterFactory from twisted.web.client import HTTPClientFactory, _makeGetterFactory
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from decompression import DecompressionMiddleware from scrapy.http.response import Response
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
from scrapy.selector import Selector
from lxml import etree
def getPageFactory(url, contextFactory=None, *args, **kwargs): def getPageFactory(url, contextFactory=None, *args, **kwargs):
@ -22,13 +26,44 @@ def getPageFactory(url, contextFactory=None, *args, **kwargs):
*args, **kwargs) *args, **kwargs)
def downloadDone(response, request=None, page_factory=None): def setBaseAndRemoveScripts(selector, url):
response = DecompressionMiddleware().process_response(response) tree = selector._root.getroottree()
# set base url to html document
head = tree.xpath("//head")
if head:
head = head[0]
base = head.xpath("./base")
if base:
base = base[0]
else:
base = etree.Element("base")
head.append(base)
base.set('href', url)
request.write(response) for bad in tree.xpath("//*"):
# remove scripts
if bad.tag == 'script':
bad.getparent().remove(bad)
# remove html events
for attr in bad.attrib:
if attr.startswith('on'):
del bad.attrib[attr]
return etree.tostring(tree, pretty_print=True)
def downloadDone(response_str, request=None, page_factory=None, url=None):
response = Response(url, body=response_str)
response = DecompressionMiddleware().process_response(None, response, None)
sel = Selector(response)
response_str = setBaseAndRemoveScripts(sel, url)
request.write(response_str)
request.finish() request.finish()
def downloadError(error, request=None, page_factory=None): def downloadError(error, request=None, page_factory=None):
import pdb; pdb.set_trace()
request.write('Downloader error: ' + error.value) request.write('Downloader error: ' + error.value)
request.finish() request.finish()
@ -45,9 +80,12 @@ class Counter(resource.Resource):
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
}) },
redirectLimit=13,
timeout=5
)
d = page_factory.deferred d = page_factory.deferred
d.addCallback(downloadDone, request=request, page_factory=page_factory) d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
d.addErrback(downloadError, request=request, page_factory=page_factory) d.addErrback(downloadError, request=request, page_factory=page_factory)
return NOT_DONE_YET return NOT_DONE_YET