mirror of
https://github.com/taroved/pol
synced 2025-05-20 08:00:14 -07:00
getting of html with base and with no scripts
This commit is contained in:
parent
87bbd2a4b1
commit
30e80f7bff
@ -1,90 +0,0 @@
|
|||||||
""" This module implements the DecompressionMiddleware which tries to recognise
|
|
||||||
and extract the potentially compressed responses that may arrive.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import bz2
|
|
||||||
import gzip
|
|
||||||
import zipfile
|
|
||||||
import tarfile
|
|
||||||
import logging
|
|
||||||
from tempfile import mktemp
|
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO as BytesIO
|
|
||||||
except ImportError:
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from scrapy.responsetypes import responsetypes
|
|
||||||
|
|
||||||
from scrapy.http.response import Response
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class DecompressionMiddleware(object):
|
|
||||||
""" This middleware tries to recognise and extract the possibly compressed
|
|
||||||
responses that may arrive. """
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self._formats = {
|
|
||||||
'tar': self._is_tar,
|
|
||||||
'zip': self._is_zip,
|
|
||||||
'gz': self._is_gzip,
|
|
||||||
'bz2': self._is_bzip2
|
|
||||||
}
|
|
||||||
|
|
||||||
def _is_tar(self, response):
|
|
||||||
archive = BytesIO(response.body)
|
|
||||||
try:
|
|
||||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
|
||||||
except tarfile.ReadError:
|
|
||||||
return
|
|
||||||
|
|
||||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
|
||||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
|
||||||
return response.replace(body=body, cls=respcls)
|
|
||||||
|
|
||||||
def _is_zip(self, response):
|
|
||||||
archive = BytesIO(response.body)
|
|
||||||
try:
|
|
||||||
zip_file = zipfile.ZipFile(archive)
|
|
||||||
except zipfile.BadZipfile:
|
|
||||||
return
|
|
||||||
|
|
||||||
namelist = zip_file.namelist()
|
|
||||||
body = zip_file.read(namelist[0])
|
|
||||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
|
||||||
return response.replace(body=body, cls=respcls)
|
|
||||||
|
|
||||||
def _is_gzip(self, response):
|
|
||||||
archive = BytesIO(response.body)
|
|
||||||
try:
|
|
||||||
body = gzip.GzipFile(fileobj=archive).read()
|
|
||||||
except IOError:
|
|
||||||
return
|
|
||||||
|
|
||||||
respcls = responsetypes.from_args(body=body)
|
|
||||||
return response.replace(body=body, cls=respcls)
|
|
||||||
|
|
||||||
def _is_bzip2(self, response):
|
|
||||||
try:
|
|
||||||
body = bz2.decompress(response.body)
|
|
||||||
except IOError:
|
|
||||||
return
|
|
||||||
|
|
||||||
respcls = responsetypes.from_args(body=body)
|
|
||||||
return response.replace(body=body, cls=respcls)
|
|
||||||
|
|
||||||
#modified by me
|
|
||||||
def process_response(self, response_str):
|
|
||||||
response = Response('http://xxx.yyy', body=response_str)
|
|
||||||
|
|
||||||
for fmt, func in six.iteritems(self._formats):
|
|
||||||
new_response = func(response)
|
|
||||||
if new_response:
|
|
||||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
|
||||||
{'responsefmt': fmt})
|
|
||||||
return new_response.body
|
|
||||||
return response.body
|
|
@ -5,7 +5,11 @@ from twisted.internet import reactor, endpoints
|
|||||||
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
|
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
|
||||||
from twisted.web.server import NOT_DONE_YET
|
from twisted.web.server import NOT_DONE_YET
|
||||||
|
|
||||||
from decompression import DecompressionMiddleware
|
from scrapy.http.response import Response
|
||||||
|
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
|
||||||
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
||||||
@ -22,13 +26,44 @@ def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
|||||||
*args, **kwargs)
|
*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def downloadDone(response, request=None, page_factory=None):
|
def setBaseAndRemoveScripts(selector, url):
|
||||||
response = DecompressionMiddleware().process_response(response)
|
tree = selector._root.getroottree()
|
||||||
|
|
||||||
|
# set base url to html document
|
||||||
|
head = tree.xpath("//head")
|
||||||
|
if head:
|
||||||
|
head = head[0]
|
||||||
|
base = head.xpath("./base")
|
||||||
|
if base:
|
||||||
|
base = base[0]
|
||||||
|
else:
|
||||||
|
base = etree.Element("base")
|
||||||
|
head.append(base)
|
||||||
|
base.set('href', url)
|
||||||
|
|
||||||
request.write(response)
|
for bad in tree.xpath("//*"):
|
||||||
|
# remove scripts
|
||||||
|
if bad.tag == 'script':
|
||||||
|
bad.getparent().remove(bad)
|
||||||
|
# remove html events
|
||||||
|
for attr in bad.attrib:
|
||||||
|
if attr.startswith('on'):
|
||||||
|
del bad.attrib[attr]
|
||||||
|
|
||||||
|
return etree.tostring(tree, pretty_print=True)
|
||||||
|
|
||||||
|
def downloadDone(response_str, request=None, page_factory=None, url=None):
|
||||||
|
response = Response(url, body=response_str)
|
||||||
|
response = DecompressionMiddleware().process_response(None, response, None)
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
response_str = setBaseAndRemoveScripts(sel, url)
|
||||||
|
|
||||||
|
request.write(response_str)
|
||||||
request.finish()
|
request.finish()
|
||||||
|
|
||||||
def downloadError(error, request=None, page_factory=None):
|
def downloadError(error, request=None, page_factory=None):
|
||||||
|
import pdb; pdb.set_trace()
|
||||||
request.write('Downloader error: ' + error.value)
|
request.write('Downloader error: ' + error.value)
|
||||||
request.finish()
|
request.finish()
|
||||||
|
|
||||||
@ -45,9 +80,12 @@ class Counter(resource.Resource):
|
|||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
'Accept-Encoding': 'gzip, deflate, sdch',
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
|
||||||
})
|
},
|
||||||
|
redirectLimit=13,
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
d = page_factory.deferred
|
d = page_factory.deferred
|
||||||
d.addCallback(downloadDone, request=request, page_factory=page_factory)
|
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
|
||||||
d.addErrback(downloadError, request=request, page_factory=page_factory)
|
d.addErrback(downloadError, request=request, page_factory=page_factory)
|
||||||
return NOT_DONE_YET
|
return NOT_DONE_YET
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user