v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 22:30:09 -07:00

fix scrapy response

This commit is contained in:
Alexandr Nesterenko 2015-11-30 16:34:02 +00:00
parent e872513c33
commit 853cf4db17

View File

@ -5,10 +5,13 @@ from twisted.internet import reactor, endpoints
from twisted.web.client import HTTPClientFactory, _makeGetterFactory from twisted.web.client import HTTPClientFactory, _makeGetterFactory
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from scrapy.http.response import Response from scrapy.http.response.text import TextResponse
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
from scrapy.selector import Selector from scrapy.selector import Selector
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from lxml import etree from lxml import etree
@ -26,8 +29,8 @@ def getPageFactory(url, contextFactory=None, *args, **kwargs):
*args, **kwargs) *args, **kwargs)
def setBaseAndRemoveScripts(selector, url): def setBaseAndRemoveScriptsAndMore(response, url):
tree = selector._root.getroottree() tree = response.selector._root.getroottree()
# set base url to html document # set base url to html document
head = tree.xpath("//head") head = tree.xpath("//head")
@ -38,13 +41,18 @@ def setBaseAndRemoveScripts(selector, url):
base = base[0] base = base[0]
else: else:
base = etree.Element("base") base = etree.Element("base")
head.append(base) head.insert(0, base)
base.set('href', url) base.set('href', url)
for bad in tree.xpath("//*"): for bad in tree.xpath("//*"):
# remove scripts # remove scripts
if bad.tag == 'script': if bad.tag == 'script':
bad.getparent().remove(bad) bad.getparent().remove(bad)
# sanitize anchors
elif bad.tag == 'a' and 'href' in bad.attrib:
bad.attrib['origin-href'] = bad.attrib['href']
del bad.attrib['href']
# remove html events # remove html events
for attr in bad.attrib: for attr in bad.attrib:
if attr.startswith('on'): if attr.startswith('on'):
@ -52,12 +60,19 @@ def setBaseAndRemoveScripts(selector, url):
return etree.tostring(tree, pretty_print=True) return etree.tostring(tree, pretty_print=True)
def buildScrapyResponse(page_factory, body):
status = int(page_factory.status)
headers = Headers(page_factory.response_headers)
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
def downloadDone(response_str, request=None, page_factory=None, url=None): def downloadDone(response_str, request=None, page_factory=None, url=None):
response = Response(url, body=response_str) response = buildScrapyResponse(page_factory, response_str)
response = DecompressionMiddleware().process_response(None, response, None) response = DecompressionMiddleware().process_response(None, response, None)
sel = Selector(response) if (isinstance(response, TextResponse)):
response_str = setBaseAndRemoveScripts(sel, url) response_str = setBaseAndRemoveScriptsAndMore(response, url)
request.write(response_str) request.write(response_str)
request.finish() request.finish()
@ -71,43 +86,38 @@ def downloadError(error, request=None, page_factory=None):
class Counter(resource.Resource): class Counter(resource.Resource):
isLeaf = True isLeaf = True
def render_POST(self, request): def startRequest(self, request, url):
obj = json.load(request.content)
url = obj[0].encode('utf-8')
page_factory = getPageFactory(url, page_factory = getPageFactory(url,
headers={ headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
}, },
redirectLimit=13, redirectLimit=5,
timeout=5 timeout=10
) )
d = page_factory.deferred d = page_factory.deferred
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url) d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
d.addErrback(downloadError, request=request, page_factory=page_factory) d.addErrback(downloadError, request=request, page_factory=page_factory)
def render_POST(self, request):
obj = json.load(request.content)
url = obj[0].encode('utf-8')
self.startRequest(request, url)
return NOT_DONE_YET return NOT_DONE_YET
def render_GET(self, request): def render_GET(self, request):
''' '''
Render page for frontend Render page for frontend
''' '''
if 'url' in request.args:
url = request.args['url'][0] url = request.args['url'][0]
page_factory = getPageFactory(url, self.startRequest(request, url)
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
},
redirectLimit=13,
timeout=5
)
d = page_factory.deferred
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
d.addErrback(downloadError, request=request, page_factory=page_factory)
return NOT_DONE_YET return NOT_DONE_YET
else:
return 'Url is required'