v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 06:10:09 -07:00

twisted agent

This commit is contained in:
Alexandr Nesterenko 2017-06-21 14:59:02 -07:00
parent 3fe6d189a3
commit 5a15140682
2 changed files with 61 additions and 86 deletions

View File

@ -4,8 +4,10 @@ from datetime import datetime
from twisted.web import server, resource
from twisted.internet import reactor, endpoints
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody
from twisted.web.server import NOT_DONE_YET
from twisted.web.http_headers import Headers
twisted_headers = Headers
from scrapy.http.response.text import TextResponse
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
@ -17,7 +19,7 @@ from scrapy.responsetypes import responsetypes
from lxml import etree
import re
from feed import startFeedRequest
from feed import getFeedData, buildFeed
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
@ -38,18 +40,7 @@ def check_feed_request_time_limit(url):
r.set(url, int(time.time()))
return 0
def getPageFactory(url, contextFactory=None, *args, **kwargs):
"""
Download a web page as a string.
Download a page. Return a deferred, which will callback with a
page (as a string) or errback with a description of the error.
See L{HTTPClientFactory} to see what extra arguments can be passed.
"""
return _makeGetterFactory(
url,
HTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs)
agent = BrowserLikeRedirectAgent(Agent(reactor, connectTimeout=10), redirectLimit=5)
def html2json(el):
return [
@ -109,29 +100,42 @@ def setBaseAndRemoveScriptsAndMore(response, url):
return etree.tostring(tree, method='html')
def buildScrapyResponse(page_factory, body):
status = int(page_factory.status)
headers = Headers(page_factory.response_headers)
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
def buildScrapyResponse(response, body, url):
status = response.code
headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()})
respcls = responsetypes.from_args(headers=headers, url=url)
return respcls(url=url, status=status, headers=headers, body=body)
def downloadDone(response_str, request=None, page_factory=None, url=None):
response = buildScrapyResponse(page_factory, response_str)
def downloadStarted(response, response_ref):
response_ref.append(response) # seve the response reference
return response
def downloadDone(response_str, request, response_ref, feed_config):
response = response_ref.pop() # get the response reference
url = response.request.absoluteURI
print 'Response <%s> ready (%s bytes)' % (url, len(response_str))
response = buildScrapyResponse(response, response_str, url)
response = DecompressionMiddleware().process_response(None, response, None)
if (isinstance(response, TextResponse)):
response_str = setBaseAndRemoveScriptsAndMore(response, url)
if feed_config:
response_str = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
else:
response_str = setBaseAndRemoveScriptsAndMore(response, url)
request.write(response_str)
request.finish()
def downloadError(error, request=None, page_factory=None):
def downloadError(error, request=None):
if DEBUG:
request.write('Downloader error: ' + error.getErrorMessage())
request.write('Traceback: ' + error.getTraceback())
else:
request.write('Something wrong')
request.write('Something wrong. Geek comment: ' + error.getErrorMessage())
sys.stderr.write(datetime.datetime.now())
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
request.finish()
@ -142,19 +146,23 @@ class Downloader(resource.Resource):
feed_regexp = re.compile('^/feed1?/(\d{1,10})$')
def startRequest(self, request, url):
page_factory = getPageFactory(url,
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': DOWNLOADER_USER_AGENT
},
redirectLimit=5,
timeout=10
)
d = page_factory.deferred
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
d.addErrback(downloadError, request=request, page_factory=page_factory)
def startRequest(self, request, url, feed_config = None):
d = agent.request(
'GET',
url,
twisted_headers({
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'],
'Accept-Encoding': ['gzip, deflate, sdch'],
'User-Agent': [DOWNLOADER_USER_AGENT]
}),
None
)
print 'Request <GET %s> started' % (url,)
response_ref = []
d.addCallback(downloadStarted, response_ref)
d.addCallback(readBody)
d.addCallback(downloadDone, request=request, response_ref=response_ref, feed_config=feed_config)
d.addErrback(downloadError, request=request)
def render_POST(self, request):
obj = json.load(request.content)
@ -179,9 +187,15 @@ class Downloader(resource.Resource):
if time_left:
request.setResponseCode(429)
request.setHeader('Retry-After', str(time_left) + ' seconds')
return 'Too Many Requests'
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
else:
startFeedRequest(request, feed_id)
res = getFeedData(request, feed_id)
if isinstance(res, basestring): # error message
return res
url, feed_config = res
self.startRequest(request, url, feed_config)
return NOT_DONE_YET
else: # neither page and feed
return 'Url is required'

55
feed.py
View File

@ -38,12 +38,6 @@ def _getPageFactory(url, contextFactory=None, *args, **kwargs):
contextFactory=contextFactory,
*args, **kwargs)
def _buildScrapyResponse(page_factory, body):
status = int(page_factory.status)
headers = Headers(page_factory.response_headers)
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
def element_to_string(element):
s = [element.text] if element.text else []
for sub_element in element:
@ -56,7 +50,7 @@ def _build_link(html, doc_url, url):
base_url = w3lib.html.get_base_url(html, doc_url)
return w3lib.url.urljoin_rfc(base_url, url)
def _buildFeed(response, feed_config):
def buildFeed(response, feed_config):
response.selector.remove_namespaces()
tree = response.selector._root.getroottree()
@ -80,13 +74,15 @@ def _buildFeed(response, feed_config):
if len(item) == len(feed_config['fields']): # all fields are required
item['title_link'] = title_link
items.append(item)
title = response.selector.xpath('//title/text()').extract()
#build feed
feed = Rss201rev2Feed(
title='Polite Pol: ' + feed_config['uri'],
title = title[0] if title else 'Polite Pol: ' + feed_config['uri'],
link=feed_config['uri'],
description="Generated by PolitePol.com.\n"+\
"Url: " + feed_config['uri'],
"Source page url: " + feed_config['uri'],
language="en",
)
for item in items:
@ -105,29 +101,7 @@ def _buildFeed(response, feed_config):
)
return feed.writeString('utf-8')
def _downloadDone(response_str, request=None, page_factory=None, feed_config=None):
response = _buildScrapyResponse(page_factory, response_str)
response = DecompressionMiddleware().process_response(None, response, None)
if (isinstance(response, TextResponse)):
response_str = _buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
request.write(response_str)
request.finish()
def _downloadError(error, request=None, page_factory=None):
if DEBUG:
request.write('Downloader error: ' + error.getErrorMessage())
request.write('Traceback: ' + error.getTraceback())
else:
request.write('Something wrong')
sys.stderr.write(datetime.datetime.now())
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
request.finish()
def startFeedRequest(request, feed_id):
def getFeedData(request, feed_id):
# get url, xpathes
creds = DATABASES['default']
db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME'])
@ -148,19 +122,6 @@ def startFeedRequest(request, feed_id):
feed['fields'][row[2]] = row[3]
if feed:
page_factory = _getPageFactory(feed['uri'],
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': DOWNLOADER_USER_AGENT
},
redirectLimit=5,
timeout=10
)
d = page_factory.deferred
d.addCallback(_downloadDone, request=request, page_factory=page_factory, feed_config=feed)
d.addErrback(_downloadError, request=request, page_factory=page_factory)
return [feed['uri'], feed]
else:
request.write('Feed generator error: config of feed is empty')
request.finish()
return
return 'Feed generator error: config of feed is empty'