mirror of
https://github.com/taroved/pol
synced 2025-05-16 06:10:09 -07:00
twisted agent
This commit is contained in:
parent
3fe6d189a3
commit
5a15140682
@ -4,8 +4,10 @@ from datetime import datetime
|
||||
|
||||
from twisted.web import server, resource
|
||||
from twisted.internet import reactor, endpoints
|
||||
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
|
||||
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody
|
||||
from twisted.web.server import NOT_DONE_YET
|
||||
from twisted.web.http_headers import Headers
|
||||
twisted_headers = Headers
|
||||
|
||||
from scrapy.http.response.text import TextResponse
|
||||
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
|
||||
@ -17,7 +19,7 @@ from scrapy.responsetypes import responsetypes
|
||||
from lxml import etree
|
||||
import re
|
||||
|
||||
from feed import startFeedRequest
|
||||
from feed import getFeedData, buildFeed
|
||||
|
||||
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
|
||||
|
||||
@ -38,18 +40,7 @@ def check_feed_request_time_limit(url):
|
||||
r.set(url, int(time.time()))
|
||||
return 0
|
||||
|
||||
def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
||||
"""
|
||||
Download a web page as a string.
|
||||
Download a page. Return a deferred, which will callback with a
|
||||
page (as a string) or errback with a description of the error.
|
||||
See L{HTTPClientFactory} to see what extra arguments can be passed.
|
||||
"""
|
||||
return _makeGetterFactory(
|
||||
url,
|
||||
HTTPClientFactory,
|
||||
contextFactory=contextFactory,
|
||||
*args, **kwargs)
|
||||
agent = BrowserLikeRedirectAgent(Agent(reactor, connectTimeout=10), redirectLimit=5)
|
||||
|
||||
def html2json(el):
|
||||
return [
|
||||
@ -109,29 +100,42 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
||||
|
||||
return etree.tostring(tree, method='html')
|
||||
|
||||
def buildScrapyResponse(page_factory, body):
|
||||
status = int(page_factory.status)
|
||||
headers = Headers(page_factory.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
|
||||
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
|
||||
def buildScrapyResponse(response, body, url):
|
||||
status = response.code
|
||||
headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()})
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
return respcls(url=url, status=status, headers=headers, body=body)
|
||||
|
||||
def downloadDone(response_str, request=None, page_factory=None, url=None):
|
||||
response = buildScrapyResponse(page_factory, response_str)
|
||||
def downloadStarted(response, response_ref):
|
||||
response_ref.append(response) # seve the response reference
|
||||
return response
|
||||
|
||||
def downloadDone(response_str, request, response_ref, feed_config):
|
||||
response = response_ref.pop() # get the response reference
|
||||
|
||||
url = response.request.absoluteURI
|
||||
|
||||
print 'Response <%s> ready (%s bytes)' % (url, len(response_str))
|
||||
response = buildScrapyResponse(response, response_str, url)
|
||||
|
||||
response = DecompressionMiddleware().process_response(None, response, None)
|
||||
|
||||
if (isinstance(response, TextResponse)):
|
||||
response_str = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
if feed_config:
|
||||
response_str = buildFeed(response, feed_config)
|
||||
request.setHeader(b"Content-Type", b'text/xml')
|
||||
else:
|
||||
response_str = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
|
||||
request.write(response_str)
|
||||
request.finish()
|
||||
|
||||
def downloadError(error, request=None, page_factory=None):
|
||||
def downloadError(error, request=None):
|
||||
if DEBUG:
|
||||
request.write('Downloader error: ' + error.getErrorMessage())
|
||||
request.write('Traceback: ' + error.getTraceback())
|
||||
else:
|
||||
request.write('Something wrong')
|
||||
request.write('Something wrong. Geek comment: ' + error.getErrorMessage())
|
||||
sys.stderr.write(datetime.datetime.now())
|
||||
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
|
||||
request.finish()
|
||||
@ -142,19 +146,23 @@ class Downloader(resource.Resource):
|
||||
|
||||
feed_regexp = re.compile('^/feed1?/(\d{1,10})$')
|
||||
|
||||
def startRequest(self, request, url):
|
||||
page_factory = getPageFactory(url,
|
||||
headers={
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
||||
'User-Agent': DOWNLOADER_USER_AGENT
|
||||
},
|
||||
redirectLimit=5,
|
||||
timeout=10
|
||||
)
|
||||
d = page_factory.deferred
|
||||
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
|
||||
d.addErrback(downloadError, request=request, page_factory=page_factory)
|
||||
def startRequest(self, request, url, feed_config = None):
|
||||
d = agent.request(
|
||||
'GET',
|
||||
url,
|
||||
twisted_headers({
|
||||
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'],
|
||||
'Accept-Encoding': ['gzip, deflate, sdch'],
|
||||
'User-Agent': [DOWNLOADER_USER_AGENT]
|
||||
}),
|
||||
None
|
||||
)
|
||||
print 'Request <GET %s> started' % (url,)
|
||||
response_ref = []
|
||||
d.addCallback(downloadStarted, response_ref)
|
||||
d.addCallback(readBody)
|
||||
d.addCallback(downloadDone, request=request, response_ref=response_ref, feed_config=feed_config)
|
||||
d.addErrback(downloadError, request=request)
|
||||
|
||||
def render_POST(self, request):
|
||||
obj = json.load(request.content)
|
||||
@ -179,9 +187,15 @@ class Downloader(resource.Resource):
|
||||
if time_left:
|
||||
request.setResponseCode(429)
|
||||
request.setHeader('Retry-After', str(time_left) + ' seconds')
|
||||
return 'Too Many Requests'
|
||||
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
|
||||
else:
|
||||
startFeedRequest(request, feed_id)
|
||||
res = getFeedData(request, feed_id)
|
||||
|
||||
if isinstance(res, basestring): # error message
|
||||
return res
|
||||
|
||||
url, feed_config = res
|
||||
self.startRequest(request, url, feed_config)
|
||||
return NOT_DONE_YET
|
||||
else: # neither page and feed
|
||||
return 'Url is required'
|
||||
|
55
feed.py
55
feed.py
@ -38,12 +38,6 @@ def _getPageFactory(url, contextFactory=None, *args, **kwargs):
|
||||
contextFactory=contextFactory,
|
||||
*args, **kwargs)
|
||||
|
||||
def _buildScrapyResponse(page_factory, body):
|
||||
status = int(page_factory.status)
|
||||
headers = Headers(page_factory.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
|
||||
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
|
||||
|
||||
def element_to_string(element):
|
||||
s = [element.text] if element.text else []
|
||||
for sub_element in element:
|
||||
@ -56,7 +50,7 @@ def _build_link(html, doc_url, url):
|
||||
base_url = w3lib.html.get_base_url(html, doc_url)
|
||||
return w3lib.url.urljoin_rfc(base_url, url)
|
||||
|
||||
def _buildFeed(response, feed_config):
|
||||
def buildFeed(response, feed_config):
|
||||
response.selector.remove_namespaces()
|
||||
|
||||
tree = response.selector._root.getroottree()
|
||||
@ -80,13 +74,15 @@ def _buildFeed(response, feed_config):
|
||||
if len(item) == len(feed_config['fields']): # all fields are required
|
||||
item['title_link'] = title_link
|
||||
items.append(item)
|
||||
|
||||
title = response.selector.xpath('//title/text()').extract()
|
||||
|
||||
#build feed
|
||||
feed = Rss201rev2Feed(
|
||||
title='Polite Pol: ' + feed_config['uri'],
|
||||
title = title[0] if title else 'Polite Pol: ' + feed_config['uri'],
|
||||
link=feed_config['uri'],
|
||||
description="Generated by PolitePol.com.\n"+\
|
||||
"Url: " + feed_config['uri'],
|
||||
"Source page url: " + feed_config['uri'],
|
||||
language="en",
|
||||
)
|
||||
for item in items:
|
||||
@ -105,29 +101,7 @@ def _buildFeed(response, feed_config):
|
||||
)
|
||||
return feed.writeString('utf-8')
|
||||
|
||||
def _downloadDone(response_str, request=None, page_factory=None, feed_config=None):
|
||||
response = _buildScrapyResponse(page_factory, response_str)
|
||||
|
||||
response = DecompressionMiddleware().process_response(None, response, None)
|
||||
|
||||
if (isinstance(response, TextResponse)):
|
||||
response_str = _buildFeed(response, feed_config)
|
||||
|
||||
request.setHeader(b"Content-Type", b'text/xml')
|
||||
request.write(response_str)
|
||||
request.finish()
|
||||
|
||||
def _downloadError(error, request=None, page_factory=None):
|
||||
if DEBUG:
|
||||
request.write('Downloader error: ' + error.getErrorMessage())
|
||||
request.write('Traceback: ' + error.getTraceback())
|
||||
else:
|
||||
request.write('Something wrong')
|
||||
sys.stderr.write(datetime.datetime.now())
|
||||
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
|
||||
request.finish()
|
||||
|
||||
def startFeedRequest(request, feed_id):
|
||||
def getFeedData(request, feed_id):
|
||||
# get url, xpathes
|
||||
creds = DATABASES['default']
|
||||
db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME'])
|
||||
@ -148,19 +122,6 @@ def startFeedRequest(request, feed_id):
|
||||
feed['fields'][row[2]] = row[3]
|
||||
|
||||
if feed:
|
||||
page_factory = _getPageFactory(feed['uri'],
|
||||
headers={
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
||||
'User-Agent': DOWNLOADER_USER_AGENT
|
||||
},
|
||||
redirectLimit=5,
|
||||
timeout=10
|
||||
)
|
||||
d = page_factory.deferred
|
||||
d.addCallback(_downloadDone, request=request, page_factory=page_factory, feed_config=feed)
|
||||
d.addErrback(_downloadError, request=request, page_factory=page_factory)
|
||||
return [feed['uri'], feed]
|
||||
else:
|
||||
request.write('Feed generator error: config of feed is empty')
|
||||
request.finish()
|
||||
return
|
||||
return 'Feed generator error: config of feed is empty'
|
||||
|
Loading…
x
Reference in New Issue
Block a user