mirror of
https://github.com/taroved/pol
synced 2025-05-16 22:30:09 -07:00
twisted agent
This commit is contained in:
parent
3fe6d189a3
commit
5a15140682
@ -4,8 +4,10 @@ from datetime import datetime
|
|||||||
|
|
||||||
from twisted.web import server, resource
|
from twisted.web import server, resource
|
||||||
from twisted.internet import reactor, endpoints
|
from twisted.internet import reactor, endpoints
|
||||||
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
|
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody
|
||||||
from twisted.web.server import NOT_DONE_YET
|
from twisted.web.server import NOT_DONE_YET
|
||||||
|
from twisted.web.http_headers import Headers
|
||||||
|
twisted_headers = Headers
|
||||||
|
|
||||||
from scrapy.http.response.text import TextResponse
|
from scrapy.http.response.text import TextResponse
|
||||||
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
|
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
|
||||||
@ -17,7 +19,7 @@ from scrapy.responsetypes import responsetypes
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from feed import startFeedRequest
|
from feed import getFeedData, buildFeed
|
||||||
|
|
||||||
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
|
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG
|
||||||
|
|
||||||
@ -38,18 +40,7 @@ def check_feed_request_time_limit(url):
|
|||||||
r.set(url, int(time.time()))
|
r.set(url, int(time.time()))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def getPageFactory(url, contextFactory=None, *args, **kwargs):
|
agent = BrowserLikeRedirectAgent(Agent(reactor, connectTimeout=10), redirectLimit=5)
|
||||||
"""
|
|
||||||
Download a web page as a string.
|
|
||||||
Download a page. Return a deferred, which will callback with a
|
|
||||||
page (as a string) or errback with a description of the error.
|
|
||||||
See L{HTTPClientFactory} to see what extra arguments can be passed.
|
|
||||||
"""
|
|
||||||
return _makeGetterFactory(
|
|
||||||
url,
|
|
||||||
HTTPClientFactory,
|
|
||||||
contextFactory=contextFactory,
|
|
||||||
*args, **kwargs)
|
|
||||||
|
|
||||||
def html2json(el):
|
def html2json(el):
|
||||||
return [
|
return [
|
||||||
@ -109,29 +100,42 @@ def setBaseAndRemoveScriptsAndMore(response, url):
|
|||||||
|
|
||||||
return etree.tostring(tree, method='html')
|
return etree.tostring(tree, method='html')
|
||||||
|
|
||||||
def buildScrapyResponse(page_factory, body):
|
def buildScrapyResponse(response, body, url):
|
||||||
status = int(page_factory.status)
|
status = response.code
|
||||||
headers = Headers(page_factory.response_headers)
|
headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()})
|
||||||
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
|
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||||
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
|
return respcls(url=url, status=status, headers=headers, body=body)
|
||||||
|
|
||||||
def downloadDone(response_str, request=None, page_factory=None, url=None):
|
def downloadStarted(response, response_ref):
|
||||||
response = buildScrapyResponse(page_factory, response_str)
|
response_ref.append(response) # seve the response reference
|
||||||
|
return response
|
||||||
|
|
||||||
|
def downloadDone(response_str, request, response_ref, feed_config):
|
||||||
|
response = response_ref.pop() # get the response reference
|
||||||
|
|
||||||
|
url = response.request.absoluteURI
|
||||||
|
|
||||||
|
print 'Response <%s> ready (%s bytes)' % (url, len(response_str))
|
||||||
|
response = buildScrapyResponse(response, response_str, url)
|
||||||
|
|
||||||
response = DecompressionMiddleware().process_response(None, response, None)
|
response = DecompressionMiddleware().process_response(None, response, None)
|
||||||
|
|
||||||
if (isinstance(response, TextResponse)):
|
if (isinstance(response, TextResponse)):
|
||||||
response_str = setBaseAndRemoveScriptsAndMore(response, url)
|
if feed_config:
|
||||||
|
response_str = buildFeed(response, feed_config)
|
||||||
|
request.setHeader(b"Content-Type", b'text/xml')
|
||||||
|
else:
|
||||||
|
response_str = setBaseAndRemoveScriptsAndMore(response, url)
|
||||||
|
|
||||||
request.write(response_str)
|
request.write(response_str)
|
||||||
request.finish()
|
request.finish()
|
||||||
|
|
||||||
def downloadError(error, request=None, page_factory=None):
|
def downloadError(error, request=None):
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
request.write('Downloader error: ' + error.getErrorMessage())
|
request.write('Downloader error: ' + error.getErrorMessage())
|
||||||
request.write('Traceback: ' + error.getTraceback())
|
request.write('Traceback: ' + error.getTraceback())
|
||||||
else:
|
else:
|
||||||
request.write('Something wrong')
|
request.write('Something wrong. Geek comment: ' + error.getErrorMessage())
|
||||||
sys.stderr.write(datetime.datetime.now())
|
sys.stderr.write(datetime.datetime.now())
|
||||||
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
|
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
|
||||||
request.finish()
|
request.finish()
|
||||||
@ -142,19 +146,23 @@ class Downloader(resource.Resource):
|
|||||||
|
|
||||||
feed_regexp = re.compile('^/feed1?/(\d{1,10})$')
|
feed_regexp = re.compile('^/feed1?/(\d{1,10})$')
|
||||||
|
|
||||||
def startRequest(self, request, url):
|
def startRequest(self, request, url, feed_config = None):
|
||||||
page_factory = getPageFactory(url,
|
d = agent.request(
|
||||||
headers={
|
'GET',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
url,
|
||||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
twisted_headers({
|
||||||
'User-Agent': DOWNLOADER_USER_AGENT
|
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'],
|
||||||
},
|
'Accept-Encoding': ['gzip, deflate, sdch'],
|
||||||
redirectLimit=5,
|
'User-Agent': [DOWNLOADER_USER_AGENT]
|
||||||
timeout=10
|
}),
|
||||||
)
|
None
|
||||||
d = page_factory.deferred
|
)
|
||||||
d.addCallback(downloadDone, request=request, page_factory=page_factory, url=url)
|
print 'Request <GET %s> started' % (url,)
|
||||||
d.addErrback(downloadError, request=request, page_factory=page_factory)
|
response_ref = []
|
||||||
|
d.addCallback(downloadStarted, response_ref)
|
||||||
|
d.addCallback(readBody)
|
||||||
|
d.addCallback(downloadDone, request=request, response_ref=response_ref, feed_config=feed_config)
|
||||||
|
d.addErrback(downloadError, request=request)
|
||||||
|
|
||||||
def render_POST(self, request):
|
def render_POST(self, request):
|
||||||
obj = json.load(request.content)
|
obj = json.load(request.content)
|
||||||
@ -179,9 +187,15 @@ class Downloader(resource.Resource):
|
|||||||
if time_left:
|
if time_left:
|
||||||
request.setResponseCode(429)
|
request.setResponseCode(429)
|
||||||
request.setHeader('Retry-After', str(time_left) + ' seconds')
|
request.setHeader('Retry-After', str(time_left) + ' seconds')
|
||||||
return 'Too Many Requests'
|
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
|
||||||
else:
|
else:
|
||||||
startFeedRequest(request, feed_id)
|
res = getFeedData(request, feed_id)
|
||||||
|
|
||||||
|
if isinstance(res, basestring): # error message
|
||||||
|
return res
|
||||||
|
|
||||||
|
url, feed_config = res
|
||||||
|
self.startRequest(request, url, feed_config)
|
||||||
return NOT_DONE_YET
|
return NOT_DONE_YET
|
||||||
else: # neither page and feed
|
else: # neither page and feed
|
||||||
return 'Url is required'
|
return 'Url is required'
|
||||||
|
55
feed.py
55
feed.py
@ -38,12 +38,6 @@ def _getPageFactory(url, contextFactory=None, *args, **kwargs):
|
|||||||
contextFactory=contextFactory,
|
contextFactory=contextFactory,
|
||||||
*args, **kwargs)
|
*args, **kwargs)
|
||||||
|
|
||||||
def _buildScrapyResponse(page_factory, body):
|
|
||||||
status = int(page_factory.status)
|
|
||||||
headers = Headers(page_factory.response_headers)
|
|
||||||
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
|
|
||||||
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
|
|
||||||
|
|
||||||
def element_to_string(element):
|
def element_to_string(element):
|
||||||
s = [element.text] if element.text else []
|
s = [element.text] if element.text else []
|
||||||
for sub_element in element:
|
for sub_element in element:
|
||||||
@ -56,7 +50,7 @@ def _build_link(html, doc_url, url):
|
|||||||
base_url = w3lib.html.get_base_url(html, doc_url)
|
base_url = w3lib.html.get_base_url(html, doc_url)
|
||||||
return w3lib.url.urljoin_rfc(base_url, url)
|
return w3lib.url.urljoin_rfc(base_url, url)
|
||||||
|
|
||||||
def _buildFeed(response, feed_config):
|
def buildFeed(response, feed_config):
|
||||||
response.selector.remove_namespaces()
|
response.selector.remove_namespaces()
|
||||||
|
|
||||||
tree = response.selector._root.getroottree()
|
tree = response.selector._root.getroottree()
|
||||||
@ -80,13 +74,15 @@ def _buildFeed(response, feed_config):
|
|||||||
if len(item) == len(feed_config['fields']): # all fields are required
|
if len(item) == len(feed_config['fields']): # all fields are required
|
||||||
item['title_link'] = title_link
|
item['title_link'] = title_link
|
||||||
items.append(item)
|
items.append(item)
|
||||||
|
|
||||||
|
title = response.selector.xpath('//title/text()').extract()
|
||||||
|
|
||||||
#build feed
|
#build feed
|
||||||
feed = Rss201rev2Feed(
|
feed = Rss201rev2Feed(
|
||||||
title='Polite Pol: ' + feed_config['uri'],
|
title = title[0] if title else 'Polite Pol: ' + feed_config['uri'],
|
||||||
link=feed_config['uri'],
|
link=feed_config['uri'],
|
||||||
description="Generated by PolitePol.com.\n"+\
|
description="Generated by PolitePol.com.\n"+\
|
||||||
"Url: " + feed_config['uri'],
|
"Source page url: " + feed_config['uri'],
|
||||||
language="en",
|
language="en",
|
||||||
)
|
)
|
||||||
for item in items:
|
for item in items:
|
||||||
@ -105,29 +101,7 @@ def _buildFeed(response, feed_config):
|
|||||||
)
|
)
|
||||||
return feed.writeString('utf-8')
|
return feed.writeString('utf-8')
|
||||||
|
|
||||||
def _downloadDone(response_str, request=None, page_factory=None, feed_config=None):
|
def getFeedData(request, feed_id):
|
||||||
response = _buildScrapyResponse(page_factory, response_str)
|
|
||||||
|
|
||||||
response = DecompressionMiddleware().process_response(None, response, None)
|
|
||||||
|
|
||||||
if (isinstance(response, TextResponse)):
|
|
||||||
response_str = _buildFeed(response, feed_config)
|
|
||||||
|
|
||||||
request.setHeader(b"Content-Type", b'text/xml')
|
|
||||||
request.write(response_str)
|
|
||||||
request.finish()
|
|
||||||
|
|
||||||
def _downloadError(error, request=None, page_factory=None):
|
|
||||||
if DEBUG:
|
|
||||||
request.write('Downloader error: ' + error.getErrorMessage())
|
|
||||||
request.write('Traceback: ' + error.getTraceback())
|
|
||||||
else:
|
|
||||||
request.write('Something wrong')
|
|
||||||
sys.stderr.write(datetime.datetime.now())
|
|
||||||
sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()))
|
|
||||||
request.finish()
|
|
||||||
|
|
||||||
def startFeedRequest(request, feed_id):
|
|
||||||
# get url, xpathes
|
# get url, xpathes
|
||||||
creds = DATABASES['default']
|
creds = DATABASES['default']
|
||||||
db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME'])
|
db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME'])
|
||||||
@ -148,19 +122,6 @@ def startFeedRequest(request, feed_id):
|
|||||||
feed['fields'][row[2]] = row[3]
|
feed['fields'][row[2]] = row[3]
|
||||||
|
|
||||||
if feed:
|
if feed:
|
||||||
page_factory = _getPageFactory(feed['uri'],
|
return [feed['uri'], feed]
|
||||||
headers={
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
||||||
'Accept-Encoding': 'gzip, deflate, sdch',
|
|
||||||
'User-Agent': DOWNLOADER_USER_AGENT
|
|
||||||
},
|
|
||||||
redirectLimit=5,
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
d = page_factory.deferred
|
|
||||||
d.addCallback(_downloadDone, request=request, page_factory=page_factory, feed_config=feed)
|
|
||||||
d.addErrback(_downloadError, request=request, page_factory=page_factory)
|
|
||||||
else:
|
else:
|
||||||
request.write('Feed generator error: config of feed is empty')
|
return 'Feed generator error: config of feed is empty'
|
||||||
request.finish()
|
|
||||||
return
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user