v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-31 21:40:09 -07:00

log only not traced exceptions

This commit is contained in:
Alexandr Nesterenko 2017-11-10 13:01:23 +03:00
parent 8e483314d8
commit e02f4f0d1b

View File

@ -1,333 +1,336 @@
from __future__ import print_function from __future__ import print_function
from datetime import datetime from datetime import datetime
from hashlib import md5 from hashlib import md5
import json import json
import pickle import pickle
import time, sys, traceback import time, sys, traceback
import re import re
from urlparse import urlparse from urlparse import urlparse
from lxml import etree from lxml import etree
from twisted.web import server, resource from twisted.web import server, resource
from twisted.internet import reactor, endpoints, defer from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from twisted.web.http_headers import Headers from twisted.web.http_headers import Headers
from twisted.web.html import escape from twisted.web.html import escape
twisted_headers = Headers twisted_headers = Headers
from twisted.logger import Logger from twisted.logger import Logger
from scrapy.http.response.text import TextResponse from scrapy.http.response.text import TextResponse
from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
from scrapy.http.request import Request from scrapy.http.request import Request
from scrapy.http import Headers from scrapy.http import Headers
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from scrapy.selector import Selector from scrapy.selector import Selector
from pol.log import LogHandler from pol.log import LogHandler
from .feed import Feed from .feed import Feed
from twisted.logger import Logger from twisted.logger import Logger
log = Logger() log = Logger()
class Downloader(object): class Downloader(object):
def __init__(self, feed, debug, snapshot_dir='/tmp', stat_tool=None, memon=None): def __init__(self, feed, debug, snapshot_dir='/tmp', stat_tool=None, memon=None):
self.feed = feed self.feed = feed
self.debug = debug self.debug = debug
self.snapshot_dir = snapshot_dir self.snapshot_dir = snapshot_dir
self.stat_tool = stat_tool self.stat_tool = stat_tool
self.memon = memon self.memon = memon
def html2json(self, el): def html2json(self, el):
return [ return [
el.tag, el.tag,
{"tag-id": el.attrib["tag-id"]}, {"tag-id": el.attrib["tag-id"]},
[self.html2json(e) for e in el.getchildren() if isinstance(e, etree.ElementBase)] [self.html2json(e) for e in el.getchildren() if isinstance(e, etree.ElementBase)]
] ]
def _saveResponse(self, headers, url, tree): def _saveResponse(self, headers, url, tree):
# save html for extended selectors # save html for extended selectors
file_name = '%s_%s' % (time.time(), md5(url).hexdigest()) file_name = '%s_%s' % (time.time(), md5(url).hexdigest())
file_path = self.snapshot_dir + '/' + file_name file_path = self.snapshot_dir + '/' + file_name
with open(file_path, 'w') as f: with open(file_path, 'w') as f:
f.write(url + '\n') f.write(url + '\n')
for k, v in headers.iteritems(): for k, v in headers.iteritems():
for vv in v: for vv in v:
f.write('%s: %s\n' % (k, vv)) f.write('%s: %s\n' % (k, vv))
f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html')) f.write('\n\n' + etree.tostring(tree, encoding='utf-8', method='html'))
return file_name return file_name
def setBaseAndRemoveScriptsAndMore(self, selector, headers, url): def setBaseAndRemoveScriptsAndMore(self, selector, headers, url):
selector.remove_namespaces() selector.remove_namespaces()
tree = selector.root.getroottree() tree = selector.root.getroottree()
file_name = self._saveResponse(headers, url, tree) file_name = self._saveResponse(headers, url, tree)
# set base url to html document # set base url to html document
head = tree.xpath("//head") head = tree.xpath("//head")
if head: if head:
head = head[0] head = head[0]
base = head.xpath("./base") base = head.xpath("./base")
if base: if base:
base = base[0] base = base[0]
else: else:
base = etree.Element("base") base = etree.Element("base")
head.insert(0, base) head.insert(0, base)
base.set('href', url) base.set('href', url)
i = 1 i = 1
for bad in tree.xpath("//*"): for bad in tree.xpath("//*"):
# remove scripts # remove scripts
if bad.tag == 'script': if bad.tag == 'script':
bad.getparent().remove(bad) bad.getparent().remove(bad)
else: else:
# set tag-id attribute # set tag-id attribute
bad.attrib['tag-id'] = str(i) bad.attrib['tag-id'] = str(i)
i += 1 i += 1
# sanitize anchors # sanitize anchors
if bad.tag == 'a' and 'href' in bad.attrib: if bad.tag == 'a' and 'href' in bad.attrib:
bad.attrib['origin-href'] = bad.attrib['href'] bad.attrib['origin-href'] = bad.attrib['href']
del bad.attrib['href'] del bad.attrib['href']
# remove html events # remove html events
for attr in bad.attrib: for attr in bad.attrib:
if attr.startswith('on'): if attr.startswith('on'):
del bad.attrib[attr] del bad.attrib[attr]
# sanitize forms # sanitize forms
if bad.tag == 'form': if bad.tag == 'form':
bad.attrib['onsubmit'] = "return false" bad.attrib['onsubmit'] = "return false"
body = tree.xpath("//body") body = tree.xpath("//body")
if body: if body:
# append html2json js object # append html2json js object
jsobj = self.html2json(tree.getroot()) jsobj = self.html2json(tree.getroot())
script = etree.Element('script', {'type': 'text/javascript'}) script = etree.Element('script', {'type': 'text/javascript'})
script.text = '\n'.join(( script.text = '\n'.join((
'var html2json = ' + json.dumps(jsobj) + ';', 'var html2json = ' + json.dumps(jsobj) + ';',
'var snapshot_time = "' + file_name + '";' 'var snapshot_time = "' + file_name + '";'
)) ))
body[0].append(script) body[0].append(script)
return (etree.tostring(tree, method='html'), file_name) return (etree.tostring(tree, method='html'), file_name)
def buildScrapyResponse(self, response, body, url): def buildScrapyResponse(self, response, body, url):
status = response.code status = response.code
headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()}) headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()})
respcls = responsetypes.from_args(headers=headers, url=url) respcls = responsetypes.from_args(headers=headers, url=url)
return respcls(url=url, status=status, headers=headers, body=body) return respcls(url=url, status=status, headers=headers, body=body)
def error_html(self, msg): def error_html(self, msg):
return "<html><body>%s</body></html" % msg.replace("\n", "<br/>\n") return "<html><body>%s</body></html" % msg.replace("\n", "<br/>\n")
def downloadError(self, error, request=None, url=None, response=None, feed_config=None, selector_defer=None): def downloadError(self, error, request=None, url=None, response=None, feed_config=None, selector_defer=None):
# read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok # read for details: https://stackoverflow.com/questions/29423986/twisted-giving-twisted-web-client-partialdownloaderror-200-ok
if error.type is PartialDownloadError and error.value.status == '200': if error.type is PartialDownloadError and error.value.status == '200':
d = defer.Deferred() d = defer.Deferred()
reactor.callLater(0, d.callback, error.value.response) # error.value.response is response_str reactor.callLater(0, d.callback, error.value.response) # error.value.response is response_str
d.addCallback(self.downloadDone, request=request, response=response, feed_config=feed_config, selector_defer=selector_defer) d.addCallback(self.downloadDone, request=request, response=response, feed_config=feed_config, selector_defer=selector_defer)
d.addErrback(self.downloadError, request=request, url=url, response=response, feed_config=feed_config, selector_defer=selector_defer) d.addErrback(self.downloadError, request=request, url=url, response=response, feed_config=feed_config, selector_defer=selector_defer)
return return
if selector_defer: if selector_defer:
selector_defer.errback(error) selector_defer.errback(error)
else: else:
if self.debug: if self.debug:
request.write('Downloader error: ' + error.getErrorMessage()) request.write('Downloader error: ' + error.getErrorMessage())
request.write('Traceback: ' + error.getTraceback()) request.write('Traceback: ' + error.getTraceback())
else: else:
request.write(self.error_html('<h1>PolitePol says: "Something wrong"</h1> <p><b>Try to refresh page or contact us by email: <a href="mailto:politepol.com@gmail.com">politepol.com@gmail.com</a></b>\n(Help us to improve our service with your feedback)</p> <p><i>Scary mantra: %s</i></p>' % escape(error.getErrorMessage()))) request.write(self.error_html('<h1>PolitePol says: "Something wrong"</h1> <p><b>Try to refresh page or contact us by email: <a href="mailto:politepol.com@gmail.com">politepol.com@gmail.com</a></b>\n(Help us to improve our service with your feedback)</p> <p><i>Scary mantra: %s</i></p>' % escape(error.getErrorMessage())))
sys.stderr.write('\n'.join([str(datetime.utcnow()), request.uri, url, 'Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()])) request.finish()
request.finish()
try:
try: if self.stat_tool:
feed_id = feed_config and feed_config['id'] feed_id = feed_config and feed_config['id']
s_url = None s_url = None
if not feed_id: if not feed_id:
feed_id = 0 feed_id = 0
s_url = url s_url = url
if self.stat_tool: self.stat_tool.trace(
self.stat_tool.trace( ip = request.getHeader('x-real-ip') or request.client.host,
ip = request.getHeader('x-real-ip') or request.client.host, feed_id = feed_id,
feed_id = feed_id, post_cnt=0,
post_cnt=0, new_post_cnt=0,
new_post_cnt=0, url=s_url,
url=s_url, ex_msg=error.getErrorMessage(),
ex_msg=error.getErrorMessage(), ex_callstack=error.getTraceback()
ex_callstack=error.getTraceback() )
) else:
except: sys.stderr.write('\n'.join(
traceback.print_exc(file=sys.stdout) [str(datetime.utcnow()), request.uri, url, 'Downloader error: ' + error.getErrorMessage(),
'Traceback: ' + error.getTraceback()]))
except:
def downloadStarted(self, response, request, url, feed_config, selector_defer): traceback.print_exc(file=sys.stdout)
d = readBody(response)
d.addCallback(self.downloadDone, request=request, response=response, feed_config=feed_config, selector_defer=selector_defer)
d.addErrback(self.downloadError, request=request, url=url, response=response, feed_config=feed_config, selector_defer=selector_defer) def downloadStarted(self, response, request, url, feed_config, selector_defer):
return response d = readBody(response)
d.addCallback(self.downloadDone, request=request, response=response, feed_config=feed_config, selector_defer=selector_defer)
def downloadDone(self, response_str, request, response, feed_config, selector_defer): d.addErrback(self.downloadError, request=request, url=url, response=response, feed_config=feed_config, selector_defer=selector_defer)
url = response.request.absoluteURI return response
print('Response <%s> ready (%s bytes)' % (url, len(response_str))) def downloadDone(self, response_str, request, response, feed_config, selector_defer):
response = self.buildScrapyResponse(response, response_str, url) url = response.request.absoluteURI
if selector_defer: print('Response <%s> ready (%s bytes)' % (url, len(response_str)))
selector_defer.callback(response) response = self.buildScrapyResponse(response, response_str, url)
else:
self.writeResponse(request, response, feed_config, response_str) if selector_defer:
self.run_memon() selector_defer.callback(response)
else:
def writeResponse(self, request, response, feed_config, response_str='PolitePol: Local page processing is failed'): self.writeResponse(request, response, feed_config, response_str)
response = HttpCompressionMiddleware().process_response(Request(response.url), response, None) self.run_memon()
response = DecompressionMiddleware().process_response(None, response, None)
def writeResponse(self, request, response, feed_config, response_str='PolitePol: Local page processing is failed'):
if (isinstance(response, TextResponse)): response = HttpCompressionMiddleware().process_response(Request(response.url), response, None)
ip = request.getHeader('x-real-ip') or request.client.host response = DecompressionMiddleware().process_response(None, response, None)
response_str = self.prepare_response_str(response.selector, response.headers, response.body_as_unicode(), response.url, feed_config, ip)
if feed_config: if (isinstance(response, TextResponse)):
request.setHeader(b"Content-Type", b'text/xml; charset=utf-8') ip = request.getHeader('x-real-ip') or request.client.host
response_str = self.prepare_response_str(response.selector, response.headers, response.body_as_unicode(), response.url, feed_config, ip)
request.write(response_str) if feed_config:
request.finish() request.setHeader(b"Content-Type", b'text/xml; charset=utf-8')
def prepare_response_str(self, selector, headers, page_unicode, url, feed_config, ip=None): request.write(response_str)
if feed_config: request.finish()
[response_str, post_cnt, new_post_cnt] = self.feed.buildFeed(selector, page_unicode, feed_config)
if self.stat_tool: def prepare_response_str(self, selector, headers, page_unicode, url, feed_config, ip=None):
self.stat_tool.trace(ip=ip, feed_id=feed_config['id'], post_cnt=post_cnt, new_post_cnt=new_post_cnt) if feed_config:
else: [response_str, post_cnt, new_post_cnt] = self.feed.buildFeed(selector, page_unicode, feed_config)
response_str, file_name = self.setBaseAndRemoveScriptsAndMore(selector, headers, url) if self.stat_tool:
if self.stat_tool: self.stat_tool.trace(ip=ip, feed_id=feed_config['id'], post_cnt=post_cnt, new_post_cnt=new_post_cnt)
self.stat_tool.trace(ip=ip, feed_id=0, post_cnt=0, new_post_cnt=0, url=url) else:
return response_str response_str, file_name = self.setBaseAndRemoveScriptsAndMore(selector, headers, url)
if self.stat_tool:
self.stat_tool.trace(ip=ip, feed_id=0, post_cnt=0, new_post_cnt=0, url=url)
def run_memon(self): return response_str
if self.memon:
d = defer.Deferred()
reactor.callLater(0, d.callback, None) def run_memon(self):
d.addCallback(self.memon.show_diff) if self.memon:
d.addErrback(lambda err: print("Memory Monitor error: %s\nPGC traceback: %s" % (err.getErrorMessage(), err.getTraceback()))) d = defer.Deferred()
reactor.callLater(0, d.callback, None)
d.addCallback(self.memon.show_diff)
class Site(resource.Resource): d.addErrback(lambda err: print("Memory Monitor error: %s\nPGC traceback: %s" % (err.getErrorMessage(), err.getTraceback())))
isLeaf = True
feed_regexp = re.compile('^/feed1?/(\d{1,10})$') class Site(resource.Resource):
isLeaf = True
def __init__(self, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None):
self.db_creds = db_creds feed_regexp = re.compile('^/feed1?/(\d{1,10})$')
self.snapshot_dir = snapshot_dir
self.user_agent = user_agent def __init__(self, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None):
self.limiter = limiter self.db_creds = db_creds
self.prefetch_dir = prefetch_dir self.snapshot_dir = snapshot_dir
self.user_agent = user_agent
self.feed = Feed(db_creds) self.limiter = limiter
self.downloader = Downloader(self.feed, debug, snapshot_dir, stat_tool, memon) self.prefetch_dir = prefetch_dir
def startRequest(self, request, url, feed_config = None, selector_defer=None): self.feed = Feed(db_creds)
sresponse = self.tryLocalPage(url) self.downloader = Downloader(self.feed, debug, snapshot_dir, stat_tool, memon)
if sresponse:
if selector_defer: def startRequest(self, request, url, feed_config = None, selector_defer=None):
reactor.callLater(0, selector_defer.callback, sresponse) sresponse = self.tryLocalPage(url)
else: if sresponse:
self.downloader.writeResponse(request, sresponse, feed_config) if selector_defer:
else: reactor.callLater(0, selector_defer.callback, sresponse)
agent = BrowserLikeRedirectAgent( else:
Agent(reactor, self.downloader.writeResponse(request, sresponse, feed_config)
contextFactory=ScrapyClientContextFactory(), # skip certificate verification else:
connectTimeout=10), agent = BrowserLikeRedirectAgent(
#pool=pool), Agent(reactor,
redirectLimit=5 contextFactory=ScrapyClientContextFactory(), # skip certificate verification
) connectTimeout=10),
#pool=pool),
d = agent.request( redirectLimit=5
'GET', )
url,
twisted_headers({ d = agent.request(
'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'], 'GET',
'Accept-Encoding': ['gzip, deflate, sdch'], url,
'User-Agent': [self.user_agent] twisted_headers({
}), 'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'],
None 'Accept-Encoding': ['gzip, deflate, sdch'],
) 'User-Agent': [self.user_agent]
print('Request <GET %s> started' % (url,)) }),
d.addCallback(self.downloader.downloadStarted, request=request, url=url, feed_config=feed_config, selector_defer=selector_defer) None
d.addErrback(self.downloader.downloadError, request=request, url=url, feed_config=feed_config, selector_defer=selector_defer) )
print('Request <GET %s> started' % (url,))
def tryLocalPage(self, url): d.addCallback(self.downloader.downloadStarted, request=request, url=url, feed_config=feed_config, selector_defer=selector_defer)
if self.prefetch_dir: d.addErrback(self.downloader.downloadError, request=request, url=url, feed_config=feed_config, selector_defer=selector_defer)
m = md5(url).hexdigest()
domain = urlparse(url).netloc def tryLocalPage(self, url):
try: if self.prefetch_dir:
with open(self.prefetch_dir + '/' + m + '.' + domain) as f: m = md5(url).hexdigest()
return pickle.load(f) domain = urlparse(url).netloc
except IOError: try:
pass with open(self.prefetch_dir + '/' + m + '.' + domain) as f:
return None return pickle.load(f)
except IOError:
def render_GET(self, request): pass
''' return None
Render page for frontend or RSS feed
''' def render_GET(self, request):
if 'url' in request.args: # page for frontend '''
url = request.args['url'][0] Render page for frontend or RSS feed
'''
self.startRequest(request, url) if 'url' in request.args: # page for frontend
return NOT_DONE_YET url = request.args['url'][0]
elif self.feed_regexp.match(request.uri) is not None: # feed
feed_id = self.feed_regexp.match(request.uri).groups()[0] self.startRequest(request, url)
return NOT_DONE_YET
time_left = self.limiter.check_request_time_limit(request.uri) if self.limiter else 0 elif self.feed_regexp.match(request.uri) is not None: # feed
if time_left: feed_id = self.feed_regexp.match(request.uri).groups()[0]
request.setResponseCode(429)
request.setHeader('Retry-After', str(time_left) + ' seconds') time_left = self.limiter.check_request_time_limit(request.uri) if self.limiter else 0
return 'Too Many Requests. Retry after %s seconds' % (str(time_left)) if time_left:
else: request.setResponseCode(429)
res = self.feed.getFeedData(feed_id) request.setHeader('Retry-After', str(time_left) + ' seconds')
return 'Too Many Requests. Retry after %s seconds' % (str(time_left))
if isinstance(res, basestring): # error message else:
return res res = self.feed.getFeedData(feed_id)
url, feed_config = res if isinstance(res, basestring): # error message
self.startRequest(request, url, feed_config) return res
return NOT_DONE_YET
else: # neither page and feed url, feed_config = res
return 'Url is required' self.startRequest(request, url, feed_config)
return NOT_DONE_YET
else: # neither page and feed
class Server(object): return 'Url is required'
def __init__(self, port, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None):
self.port = port class Server(object):
self.db_creds = db_creds
self.snapshot_dir = snapshot_dir def __init__(self, port, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None):
self.user_agent = user_agent self.port = port
self.debug = debug self.db_creds = db_creds
self.limiter = limiter self.snapshot_dir = snapshot_dir
self.memon = memon self.user_agent = user_agent
self.stat_tool=stat_tool self.debug = debug
self.prefetch_dir = prefetch_dir self.limiter = limiter
self.memon = memon
self.log_handler = LogHandler() self.stat_tool=stat_tool
self.prefetch_dir = prefetch_dir
self.site = Site(self.db_creds, self.snapshot_dir, self.user_agent, self.debug, self.limiter, self.memon, self.stat_tool, self.prefetch_dir)
self.log_handler = LogHandler()
def requestSelector(self, url=None, feed_config=None):
d = defer.Deferred() self.site = Site(self.db_creds, self.snapshot_dir, self.user_agent, self.debug, self.limiter, self.memon, self.stat_tool, self.prefetch_dir)
self.site.startRequest(None, url, feed_config=feed_config, selector_defer=d)
return d def requestSelector(self, url=None, feed_config=None):
d = defer.Deferred()
def run(self): self.site.startRequest(None, url, feed_config=feed_config, selector_defer=d)
endpoints.serverFromString(reactor, "tcp:%s" % self.port).listen(server.Site(self.site)) return d
reactor.run()
def run(self):
endpoints.serverFromString(reactor, "tcp:%s" % self.port).listen(server.Site(self.site))
reactor.run()