v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-06-01 14:00:09 -07:00

small fixes

This commit is contained in:
Alexandr Nesterenko 2018-02-08 22:51:15 +03:00
parent 6f97665b68
commit d90386117b
3 changed files with 156 additions and 8 deletions

145
pol/client.py Normal file
View File

@ -0,0 +1,145 @@
from __future__ import division, absolute_import
import warnings
from twisted.python.failure import Failure
from twisted.internet import defer, protocol, reactor
from twisted.web._newclient import (
HTTP11ClientProtocol,
PotentialDataLoss,
Request,
RequestGenerationFailed,
RequestNotSent,
RequestTransmissionFailed,
Response,
ResponseDone,
ResponseFailed,
ResponseNeverReceived,
_WrapperException,
)
from twisted.web.client import PartialDownloadError
IGNORE_SIZE = 0
class _PpReadBodyProtocol(protocol.Protocol):
"""
Protocol that collects data sent to it.
This is a helper for L{IResponse.deliverBody}, which collects the body and
fires a deferred with it.
@ivar deferred: See L{__init__}.
@ivar status: See L{__init__}.
@ivar message: See L{__init__}.
@ivar dataBuffer: list of byte-strings received
@type dataBuffer: L{list} of L{bytes}
"""
def __init__(self, status, message, deferred, max_size):
"""
@param status: Status of L{IResponse}
@ivar status: L{int}
@param message: Message of L{IResponse}
@type message: L{bytes}
@param deferred: deferred to fire when response is complete
@type deferred: L{Deferred} firing with L{bytes}
"""
self.deferred = deferred
self.status = status
self.message = message
self.dataBuffer = []
self.max_size = max_size
self.buffer_size = 0
def dataReceived(self, data):
"""
Accumulate some more bytes from the response.
"""
self.dataBuffer.append(data)
self.buffer_size += len(data)
if self.max_size != IGNORE_SIZE and self.buffer_size > self.max_size:
self.transport.stopProducing() # https://twistedmatrix.com/trac/ticket/8227
def connectionLost(self, reason):
"""
Deliver the accumulated response bytes to the waiting L{Deferred}, if
the response body has been completely received without error.
"""
if reason.check(ResponseDone):
self.deferred.callback(b''.join(self.dataBuffer))
elif reason.check(PotentialDataLoss):
self.deferred.errback(
PartialDownloadError(self.status, self.message,
b''.join(self.dataBuffer)))
else:
self.deferred.errback(reason)
def ppReadBody(response, max_size):
"""
Get the body of an L{IResponse} and return it as a byte string.
This is a helper function for clients that don't want to incrementally
receive the body of an HTTP response.
@param response: The HTTP response for which the body will be read.
@type response: L{IResponse} provider
@return: A L{Deferred} which will fire with the body of the response.
Cancelling it will close the connection to the server immediately.
"""
def cancel(deferred):
"""
Cancel a L{readBody} call, close the connection to the HTTP server
immediately, if it is still open.
@param deferred: The cancelled L{defer.Deferred}.
"""
abort = getAbort()
if abort is not None:
abort()
d = defer.Deferred(cancel)
protocol = _PpReadBodyProtocol(response.code, response.phrase, d, max_size=max_size)
def getAbort():
return getattr(protocol.transport, 'abortConnection', None)
response.deliverBody(protocol)
if protocol.transport is not None and getAbort() is None:
warnings.warn(
'Using readBody with a transport that does not have an '
'abortConnection method',
category=DeprecationWarning,
stacklevel=2)
def respFailed(fail):
if fail.type is ResponseFailed and max_size != IGNORE_SIZE and protocol.buffer_size > max_size:
d = defer.Deferred()
reactor.callLater(0, d.errback, ResponseIsTooBig('Response is too big', max_size))
return d
else:
return fail
d.addErrback(respFailed)
return d
class ResponseIsTooBig(Exception):
"""
Response is too big
@ivar max_size: Max length for response in bytes
"""
def __init__(self, reason, max_size):
Exception.__init__(self, reason, max_size)
self.max_size = max_size

View File

@ -117,7 +117,7 @@ class Feed(object):
title = title[0] if title else 'Polite Pol: ' + feed_config['uri'], title = title[0] if title else 'Polite Pol: ' + feed_config['uri'],
link=feed_config['uri'], link=feed_config['uri'],
description="Generated by PolitePol.com.\n"+\ description="Generated by PolitePol.com.\n"+\
"Source page url: " + feed_config['uri'], "Source page: " + feed_config['uri'],
language="en", language="en",
) )
new_post_cnt = self.fill_time(feed_config['id'], items) new_post_cnt = self.fill_time(feed_config['id'], items)

View File

@ -10,7 +10,7 @@ from lxml import etree
from twisted.web import server, resource from twisted.web import server, resource
from twisted.internet import reactor, endpoints, defer from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool from twisted.web.client import Agent, BrowserLikeRedirectAgent, PartialDownloadError, HTTPConnectionPool
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from twisted.web.http_headers import Headers from twisted.web.http_headers import Headers
from twisted.web.http import INTERNAL_SERVER_ERROR from twisted.web.http import INTERNAL_SERVER_ERROR
@ -29,6 +29,7 @@ from scrapy.selector import Selector
from pol.log import LogHandler from pol.log import LogHandler
from .feed import Feed from .feed import Feed
from .client import ppReadBody, IGNORE_SIZE
from twisted.logger import Logger from twisted.logger import Logger
@ -38,7 +39,7 @@ log = Logger()
class Downloader(object): class Downloader(object):
def __init__(self, feed, debug, snapshot_dir, stat_tool, memon, request, def __init__(self, feed, debug, snapshot_dir, stat_tool, memon, request,
url, feed_config, selector_defer, sanitize): url, feed_config, selector_defer, sanitize, max_size):
self.feed = feed self.feed = feed
self.debug = debug self.debug = debug
self.snapshot_dir = snapshot_dir self.snapshot_dir = snapshot_dir
@ -49,6 +50,7 @@ class Downloader(object):
self.feed_config=feed_config self.feed_config=feed_config
self.selector_defer = selector_defer self.selector_defer = selector_defer
self.sanitize = sanitize self.sanitize = sanitize
self.max_size = max_size
def html2json(self, el): def html2json(self, el):
return [ return [
@ -188,7 +190,7 @@ class Downloader(object):
def downloadStarted(self, response): def downloadStarted(self, response):
self.response = response self.response = response
d = readBody(response) d = ppReadBody(response, self.max_size)
d.addCallback(self.downloadDone) d.addCallback(self.downloadDone)
d.addErrback(self.downloadError) d.addErrback(self.downloadError)
return response return response
@ -256,7 +258,7 @@ class Site(resource.Resource):
feed_regexp = re.compile(b'^/feed/(\d{1,10})') feed_regexp = re.compile(b'^/feed/(\d{1,10})')
def __init__(self, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None, feed=None, downloadercls=None): def __init__(self, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None, feed=None, downloadercls=None, max_size=IGNORE_SIZE):
self.db_creds = db_creds self.db_creds = db_creds
self.snapshot_dir = snapshot_dir self.snapshot_dir = snapshot_dir
self.user_agent = user_agent self.user_agent = user_agent
@ -267,12 +269,13 @@ class Site(resource.Resource):
self.debug = debug self.debug = debug
self.stat_tool = stat_tool self.stat_tool = stat_tool
self.memon= memon self.memon= memon
self.max_size = max_size
self.downloadercls = downloadercls or Downloader self.downloadercls = downloadercls or Downloader
def startRequest(self, request, url, feed_config = None, selector_defer=None, sanitize=False): def startRequest(self, request, url, feed_config = None, selector_defer=None, sanitize=False):
downloader = self.downloadercls(self.feed, self.debug, self.snapshot_dir, self.stat_tool, self.memon, downloader = self.downloadercls(self.feed, self.debug, self.snapshot_dir, self.stat_tool, self.memon,
request=request, url=url, feed_config=feed_config, request=request, url=url, feed_config=feed_config,
selector_defer=selector_defer, sanitize=sanitize) selector_defer=selector_defer, sanitize=sanitize, max_size=self.max_size)
sresponse = self.tryLocalPage(url) sresponse = self.tryLocalPage(url)
if sresponse: if sresponse:
@ -348,7 +351,7 @@ class Site(resource.Resource):
class Server(object): class Server(object):
def __init__(self, port, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None, feed=None, sitecls=None, downloadercls=None): def __init__(self, port, db_creds, snapshot_dir, user_agent, debug=False, limiter=None, memon=None, stat_tool=None, prefetch_dir=None, feed=None, sitecls=None, downloadercls=None, max_size=IGNORE_SIZE):
self.port = port self.port = port
self.db_creds = db_creds self.db_creds = db_creds
self.snapshot_dir = snapshot_dir self.snapshot_dir = snapshot_dir
@ -364,7 +367,7 @@ class Server(object):
if not sitecls: if not sitecls:
sitecls = Site sitecls = Site
self.site = sitecls(self.db_creds, self.snapshot_dir, self.user_agent, self.debug, self.limiter, self.memon, self.stat_tool, self.prefetch_dir, feed, downloadercls=downloadercls) self.site = sitecls(self.db_creds, self.snapshot_dir, self.user_agent, self.debug, self.limiter, self.memon, self.stat_tool, self.prefetch_dir, feed, downloadercls=downloadercls, max_size=max_size)
def requestSelector(self, url=None, feed_config=None): def requestSelector(self, url=None, feed_config=None):
d = defer.Deferred() d = defer.Deferred()