v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 06:10:09 -07:00
This commit is contained in:
Alexandr Nesterenko 2017-09-22 22:54:45 -04:00
parent ddd9a79f22
commit 91a33febb9
2 changed files with 59 additions and 7 deletions

View File

@ -26,7 +26,8 @@ from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from lxml import etree
import re
from feed import getFeedData, buildFeed
from feed import getFeedData, buildFeed, get_conn
from contextlib import closing
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
@ -41,12 +42,45 @@ class bcolors:
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
class RequestStat:
def __init__(self, ip, feed_id, post_cnt, new_post_cnt, url=None, ex_msg=None, ex_callstack=None):
self.ip = ip
self.feed_id = feed_id
self.post_cnt = post_cnt
self.new_post_cnt = new_post_cnt
self.url = url
self.ex_msg = ex_msg
self.ex_callstack = ex_callstack
def get_ip_id(ip, cur):
#import pdb;pdb.set_trace()
cur.execute("""select id from ips where address=%s""", (ip,))
ip_id = cur.fetchone()
if not ip_id:
cur.execute("insert into ips (address) values (%s)", (ip,))
ip_id = cur.lastrowid
return ip_id
def save_stat(stat):
with closing(get_conn()) as conn:
with conn as cur:
ip_id = get_ip_id(stat.ip, cur)
cur.execute("""insert into requests (ip_id, feed_id, post_cnt, new_post_cnt)
values (%s, %s, %s, %s)""", (ip_id, stat.feed_id, stat.post_cnt, stat.new_post_cnt))
stat_id = cur.lastrowid
if not stat.feed_id:
cur.execute("insert into request_urls (url, request_id) values (%s, %s)", (stat.url.encode('utf-8'), stat_id))
def print_log(event):
if 'isError' in event and event['isError']:
sys.stdout.write(bcolors.FAIL + formatEventAsClassicLogText(event) + bcolors.ENDC)
sys.stderr.write(formatEventAsClassicLogText(event))
sys.stderr.flush()
else:
if 'stat' in event and event['stat']:
save_stat(event['request'])
sys.stdout.write(formatEventAsClassicLogText(event))
sys.stdout.flush()
@ -179,10 +213,27 @@ def downloadDone(response_str, request, response, feed_config):
if (isinstance(response, TextResponse)):
if feed_config:
response_str = buildFeed(response, feed_config)
[response_str, post_cnt, new_post_cnt] = buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml; charset=utf-8')
log.debug('Stat: ip={request.ip} feed_id={request.feed_id} new_post_cnt={request.post_cnt} new_post_cnt={request.new_post_cnt}', request=RequestStat(
ip=request.client.host,
feed_id=feed_config['id'],
post_cnt=post_cnt,
new_post_cnt=new_post_cnt
),
stat=True
)
else:
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
log.debug('Stat: ip={request.ip} url={request.url}', request=RequestStat(
ip=request.client.host,
feed_id=0,
post_cnt=0,
new_post_cnt=0,
url=url
),
stat=True
)
request.write(response_str)
request.finish()

11
feed.py
View File

@ -26,7 +26,6 @@ def save_post(conn, created, feed_id, post_fields):
with conn as cur:
cur.execute("""insert into frontend_post (md5sum, created, feed_id)
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
post_id = cur._last_executed
post_id = conn.insert_id()
for key in ['title', 'description', 'title_link']:
@ -38,6 +37,8 @@ def save_post(conn, created, feed_id, post_fields):
def fill_time(feed_id, items):
if not items:
return []
new_post_cnt = 0
for item in items:
#create md5
h = md5('')
@ -65,15 +66,15 @@ def fill_time(feed_id, items):
fetched_dates[md5hash] = created
cur_time = datetime.datetime.utcnow()
new_posts = []
for item in items:
if item['md5'] in fetched_dates:
item['time'] = fetched_dates[item['md5']]
else:
item['time'] = cur_time
save_post(conn, cur_time, feed_id, item)
new_post_cnt += 1
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
return new_post_cnt
def decode(text, encoding): # it's strange but true
if isinstance(text, unicode):
@ -132,7 +133,7 @@ def buildFeed(response, feed_config):
language="en",
)
fill_time(feed_config['id'], items)
new_post_cnt = fill_time(feed_config['id'], items)
for item in items:
title = item['title'] if 'title' in item else ''
@ -150,7 +151,7 @@ def buildFeed(response, feed_config):
#enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image"
pubdate = time
)
return feed.writeString('utf-8')
return [feed.writeString('utf-8'), len(items), new_post_cnt]
def getFeedData(request, feed_id):
# get url, xpathes