mirror of
https://github.com/taroved/pol
synced 2025-05-16 06:10:09 -07:00
stat
This commit is contained in:
parent
ddd9a79f22
commit
91a33febb9
@ -26,7 +26,8 @@ from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
|
||||
from lxml import etree
|
||||
import re
|
||||
|
||||
from feed import getFeedData, buildFeed
|
||||
from feed import getFeedData, buildFeed, get_conn
|
||||
from contextlib import closing
|
||||
|
||||
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
|
||||
|
||||
@ -41,12 +42,45 @@ class bcolors:
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
class RequestStat:
|
||||
def __init__(self, ip, feed_id, post_cnt, new_post_cnt, url=None, ex_msg=None, ex_callstack=None):
|
||||
self.ip = ip
|
||||
self.feed_id = feed_id
|
||||
self.post_cnt = post_cnt
|
||||
self.new_post_cnt = new_post_cnt
|
||||
self.url = url
|
||||
self.ex_msg = ex_msg
|
||||
self.ex_callstack = ex_callstack
|
||||
|
||||
def get_ip_id(ip, cur):
|
||||
#import pdb;pdb.set_trace()
|
||||
cur.execute("""select id from ips where address=%s""", (ip,))
|
||||
ip_id = cur.fetchone()
|
||||
if not ip_id:
|
||||
cur.execute("insert into ips (address) values (%s)", (ip,))
|
||||
ip_id = cur.lastrowid
|
||||
return ip_id
|
||||
|
||||
|
||||
def save_stat(stat):
|
||||
with closing(get_conn()) as conn:
|
||||
with conn as cur:
|
||||
ip_id = get_ip_id(stat.ip, cur)
|
||||
cur.execute("""insert into requests (ip_id, feed_id, post_cnt, new_post_cnt)
|
||||
values (%s, %s, %s, %s)""", (ip_id, stat.feed_id, stat.post_cnt, stat.new_post_cnt))
|
||||
stat_id = cur.lastrowid
|
||||
if not stat.feed_id:
|
||||
cur.execute("insert into request_urls (url, request_id) values (%s, %s)", (stat.url.encode('utf-8'), stat_id))
|
||||
|
||||
|
||||
def print_log(event):
|
||||
if 'isError' in event and event['isError']:
|
||||
sys.stdout.write(bcolors.FAIL + formatEventAsClassicLogText(event) + bcolors.ENDC)
|
||||
sys.stderr.write(formatEventAsClassicLogText(event))
|
||||
sys.stderr.flush()
|
||||
else:
|
||||
if 'stat' in event and event['stat']:
|
||||
save_stat(event['request'])
|
||||
sys.stdout.write(formatEventAsClassicLogText(event))
|
||||
sys.stdout.flush()
|
||||
|
||||
@ -179,10 +213,27 @@ def downloadDone(response_str, request, response, feed_config):
|
||||
|
||||
if (isinstance(response, TextResponse)):
|
||||
if feed_config:
|
||||
response_str = buildFeed(response, feed_config)
|
||||
[response_str, post_cnt, new_post_cnt] = buildFeed(response, feed_config)
|
||||
request.setHeader(b"Content-Type", b'text/xml; charset=utf-8')
|
||||
log.debug('Stat: ip={request.ip} feed_id={request.feed_id} new_post_cnt={request.post_cnt} new_post_cnt={request.new_post_cnt}', request=RequestStat(
|
||||
ip=request.client.host,
|
||||
feed_id=feed_config['id'],
|
||||
post_cnt=post_cnt,
|
||||
new_post_cnt=new_post_cnt
|
||||
),
|
||||
stat=True
|
||||
)
|
||||
else:
|
||||
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
||||
log.debug('Stat: ip={request.ip} url={request.url}', request=RequestStat(
|
||||
ip=request.client.host,
|
||||
feed_id=0,
|
||||
post_cnt=0,
|
||||
new_post_cnt=0,
|
||||
url=url
|
||||
),
|
||||
stat=True
|
||||
)
|
||||
|
||||
request.write(response_str)
|
||||
request.finish()
|
||||
|
11
feed.py
11
feed.py
@ -26,7 +26,6 @@ def save_post(conn, created, feed_id, post_fields):
|
||||
with conn as cur:
|
||||
cur.execute("""insert into frontend_post (md5sum, created, feed_id)
|
||||
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
|
||||
post_id = cur._last_executed
|
||||
|
||||
post_id = conn.insert_id()
|
||||
for key in ['title', 'description', 'title_link']:
|
||||
@ -38,6 +37,8 @@ def save_post(conn, created, feed_id, post_fields):
|
||||
def fill_time(feed_id, items):
|
||||
if not items:
|
||||
return []
|
||||
|
||||
new_post_cnt = 0
|
||||
for item in items:
|
||||
#create md5
|
||||
h = md5('')
|
||||
@ -65,15 +66,15 @@ def fill_time(feed_id, items):
|
||||
fetched_dates[md5hash] = created
|
||||
|
||||
cur_time = datetime.datetime.utcnow()
|
||||
new_posts = []
|
||||
for item in items:
|
||||
if item['md5'] in fetched_dates:
|
||||
item['time'] = fetched_dates[item['md5']]
|
||||
else:
|
||||
item['time'] = cur_time
|
||||
save_post(conn, cur_time, feed_id, item)
|
||||
new_post_cnt += 1
|
||||
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
|
||||
|
||||
return new_post_cnt
|
||||
|
||||
def decode(text, encoding): # it's strange but true
|
||||
if isinstance(text, unicode):
|
||||
@ -132,7 +133,7 @@ def buildFeed(response, feed_config):
|
||||
language="en",
|
||||
)
|
||||
|
||||
fill_time(feed_config['id'], items)
|
||||
new_post_cnt = fill_time(feed_config['id'], items)
|
||||
|
||||
for item in items:
|
||||
title = item['title'] if 'title' in item else ''
|
||||
@ -150,7 +151,7 @@ def buildFeed(response, feed_config):
|
||||
#enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image"
|
||||
pubdate = time
|
||||
)
|
||||
return feed.writeString('utf-8')
|
||||
return [feed.writeString('utf-8'), len(items), new_post_cnt]
|
||||
|
||||
def getFeedData(request, feed_id):
|
||||
# get url, xpathes
|
||||
|
Loading…
x
Reference in New Issue
Block a user