mirror of
https://github.com/taroved/pol
synced 2025-05-16 14:20:10 -07:00
stat
This commit is contained in:
parent
ddd9a79f22
commit
91a33febb9
@ -26,7 +26,8 @@ from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from feed import getFeedData, buildFeed
|
from feed import getFeedData, buildFeed, get_conn
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
|
from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR
|
||||||
|
|
||||||
@ -41,12 +42,45 @@ class bcolors:
|
|||||||
BOLD = '\033[1m'
|
BOLD = '\033[1m'
|
||||||
UNDERLINE = '\033[4m'
|
UNDERLINE = '\033[4m'
|
||||||
|
|
||||||
|
class RequestStat:
|
||||||
|
def __init__(self, ip, feed_id, post_cnt, new_post_cnt, url=None, ex_msg=None, ex_callstack=None):
|
||||||
|
self.ip = ip
|
||||||
|
self.feed_id = feed_id
|
||||||
|
self.post_cnt = post_cnt
|
||||||
|
self.new_post_cnt = new_post_cnt
|
||||||
|
self.url = url
|
||||||
|
self.ex_msg = ex_msg
|
||||||
|
self.ex_callstack = ex_callstack
|
||||||
|
|
||||||
|
def get_ip_id(ip, cur):
|
||||||
|
#import pdb;pdb.set_trace()
|
||||||
|
cur.execute("""select id from ips where address=%s""", (ip,))
|
||||||
|
ip_id = cur.fetchone()
|
||||||
|
if not ip_id:
|
||||||
|
cur.execute("insert into ips (address) values (%s)", (ip,))
|
||||||
|
ip_id = cur.lastrowid
|
||||||
|
return ip_id
|
||||||
|
|
||||||
|
|
||||||
|
def save_stat(stat):
|
||||||
|
with closing(get_conn()) as conn:
|
||||||
|
with conn as cur:
|
||||||
|
ip_id = get_ip_id(stat.ip, cur)
|
||||||
|
cur.execute("""insert into requests (ip_id, feed_id, post_cnt, new_post_cnt)
|
||||||
|
values (%s, %s, %s, %s)""", (ip_id, stat.feed_id, stat.post_cnt, stat.new_post_cnt))
|
||||||
|
stat_id = cur.lastrowid
|
||||||
|
if not stat.feed_id:
|
||||||
|
cur.execute("insert into request_urls (url, request_id) values (%s, %s)", (stat.url.encode('utf-8'), stat_id))
|
||||||
|
|
||||||
|
|
||||||
def print_log(event):
|
def print_log(event):
|
||||||
if 'isError' in event and event['isError']:
|
if 'isError' in event and event['isError']:
|
||||||
sys.stdout.write(bcolors.FAIL + formatEventAsClassicLogText(event) + bcolors.ENDC)
|
sys.stdout.write(bcolors.FAIL + formatEventAsClassicLogText(event) + bcolors.ENDC)
|
||||||
sys.stderr.write(formatEventAsClassicLogText(event))
|
sys.stderr.write(formatEventAsClassicLogText(event))
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
else:
|
else:
|
||||||
|
if 'stat' in event and event['stat']:
|
||||||
|
save_stat(event['request'])
|
||||||
sys.stdout.write(formatEventAsClassicLogText(event))
|
sys.stdout.write(formatEventAsClassicLogText(event))
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
@ -179,10 +213,27 @@ def downloadDone(response_str, request, response, feed_config):
|
|||||||
|
|
||||||
if (isinstance(response, TextResponse)):
|
if (isinstance(response, TextResponse)):
|
||||||
if feed_config:
|
if feed_config:
|
||||||
response_str = buildFeed(response, feed_config)
|
[response_str, post_cnt, new_post_cnt] = buildFeed(response, feed_config)
|
||||||
request.setHeader(b"Content-Type", b'text/xml; charset=utf-8')
|
request.setHeader(b"Content-Type", b'text/xml; charset=utf-8')
|
||||||
|
log.debug('Stat: ip={request.ip} feed_id={request.feed_id} new_post_cnt={request.post_cnt} new_post_cnt={request.new_post_cnt}', request=RequestStat(
|
||||||
|
ip=request.client.host,
|
||||||
|
feed_id=feed_config['id'],
|
||||||
|
post_cnt=post_cnt,
|
||||||
|
new_post_cnt=new_post_cnt
|
||||||
|
),
|
||||||
|
stat=True
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url)
|
||||||
|
log.debug('Stat: ip={request.ip} url={request.url}', request=RequestStat(
|
||||||
|
ip=request.client.host,
|
||||||
|
feed_id=0,
|
||||||
|
post_cnt=0,
|
||||||
|
new_post_cnt=0,
|
||||||
|
url=url
|
||||||
|
),
|
||||||
|
stat=True
|
||||||
|
)
|
||||||
|
|
||||||
request.write(response_str)
|
request.write(response_str)
|
||||||
request.finish()
|
request.finish()
|
||||||
|
11
feed.py
11
feed.py
@ -26,7 +26,6 @@ def save_post(conn, created, feed_id, post_fields):
|
|||||||
with conn as cur:
|
with conn as cur:
|
||||||
cur.execute("""insert into frontend_post (md5sum, created, feed_id)
|
cur.execute("""insert into frontend_post (md5sum, created, feed_id)
|
||||||
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
|
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
|
||||||
post_id = cur._last_executed
|
|
||||||
|
|
||||||
post_id = conn.insert_id()
|
post_id = conn.insert_id()
|
||||||
for key in ['title', 'description', 'title_link']:
|
for key in ['title', 'description', 'title_link']:
|
||||||
@ -38,6 +37,8 @@ def save_post(conn, created, feed_id, post_fields):
|
|||||||
def fill_time(feed_id, items):
|
def fill_time(feed_id, items):
|
||||||
if not items:
|
if not items:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
new_post_cnt = 0
|
||||||
for item in items:
|
for item in items:
|
||||||
#create md5
|
#create md5
|
||||||
h = md5('')
|
h = md5('')
|
||||||
@ -65,15 +66,15 @@ def fill_time(feed_id, items):
|
|||||||
fetched_dates[md5hash] = created
|
fetched_dates[md5hash] = created
|
||||||
|
|
||||||
cur_time = datetime.datetime.utcnow()
|
cur_time = datetime.datetime.utcnow()
|
||||||
new_posts = []
|
|
||||||
for item in items:
|
for item in items:
|
||||||
if item['md5'] in fetched_dates:
|
if item['md5'] in fetched_dates:
|
||||||
item['time'] = fetched_dates[item['md5']]
|
item['time'] = fetched_dates[item['md5']]
|
||||||
else:
|
else:
|
||||||
item['time'] = cur_time
|
item['time'] = cur_time
|
||||||
save_post(conn, cur_time, feed_id, item)
|
save_post(conn, cur_time, feed_id, item)
|
||||||
|
new_post_cnt += 1
|
||||||
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
|
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
|
||||||
|
return new_post_cnt
|
||||||
|
|
||||||
def decode(text, encoding): # it's strange but true
|
def decode(text, encoding): # it's strange but true
|
||||||
if isinstance(text, unicode):
|
if isinstance(text, unicode):
|
||||||
@ -132,7 +133,7 @@ def buildFeed(response, feed_config):
|
|||||||
language="en",
|
language="en",
|
||||||
)
|
)
|
||||||
|
|
||||||
fill_time(feed_config['id'], items)
|
new_post_cnt = fill_time(feed_config['id'], items)
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
title = item['title'] if 'title' in item else ''
|
title = item['title'] if 'title' in item else ''
|
||||||
@ -150,7 +151,7 @@ def buildFeed(response, feed_config):
|
|||||||
#enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image"
|
#enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image"
|
||||||
pubdate = time
|
pubdate = time
|
||||||
)
|
)
|
||||||
return feed.writeString('utf-8')
|
return [feed.writeString('utf-8'), len(items), new_post_cnt]
|
||||||
|
|
||||||
def getFeedData(request, feed_id):
|
def getFeedData(request, feed_id):
|
||||||
# get url, xpathes
|
# get url, xpathes
|
||||||
|
Loading…
x
Reference in New Issue
Block a user