diff --git a/downloader.py b/downloader.py index c0929ea..35f5517 100644 --- a/downloader.py +++ b/downloader.py @@ -26,7 +26,8 @@ from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from lxml import etree import re -from feed import getFeedData, buildFeed +from feed import getFeedData, buildFeed, get_conn +from contextlib import closing from settings import DOWNLOADER_USER_AGENT, FEED_REQUEST_PERIOD_LIMIT, DEBUG, SNAPSHOT_DIR @@ -41,12 +42,45 @@ class bcolors: BOLD = '\033[1m' UNDERLINE = '\033[4m' +class RequestStat: + def __init__(self, ip, feed_id, post_cnt, new_post_cnt, url=None, ex_msg=None, ex_callstack=None): + self.ip = ip + self.feed_id = feed_id + self.post_cnt = post_cnt + self.new_post_cnt = new_post_cnt + self.url = url + self.ex_msg = ex_msg + self.ex_callstack = ex_callstack + +def get_ip_id(ip, cur): + #import pdb;pdb.set_trace() + cur.execute("""select id from ips where address=%s""", (ip,)) + ip_id = cur.fetchone() + if not ip_id: + cur.execute("insert into ips (address) values (%s)", (ip,)) + ip_id = cur.lastrowid + return ip_id + + +def save_stat(stat): + with closing(get_conn()) as conn: + with conn as cur: + ip_id = get_ip_id(stat.ip, cur) + cur.execute("""insert into requests (ip_id, feed_id, post_cnt, new_post_cnt) + values (%s, %s, %s, %s)""", (ip_id, stat.feed_id, stat.post_cnt, stat.new_post_cnt)) + stat_id = cur.lastrowid + if not stat.feed_id: + cur.execute("insert into request_urls (url, request_id) values (%s, %s)", (stat.url.encode('utf-8'), stat_id)) + + def print_log(event): if 'isError' in event and event['isError']: sys.stdout.write(bcolors.FAIL + formatEventAsClassicLogText(event) + bcolors.ENDC) sys.stderr.write(formatEventAsClassicLogText(event)) sys.stderr.flush() else: + if 'stat' in event and event['stat']: + save_stat(event['request']) sys.stdout.write(formatEventAsClassicLogText(event)) sys.stdout.flush() @@ -179,10 +213,27 @@ def downloadDone(response_str, request, response, feed_config): if (isinstance(response, TextResponse)): if feed_config: - response_str = buildFeed(response, feed_config) + [response_str, post_cnt, new_post_cnt] = buildFeed(response, feed_config) request.setHeader(b"Content-Type", b'text/xml; charset=utf-8') + log.debug('Stat: ip={request.ip} feed_id={request.feed_id} new_post_cnt={request.post_cnt} new_post_cnt={request.new_post_cnt}', request=RequestStat( + ip=request.client.host, + feed_id=feed_config['id'], + post_cnt=post_cnt, + new_post_cnt=new_post_cnt + ), + stat=True + ) else: response_str, file_name = setBaseAndRemoveScriptsAndMore(response, url) + log.debug('Stat: ip={request.ip} url={request.url}', request=RequestStat( + ip=request.client.host, + feed_id=0, + post_cnt=0, + new_post_cnt=0, + url=url + ), + stat=True + ) request.write(response_str) request.finish() diff --git a/feed.py b/feed.py index 32d04b7..f24c3d4 100644 --- a/feed.py +++ b/feed.py @@ -26,7 +26,6 @@ def save_post(conn, created, feed_id, post_fields): with conn as cur: cur.execute("""insert into frontend_post (md5sum, created, feed_id) values (%s, %s, %s)""", (post_fields['md5'], created, feed_id)) - post_id = cur._last_executed post_id = conn.insert_id() for key in ['title', 'description', 'title_link']: @@ -38,6 +37,8 @@ def save_post(conn, created, feed_id, post_fields): def fill_time(feed_id, items): if not items: return [] + + new_post_cnt = 0 for item in items: #create md5 h = md5('') @@ -65,15 +66,15 @@ def fill_time(feed_id, items): fetched_dates[md5hash] = created cur_time = datetime.datetime.utcnow() - new_posts = [] for item in items: if item['md5'] in fetched_dates: item['time'] = fetched_dates[item['md5']] else: item['time'] = cur_time save_post(conn, cur_time, feed_id, item) + new_post_cnt += 1 cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE) - + return new_post_cnt def decode(text, encoding): # it's strange but true if isinstance(text, unicode): @@ -132,7 +133,7 @@ def buildFeed(response, feed_config): language="en", ) - fill_time(feed_config['id'], items) + new_post_cnt = fill_time(feed_config['id'], items) for item in items: title = item['title'] if 'title' in item else '' @@ -150,7 +151,7 @@ def buildFeed(response, feed_config): #enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image" pubdate = time ) - return feed.writeString('utf-8') + return [feed.writeString('utf-8'), len(items), new_post_cnt] def getFeedData(request, feed_id): # get url, xpathes