From f2f823974dde1a3e52c3da21aeae6db8eb11a8d4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 18 Jul 2017 16:29:16 -0400 Subject: [PATCH] tmp fix + tests --- feed.py | 78 ++++++++++++++++++++++-------------------- tests.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 142 insertions(+), 38 deletions(-) diff --git a/feed.py b/feed.py index 74c514f..7d6a4db 100644 --- a/feed.py +++ b/feed.py @@ -2,7 +2,7 @@ import w3lib.url import w3lib.html from lxml import etree -import re +import re, sys from hashlib import md5 from feedgenerator import Rss201rev2Feed, Enclosure @@ -31,44 +31,48 @@ def save_post(conn, created, feed_id, post_fields): print(cur._last_executed) def fill_time(feed_id, items): - if not items: - return [] - for item in items: - #create md5 - h = md5('') - for key in ['title', 'description', 'link']: - if key in item: - h.update(item[key].encode('utf-8')) - item['md5'] = h.hexdigest() + try: + if not items: + return [] + for item in items: + #create md5 + h = md5('') + for key in ['title', 'description', 'link']: + if key in item: + h.update(item[key].encode('utf-8')) + item['md5'] = h.hexdigest() - #fetch dates from db - fetched_dates = {} - db = get_conn() - with db: - quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items]) + #fetch dates from db + fetched_dates = {} + db = get_conn() + with db: + quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items]) - cur = db.cursor() - cur.execute("""select p.md5sum, p.created, p.id - from frontend_post p - where p.md5sum in (%s) - and p.feed_id=%s""" % (quoted_hashes, feed_id,)) - rows = cur.fetchall() - print(cur._last_executed) - for row in rows: - md5hash = row[0] - created = row[1] - post_id = row[2] - fetched_dates[md5hash] = created - cur_time = datetime.datetime.utcnow() - new_posts = [] - for item in items: - if item['md5'] in fetched_dates: - item['time'] = fetched_dates[item['md5']] - else: - item['time'] = cur_time - save_post(db, cur_time, feed_id, item) - cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE) + cur = db.cursor() + cur.execute("""select p.md5sum, p.created, p.id + from frontend_post p + where p.md5sum in (%s) + and p.feed_id=%s""" % (quoted_hashes, feed_id,)) + rows = cur.fetchall() + print(cur._last_executed) + for row in rows: + md5hash = row[0] + created = row[1] + post_id = row[2] + fetched_dates[md5hash] = created + cur_time = datetime.datetime.utcnow() + new_posts = [] + for item in items: + if item['md5'] in fetched_dates: + item['time'] = fetched_dates[item['md5']] + else: + item['time'] = cur_time + save_post(db, cur_time, feed_id, item) + cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE) + except Exception as ex: + sys.stderr.write('\n'.join([str(datetime.datetime.now()), "Feed exception:" +str(ex)])) + def element_to_string(element): if isinstance(element, basestring): # attribute return element @@ -123,7 +127,7 @@ def buildFeed(response, feed_config): for item in items: title = item['title'] if 'title' in item else '' desc = item['description'] if 'description' in item else '' - time = item['time'] + time = item['time'] if 'time' in item else datetime.datetime.now() if 'link' in item: link = item['link'] else: diff --git a/tests.py b/tests.py index 7e225a1..e94e0b4 100644 --- a/tests.py +++ b/tests.py @@ -1,4 +1,6 @@ from lxml import etree +import sys +import requests from feed import element_to_string @@ -6,6 +8,104 @@ def test1_get_inner_html(): root = etree.fromstring('12345') assert element_to_string(root) == '12345' -test1_get_inner_html() +ids = [1,2,3,5,6,54,100,101,113,118,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,166,167] + +def parse_feed(text): + ch = etree.fromstring(text).xpath('/rss/channel') + title = ch[0].xpath('title')[0].text + items = ch[0].xpath('item') + return [title.encode('utf-8'), items] + +def crawl(extention): + number = 0 + for id in ids: + print "ID: %s (%s of %s)" % (id, number, len(ids)) + r = requests.get("http://politepol.com/feed/%s" % id) + text = r.text.encode('utf-8') + with open("tests/%s.%s" % (id, extention), 'w') as f: + f.write(text) + title, items = parse_feed(text) + print "Title: %s" % title + print "Items count: %s" % len(items) + number += 1 + +def diff(ext1, ext2): + diff = [] + number = 0 + for id in ids: + print "ID: %s" % (id,) + text1 = None + with open("tests/%s.%s" % (id, ext1), 'r') as f: + text1 = f.read() + text2 = None + with open("tests/%s.%s" % (id, ext2), 'r') as f: + text2 = f.read() + + if text1 == text2: + print "Identical" + else: + diff.append(id) + posts_diff = 0 + with open("tests/%s.diff" % (id,), 'w') as f: + title1, items1 = parse_feed(text1) + title2, items2 = parse_feed(text2) + if title1 != title2: + print "Different titles" + f.write("<<<<<<<<<<<<<<< Different titles >>>>>>>>>>>>>>>\n") + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1)) + f.write(title1 + "\n") + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2)) + f.write(title2 + "\n") + posts1 = {} + posts2 = {} + if len(items1) != len(items2): + print "Different post count: %s vs %s" % (len(items1), len(item2)) + f.write("<< Different posts count: %s.%s:%s vs %s.%s:%s >>\n" % (id, ext1, len(items1), id, ext2, len(item2))) + for post in items1: + posts1[element_to_string(post)] = True + for post in items2: + posts2[element_to_string(post)] = True + + for post in items1: + if not (element_to_string(post) in post2): + posts_diff += 1 + f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff) + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1)) + f.write(element_to_string(post) + "\n") + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2)) + f.write("*** Not found ***\n") + + for post in items2: + if not (element_to_string(post) in post1): + posts_diff += 1 + f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff) + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1)) + f.write("*** Not found ***\n") + f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2)) + f.write(element_to_string(post) + "\n") + print "Content of files %s.%s and %s.%s is different. Diff: %s.diff" % (id, ext1, id, ext2, id) + if posts_diff > 0: + print "Different feeds: %s" % posts_diff + number += 1 + if diff > 0: + print "Different feed ids: %s" % str(diff) +print str(sys.argv) +if len(sys.argv) == 1: + test1_get_inner_html() +elif len(sys.argv) > 2: + if sys.argv[1] == 'crawl': + if len(sys.argv) == 3: + crawl(sys.argv[2]) + else: + raise Exception("Invalid argument count for crawl") + elif sys.argv[1] == 'diff': + if len(sys.argv) == 4: + diff(sys.argv[2], sys.argv[3]) + else: + raise Exception("Invalid argument count for diff") + else: + raise Exception("Unsupported operation %s" % sys.argv[1]) +else: + raise Exception("Invaid argument count") print 'All tests are OK'