diff --git a/feed.py b/feed.py
index 74c514f..7d6a4db 100644
--- a/feed.py
+++ b/feed.py
@@ -2,7 +2,7 @@ import w3lib.url
import w3lib.html
from lxml import etree
-import re
+import re, sys
from hashlib import md5
from feedgenerator import Rss201rev2Feed, Enclosure
@@ -31,44 +31,48 @@ def save_post(conn, created, feed_id, post_fields):
print(cur._last_executed)
def fill_time(feed_id, items):
- if not items:
- return []
- for item in items:
- #create md5
- h = md5('')
- for key in ['title', 'description', 'link']:
- if key in item:
- h.update(item[key].encode('utf-8'))
- item['md5'] = h.hexdigest()
+ try:
+ if not items:
+ return []
+ for item in items:
+ #create md5
+ h = md5('')
+ for key in ['title', 'description', 'link']:
+ if key in item:
+ h.update(item[key].encode('utf-8'))
+ item['md5'] = h.hexdigest()
- #fetch dates from db
- fetched_dates = {}
- db = get_conn()
- with db:
- quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
+ #fetch dates from db
+ fetched_dates = {}
+ db = get_conn()
+ with db:
+ quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
- cur = db.cursor()
- cur.execute("""select p.md5sum, p.created, p.id
- from frontend_post p
- where p.md5sum in (%s)
- and p.feed_id=%s""" % (quoted_hashes, feed_id,))
- rows = cur.fetchall()
- print(cur._last_executed)
- for row in rows:
- md5hash = row[0]
- created = row[1]
- post_id = row[2]
- fetched_dates[md5hash] = created
- cur_time = datetime.datetime.utcnow()
- new_posts = []
- for item in items:
- if item['md5'] in fetched_dates:
- item['time'] = fetched_dates[item['md5']]
- else:
- item['time'] = cur_time
- save_post(db, cur_time, feed_id, item)
- cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
+ cur = db.cursor()
+ cur.execute("""select p.md5sum, p.created, p.id
+ from frontend_post p
+ where p.md5sum in (%s)
+ and p.feed_id=%s""" % (quoted_hashes, feed_id,))
+ rows = cur.fetchall()
+ print(cur._last_executed)
+ for row in rows:
+ md5hash = row[0]
+ created = row[1]
+ post_id = row[2]
+ fetched_dates[md5hash] = created
+ cur_time = datetime.datetime.utcnow()
+ new_posts = []
+ for item in items:
+ if item['md5'] in fetched_dates:
+ item['time'] = fetched_dates[item['md5']]
+ else:
+ item['time'] = cur_time
+ save_post(db, cur_time, feed_id, item)
+ cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
+ except Exception as ex:
+ sys.stderr.write('\n'.join([str(datetime.datetime.now()), "Feed exception:" +str(ex)]))
+
def element_to_string(element):
if isinstance(element, basestring): # attribute
return element
@@ -123,7 +127,7 @@ def buildFeed(response, feed_config):
for item in items:
title = item['title'] if 'title' in item else ''
desc = item['description'] if 'description' in item else ''
- time = item['time']
+ time = item['time'] if 'time' in item else datetime.datetime.now()
if 'link' in item:
link = item['link']
else:
diff --git a/tests.py b/tests.py
index 7e225a1..e94e0b4 100644
--- a/tests.py
+++ b/tests.py
@@ -1,4 +1,6 @@
from lxml import etree
+import sys
+import requests
from feed import element_to_string
@@ -6,6 +8,104 @@ def test1_get_inner_html():
root = etree.fromstring('12345')
assert element_to_string(root) == '12345'
-test1_get_inner_html()
+ids = [1,2,3,5,6,54,100,101,113,118,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,166,167]
+
+def parse_feed(text):
+ ch = etree.fromstring(text).xpath('/rss/channel')
+ title = ch[0].xpath('title')[0].text
+ items = ch[0].xpath('item')
+ return [title.encode('utf-8'), items]
+
+def crawl(extention):
+ number = 0
+ for id in ids:
+ print "ID: %s (%s of %s)" % (id, number, len(ids))
+ r = requests.get("http://politepol.com/feed/%s" % id)
+ text = r.text.encode('utf-8')
+ with open("tests/%s.%s" % (id, extention), 'w') as f:
+ f.write(text)
+ title, items = parse_feed(text)
+ print "Title: %s" % title
+ print "Items count: %s" % len(items)
+ number += 1
+
+def diff(ext1, ext2):
+ diff = []
+ number = 0
+ for id in ids:
+ print "ID: %s" % (id,)
+ text1 = None
+ with open("tests/%s.%s" % (id, ext1), 'r') as f:
+ text1 = f.read()
+ text2 = None
+ with open("tests/%s.%s" % (id, ext2), 'r') as f:
+ text2 = f.read()
+
+ if text1 == text2:
+ print "Identical"
+ else:
+ diff.append(id)
+ posts_diff = 0
+ with open("tests/%s.diff" % (id,), 'w') as f:
+ title1, items1 = parse_feed(text1)
+ title2, items2 = parse_feed(text2)
+ if title1 != title2:
+ print "Different titles"
+ f.write("<<<<<<<<<<<<<<< Different titles >>>>>>>>>>>>>>>\n")
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
+ f.write(title1 + "\n")
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
+ f.write(title2 + "\n")
+ posts1 = {}
+ posts2 = {}
+ if len(items1) != len(items2):
+ print "Different post count: %s vs %s" % (len(items1), len(item2))
+ f.write("<< Different posts count: %s.%s:%s vs %s.%s:%s >>\n" % (id, ext1, len(items1), id, ext2, len(item2)))
+ for post in items1:
+ posts1[element_to_string(post)] = True
+ for post in items2:
+ posts2[element_to_string(post)] = True
+
+ for post in items1:
+ if not (element_to_string(post) in post2):
+ posts_diff += 1
+ f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
+ f.write(element_to_string(post) + "\n")
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
+ f.write("*** Not found ***\n")
+
+ for post in items2:
+ if not (element_to_string(post) in post1):
+ posts_diff += 1
+ f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
+ f.write("*** Not found ***\n")
+ f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
+ f.write(element_to_string(post) + "\n")
+ print "Content of files %s.%s and %s.%s is different. Diff: %s.diff" % (id, ext1, id, ext2, id)
+ if posts_diff > 0:
+ print "Different feeds: %s" % posts_diff
+ number += 1
+ if diff > 0:
+ print "Different feed ids: %s" % str(diff)
+print str(sys.argv)
+if len(sys.argv) == 1:
+ test1_get_inner_html()
+elif len(sys.argv) > 2:
+ if sys.argv[1] == 'crawl':
+ if len(sys.argv) == 3:
+ crawl(sys.argv[2])
+ else:
+ raise Exception("Invalid argument count for crawl")
+ elif sys.argv[1] == 'diff':
+ if len(sys.argv) == 4:
+ diff(sys.argv[2], sys.argv[3])
+ else:
+ raise Exception("Invalid argument count for diff")
+ else:
+ raise Exception("Unsupported operation %s" % sys.argv[1])
+else:
+ raise Exception("Invaid argument count")
print 'All tests are OK'