v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-17 06:40:08 -07:00

tmp fix + tests

This commit is contained in:
Your Name 2017-07-18 16:29:16 -04:00
parent 26780338a8
commit f2f823974d
2 changed files with 142 additions and 38 deletions

78
feed.py
View File

@ -2,7 +2,7 @@ import w3lib.url
import w3lib.html
from lxml import etree
import re
import re, sys
from hashlib import md5
from feedgenerator import Rss201rev2Feed, Enclosure
@ -31,44 +31,48 @@ def save_post(conn, created, feed_id, post_fields):
print(cur._last_executed)
def fill_time(feed_id, items):
if not items:
return []
for item in items:
#create md5
h = md5('')
for key in ['title', 'description', 'link']:
if key in item:
h.update(item[key].encode('utf-8'))
item['md5'] = h.hexdigest()
try:
if not items:
return []
for item in items:
#create md5
h = md5('')
for key in ['title', 'description', 'link']:
if key in item:
h.update(item[key].encode('utf-8'))
item['md5'] = h.hexdigest()
#fetch dates from db
fetched_dates = {}
db = get_conn()
with db:
quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
#fetch dates from db
fetched_dates = {}
db = get_conn()
with db:
quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
cur = db.cursor()
cur.execute("""select p.md5sum, p.created, p.id
from frontend_post p
where p.md5sum in (%s)
and p.feed_id=%s""" % (quoted_hashes, feed_id,))
rows = cur.fetchall()
print(cur._last_executed)
for row in rows:
md5hash = row[0]
created = row[1]
post_id = row[2]
fetched_dates[md5hash] = created
cur_time = datetime.datetime.utcnow()
new_posts = []
for item in items:
if item['md5'] in fetched_dates:
item['time'] = fetched_dates[item['md5']]
else:
item['time'] = cur_time
save_post(db, cur_time, feed_id, item)
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
cur = db.cursor()
cur.execute("""select p.md5sum, p.created, p.id
from frontend_post p
where p.md5sum in (%s)
and p.feed_id=%s""" % (quoted_hashes, feed_id,))
rows = cur.fetchall()
print(cur._last_executed)
for row in rows:
md5hash = row[0]
created = row[1]
post_id = row[2]
fetched_dates[md5hash] = created
cur_time = datetime.datetime.utcnow()
new_posts = []
for item in items:
if item['md5'] in fetched_dates:
item['time'] = fetched_dates[item['md5']]
else:
item['time'] = cur_time
save_post(db, cur_time, feed_id, item)
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
except Exception as ex:
sys.stderr.write('\n'.join([str(datetime.datetime.now()), "Feed exception:" +str(ex)]))
def element_to_string(element):
if isinstance(element, basestring): # attribute
return element
@ -123,7 +127,7 @@ def buildFeed(response, feed_config):
for item in items:
title = item['title'] if 'title' in item else ''
desc = item['description'] if 'description' in item else ''
time = item['time']
time = item['time'] if 'time' in item else datetime.datetime.now()
if 'link' in item:
link = item['link']
else:

102
tests.py
View File

@ -1,4 +1,6 @@
from lxml import etree
import sys
import requests
from feed import element_to_string
@ -6,6 +8,104 @@ def test1_get_inner_html():
root = etree.fromstring('<a>1<b>2</b>3<c>4</c>5</a>')
assert element_to_string(root) == '1<b>2</b>3<c>4</c>5'
test1_get_inner_html()
ids = [1,2,3,5,6,54,100,101,113,118,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,166,167]
def parse_feed(text):
ch = etree.fromstring(text).xpath('/rss/channel')
title = ch[0].xpath('title')[0].text
items = ch[0].xpath('item')
return [title.encode('utf-8'), items]
def crawl(extention):
number = 0
for id in ids:
print "ID: %s (%s of %s)" % (id, number, len(ids))
r = requests.get("http://politepol.com/feed/%s" % id)
text = r.text.encode('utf-8')
with open("tests/%s.%s" % (id, extention), 'w') as f:
f.write(text)
title, items = parse_feed(text)
print "Title: %s" % title
print "Items count: %s" % len(items)
number += 1
def diff(ext1, ext2):
diff = []
number = 0
for id in ids:
print "ID: %s" % (id,)
text1 = None
with open("tests/%s.%s" % (id, ext1), 'r') as f:
text1 = f.read()
text2 = None
with open("tests/%s.%s" % (id, ext2), 'r') as f:
text2 = f.read()
if text1 == text2:
print "Identical"
else:
diff.append(id)
posts_diff = 0
with open("tests/%s.diff" % (id,), 'w') as f:
title1, items1 = parse_feed(text1)
title2, items2 = parse_feed(text2)
if title1 != title2:
print "Different titles"
f.write("<<<<<<<<<<<<<<< Different titles >>>>>>>>>>>>>>>\n")
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
f.write(title1 + "\n")
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
f.write(title2 + "\n")
posts1 = {}
posts2 = {}
if len(items1) != len(items2):
print "Different post count: %s vs %s" % (len(items1), len(item2))
f.write("<< Different posts count: %s.%s:%s vs %s.%s:%s >>\n" % (id, ext1, len(items1), id, ext2, len(item2)))
for post in items1:
posts1[element_to_string(post)] = True
for post in items2:
posts2[element_to_string(post)] = True
for post in items1:
if not (element_to_string(post) in post2):
posts_diff += 1
f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
f.write(element_to_string(post) + "\n")
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
f.write("*** Not found ***\n")
for post in items2:
if not (element_to_string(post) in post1):
posts_diff += 1
f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
f.write("*** Not found ***\n")
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
f.write(element_to_string(post) + "\n")
print "Content of files %s.%s and %s.%s is different. Diff: %s.diff" % (id, ext1, id, ext2, id)
if posts_diff > 0:
print "Different feeds: %s" % posts_diff
number += 1
if diff > 0:
print "Different feed ids: %s" % str(diff)
print str(sys.argv)
if len(sys.argv) == 1:
test1_get_inner_html()
elif len(sys.argv) > 2:
if sys.argv[1] == 'crawl':
if len(sys.argv) == 3:
crawl(sys.argv[2])
else:
raise Exception("Invalid argument count for crawl")
elif sys.argv[1] == 'diff':
if len(sys.argv) == 4:
diff(sys.argv[2], sys.argv[3])
else:
raise Exception("Invalid argument count for diff")
else:
raise Exception("Unsupported operation %s" % sys.argv[1])
else:
raise Exception("Invaid argument count")
print 'All tests are OK'