mirror of
https://github.com/taroved/pol
synced 2025-05-17 06:40:08 -07:00
tmp fix + tests
This commit is contained in:
parent
26780338a8
commit
f2f823974d
78
feed.py
78
feed.py
@ -2,7 +2,7 @@ import w3lib.url
|
||||
import w3lib.html
|
||||
|
||||
from lxml import etree
|
||||
import re
|
||||
import re, sys
|
||||
from hashlib import md5
|
||||
|
||||
from feedgenerator import Rss201rev2Feed, Enclosure
|
||||
@ -31,44 +31,48 @@ def save_post(conn, created, feed_id, post_fields):
|
||||
print(cur._last_executed)
|
||||
|
||||
def fill_time(feed_id, items):
|
||||
if not items:
|
||||
return []
|
||||
for item in items:
|
||||
#create md5
|
||||
h = md5('')
|
||||
for key in ['title', 'description', 'link']:
|
||||
if key in item:
|
||||
h.update(item[key].encode('utf-8'))
|
||||
item['md5'] = h.hexdigest()
|
||||
try:
|
||||
if not items:
|
||||
return []
|
||||
for item in items:
|
||||
#create md5
|
||||
h = md5('')
|
||||
for key in ['title', 'description', 'link']:
|
||||
if key in item:
|
||||
h.update(item[key].encode('utf-8'))
|
||||
item['md5'] = h.hexdigest()
|
||||
|
||||
#fetch dates from db
|
||||
fetched_dates = {}
|
||||
db = get_conn()
|
||||
with db:
|
||||
quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
|
||||
#fetch dates from db
|
||||
fetched_dates = {}
|
||||
db = get_conn()
|
||||
with db:
|
||||
quoted_hashes = ','.join(["'%s'" % (i['md5']) for i in items])
|
||||
|
||||
cur = db.cursor()
|
||||
cur.execute("""select p.md5sum, p.created, p.id
|
||||
from frontend_post p
|
||||
where p.md5sum in (%s)
|
||||
and p.feed_id=%s""" % (quoted_hashes, feed_id,))
|
||||
rows = cur.fetchall()
|
||||
print(cur._last_executed)
|
||||
for row in rows:
|
||||
md5hash = row[0]
|
||||
created = row[1]
|
||||
post_id = row[2]
|
||||
fetched_dates[md5hash] = created
|
||||
cur_time = datetime.datetime.utcnow()
|
||||
new_posts = []
|
||||
for item in items:
|
||||
if item['md5'] in fetched_dates:
|
||||
item['time'] = fetched_dates[item['md5']]
|
||||
else:
|
||||
item['time'] = cur_time
|
||||
save_post(db, cur_time, feed_id, item)
|
||||
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
|
||||
cur = db.cursor()
|
||||
cur.execute("""select p.md5sum, p.created, p.id
|
||||
from frontend_post p
|
||||
where p.md5sum in (%s)
|
||||
and p.feed_id=%s""" % (quoted_hashes, feed_id,))
|
||||
rows = cur.fetchall()
|
||||
print(cur._last_executed)
|
||||
for row in rows:
|
||||
md5hash = row[0]
|
||||
created = row[1]
|
||||
post_id = row[2]
|
||||
fetched_dates[md5hash] = created
|
||||
cur_time = datetime.datetime.utcnow()
|
||||
new_posts = []
|
||||
for item in items:
|
||||
if item['md5'] in fetched_dates:
|
||||
item['time'] = fetched_dates[item['md5']]
|
||||
else:
|
||||
item['time'] = cur_time
|
||||
save_post(db, cur_time, feed_id, item)
|
||||
cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE)
|
||||
except Exception as ex:
|
||||
sys.stderr.write('\n'.join([str(datetime.datetime.now()), "Feed exception:" +str(ex)]))
|
||||
|
||||
|
||||
def element_to_string(element):
|
||||
if isinstance(element, basestring): # attribute
|
||||
return element
|
||||
@ -123,7 +127,7 @@ def buildFeed(response, feed_config):
|
||||
for item in items:
|
||||
title = item['title'] if 'title' in item else ''
|
||||
desc = item['description'] if 'description' in item else ''
|
||||
time = item['time']
|
||||
time = item['time'] if 'time' in item else datetime.datetime.now()
|
||||
if 'link' in item:
|
||||
link = item['link']
|
||||
else:
|
||||
|
102
tests.py
102
tests.py
@ -1,4 +1,6 @@
|
||||
from lxml import etree
|
||||
import sys
|
||||
import requests
|
||||
|
||||
from feed import element_to_string
|
||||
|
||||
@ -6,6 +8,104 @@ def test1_get_inner_html():
|
||||
root = etree.fromstring('<a>1<b>2</b>3<c>4</c>5</a>')
|
||||
assert element_to_string(root) == '1<b>2</b>3<c>4</c>5'
|
||||
|
||||
test1_get_inner_html()
|
||||
ids = [1,2,3,5,6,54,100,101,113,118,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,166,167]
|
||||
|
||||
def parse_feed(text):
|
||||
ch = etree.fromstring(text).xpath('/rss/channel')
|
||||
title = ch[0].xpath('title')[0].text
|
||||
items = ch[0].xpath('item')
|
||||
return [title.encode('utf-8'), items]
|
||||
|
||||
def crawl(extention):
|
||||
number = 0
|
||||
for id in ids:
|
||||
print "ID: %s (%s of %s)" % (id, number, len(ids))
|
||||
r = requests.get("http://politepol.com/feed/%s" % id)
|
||||
text = r.text.encode('utf-8')
|
||||
with open("tests/%s.%s" % (id, extention), 'w') as f:
|
||||
f.write(text)
|
||||
title, items = parse_feed(text)
|
||||
print "Title: %s" % title
|
||||
print "Items count: %s" % len(items)
|
||||
number += 1
|
||||
|
||||
def diff(ext1, ext2):
|
||||
diff = []
|
||||
number = 0
|
||||
for id in ids:
|
||||
print "ID: %s" % (id,)
|
||||
text1 = None
|
||||
with open("tests/%s.%s" % (id, ext1), 'r') as f:
|
||||
text1 = f.read()
|
||||
text2 = None
|
||||
with open("tests/%s.%s" % (id, ext2), 'r') as f:
|
||||
text2 = f.read()
|
||||
|
||||
if text1 == text2:
|
||||
print "Identical"
|
||||
else:
|
||||
diff.append(id)
|
||||
posts_diff = 0
|
||||
with open("tests/%s.diff" % (id,), 'w') as f:
|
||||
title1, items1 = parse_feed(text1)
|
||||
title2, items2 = parse_feed(text2)
|
||||
if title1 != title2:
|
||||
print "Different titles"
|
||||
f.write("<<<<<<<<<<<<<<< Different titles >>>>>>>>>>>>>>>\n")
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
|
||||
f.write(title1 + "\n")
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
|
||||
f.write(title2 + "\n")
|
||||
posts1 = {}
|
||||
posts2 = {}
|
||||
if len(items1) != len(items2):
|
||||
print "Different post count: %s vs %s" % (len(items1), len(item2))
|
||||
f.write("<< Different posts count: %s.%s:%s vs %s.%s:%s >>\n" % (id, ext1, len(items1), id, ext2, len(item2)))
|
||||
for post in items1:
|
||||
posts1[element_to_string(post)] = True
|
||||
for post in items2:
|
||||
posts2[element_to_string(post)] = True
|
||||
|
||||
for post in items1:
|
||||
if not (element_to_string(post) in post2):
|
||||
posts_diff += 1
|
||||
f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
|
||||
f.write(element_to_string(post) + "\n")
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
|
||||
f.write("*** Not found ***\n")
|
||||
|
||||
for post in items2:
|
||||
if not (element_to_string(post) in post1):
|
||||
posts_diff += 1
|
||||
f.write("<<<<<<<<<<<<<<< Different posts (%s) >>>>>>>>>>>>>>>\n" % posts_diff)
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext1))
|
||||
f.write("*** Not found ***\n")
|
||||
f.write(">>>>>>>>>>>>>>> %s.%s <<<<<<<<<<<<<\n" % (id, ext2))
|
||||
f.write(element_to_string(post) + "\n")
|
||||
print "Content of files %s.%s and %s.%s is different. Diff: %s.diff" % (id, ext1, id, ext2, id)
|
||||
if posts_diff > 0:
|
||||
print "Different feeds: %s" % posts_diff
|
||||
number += 1
|
||||
if diff > 0:
|
||||
print "Different feed ids: %s" % str(diff)
|
||||
print str(sys.argv)
|
||||
if len(sys.argv) == 1:
|
||||
test1_get_inner_html()
|
||||
elif len(sys.argv) > 2:
|
||||
if sys.argv[1] == 'crawl':
|
||||
if len(sys.argv) == 3:
|
||||
crawl(sys.argv[2])
|
||||
else:
|
||||
raise Exception("Invalid argument count for crawl")
|
||||
elif sys.argv[1] == 'diff':
|
||||
if len(sys.argv) == 4:
|
||||
diff(sys.argv[2], sys.argv[3])
|
||||
else:
|
||||
raise Exception("Invalid argument count for diff")
|
||||
else:
|
||||
raise Exception("Unsupported operation %s" % sys.argv[1])
|
||||
else:
|
||||
raise Exception("Invaid argument count")
|
||||
|
||||
print 'All tests are OK'
|
||||
|
Loading…
x
Reference in New Issue
Block a user