diff --git a/downloader.py b/downloader.py index f29dac8..58ce41e 100644 --- a/downloader.py +++ b/downloader.py @@ -136,8 +136,8 @@ def downloadError(error, request=None): request.write('Traceback: ' + error.getTraceback()) else: request.write('Something wrong. Geek comment: ' + error.getErrorMessage()) - sys.stderr.write(datetime.datetime.now()) - sys.stderr.write('\n'.join('Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback())) + sys.stderr.write(str(datetime.now())) + sys.stderr.write('\n'.join(['Downloader error: ' + error.getErrorMessage(), 'Traceback: ' + error.getTraceback()])) request.finish() diff --git a/feed.py b/feed.py index 264d19b..0a9a2ff 100644 --- a/feed.py +++ b/feed.py @@ -13,6 +13,67 @@ from settings import DATABASES, DOWNLOADER_USER_AGENT url_hash_regexp = re.compile('(#.*)?$') +POST_TIME_DISTANCE = 15 # minutes + +FIELD_IDS = {'title': 1, 'description': 2, 'title_link': 3} + +def save_post(conn, md5sum, created, feed_id, post_fields): + cur = conn.cursor() + try: + #import pdb;pdb.set_trace() + cur.execute("""insert into frontend_post (md5sum, created, feed_id) + values (%s, %s, %s)""", (md5sum.hexdigest(), created, feed_id)) + finally: + print(cur._last_executed) + post_id = conn.insert_id() + for key in ['title', 'description', 'title_link']: + if key in post_fields: + try: + cur.execute("""insert into frontend_postfield (field_id, post_id, `text`) + values (%s, %s, %s)""", (FIELD_IDS[key], post_id, post_fields[key].encode('utf-8'))) + finally: + print(cur._last_executed) + +def fill_dates(feed_id, items): + if not items: + return [] + hashes = {} + for item in items: + #create md5 + h = md5('') + for key in ['title', 'description', 'title_link']: + if key in item: + h.update(item[key].encode('utf-8')) + hashes[h] = item + + #fetch dates from db + fetched_dates = {} + db = get_conn() + with db: + quoted_hashes = ','.join(["'%s'" % (h.hexdigest()) for h in hashes]) + + cur = db.cursor() + cur.execute("""select p.md5sum, p.created, p.id + from frontend_post p + where p.md5sum in (%s) + and p.id=%s""" % (quoted_hashes, feed_id,)) + rows = cur.fetchall() + print(cur._last_executed) + for row in rows: + md5hash = row[0] + created = row[1] + post_id = row[2] + fetched_dates[md5hash] = datetime.datetime.fromtimestamp(int(created)) + cur_time = datetime.datetime.now() + new_posts = [] + for h in hashes: + if h in fetched_dates: + hashes[h]['time'] = fetched_date[h] + else: + hashes[h]['time'] = cur_time + save_post(db, h, cur_time, feed_id, hashes[h]) + cur_time -= datetime.timedelta(minutes=POST_TIME_DISTANCE) + def element_to_string(element): s = [element.text] if element.text else [] for sub_element in element: @@ -42,11 +103,12 @@ def buildFeed(response, feed_config): anchor = element[0].xpath('ancestor-or-self::node()[name()="a"]') if anchor and anchor[0].get('href'): title_link = _build_link(response.body_as_unicode(), feed_config['uri'], anchor[0].get('href')) - + if len(item) == len(feed_config['fields']): # all fields are required - item['title_link'] = title_link + if title_link: + item['title_link'] = title_link items.append(item) - + title = response.selector.xpath('//title/text()').extract() #build feed @@ -57,10 +119,13 @@ def buildFeed(response, feed_config): "Source page url: " + feed_config['uri'], language="en", ) + + fill_dates(feed_config['id'], items) + for item in items: title = item['title'] if 'title' in item else '' desc = item['description'] if 'description' in item else '' - if item['title_link']: + if 'title_link' in item: link = item['title_link'] else: link = url_hash_regexp.sub('#' + md5((title+desc).encode('utf-8')).hexdigest(), feed_config['uri']) @@ -75,9 +140,8 @@ def buildFeed(response, feed_config): def getFeedData(request, feed_id): # get url, xpathes - creds = DATABASES['default'] - db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME']) feed = {} + db = get_conn() with db: cur = db.cursor() cur.execute("""select f.uri, f.xpath, fi.name, ff.xpath from frontend_feed f @@ -88,6 +152,7 @@ def getFeedData(request, feed_id): for row in rows: if not feed: + feed['id'] = feed_id feed['uri'] = row[0] feed['xpath'] = row[1] feed['fields'] = {} @@ -97,3 +162,9 @@ def getFeedData(request, feed_id): return [feed['uri'], feed] else: return 'Feed generator error: config of feed is empty' + +def get_conn(): + creds = DATABASES['default'] + db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME']) + db.autocommit(True) + return db diff --git a/frontend/frontend/fixtures/fields.json b/frontend/frontend/fixtures/fields.json index 8b7fefe..e54b7b0 100644 --- a/frontend/frontend/fixtures/fields.json +++ b/frontend/frontend/fixtures/fields.json @@ -12,5 +12,12 @@ }, "model": "frontend.field", "pk": 2 +}, +{ + "fields": { + "name": "link" + }, + "model": "frontend.field", + "pk": 3 } ] diff --git a/frontend/frontend/migrations/0002_auto_20170711_2119.py b/frontend/frontend/migrations/0002_auto_20170711_2119.py new file mode 100644 index 0000000..ac96343 --- /dev/null +++ b/frontend/frontend/migrations/0002_auto_20170711_2119.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models +import datetime + + +class Migration(migrations.Migration): + + dependencies = [ + ('frontend', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Post', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('md5sum', models.CharField(max_length=32)), + ('created', models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name='PostField', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('text', models.TextField(max_length=65535)), + ('field', models.ForeignKey(to='frontend.Field')), + ('post', models.ForeignKey(to='frontend.Post')), + ], + ), + migrations.AddField( + model_name='feed', + name='created', + field=models.DateTimeField(default=datetime.datetime(2017, 7, 11, 21, 19, 23, 580569), auto_now_add=True), + preserve_default=False, + ), + migrations.AddField( + model_name='post', + name='feed', + field=models.ForeignKey(to='frontend.Feed'), + ), + migrations.AlterIndexTogether( + name='post', + index_together=set([('feed', 'md5sum')]), + ), + ] diff --git a/frontend/frontend/models.py b/frontend/frontend/models.py index ecaabbd..d3cf173 100644 --- a/frontend/frontend/models.py +++ b/frontend/frontend/models.py @@ -1,9 +1,9 @@ from django.db import models - class Feed(models.Model): uri = models.CharField(max_length=2000) xpath = models.CharField(max_length=2000) + created = models.DateTimeField(auto_now_add=True) class Field(models.Model): name = models.CharField(max_length=200) @@ -12,3 +12,16 @@ class FeedField(models.Model): feed = models.ForeignKey(Feed, on_delete=models.CASCADE) field = models.ForeignKey(Field, on_delete=models.CASCADE) xpath = models.CharField(max_length=2000) + +class Post(models.Model): + feed = models.ForeignKey(Feed, on_delete=models.CASCADE) + md5sum = models.CharField(max_length=32) + created = models.DateTimeField(auto_now_add=True) + + class Meta: + index_together = ['feed', 'md5sum'] + +class PostField(models.Model): + post = models.ForeignKey(Post, on_delete=models.CASCADE) + field = models.ForeignKey(Field, on_delete=models.CASCADE) + text = models.TextField(max_length=64*1024-1) diff --git a/requirements.txt b/requirements.txt index 48e7599..2cd0b04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ Django==1.8.6 lxml==3.3.3 Scrapy==1.0.3 django-pipeline==1.5.4 +django-unixtimestampfield==0.3.9 mysqlclient==1.3.7 w3lib==1.12.0 feedgenerator==1.8