v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 06:10:09 -07:00
This commit is contained in:
Alexandr Nesterenko 2017-09-22 17:04:05 +03:00
parent 35c382553c
commit 6e5cb836cd
2 changed files with 12 additions and 9 deletions

View File

@ -4,7 +4,7 @@ import time, sys
from hashlib import md5
from datetime import datetime
from twisted.logger import globalLogBeginner, formatEventAsClassicLogText
from twisted.logger import globalLogBeginner, formatEventAsClassicLogText, Logger
from twisted.web import server, resource
from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool
@ -52,6 +52,7 @@ def print_log(event):
globalLogBeginner.beginLoggingTo([print_log], discardBuffer=True, redirectStandardIO=False) # requred, discardBuffer gets rid of the LimitedHistoryLogObserver, redirectStandardIO will loop print action
log = Logger()
if FEED_REQUEST_PERIOD_LIMIT:
import redis
@ -189,9 +190,9 @@ def downloadDone(response_str, request, response, feed_config):
from pympler import tracker
import gc
#sum = None
tr = tracker.SummaryTracker()
MON_PERIOD_SECONDS = 5#3 * 60 * 60 # 3 hours
MON_PERIOD_SECONDS = 1 * 60 * 60 # 1 hours
mon_time = None
def mon(none):
global mon_time
@ -201,7 +202,8 @@ def mon(none):
#pool.closeCachedConnections()
gc.collect()
global tr
tr.print_diff()
for line in tr.format_diff():
log.info(line)
mon_time = tm
def run_pgc():

11
feed.py
View File

@ -11,8 +11,11 @@ import datetime
import MySQLdb
from contextlib import closing
from settings import DATABASES, DOWNLOADER_USER_AGENT
from twisted.logger import Logger
log = Logger()
url_hash_regexp = re.compile('(#.*)?$')
POST_TIME_DISTANCE = 15 # minutes, RSS Feed Reader skip same titles created in 10 min interval
@ -23,15 +26,14 @@ def save_post(conn, created, feed_id, post_fields):
with conn as cur:
cur.execute("""insert into frontend_post (md5sum, created, feed_id)
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
print(cur._last_executed)
post_id = cur._last_executed
post_id = conn.insert_id()
for key in ['title', 'description', 'title_link']:
if key in post_fields:
#import pdb;pdb.set_trace()
cur.execute("""insert into frontend_postfield (field_id, post_id, `text`)
values (%s, %s, %s)""", (FIELD_IDS[key], post_id, post_fields[key].encode('utf-8')))
print(cur._last_executed)
log.info('Post saved id:{id!r}', id=post_id)
def fill_time(feed_id, items):
if not items:
@ -55,7 +57,7 @@ def fill_time(feed_id, items):
where p.md5sum in (%s)
and p.feed_id=%s""" % (quoted_hashes, feed_id,))
rows = cur.fetchall()
print(cur._last_executed)
log.debug('Selected {count!r} posts', count=len(rows))
for row in rows:
md5hash = row[0]
created = row[1]
@ -99,7 +101,6 @@ def buildFeed(response, feed_config):
tree = selector.root.getroottree()
# get data from html
items = []
#import pdb;pdb.set_trace()
for node in selector.xpath(feed_config['xpath']):
item = {}
required_count = 0