v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 14:20:10 -07:00
This commit is contained in:
Alexandr Nesterenko 2017-09-22 17:04:05 +03:00
parent 35c382553c
commit 6e5cb836cd
2 changed files with 12 additions and 9 deletions

View File

@ -4,7 +4,7 @@ import time, sys
from hashlib import md5 from hashlib import md5
from datetime import datetime from datetime import datetime
from twisted.logger import globalLogBeginner, formatEventAsClassicLogText from twisted.logger import globalLogBeginner, formatEventAsClassicLogText, Logger
from twisted.web import server, resource from twisted.web import server, resource
from twisted.internet import reactor, endpoints, defer from twisted.internet import reactor, endpoints, defer
from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool from twisted.web.client import Agent, BrowserLikeRedirectAgent, readBody, PartialDownloadError, HTTPConnectionPool
@ -52,6 +52,7 @@ def print_log(event):
globalLogBeginner.beginLoggingTo([print_log], discardBuffer=True, redirectStandardIO=False) # requred, discardBuffer gets rid of the LimitedHistoryLogObserver, redirectStandardIO will loop print action globalLogBeginner.beginLoggingTo([print_log], discardBuffer=True, redirectStandardIO=False) # requred, discardBuffer gets rid of the LimitedHistoryLogObserver, redirectStandardIO will loop print action
log = Logger()
if FEED_REQUEST_PERIOD_LIMIT: if FEED_REQUEST_PERIOD_LIMIT:
import redis import redis
@ -189,9 +190,9 @@ def downloadDone(response_str, request, response, feed_config):
from pympler import tracker from pympler import tracker
import gc import gc
#sum = None
tr = tracker.SummaryTracker() tr = tracker.SummaryTracker()
MON_PERIOD_SECONDS = 5#3 * 60 * 60 # 3 hours MON_PERIOD_SECONDS = 1 * 60 * 60 # 1 hours
mon_time = None mon_time = None
def mon(none): def mon(none):
global mon_time global mon_time
@ -201,7 +202,8 @@ def mon(none):
#pool.closeCachedConnections() #pool.closeCachedConnections()
gc.collect() gc.collect()
global tr global tr
tr.print_diff() for line in tr.format_diff():
log.info(line)
mon_time = tm mon_time = tm
def run_pgc(): def run_pgc():

11
feed.py
View File

@ -11,8 +11,11 @@ import datetime
import MySQLdb import MySQLdb
from contextlib import closing from contextlib import closing
from settings import DATABASES, DOWNLOADER_USER_AGENT from settings import DATABASES, DOWNLOADER_USER_AGENT
from twisted.logger import Logger
log = Logger()
url_hash_regexp = re.compile('(#.*)?$') url_hash_regexp = re.compile('(#.*)?$')
POST_TIME_DISTANCE = 15 # minutes, RSS Feed Reader skip same titles created in 10 min interval POST_TIME_DISTANCE = 15 # minutes, RSS Feed Reader skip same titles created in 10 min interval
@ -23,15 +26,14 @@ def save_post(conn, created, feed_id, post_fields):
with conn as cur: with conn as cur:
cur.execute("""insert into frontend_post (md5sum, created, feed_id) cur.execute("""insert into frontend_post (md5sum, created, feed_id)
values (%s, %s, %s)""", (post_fields['md5'], created, feed_id)) values (%s, %s, %s)""", (post_fields['md5'], created, feed_id))
print(cur._last_executed) post_id = cur._last_executed
post_id = conn.insert_id() post_id = conn.insert_id()
for key in ['title', 'description', 'title_link']: for key in ['title', 'description', 'title_link']:
if key in post_fields: if key in post_fields:
#import pdb;pdb.set_trace()
cur.execute("""insert into frontend_postfield (field_id, post_id, `text`) cur.execute("""insert into frontend_postfield (field_id, post_id, `text`)
values (%s, %s, %s)""", (FIELD_IDS[key], post_id, post_fields[key].encode('utf-8'))) values (%s, %s, %s)""", (FIELD_IDS[key], post_id, post_fields[key].encode('utf-8')))
print(cur._last_executed) log.info('Post saved id:{id!r}', id=post_id)
def fill_time(feed_id, items): def fill_time(feed_id, items):
if not items: if not items:
@ -55,7 +57,7 @@ def fill_time(feed_id, items):
where p.md5sum in (%s) where p.md5sum in (%s)
and p.feed_id=%s""" % (quoted_hashes, feed_id,)) and p.feed_id=%s""" % (quoted_hashes, feed_id,))
rows = cur.fetchall() rows = cur.fetchall()
print(cur._last_executed) log.debug('Selected {count!r} posts', count=len(rows))
for row in rows: for row in rows:
md5hash = row[0] md5hash = row[0]
created = row[1] created = row[1]
@ -99,7 +101,6 @@ def buildFeed(response, feed_config):
tree = selector.root.getroottree() tree = selector.root.getroottree()
# get data from html # get data from html
items = [] items = []
#import pdb;pdb.set_trace()
for node in selector.xpath(feed_config['xpath']): for node in selector.xpath(feed_config['xpath']):
item = {} item = {}
required_count = 0 required_count = 0