v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-21 08:30:24 -07:00

feed generator is working

This commit is contained in:
Alexandr Nesterenko 2016-07-10 15:13:17 -07:00
parent 54e6dd0462
commit ce8432fb55
9 changed files with 240 additions and 15 deletions

View File

@ -13,7 +13,9 @@ from scrapy.http import Headers
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from lxml import etree from lxml import etree
import re
from feed import startFeedRequest
def getPageFactory(url, contextFactory=None, *args, **kwargs): def getPageFactory(url, contextFactory=None, *args, **kwargs):
""" """
@ -87,14 +89,15 @@ def downloadDone(response_str, request=None, page_factory=None, url=None):
request.finish() request.finish()
def downloadError(error, request=None, page_factory=None): def downloadError(error, request=None, page_factory=None):
#import pdb; pdb.set_trace()
request.write('Downloader error: ' + error.value) request.write('Downloader error: ' + error.value)
request.finish() request.finish()
class Counter(resource.Resource): class Downloader(resource.Resource):
isLeaf = True isLeaf = True
feed_regexp = re.compile('^/feed/(\d+)$')
def startRequest(self, request, url): def startRequest(self, request, url):
page_factory = getPageFactory(url, page_factory = getPageFactory(url,
headers={ headers={
@ -118,18 +121,21 @@ class Counter(resource.Resource):
def render_GET(self, request): def render_GET(self, request):
''' '''
Render page for frontend Render page for frontend or RSS feed
''' '''
if 'url' in request.args: if 'url' in request.args:
url = request.args['url'][0] url = request.args['url'][0]
self.startRequest(request, url) self.startRequest(request, url)
return NOT_DONE_YET return NOT_DONE_YET
elif self.feed_regexp.match(request.uri) is not None:
feed_id = self.feed_regexp.match(request.uri).groups()[0]
startFeedRequest(request, feed_id)
return NOT_DONE_YET
else: else:
return 'Url is required' return 'Url is required'
endpoints.serverFromString(reactor, "tcp:1234").listen(server.Site(Downloader()))
endpoints.serverFromString(reactor, "tcp:1234").listen(server.Site(Counter()))
print 'Server starting at http://localhost:1234' print 'Server starting at http://localhost:1234'
reactor.run() reactor.run()

134
feed.py Normal file
View File

@ -0,0 +1,134 @@
from twisted.web import server, resource
from twisted.internet import reactor, endpoints
from twisted.web.client import HTTPClientFactory, _makeGetterFactory
from twisted.web.server import NOT_DONE_YET
from scrapy.http.response.text import TextResponse
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
from scrapy.selector import Selector
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from lxml import etree
from feedgenerator import Rss201rev2Feed, Enclosure
import datetime
import MySQLdb
from settings import DATABASES
def _getPageFactory(url, contextFactory=None, *args, **kwargs):
"""
Download a web page as a string.
Download a page. Return a deferred, which will callback with a
page (as a string) or errback with a description of the error.
See L{HTTPClientFactory} to see what extra arguments can be passed.
"""
return _makeGetterFactory(
url,
HTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs)
def _buildScrapyResponse(page_factory, body):
status = int(page_factory.status)
headers = Headers(page_factory.response_headers)
respcls = responsetypes.from_args(headers=headers, url=page_factory.url)
return respcls(url=page_factory.url, status=status, headers=headers, body=body)
def element_to_string(element):
s = [element.text] if element.text else []
for sub_element in element:
s.append(etree.tostring(sub_element))
s.append(element.tail)
return ''.join(s)
def _buildFeed(response, feed_config):
tree = response.selector._root.getroottree()
# get data from html
items = []
for node in tree.xpath(feed_config['xpath']):
item = {}
for field_name in ['title', 'description']:
if field_name in feed_config['fields']:
element = node.xpath(feed_config['fields'][field_name])
if element:
item[field_name] = element_to_string(element[0])
items.append(item)
#import pdb; pdb.set_trace()
#build feed
feed = Rss201rev2Feed(
title='Polite Pol: ' + feed_config['uri'],
link=feed_config['uri'],
description="Generated by PolitePol.com.\n"+\
"Url: " + feed_config['uri'],
language="en",
)
for item in items:
feed.add_item(
title=item['title'] if 'title' in item else '',
link=feed_config['uri'],
description=item['description'] if 'description' in item else '',
#enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image"
pubdate=datetime.datetime.now()
)
return feed.writeString('utf-8')
def _downloadDone(response_str, request=None, page_factory=None, feed_config=None):
response = _buildScrapyResponse(page_factory, response_str)
response = DecompressionMiddleware().process_response(None, response, None)
if (isinstance(response, TextResponse)):
response_str = _buildFeed(response, feed_config)
request.setHeader(b"Content-Type", b'text/xml')
request.write(response_str)
request.finish()
def _downloadError(error, request=None, page_factory=None):
request.write('Downloader error: ' + error.value)
request.finish()
def startFeedRequest(request, feed_id):
# get url, xpathes
creds = DATABASES['default']
db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME'])
feed = {}
with db:
cur = db.cursor()
cur.execute("""select f.uri, f.xpath, fi.name, ff.xpath from frontend_feed f
right join frontend_feedfield ff on ff.feed_id=f.id
left join frontend_field fi on fi.id=ff.field_id
where f.id=%s""", (feed_id,))
rows = cur.fetchall()
for row in rows:
if not feed:
feed['uri'] = row[0]
feed['xpath'] = row[1]
feed['fields'] = {}
feed['fields'][row[2]] = row[3]
if feed:
page_factory = _getPageFactory(feed['uri'],
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
},
redirectLimit=5,
timeout=10
)
d = page_factory.deferred
d.addCallback(_downloadDone, request=request, page_factory=page_factory, feed_config=feed)
d.addErrback(_downloadError, request=request, page_factory=page_factory)
else:
Request.write('Feed generator error: config of feed is empty')
request.finish()
return

View File

@ -38,7 +38,9 @@ INSTALLED_APPS = (
'django.contrib.sessions', 'django.contrib.sessions',
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'django.contrib.sites',
'pipeline', 'pipeline',
'frontend',
) )
MIDDLEWARE_CLASSES = ( MIDDLEWARE_CLASSES = (
@ -153,9 +155,11 @@ PIPELINE_JS = {
'frontend/assets/js/bootstrap.js', 'frontend/assets/js/bootstrap.js',
'frontend/assets/js/bootstrap_and_overrides.js', 'frontend/assets/js/bootstrap_and_overrides.js',
'frontend/assets/js/setup-tool.js', 'frontend/assets/js/setup-tool.js',
'frontend/assets/js/jquery.jfeed.js',
), ),
'output_filename': 'frontend/js/app.js', 'output_filename': 'frontend/js/app.js',
} }
} }
DOWNLOADER_PAGE_URL = 'http://politepol.com/downloader?url=' DOWNLOADER_PAGE_URL = 'http://politepol.com/downloader?url='
FEED_PAGE_URL = 'http://politepol.com/feed/'

View File

@ -324,7 +324,6 @@ function requestSelection() {
reject(errMsg); reject(errMsg);
} }
}); });
console.log(JSON.stringify(htmlJson));
}); });
else { else {
return new Promise(function(resolve, reject){ return new Promise(function(resolve, reject){
@ -418,8 +417,7 @@ function onCreateButtonClick() {
if (active) if (active)
//todo: freeze UI //todo: freeze UI
createFeed().then(function(feed_page_url){ createFeed().then(function(feed_page_url){
alert(feed_page_url); window.location.href = feed_page_url;
//window.location.href = feed_page_url;
}, function(error){ }, function(error){
//todo: unfreez UI //todo: unfreez UI
console.log('Server error: '+ error); console.log('Server error: '+ error);
@ -440,7 +438,6 @@ function createFeed() {
url: "/setup_create_feed", url: "/setup_create_feed",
data: JSON.stringify({ html: htmlJson, names: name_ids, url:$('#create').data('page-url') }), data: JSON.stringify({ html: htmlJson, names: name_ids, url:$('#create').data('page-url') }),
contentType: "application/json; charset=utf-8", contentType: "application/json; charset=utf-8",
dataType: "json",
headers: {"X-CSRFToken": getCookie('csrftoken')}, headers: {"X-CSRFToken": getCookie('csrftoken')},
success: function(data){ success: function(data){
resolve(data) resolve(data)
@ -449,7 +446,6 @@ function createFeed() {
reject(errMsg); reject(errMsg);
} }
}); });
console.log(JSON.stringify(htmlJson));
}); });
else { else {
return new Promise(function(resolve, reject){ return new Promise(function(resolve, reject){

View File

@ -0,0 +1,73 @@
{% extends "base.html" %}
{% load staticfiles %}
{% load i18n %}
{% block content %}
<div class="page-header">
<h1>Feed is ready</h1>
</div>
<div class="media">
<a class="pull-left" href="{{ feed_url }}">
<img src="{% static 'frontend/images/rss-640.png' %}" width="64" height="64" class="media-object" />
</a>
<div class="media-body">
<h4 class="media-heading">subscribe.your_feed <a href="?" target="_blank">subscribe.rss_help_link</a>
:</h4>
<h3>
<a href="{{ feed_url }}" target="_blank">{{ feed_url }}</a>
</h3>
</div>
<div id="preview" class="well" style="margin-bottom: 60px">
subscribe.loading
</div>
<script type="text/javascript">
function tryGetFeed() {
$.getFeed({
url: '{{ feed_url }}',
success: function (feed) {
if ('items' in feed) {
$('#preview').empty()
.append($('<h3 style="margin-top: 0;"></h3>').append("<%= I18n.t('subscribe.preview') %>: "));
/*.append($('<div></div>').append($('<a/>').attr('href', feed.link).attr('target', '_blank').text(feed.title)))*/
var html = '';
for (var i = 0; i < feed.items.length; i++) {
var item = feed.items[i];
$('#preview').append('<h4>'
+ '<a href="'
+ item.link
+ '">'
+ item.title
+ '</a>'
+ '</h4>');
$('#preview').append('<div>'
+ item.description
+ '</div>');
}
}
else {
$('#preview').text($('#preview').text().trim() + ' .');
setTimeout(tryGetFeed, 2000);
}
},
failure: function () {
$('#preview').text($('#preview').text().trim() + ' .');
setTimeout(tryGetFeed, 2000);
},
error: function () {
$('#preview').text($('#preview').text().trim() + ' .');
setTimeout(tryGetFeed, 2000);
}
});
}
tryGetFeed();
</script>
</div>
{% endblock %}

View File

@ -22,6 +22,7 @@ from . import views
urlpatterns = i18n_patterns( urlpatterns = i18n_patterns(
url(r'^$', views.index, name='index'), url(r'^$', views.index, name='index'),
url(r'^setup$', views.setup, name='setup'), url(r'^setup$', views.setup, name='setup'),
url(r'^preview/([0-9]+)$', views.preview, name='preview'),
url(r'^admin/', include(admin.site.urls)), url(r'^admin/', include(admin.site.urls)),
) )

View File

@ -9,7 +9,7 @@ from django.core.exceptions import ValidationError
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
from .forms import IndexForm from .forms import IndexForm
from .settings import DOWNLOADER_PAGE_URL from .settings import DOWNLOADER_PAGE_URL, FEED_PAGE_URL
from .setup_tool import get_selection_tag_ids, build_xpathes_for_items from .setup_tool import get_selection_tag_ids, build_xpathes_for_items
from .models import Feed, Field, FeedField from .models import Feed, Field, FeedField
@ -37,8 +37,7 @@ def setup(request):
return render(request, 'frontend/setup.html', return render(request, 'frontend/setup.html',
{ {
'external_page_url': external_page_url, 'external_page_url': external_page_url,
'page_url': request.GET['url'], 'page_url': request.GET['url']
'feed_page_url': reverse('setup_create_feed') # todo: replace with feedpage
}) })
return HttpResponseBadRequest('Url is required') return HttpResponseBadRequest('Url is required')
@ -75,7 +74,6 @@ def _create_feed(url, xpathes):
feed_xpath = xpathes[0] feed_xpath = xpathes[0]
item_xpathes = xpathes[1] item_xpathes = xpathes[1]
#import pdb; pdb.set_trace()
feed = Feed(uri=url, xpath=feed_xpath) feed = Feed(uri=url, xpath=feed_xpath)
feed.save() feed.save()
@ -102,4 +100,15 @@ def setup_create_feed(request):
xpathes = build_xpathes_for_items(item_names, html_json) xpathes = build_xpathes_for_items(item_names, html_json)
feed_id = _create_feed(url, xpathes) feed_id = _create_feed(url, xpathes)
return HttpResponse(feed_id) return HttpResponse(reverse('preview', args=(feed_id,)))
def preview(request, feed_id):
#import pdb; pdb.set_trace()
if request.method == 'GET':
return render(request, 'frontend/preview.html',
{
'feed_url': FEED_PAGE_URL + feed_id,
})
return HttpResponseBadRequest('Only GET method supported')

View File

@ -5,6 +5,7 @@ Scrapy==1.0.3
django-pipeline==1.5.4 django-pipeline==1.5.4
mysqlclient==1.3.7 mysqlclient==1.3.7
w3lib==1.12.0 w3lib==1.12.0
feedgenerator==1.8
#sudo apt-get install nodejs npm #sudo apt-get install nodejs npm
#npm install -g less #npm install -g less
#ln -s /usr/bin/nodejs /usr/bin/node #ln -s /usr/bin/nodejs /usr/bin/node

1
settings.py Symbolic link
View File

@ -0,0 +1 @@
./frontend/frontend/settings.py