diff --git a/downloader.py b/downloader.py index da94699..389e3e7 100644 --- a/downloader.py +++ b/downloader.py @@ -13,7 +13,9 @@ from scrapy.http import Headers from scrapy.responsetypes import responsetypes from lxml import etree +import re +from feed import startFeedRequest def getPageFactory(url, contextFactory=None, *args, **kwargs): """ @@ -87,14 +89,15 @@ def downloadDone(response_str, request=None, page_factory=None, url=None): request.finish() def downloadError(error, request=None, page_factory=None): - #import pdb; pdb.set_trace() request.write('Downloader error: ' + error.value) request.finish() -class Counter(resource.Resource): +class Downloader(resource.Resource): isLeaf = True + feed_regexp = re.compile('^/feed/(\d+)$') + def startRequest(self, request, url): page_factory = getPageFactory(url, headers={ @@ -118,18 +121,21 @@ class Counter(resource.Resource): def render_GET(self, request): ''' - Render page for frontend + Render page for frontend or RSS feed ''' if 'url' in request.args: url = request.args['url'][0] self.startRequest(request, url) return NOT_DONE_YET + elif self.feed_regexp.match(request.uri) is not None: + feed_id = self.feed_regexp.match(request.uri).groups()[0] + startFeedRequest(request, feed_id) + return NOT_DONE_YET else: return 'Url is required' - -endpoints.serverFromString(reactor, "tcp:1234").listen(server.Site(Counter())) +endpoints.serverFromString(reactor, "tcp:1234").listen(server.Site(Downloader())) print 'Server starting at http://localhost:1234' reactor.run() diff --git a/feed.py b/feed.py new file mode 100644 index 0000000..f831d57 --- /dev/null +++ b/feed.py @@ -0,0 +1,134 @@ +from twisted.web import server, resource +from twisted.internet import reactor, endpoints +from twisted.web.client import HTTPClientFactory, _makeGetterFactory +from twisted.web.server import NOT_DONE_YET + +from scrapy.http.response.text import TextResponse +from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware +from scrapy.selector import Selector + +from scrapy.http import Headers +from scrapy.responsetypes import responsetypes + +from lxml import etree + +from feedgenerator import Rss201rev2Feed, Enclosure +import datetime + +import MySQLdb +from settings import DATABASES + + +def _getPageFactory(url, contextFactory=None, *args, **kwargs): + """ + Download a web page as a string. + Download a page. Return a deferred, which will callback with a + page (as a string) or errback with a description of the error. + See L{HTTPClientFactory} to see what extra arguments can be passed. + """ + return _makeGetterFactory( + url, + HTTPClientFactory, + contextFactory=contextFactory, + *args, **kwargs) + +def _buildScrapyResponse(page_factory, body): + status = int(page_factory.status) + headers = Headers(page_factory.response_headers) + respcls = responsetypes.from_args(headers=headers, url=page_factory.url) + return respcls(url=page_factory.url, status=status, headers=headers, body=body) + +def element_to_string(element): + s = [element.text] if element.text else [] + for sub_element in element: + s.append(etree.tostring(sub_element)) + s.append(element.tail) + return ''.join(s) + +def _buildFeed(response, feed_config): + tree = response.selector._root.getroottree() + + # get data from html + items = [] + for node in tree.xpath(feed_config['xpath']): + item = {} + for field_name in ['title', 'description']: + if field_name in feed_config['fields']: + element = node.xpath(feed_config['fields'][field_name]) + if element: + item[field_name] = element_to_string(element[0]) + items.append(item) + + #import pdb; pdb.set_trace() + #build feed + feed = Rss201rev2Feed( + title='Polite Pol: ' + feed_config['uri'], + link=feed_config['uri'], + description="Generated by PolitePol.com.\n"+\ + "Url: " + feed_config['uri'], + language="en", + ) + + for item in items: + feed.add_item( + title=item['title'] if 'title' in item else '', + link=feed_config['uri'], + description=item['description'] if 'description' in item else '', + #enclosure=Enclosure(fields[4], "32000", "image/jpeg") if 4 in fields else None, #"Image" + pubdate=datetime.datetime.now() + ) + return feed.writeString('utf-8') + +def _downloadDone(response_str, request=None, page_factory=None, feed_config=None): + response = _buildScrapyResponse(page_factory, response_str) + + response = DecompressionMiddleware().process_response(None, response, None) + + if (isinstance(response, TextResponse)): + response_str = _buildFeed(response, feed_config) + + request.setHeader(b"Content-Type", b'text/xml') + request.write(response_str) + request.finish() + +def _downloadError(error, request=None, page_factory=None): + request.write('Downloader error: ' + error.value) + request.finish() + +def startFeedRequest(request, feed_id): + # get url, xpathes + creds = DATABASES['default'] + db = MySQLdb.connect(host=creds['HOST'], port=int(creds['PORT']), user=creds['USER'], passwd=creds['PASSWORD'], db=creds['NAME']) + feed = {} + with db: + cur = db.cursor() + cur.execute("""select f.uri, f.xpath, fi.name, ff.xpath from frontend_feed f + right join frontend_feedfield ff on ff.feed_id=f.id + left join frontend_field fi on fi.id=ff.field_id + where f.id=%s""", (feed_id,)) + rows = cur.fetchall() + + for row in rows: + if not feed: + feed['uri'] = row[0] + feed['xpath'] = row[1] + feed['fields'] = {} + feed['fields'][row[2]] = row[3] + + if feed: + page_factory = _getPageFactory(feed['uri'], + headers={ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, sdch', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36' + }, + redirectLimit=5, + timeout=10 + ) + d = page_factory.deferred + d.addCallback(_downloadDone, request=request, page_factory=page_factory, feed_config=feed) + d.addErrback(_downloadError, request=request, page_factory=page_factory) + else: + Request.write('Feed generator error: config of feed is empty') + request.finish() + return diff --git a/frontend/frontend/settings.py.example b/frontend/frontend/settings.py.example index 05c0c4c..efa452f 100644 --- a/frontend/frontend/settings.py.example +++ b/frontend/frontend/settings.py.example @@ -38,7 +38,9 @@ INSTALLED_APPS = ( 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'django.contrib.sites', 'pipeline', + 'frontend', ) MIDDLEWARE_CLASSES = ( @@ -153,9 +155,11 @@ PIPELINE_JS = { 'frontend/assets/js/bootstrap.js', 'frontend/assets/js/bootstrap_and_overrides.js', 'frontend/assets/js/setup-tool.js', + 'frontend/assets/js/jquery.jfeed.js', ), 'output_filename': 'frontend/js/app.js', } } DOWNLOADER_PAGE_URL = 'http://politepol.com/downloader?url=' +FEED_PAGE_URL = 'http://politepol.com/feed/' diff --git a/frontend/frontend/static/frontend/assets/js/setup-tool.js b/frontend/frontend/static/frontend/assets/js/setup-tool.js index 0c6c394..197a164 100644 --- a/frontend/frontend/static/frontend/assets/js/setup-tool.js +++ b/frontend/frontend/static/frontend/assets/js/setup-tool.js @@ -324,7 +324,6 @@ function requestSelection() { reject(errMsg); } }); - console.log(JSON.stringify(htmlJson)); }); else { return new Promise(function(resolve, reject){ @@ -418,8 +417,7 @@ function onCreateButtonClick() { if (active) //todo: freeze UI createFeed().then(function(feed_page_url){ - alert(feed_page_url); - //window.location.href = feed_page_url; + window.location.href = feed_page_url; }, function(error){ //todo: unfreez UI console.log('Server error: '+ error); @@ -440,7 +438,6 @@ function createFeed() { url: "/setup_create_feed", data: JSON.stringify({ html: htmlJson, names: name_ids, url:$('#create').data('page-url') }), contentType: "application/json; charset=utf-8", - dataType: "json", headers: {"X-CSRFToken": getCookie('csrftoken')}, success: function(data){ resolve(data) @@ -449,7 +446,6 @@ function createFeed() { reject(errMsg); } }); - console.log(JSON.stringify(htmlJson)); }); else { return new Promise(function(resolve, reject){ diff --git a/frontend/frontend/templates/frontend/preview.html b/frontend/frontend/templates/frontend/preview.html new file mode 100644 index 0000000..142266c --- /dev/null +++ b/frontend/frontend/templates/frontend/preview.html @@ -0,0 +1,73 @@ +{% extends "base.html" %} +{% load staticfiles %} +{% load i18n %} + +{% block content %} +