v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-21 08:30:24 -07:00

title link fix

This commit is contained in:
Alexandr Nesterenko 2016-07-20 14:57:36 -07:00
parent 3f2c9f6624
commit 47156b6465

16
feed.py
View File

@ -10,6 +10,9 @@ from scrapy.selector import Selector
from scrapy.http import Headers from scrapy.http import Headers
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
import w3lib.url
import w3lib.html
from lxml import etree from lxml import etree
from feedgenerator import Rss201rev2Feed, Enclosure from feedgenerator import Rss201rev2Feed, Enclosure
@ -46,9 +49,9 @@ def element_to_string(element):
s.append(element.tail) s.append(element.tail)
return ''.join(s) return ''.join(s)
def _build_link(doclink, link): def _build_link(html, doc_url, url):
# todo base_url = w3lib.html.get_base_url(html, doc_url)
return link return w3lib.url.urljoin_rfc(base_url, url)
def _buildFeed(response, feed_config): def _buildFeed(response, feed_config):
tree = response.selector._root.getroottree() tree = response.selector._root.getroottree()
@ -57,6 +60,7 @@ def _buildFeed(response, feed_config):
items = [] items = []
for node in tree.xpath(feed_config['xpath']): for node in tree.xpath(feed_config['xpath']):
item = {} item = {}
title_link = None
for field_name in ['title', 'description']: for field_name in ['title', 'description']:
if field_name in feed_config['fields']: if field_name in feed_config['fields']:
element = node.xpath(feed_config['fields'][field_name]) element = node.xpath(feed_config['fields'][field_name])
@ -66,10 +70,10 @@ def _buildFeed(response, feed_config):
if field_name == 'title': if field_name == 'title':
anchor = element[0].xpath('ancestor-or-self::node()[name()="a"]') anchor = element[0].xpath('ancestor-or-self::node()[name()="a"]')
if anchor and anchor[0].get('href'): if anchor and anchor[0].get('href'):
item['title_link'] = _build_link(feed_config['uri'], anchor[0].get('href')) title_link = _build_link(response.body_as_unicode(), feed_config['uri'], anchor[0].get('href'))
if len(item) == len(feed_config['fields']): # all fields are required if len(item) == len(feed_config['fields']): # all fields are required
item['title_link'] = title_link
items.append(item) items.append(item)
#build feed #build feed