mirror of
https://github.com/taroved/pol
synced 2025-05-16 06:10:09 -07:00
skip certificate verification
This commit is contained in:
parent
ee82b45a22
commit
bb9dc3cc55
@ -17,6 +17,7 @@ from scrapy.selector import Selector
|
||||
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
|
||||
|
||||
from lxml import etree
|
||||
import re
|
||||
@ -42,7 +43,12 @@ def check_feed_request_time_limit(url):
|
||||
r.set(url, int(time.time()))
|
||||
return 0
|
||||
|
||||
agent = BrowserLikeRedirectAgent(Agent(reactor, connectTimeout=10), redirectLimit=5)
|
||||
agent = BrowserLikeRedirectAgent(
|
||||
Agent(reactor,
|
||||
contextFactory=ScrapyClientContextFactory(), # skip certificate verification
|
||||
connectTimeout=10),
|
||||
redirectLimit=5
|
||||
)
|
||||
|
||||
def html2json(el):
|
||||
return [
|
||||
|
11
tests.py
11
tests.py
@ -34,6 +34,13 @@ def test1_get_inner_html():
|
||||
ids = [1,2,3,5,6,8,44,54,99,100,101,103,113,118,120,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,249,250,251,252,253,255,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410] # 254 timeout 344 pp gatevway timeout
|
||||
domain = "politepol.com"
|
||||
|
||||
def parse_feed0(text):
|
||||
ch = etree.fromstring(text).xpath('/rss/channel')
|
||||
title = ch[0].xpath('title')[0].text
|
||||
link = ch[0].xpath('link')[0].text
|
||||
items = ch[0].xpath('item')
|
||||
return [title, link, items]
|
||||
|
||||
def parse_feed(text):
|
||||
ch = etree.fromstring(text.encode('utf-8')).xpath('/rss/channel')
|
||||
title = ch[0].xpath('title')[0].text
|
||||
@ -50,7 +57,7 @@ def crawl(extention):
|
||||
text = r.text.encode('utf-8')
|
||||
with open("tests/%s.%s" % (id, extention), 'w') as f:
|
||||
f.write(text)
|
||||
title, link, items = parse_feed(text)
|
||||
title, link, items = parse_feed0(text)
|
||||
print "Title: %s" % title
|
||||
print "Link: %s" % link
|
||||
print "Items count: %s" % len(items)
|
||||
@ -117,6 +124,8 @@ def diff(ext1, ext2, fields):
|
||||
number += 1
|
||||
if diff > 0:
|
||||
print "Different feed ids: %s" % str(diff)
|
||||
|
||||
print "Example of usage: python tests.py crawl before politepol.com"
|
||||
print str(sys.argv)
|
||||
if len(sys.argv) == 1:
|
||||
test1_get_inner_html()
|
||||
|
Loading…
x
Reference in New Issue
Block a user