v/pol
1
0
mirror of https://github.com/taroved/pol synced 2025-05-16 14:20:10 -07:00

skip certificate verification

This commit is contained in:
Alexandr Nesterenko 2017-08-10 21:11:46 +00:00
parent ee82b45a22
commit bb9dc3cc55
2 changed files with 17 additions and 2 deletions

View File

@ -17,6 +17,7 @@ from scrapy.selector import Selector
from scrapy.http import Headers from scrapy.http import Headers
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from lxml import etree from lxml import etree
import re import re
@ -42,7 +43,12 @@ def check_feed_request_time_limit(url):
r.set(url, int(time.time())) r.set(url, int(time.time()))
return 0 return 0
agent = BrowserLikeRedirectAgent(Agent(reactor, connectTimeout=10), redirectLimit=5) agent = BrowserLikeRedirectAgent(
Agent(reactor,
contextFactory=ScrapyClientContextFactory(), # skip certificate verification
connectTimeout=10),
redirectLimit=5
)
def html2json(el): def html2json(el):
return [ return [

View File

@ -34,6 +34,13 @@ def test1_get_inner_html():
ids = [1,2,3,5,6,8,44,54,99,100,101,103,113,118,120,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,249,250,251,252,253,255,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410] # 254 timeout 344 pp gatevway timeout ids = [1,2,3,5,6,8,44,54,99,100,101,103,113,118,120,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,249,250,251,252,253,255,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410] # 254 timeout 344 pp gatevway timeout
domain = "politepol.com" domain = "politepol.com"
def parse_feed0(text):
ch = etree.fromstring(text).xpath('/rss/channel')
title = ch[0].xpath('title')[0].text
link = ch[0].xpath('link')[0].text
items = ch[0].xpath('item')
return [title, link, items]
def parse_feed(text): def parse_feed(text):
ch = etree.fromstring(text.encode('utf-8')).xpath('/rss/channel') ch = etree.fromstring(text.encode('utf-8')).xpath('/rss/channel')
title = ch[0].xpath('title')[0].text title = ch[0].xpath('title')[0].text
@ -50,7 +57,7 @@ def crawl(extention):
text = r.text.encode('utf-8') text = r.text.encode('utf-8')
with open("tests/%s.%s" % (id, extention), 'w') as f: with open("tests/%s.%s" % (id, extention), 'w') as f:
f.write(text) f.write(text)
title, link, items = parse_feed(text) title, link, items = parse_feed0(text)
print "Title: %s" % title print "Title: %s" % title
print "Link: %s" % link print "Link: %s" % link
print "Items count: %s" % len(items) print "Items count: %s" % len(items)
@ -117,6 +124,8 @@ def diff(ext1, ext2, fields):
number += 1 number += 1
if diff > 0: if diff > 0:
print "Different feed ids: %s" % str(diff) print "Different feed ids: %s" % str(diff)
print "Example of usage: python tests.py crawl before politepol.com"
print str(sys.argv) print str(sys.argv)
if len(sys.argv) == 1: if len(sys.argv) == 1:
test1_get_inner_html() test1_get_inner_html()