diff --git a/downloader.py b/downloader.py index 6c06e64..a62a49a 100644 --- a/downloader.py +++ b/downloader.py @@ -12,9 +12,11 @@ from twisted.web.html import escape twisted_headers = Headers from scrapy.http.response.text import TextResponse +from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware from scrapy.selector import Selector +from scrapy.http.request import Request from scrapy.http import Headers from scrapy.responsetypes import responsetypes from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory @@ -127,6 +129,9 @@ def buildScrapyResponse(response, body, url): respcls = responsetypes.from_args(headers=headers, url=url) return respcls(url=url, status=status, headers=headers, body=body) +def buildScrapyRequest(url): + return Request(url) + def downloadStarted(response, request, url, feed_config): d = readBody(response) d.addCallback(downloadDone, request=request, response=response, feed_config=feed_config) @@ -139,6 +144,7 @@ def downloadDone(response_str, request, response, feed_config): print 'Response <%s> ready (%s bytes)' % (url, len(response_str)) response = buildScrapyResponse(response, response_str, url) + response = HttpCompressionMiddleware().process_response(Request(url), response, None) response = DecompressionMiddleware().process_response(None, response, None) if (isinstance(response, TextResponse)): diff --git a/requirements.txt b/requirements.txt index d95f09f..a3c13a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ django-pipeline==1.5.4 mysqlclient==1.3.7 w3lib==1.12.0 feedgenerator==1.8 +brotli=0.6.0 #sudo apt-get install nodejs npm #sudo npm install -g less #sudo ln -s /usr/bin/nodejs /usr/bin/node diff --git a/tests.py b/tests.py index 14785c8..959e382 100644 --- a/tests.py +++ b/tests.py @@ -30,8 +30,7 @@ def element_to_string(element, fields=None): def test1_get_inner_html(): root = etree.fromstring('12345') assert element_to_unicode(root, 'utf-8') == u'12345' - -ids = [1,2,3,5,6,8,44,54,99,100,101,103,113,118,120,123,124,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,249,250,251,252,253,255,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410] # 254 timeout 344 pp gatevway timeout +ids = [1,54,100,131,134,140,146,159,162,166,168,175,176,183,189,190,192,204,205,226,230,236,244,251,253,260,261,263,271,272,273,275,277,279,280,308,311,312,313,315,316,317,318,327,332,333,334,335,337,338,340,347,350,352,354,355,356,357,358,359,360,361,362,363,369,371,373,376,385,399,402,405,406,410,411,412,422,427,448,467,470,471,472,473,477,479,481,512,514,519,522,523,524,526,527,528,529,532,533,536,538,547,557,587,592,597,598,599,600,606,607,608,615,616,617,618,628,629,641,642,643,645,646,647,648,649,653,658,660,673,676,678,680,681,683,685,704,709,710,717,718,719,728,730,732,735,744,745,746,749,757,758,759,772,776,777,778,779,783,784,785,786,789,790,791,792,793,794,795,797,798,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,835,836,839,840,842,843,844,845,846,847,848,849,850,851,852,853,854,855,861,862,863,864,867,868,869,870,871,872,873,874,875,876,877,878,879,880,881,882,883,884,885,886,889,890,891,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,923,924,926,927,928,929,930,931,933,934,935,936,937,938,939,940,941,942,943,944,947,948,949,950] domain = "politepol.com" def parse_feed0(text):