scrapy框架之配置文件2

发布时间 2023-12-08 15:17:42作者: 木屐呀

① 自动限速算法

 1 """
 2 17. 自动限速算法
 3     from scrapy.contrib.throttle import AutoThrottle
 4     自动限速设置
 5     1. 获取最小延迟 DOWNLOAD_DELAY
 6     2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
 7     3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
 8     4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
 9     5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
10     target_delay = latency / self.target_concurrency
11     new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
12     new_delay = max(target_delay, new_delay)
13     new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
14     slot.delay = new_delay
15 """
16 
17 # 开始自动限速
18 # AUTOTHROTTLE_ENABLED = True
19 # The initial download delay
20 # 初始下载延迟
21 # AUTOTHROTTLE_START_DELAY = 5
22 # The maximum download delay to be set in case of high latencies
23 # 最大下载延迟
24 # AUTOTHROTTLE_MAX_DELAY = 10
25 # The average number of requests Scrapy should be sending in parallel to each remote server
26 # 平均每秒并发数
27 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
28 
29 # Enable showing throttling stats for every response received:
30 # 是否显示
31 # AUTOTHROTTLE_DEBUG = True
 1 import logging
 2 
 3 from scrapy.exceptions import NotConfigured
 4 from scrapy import signals
 5 
 6 logger = logging.getLogger(__name__)
 7 
 8 
 9 class AutoThrottle(object):
10 
11     def __init__(self, crawler):
12         self.crawler = crawler
13         if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
14             raise NotConfigured
15 
16         self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
17         self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
18         crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
19         crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
20 
21     @classmethod
22     def from_crawler(cls, crawler):
23         return cls(crawler)
24 
25     def _spider_opened(self, spider):
26         self.mindelay = self._min_delay(spider)
27         self.maxdelay = self._max_delay(spider)
28         spider.download_delay = self._start_delay(spider)
29 
30     def _min_delay(self, spider):
31         s = self.crawler.settings
32         return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
33 
34     def _max_delay(self, spider):
35         return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
36 
37     def _start_delay(self, spider):
38         return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
39 
40     def _response_downloaded(self, response, request, spider):
41         key, slot = self._get_slot(request, spider)
42         latency = request.meta.get('download_latency')
43         if latency is None or slot is None:
44             return
45 
46         olddelay = slot.delay
47         self._adjust_delay(slot, latency, response)
48         if self.debug:
49             diff = slot.delay - olddelay
50             size = len(response.body)
51             conc = len(slot.transferring)
52             logger.info(
53                 "slot: %(slot)s | conc:%(concurrency)2d | "
54                 "delay:%(delay)5d ms (%(delaydiff)+d) | "
55                 "latency:%(latency)5d ms | size:%(size)6d bytes",
56                 {
57                     'slot': key, 'concurrency': conc,
58                     'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
59                     'latency': latency * 1000, 'size': size
60                 },
61                 extra={'spider': spider}
62             )
63 
64     def _get_slot(self, request, spider):
65         key = request.meta.get('download_slot')
66         return key, self.crawler.engine.downloader.slots.get(key)
67 
68     def _adjust_delay(self, slot, latency, response):
69         """Define delay adjustment policy"""
70 
71         # If a server needs `latency` seconds to respond then
72         # we should send a request each `latency/N` seconds
73         # to have N requests processed in parallel
74         target_delay = latency / self.target_concurrency
75 
76         # Adjust the delay to make it closer to target_delay
77         new_delay = (slot.delay + target_delay) / 2.0
78 
79         # If target delay is bigger than old delay, then use it instead of mean.
80         # It works better with problematic sites.
81         new_delay = max(target_delay, new_delay)
82 
83         # Make sure self.mindelay <= new_delay <= self.max_delay
84         new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
85 
86         # Dont adjust delay if response status != 200 and new delay is smaller
87         # than old one, as error pages (and redirections) are usually small and
88         # so tend to reduce latency, thus provoking a positive feedback by
89         # reducing delay instead of increase.
90         if response.status != 200 and new_delay <= slot.delay:
91             return
92 
93         slot.delay = new_delay
AutoThrottle源码

补充:

1 from scrapy.contrib.downloadermiddleware.robotstxt import RobotsTxtMiddleware  #爬虫规则
2 from scrapy.core.downloader.webclient import ScrapyHTTPClientFactory  #找到请求到接收响应的时间:latency
3 #request.meta['download_latency'] = self.headers_time-self.start_time

② 启用缓存

 1 """
 2     目的用于将已经发送的请求或响应缓存下来,以便以后使用
 3 
 4     from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
 5     from scrapy.extensions.httpcache import DummyPolicy
 6     from scrapy.extensions.httpcache import RFC2616Policy
 7     from scrapy.extensions.httpcache import FilesystemCacheStorage
 8 """
 9 # Enable and configure HTTP caching (disabled by default)
10 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
11 
12 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
13 # 是否启用缓存策略
14 # HTTPCACHE_ENABLED = True
15 
16 # 缓存策略①
17 from scrapy.extensions.httpcache import DummyPolicy
18 # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
19 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
20 
21 # 缓存策略②:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略 -->>常用
22 from scrapy.extensions.httpcache import RFC2616Policy
23 
24 # 缓存超时时间
25 # HTTPCACHE_EXPIRATION_SECS = 0
26 
27 # 缓存忽略的Http状态码
28 # HTTPCACHE_IGNORE_HTTP_CODES = []
29 
30 # 缓存保存路径
31 # HTTPCACHE_DIR = 'httpcache'
32 
33 # 缓存存储的插件
34 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #可以自定义存储类,遵循File..格式即可
35 # 文件类型存储:存储类型 -->>常用
36 from scrapy.extensions.httpcache import FilesystemCacheStorage #查看源码,文件保存response
  1 from email.utils import formatdate
  2 from twisted.internet import defer
  3 from twisted.internet.error import TimeoutError, DNSLookupError, \
  4         ConnectionRefusedError, ConnectionDone, ConnectError, \
  5         ConnectionLost, TCPTimedOutError
  6 from twisted.web.client import ResponseFailed
  7 from scrapy import signals
  8 from scrapy.exceptions import NotConfigured, IgnoreRequest
  9 from scrapy.utils.misc import load_object
 10 
 11 
 12 class HttpCacheMiddleware(object):
 13 
 14     DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
 15                            ConnectionRefusedError, ConnectionDone, ConnectError,
 16                            ConnectionLost, TCPTimedOutError, ResponseFailed,
 17                            IOError)
 18 
 19     def __init__(self, settings, stats):
 20         if not settings.getbool('HTTPCACHE_ENABLED'):
 21             raise NotConfigured
 22         self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
 23         self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
 24         self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
 25         self.stats = stats
 26 
 27     @classmethod
 28     def from_crawler(cls, crawler):
 29         o = cls(crawler.settings, crawler.stats)
 30         crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
 31         crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
 32         return o
 33 
 34     def spider_opened(self, spider):
 35         self.storage.open_spider(spider)
 36 
 37     def spider_closed(self, spider):
 38         self.storage.close_spider(spider)
 39 
 40     def process_request(self, request, spider):
 41         if request.meta.get('dont_cache', False):
 42             return
 43 
 44         # Skip uncacheable requests
 45         if not self.policy.should_cache_request(request):
 46             request.meta['_dont_cache'] = True  # flag as uncacheable
 47             return
 48 
 49         # Look for cached response and check if expired
 50         cachedresponse = self.storage.retrieve_response(spider, request)
 51         if cachedresponse is None:
 52             self.stats.inc_value('httpcache/miss', spider=spider)
 53             if self.ignore_missing:
 54                 self.stats.inc_value('httpcache/ignore', spider=spider)
 55                 raise IgnoreRequest("Ignored request not in cache: %s" % request)
 56             return  # first time request
 57 
 58         # Return cached response only if not expired
 59         cachedresponse.flags.append('cached')
 60         if self.policy.is_cached_response_fresh(cachedresponse, request):
 61             self.stats.inc_value('httpcache/hit', spider=spider)
 62             return cachedresponse
 63 
 64         # Keep a reference to cached response to avoid a second cache lookup on
 65         # process_response hook
 66         request.meta['cached_response'] = cachedresponse
 67 
 68     def process_response(self, request, response, spider):
 69         if request.meta.get('dont_cache', False):
 70             return response
 71 
 72         # Skip cached responses and uncacheable requests
 73         if 'cached' in response.flags or '_dont_cache' in request.meta:
 74             request.meta.pop('_dont_cache', None)
 75             return response
 76 
 77         # RFC2616 requires origin server to set Date header,
 78         # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
 79         if 'Date' not in response.headers:
 80             response.headers['Date'] = formatdate(usegmt=1)
 81 
 82         # Do not validate first-hand responses
 83         cachedresponse = request.meta.pop('cached_response', None)
 84         if cachedresponse is None:
 85             self.stats.inc_value('httpcache/firsthand', spider=spider)
 86             self._cache_response(spider, response, request, cachedresponse)
 87             return response
 88 
 89         if self.policy.is_cached_response_valid(cachedresponse, response, request):
 90             self.stats.inc_value('httpcache/revalidate', spider=spider)
 91             return cachedresponse
 92 
 93         self.stats.inc_value('httpcache/invalidate', spider=spider)
 94         self._cache_response(spider, response, request, cachedresponse)
 95         return response
 96 
 97     def process_exception(self, request, exception, spider):
 98         cachedresponse = request.meta.pop('cached_response', None)
 99         if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
100             self.stats.inc_value('httpcache/errorrecovery', spider=spider)
101             return cachedresponse
102 
103     def _cache_response(self, spider, response, request, cachedresponse):
104         if self.policy.should_cache_response(response, request):
105             self.stats.inc_value('httpcache/store', spider=spider)
106             self.storage.store_response(spider, request, response)
107         else:
108             self.stats.inc_value('httpcache/uncacheable', spider=spider)
HttpCacheMiddleware
 1 class DummyPolicy(object):
 2 
 3     def __init__(self, settings):
 4         self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
 5         self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
 6 
 7     def should_cache_request(self, request):
 8         return urlparse_cached(request).scheme not in self.ignore_schemes
 9 
10     def should_cache_response(self, response, request):
11         return response.status not in self.ignore_http_codes
12 
13     def is_cached_response_fresh(self, response, request):
14         return True
15 
16     def is_cached_response_valid(self, cachedresponse, response, request):
17         return True
DummyPolicy
  1 class RFC2616Policy(object):
  2 
  3     MAXAGE = 3600 * 24 * 365  # one year
  4 
  5     def __init__(self, settings):
  6         self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
  7         self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
  8         self.ignore_response_cache_controls = [to_bytes(cc) for cc in
  9             settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
 10         self._cc_parsed = WeakKeyDictionary()
 11 
 12     def _parse_cachecontrol(self, r):
 13         if r not in self._cc_parsed:
 14             cch = r.headers.get(b'Cache-Control', b'')
 15             parsed = parse_cachecontrol(cch)
 16             if isinstance(r, Response):
 17                 for key in self.ignore_response_cache_controls:
 18                     parsed.pop(key, None)
 19             self._cc_parsed[r] = parsed
 20         return self._cc_parsed[r]
 21 
 22     def should_cache_request(self, request):
 23         if urlparse_cached(request).scheme in self.ignore_schemes:
 24             return False
 25         cc = self._parse_cachecontrol(request)
 26         # obey user-agent directive "Cache-Control: no-store"
 27         if b'no-store' in cc:
 28             return False
 29         # Any other is eligible for caching
 30         return True
 31 
 32     def should_cache_response(self, response, request):
 33         # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
 34         # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
 35         # Status code 206 is not included because cache can not deal with partial contents
 36         cc = self._parse_cachecontrol(response)
 37         # obey directive "Cache-Control: no-store"
 38         if b'no-store' in cc:
 39             return False
 40         # Never cache 304 (Not Modified) responses
 41         elif response.status == 304:
 42             return False
 43         # Cache unconditionally if configured to do so
 44         elif self.always_store:
 45             return True
 46         # Any hint on response expiration is good
 47         elif b'max-age' in cc or b'Expires' in response.headers:
 48             return True
 49         # Firefox fallbacks this statuses to one year expiration if none is set
 50         elif response.status in (300, 301, 308):
 51             return True
 52         # Other statuses without expiration requires at least one validator
 53         elif response.status in (200, 203, 401):
 54             return b'Last-Modified' in response.headers or b'ETag' in response.headers
 55         # Any other is probably not eligible for caching
 56         # Makes no sense to cache responses that does not contain expiration
 57         # info and can not be revalidated
 58         else:
 59             return False
 60 
 61     def is_cached_response_fresh(self, cachedresponse, request):
 62         cc = self._parse_cachecontrol(cachedresponse)
 63         ccreq = self._parse_cachecontrol(request)
 64         if b'no-cache' in cc or b'no-cache' in ccreq:
 65             return False
 66 
 67         now = time()
 68         freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
 69         currentage = self._compute_current_age(cachedresponse, request, now)
 70 
 71         reqmaxage = self._get_max_age(ccreq)
 72         if reqmaxage is not None:
 73             freshnesslifetime = min(freshnesslifetime, reqmaxage)
 74 
 75         if currentage < freshnesslifetime:
 76             return True
 77 
 78         if b'max-stale' in ccreq and b'must-revalidate' not in cc:
 79             # From RFC2616: "Indicates that the client is willing to
 80             # accept a response that has exceeded its expiration time.
 81             # If max-stale is assigned a value, then the client is
 82             # willing to accept a response that has exceeded its
 83             # expiration time by no more than the specified number of
 84             # seconds. If no value is assigned to max-stale, then the
 85             # client is willing to accept a stale response of any age."
 86             staleage = ccreq[b'max-stale']
 87             if staleage is None:
 88                 return True
 89 
 90             try:
 91                 if currentage < freshnesslifetime + max(0, int(staleage)):
 92                     return True
 93             except ValueError:
 94                 pass
 95 
 96         # Cached response is stale, try to set validators if any
 97         self._set_conditional_validators(request, cachedresponse)
 98         return False
 99 
100     def is_cached_response_valid(self, cachedresponse, response, request):
101         # Use the cached response if the new response is a server error,
102         # as long as the old response didn't specify must-revalidate.
103         if response.status >= 500:
104             cc = self._parse_cachecontrol(cachedresponse)
105             if b'must-revalidate' not in cc:
106                 return True
107 
108         # Use the cached response if the server says it hasn't changed.
109         return response.status == 304
110 
111     def _set_conditional_validators(self, request, cachedresponse):
112         if b'Last-Modified' in cachedresponse.headers:
113             request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
114 
115         if b'ETag' in cachedresponse.headers:
116             request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
117 
118     def _get_max_age(self, cc):
119         try:
120             return max(0, int(cc[b'max-age']))
121         except (KeyError, ValueError):
122             return None
123 
124     def _compute_freshness_lifetime(self, response, request, now):
125         # Reference nsHttpResponseHead::ComputeFreshnessLifetime
126         # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
127         cc = self._parse_cachecontrol(response)
128         maxage = self._get_max_age(cc)
129         if maxage is not None:
130             return maxage
131 
132         # Parse date header or synthesize it if none exists
133         date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
134 
135         # Try HTTP/1.0 Expires header
136         if b'Expires' in response.headers:
137             expires = rfc1123_to_epoch(response.headers[b'Expires'])
138             # When parsing Expires header fails RFC 2616 section 14.21 says we
139             # should treat this as an expiration time in the past.
140             return max(0, expires - date) if expires else 0
141 
142         # Fallback to heuristic using last-modified header
143         # This is not in RFC but on Firefox caching implementation
144         lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
145         if lastmodified and lastmodified <= date:
146             return (date - lastmodified) / 10
147 
148         # This request can be cached indefinitely
149         if response.status in (300, 301, 308):
150             return self.MAXAGE
151 
152         # Insufficient information to compute fresshness lifetime
153         return 0
154 
155     def _compute_current_age(self, response, request, now):
156         # Reference nsHttpResponseHead::ComputeCurrentAge
157         # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
158         currentage = 0
159         # If Date header is not set we assume it is a fast connection, and
160         # clock is in sync with the server
161         date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
162         if now > date:
163             currentage = now - date
164 
165         if b'Age' in response.headers:
166             try:
167                 age = int(response.headers[b'Age'])
168                 currentage = max(currentage, age)
169             except ValueError:
170                 pass
171 
172         return currentage
RFC2616Policy
 1 class FilesystemCacheStorage(object):
 2 
 3     def __init__(self, settings):
 4         self.cachedir = data_path(settings['HTTPCACHE_DIR'])
 5         self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
 6         self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
 7         self._open = gzip.open if self.use_gzip else open
 8 
 9     def open_spider(self, spider):
10         logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
11                      extra={'spider': spider})
12 
13     def close_spider(self, spider):
14         pass
15 
16     def retrieve_response(self, spider, request):
17         """Return response if present in cache, or None otherwise."""
18         metadata = self._read_meta(spider, request)
19         if metadata is None:
20             return  # not cached
21         rpath = self._get_request_path(spider, request)
22         with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
23             body = f.read()
24         with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
25             rawheaders = f.read()
26         url = metadata.get('response_url')
27         status = metadata['status']
28         headers = Headers(headers_raw_to_dict(rawheaders))
29         respcls = responsetypes.from_args(headers=headers, url=url)
30         response = respcls(url=url, headers=headers, status=status, body=body)
31         return response
32 
33     def store_response(self, spider, request, response):
34         """Store the given response in the cache."""
35         rpath = self._get_request_path(spider, request)
36         if not os.path.exists(rpath):
37             os.makedirs(rpath)
38         metadata = {
39             'url': request.url,
40             'method': request.method,
41             'status': response.status,
42             'response_url': response.url,
43             'timestamp': time(),
44         }
45         with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
46             f.write(to_bytes(repr(metadata)))
47         with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
48             pickle.dump(metadata, f, protocol=2)
49         with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
50             f.write(headers_dict_to_raw(response.headers))
51         with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
52             f.write(response.body)
53         with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
54             f.write(headers_dict_to_raw(request.headers))
55         with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
56             f.write(request.body)
57 
58     def _get_request_path(self, spider, request):
59         key = request_fingerprint(request)
60         return os.path.join(self.cachedir, spider.name, key[0:2], key)
61 
62     def _read_meta(self, spider, request):
63         rpath = self._get_request_path(spider, request)
64         metapath = os.path.join(rpath, 'pickled_meta')
65         if not os.path.exists(metapath):
66             return  # not found
67         mtime = os.stat(metapath).st_mtime
68         if 0 < self.expiration_secs < time() - mtime:
69             return  # expired
70         with self._open(metapath, 'rb') as f:
71             return pickle.load(f)
FilesystemCacheStorage

 ③ 代理

1. 默认代理:

1 代理,需要在环境变量中设置
2 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
3     
4 方式一:使用默认
5       os.environ
6         {
7             http_proxy:http://root:woshiniba@192.168.11.11:9999/
8             https_proxy:http://192.168.11.11:9999/
9         }
 1 class HttpProxyMiddleware(object):
 2 
 3     def __init__(self, auth_encoding='latin-1'):
 4         self.auth_encoding = auth_encoding
 5         self.proxies = {}
 6         for type, url in getproxies().items():
 7             self.proxies[type] = self._get_proxy(url, type)
 8 
 9     @classmethod
10     def from_crawler(cls, crawler):
11         if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
12             raise NotConfigured
13         auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
14         return cls(auth_encoding)
15 
16     def _basic_auth_header(self, username, password):
17         user_pass = to_bytes(
18             '%s:%s' % (unquote(username), unquote(password)),
19             encoding=self.auth_encoding)
20         return base64.b64encode(user_pass).strip()
21 
22     def _get_proxy(self, url, orig_type):
23         proxy_type, user, password, hostport = _parse_proxy(url)
24         proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
25 
26         if user:
27             creds = self._basic_auth_header(user, password)
28         else:
29             creds = None
30 
31         return creds, proxy_url
32 
33     def process_request(self, request, spider):
34         # ignore if proxy is already set
35         if 'proxy' in request.meta:
36             if request.meta['proxy'] is None:
37                 return
38             # extract credentials if present
39             creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
40             request.meta['proxy'] = proxy_url
41             if creds and not request.headers.get('Proxy-Authorization'):
42                 request.headers['Proxy-Authorization'] = b'Basic ' + creds
43             return
44         elif not self.proxies:
45             return
46 
47         parsed = urlparse_cached(request)
48         scheme = parsed.scheme
49 
50         # 'no_proxy' is only supported by http schemes
51         if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
52             return
53 
54         if scheme in self.proxies:
55             self._set_proxy(request, scheme)
56 
57     def _set_proxy(self, request, scheme):
58         creds, proxy = self.proxies[scheme]
59         request.meta['proxy'] = proxy
60         if creds:
61             request.headers['Proxy-Authorization'] = b'Basic ' + creds
HttpProxyMiddleware

分析:

1 # from six.moves.urllib.request import getproxies, proxy_bypass
2    # 源码下 getproxies() 相当于os.environ(获取环境变量)
3    # def _set_proxy(self, request, scheme):
4    #    creds, proxy = self.proxies[scheme]
5    #    request.meta['proxy'] = proxy  #主要
6    #    if creds:
7    #       request.headers['Proxy-Authorization'] = b'Basic ' + creds  ->>不需要账号密码可以省略掉   #主要

2.自定义HttpProxyMiddleware类:

 1 def to_bytes(text, encoding=None, errors='strict'):
 2         if isinstance(text, bytes):
 3             return text
 4         if not isinstance(text, six.string_types):
 5             raise TypeError('to_bytes must receive a unicode, str or bytes '
 6                             'object, got %s' % type(text).__name__)
 7         if encoding is None:
 8             encoding = 'utf-8'
 9         return text.encode(encoding, errors)
10         
11     class ProxyMiddleware(object):
12         def process_request(self, request, spider):
13             PROXIES = [
14                 {'ip_port': '111.11.228.75:80', 'user_pass': ''},
15                 {'ip_port': '120.198.243.22:80', 'user_pass': ''},
16                 {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
17                 {'ip_port': '101.71.27.120:80', 'user_pass': ''},
18                 {'ip_port': '122.96.59.104:80', 'user_pass': ''},
19                 {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
20             ]
21             proxy = random.choice(PROXIES)
22             if proxy['user_pass'] is not None:
23                 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
24                 encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
25                 request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
26                 print "**************ProxyMiddleware have pass************" + proxy['ip_port']
27             else:
28                 print "**************ProxyMiddleware no pass************" + proxy['ip_port']
29                 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
30     
31     DOWNLOADER_MIDDLEWARES = {
32        'step8_king.middlewares.ProxyMiddleware': 500,
33     }
自定义代理

④ Https访问

Https访问时有两种情况:

1.要爬取网站使用的可信任证书(默认支持)

1 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
2 DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
 1     class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
 2         """
 3         Non-peer-certificate verifying HTTPS context factory
 4 
 5         Default OpenSSL method is TLS_METHOD (also called SSLv23_METHOD)
 6         which allows TLS protocol negotiation
 7 
 8         'A TLS/SSL connection established with [this method] may
 9          understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.'
10         """
11 
12         def __init__(self, method=SSL.SSLv23_METHOD, *args, **kwargs):
13             super(ScrapyClientContextFactory, self).__init__(*args, **kwargs)
14             self._ssl_method = method
15 
16         def getCertificateOptions(self):
17             # setting verify=True will require you to provide CAs
18             # to verify against; in other words: it's not that simple
19 
20             # backward-compatible SSL/TLS method:
21             #
22             # * this will respect `method` attribute in often recommended
23             #   `ScrapyClientContextFactory` subclass
24             #   (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
25             #
26             # * getattr() for `_ssl_method` attribute for context factories
27             #   not calling super(..., self).__init__
28             return CertificateOptions(verify=False,
29                         method=getattr(self, 'method',
30                                        getattr(self, '_ssl_method', None)),
31                         fixBrokenPeers=True,
32                         acceptableCiphers=DEFAULT_CIPHERS)
33 
34         # kept for old-style HTTP/1.0 downloader context twisted calls,
35         # e.g. connectSSL()
36         def getContext(self, hostname=None, port=None):
37             return self.getCertificateOptions().getContext()
38 
39         def creatorForNetloc(self, hostname, port):
40             return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext())
ScrapyClientContextFactory

源码分析:

1 #def CertificateOptions(self):
2    #return CertificateOptions
3       #privateKey=None,
4       #certificate=None,
5 #自定义证书 -->> 继承ScrapyClientContextFactory,重写CertificateOptions方法,将CertificateOptions添加自定义的两个参数(privateKey,certificate)

2.要爬取网站使用的自定义证书

 1 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
 2         DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
 3         
 4         # https.py
 5         from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
 6         from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
 7         
 8         class MySSLFactory(ScrapyClientContextFactory):
 9             def getCertificateOptions(self):
10                 from OpenSSL import crypto
11                 v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
12                 v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
13                 return CertificateOptions(
14                     privateKey=v1,  # pKey对象
15                     certificate=v2,  # X509对象
16                     verify=False,
17                     method=getattr(self, 'method', getattr(self, '_ssl_method', None))
18                 )
19     其他:
20         相关类
21             scrapy.core.downloader.handlers.http.HttpDownloadHandler
22             scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
23             scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
24         相关配置
25             DOWNLOADER_HTTPCLIENTFACTORY
26             DOWNLOADER_CLIENTCONTEXTFACTORY
自定义证书

 详细参考:https://www.cnblogs.com/wupeiqi/articles/6229292.html