scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配有:
1 DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' #一般更改此参数 2 DUPEFILTER_DEBUG = False 3 JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen
简单方式的url去重:
1 visited_urls = set() #进行去重 2 3 for page_url in page_urls: 4 url = "https://dig.chouti.com%s" % page_url 5 md5_url = self.md5(url) 6 if md5_url in self.has_request_url_set: 7 pass 8 else: 9 self.has_request_url_set.add(md5_url) 10 yield Request(url=url,method="POST",callback=self.good)
默认RFPDupeFilter源码去重:
1 from scrapy.dupefilter import RFPDupeFilter #去重url 2 #自定义RDPDupeFilter类
自定义RDPDupeFilter类:
1 from __future__ import print_function 2 import os 3 import logging 4 5 from scrapy.utils.job import job_dir 6 from scrapy.utils.request import request_fingerprint 7 8 9 class BaseDupeFilter(object): 10 11 @classmethod 12 def from_settings(cls, settings): 13 return cls() 14 15 def request_seen(self, request): 16 return False 17 18 def open(self): # can return deferred 19 pass 20 21 def close(self, reason): # can return a deferred 22 pass 23 24 def log(self, request, spider): # log that a request has been filtered 25 pass 26 27 28 class RFPDupeFilter(BaseDupeFilter): 29 """Request Fingerprint duplicates filter""" 30 31 def __init__(self, path=None, debug=False): 32 self.file = None 33 self.fingerprints = set() 34 self.logdupes = True 35 self.debug = debug 36 self.logger = logging.getLogger(__name__) 37 if path: 38 self.file = open(os.path.join(path, 'requests.seen'), 'a+') 39 self.file.seek(0) 40 self.fingerprints.update(x.rstrip() for x in self.file) 41 42 @classmethod 43 def from_settings(cls, settings): 44 debug = settings.getbool('DUPEFILTER_DEBUG') 45 return cls(job_dir(settings), debug) 46 47 def request_seen(self, request): 48 fp = self.request_fingerprint(request) 49 if fp in self.fingerprints: 50 return True 51 self.fingerprints.add(fp) 52 if self.file: 53 self.file.write(fp + os.linesep) 54 55 def request_fingerprint(self, request): 56 return request_fingerprint(request) 57 58 def close(self, reason): 59 if self.file: 60 self.file.close() 61 62 def log(self, request, spider): 63 if self.debug: 64 msg = "Filtered duplicate request: %(request)s" 65 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 66 elif self.logdupes: 67 msg = ("Filtered duplicate request: %(request)s" 68 " - no more duplicates will be shown" 69 " (see DUPEFILTER_DEBUG to show all duplicates)") 70 self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 71 self.logdupes = False 72 73 spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
自定义url去重:
1 class RepeatFilter(object): 2 3 def __init__(self): 4 """ 5 2 6 """ 7 self.visited_set = set() 8 9 @classmethod 10 def from_settings(cls, settings): 11 """ 12 1 13 初始化时,调用 14 """ 15 print('配置Loading...') 16 return cls() 17 18 def request_seen(self, request): 19 """ 20 4 21 检测当前请求是否已经被访问过 22 :return: True表示已经访问过;False表示未访问过 23 """ 24 if request.url in self.visited_set: 25 return True 26 self.visited_set.add(request.url) 27 return False 28 29 def open(self): # can return deferred 30 """ 31 3 32 开始爬去请求时,调用 33 """ 34 print('open') 35 pass 36 37 def close(self, reason): # can return a deferred 38 """ 39 5 40 结束爬虫爬取时,调用 41 """ 42 print('close') 43 pass 44 45 def log(self, request, spider): # log that a request has been filtered 46 """ 47 记录日志 48 一直执行 49 """ 50 pass
1 #访问url去重处理配置 2 DUPEFILTER_CLASS = "test002.duplication.RepeatFilter"