scrapy框架之自定义扩展

发布时间 2023-12-08 15:17:42作者: 木屐呀

自定义扩展时,利用信号在指定位置注册制定操作

源码剖析:

1 from scrapy.extensions.telnet import TelnetConsole #查看TelnetConsole源码
2 
3 # Enable or disable extensions
4 # See https://doc.scrapy.org/en/latest/topics/extensions.html
5 EXTENSIONS = {
6    'scrapy.extensions.telnet.TelnetConsole': None,
7    # 'test002.extensions.MyExtend':300,
8 }

查看 TelnetConsole 类:

 1 class TelnetConsole(protocol.ServerFactory):
 2 
 3     def __init__(self, crawler):
 4         if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
 5             raise NotConfigured
 6         if not TWISTED_CONCH_AVAILABLE:
 7             raise NotConfigured
 8         self.crawler = crawler
 9         self.noisy = False
10         self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
11         self.host = crawler.settings['TELNETCONSOLE_HOST']
12         self.crawler.signals.connect(self.start_listening, signals.engine_started)
13         self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
14 
15     @classmethod
16     def from_crawler(cls, crawler):
17         return cls(crawler)
18 
19     def start_listening(self):
20         self.port = listen_tcp(self.portrange, self.host, self)
21         h = self.port.getHost()
22         logger.debug("Telnet console listening on %(host)s:%(port)d",
23                      {'host': h.host, 'port': h.port},
24                      extra={'crawler': self.crawler})
25 
26     def stop_listening(self):
27         self.port.stopListening()
28 
29     def protocol(self):
30         telnet_vars = self._get_telnet_vars()
31         return telnet.TelnetTransport(telnet.TelnetBootstrapProtocol,
32             insults.ServerProtocol, manhole.Manhole, telnet_vars)
33 
34     def _get_telnet_vars(self):
35         # Note: if you add entries here also update topics/telnetconsole.rst
36         telnet_vars = {
37             'engine': self.crawler.engine,
38             'spider': self.crawler.engine.spider,
39             'slot': self.crawler.engine.slot,
40             'crawler': self.crawler,
41             'extensions': self.crawler.extensions,
42             'stats': self.crawler.stats,
43             'settings': self.crawler.settings,
44             'est': lambda: print_engine_status(self.crawler.engine),
45             'p': pprint.pprint,
46             'prefs': print_live_refs,
47             'hpy': hpy,
48             'help': "This is Scrapy telnet console. For more info see: " \
49                 "https://doc.scrapy.org/en/latest/topics/telnetconsole.html",
50         }
51         self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
52         return telnet_vars

分析:

self.start_listening&self.stop_listening 是可以自定义的方法

signals.engine_started&signals.engine_stopped 是指定信号

在指定信号上注册操作

查找信号:

进入signals查看

 1 engine_started = object()
 2 engine_stopped = object()
 3 spider_opened = object()
 4 spider_idle = object()
 5 spider_closed = object()
 6 spider_error = object()
 7 request_scheduled = object()
 8 request_dropped = object()
 9 response_received = object()
10 response_downloaded = object()
11 item_scraped = object()
12 item_dropped = object()
13 
14 # for backwards compatibility
15 stats_spider_opened = spider_opened
16 stats_spider_closing = spider_closed
17 stats_spider_closed = spider_closed
18 
19 item_passed = item_scraped
20 
21 request_received = request_scheduled

根据上面源码,我们可以源码进行自定扩展:

 1 from scrapy import signals
 2 
 3 class MyExtend:
 4 
 5     def __init__(self,crawler):
 6         self.crawler = crawler
 7         # 钩子上挂障碍物
 8         # 在指定信号上注册操作
 9         self.crawler.signals.connect(self.start,signals.engine_started)
10         self.crawler.signals.connect(self.close,signals.spider_closed)
11 
12     @classmethod
13     def from_crawler(cls,crawler):
14         return cls(crawler)
15 
16     def start(self):
17         print('signals.engine_started')
18 
19     def close(self):
20         print('signals.spider_closed')