Scrapy中搭配Splash丶selenium

发布时间 2023-08-02 13:21:23作者: 看一百次夜空里的深蓝

Splash的初识

 1 # Splash类似Selenium模拟浏览器浏览网页来动态爬取网站
 2 # 文档:https://splash.readthedocs.io/en/stable/
 3 # Splash 在docker中安装: https://splash.readthedocs.io/en/stable/install.html#linux-docker  (docker笔记点这里)
 4 # docker pull scrapinghub/splash
 5 # 启动Splash
 6 # docker run -it -p 8050:8050 --rm scrapinghub/splash
 7 # https://splash.readthedocs.io/en/stable/faq.html?highlight=scrapy#python-scrapy
 8 # pip3 install scrapy-splash
 9 
10 import requests
11 from fake_useragent import UserAgent
12 from urllib.parse import quote
13 
14 headers = {
15     'User-Agent': UserAgent().chrome
16 }
17 
18 # 调用splash,来控制(lua语言)抓取网页内容   (lua笔记点这里)
19 def main():
20     url = r"https://www.baidu.com"
21     lua_script = '''function main(splash, args)
22     splash:go('{url}')
23     splash:wait(1)
24     return splash:html()
25 end'''.format(url=url)
26     response = requests.get(r"http://localhost:8050/execute?lua_source={}".format(quote(lua_script)), headers=headers)
27     response.encoding = 'utf-8'
28     print(response.text)
29 
30 # splash直接访问网站
31 def test1():
32     url = r"https://www.baidu.com"
33     response = requests.get(r"http://localhost:8050/render.html?url={}".format(quote(url)), headers=headers)
34     response.encoding = 'utf-8'
35     print(response.text)
36 
37 if __name__ == '__main__':
38     test1()

 

Scrapy中的Splash

 1 # Splash 配置
 2 # 配置Splash URL地址
 3 SPLASH_URL = 'http://localhost:8050/'
 4 # 下载器中间件
 5 DOWNLOADER_MIDDLEWARES = {
 6    "scrapy_splash.SplashCookiesMiddleware": 200,
 7    "scrapy_splash.SplashMiddleware": 201,
 8    "myproject.middlewares.MyprojectDownloaderMiddleware": 202
 9 }
10 # 去重过滤器
11 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
12 # 使用Splash的Http缓存
13 HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 1 import scrapy
 2 from scrapy_splash import SplashRequest
 3 from urllib.parse import quote
 4 
 5 
 6 class CnblogsSpider(scrapy.Spider):
 7     name = "cnblogs"
 8     allowed_domains = ["cnblogs.com"]
 9     # start_urls = ["https://www.cnblogs.com/watermeloncode/"]
10 
11     def parse(self, response):
12         print(response.text)
13 
14     def parse1(self, response):
15         print(response.text)
16 
17     def start_requests(self):
18         lua_script = '''
19         function main(splash, args)
20             splash:go('{url}')
21             splash:wait(1)
22             return splash:html()
23         end'''.format(url=r"https://www.cnblogs.com/watermeloncode/")
24         # 要是执行lua代码的话就需要指定endpoint='execute', args={'lua_source':lua_script}
25         # yield SplashRequest(r"https://www.cnblogs.com/watermeloncode/", callback=self.parse, endpoint='execute', args={'lua_source':lua_script, 'wait':1})
26         yield SplashRequest(r"https://www.cnblogs.com/watermeloncode/", callback=self.parse, endpoint='render.html', args={'wait': 1})

Scrapy中的Selenium

 1 1.配置下载器
 2 class MyprojectDownloaderMiddleware:
 3     def process_request(self, request, spider):
 4         if spider.name == 'guazi':
 5             url = request.url
 6             # 在这里调用浏览器get请求
 7             spider.chrome.get(url)
 8             html = spider.chrome.page_source
 9             # 在这里返回HtmlResponse对象,后面就不会再次调用requests去发起请求了.因为这里我们已经把请求到的页面放置进去了.
10             return HtmlResponse(url=url, body=html, request=request, encoding='utf-8')
11             
12 2.Spider中控制创建浏览器与关闭浏览器
13 import scrapy
14 from scrapy import signals
15 from selenium import webdriver
16 
17 class GuaziSpider(scrapy.Spider):
18     name = "guazi"
19     allowed_domains = ["guazi.com"]
20     start_urls = ["https://www.guaz.com/"]
21 
22     @classmethod
23     def from_crawler(cls, crawler, *args, **kwargs):
24         # Spider对象的创建都在这里
25         # 记得调用父类方法创建好Spider对象
26         spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs)
27         # 添加一个关闭时需执行的方法.这里在处理spider_closed信号的时候会执行spider.spider_closed回调
28         crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
29         # 给Spider对象中内置一个浏览器对象
30         spider.chrome = webdriver.Chrome()
31         return spider
32 
33     def spider_closed(self, spider):
34         # 收到signals.spider_closed信号的时候执行该方法,关闭浏览器
35         spider.chrome.quit()
36         print('close selenium')
37 
38     def parse(self, response):
39         print(response.text)