Scrapy集成selenium-案例-淘宝首页推荐商品获取

发布时间 2023-07-17 11:51:23作者: 蕝戀

scrapy特性就是效率高,异步,如果非要集成selenium实际上意义不是特别大....因为selenium慢....

案例:淘宝首页推荐商品的标题获取

爬虫类 toabao.py

import scrapy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule, Spider


class TaobaoSpider(Spider):
    name = "tb"
    allowed_domains = ["taobao.com"]
    start_urls = ["https://taobao.com"]

    def parse(self, response: HtmlResponse, **kwargs):

        # with open("tb.html", "w", encoding="utf-8") as f:
        #     f.write(response.text)
        
        # 由于自己编写的Selenium中间件的存在,response返回的数据是selenium返回的。

        selector_list = response.xpath('//div[@class="tb-recommend-content-item"]')
        for selector in selector_list:
            title = selector.xpath('./a/div[@class="info-wrapper"]/div[@class="title"]/text()').get()
            print(f"{title=}")

settings.py

# 把一些配置放到settings中,方便修改
##### Selenium集成配置 #####
DIRVER_PATH = r'C:\Users\Administrator\Desktop\chromedriver.exe'
# 跳过selenium检测
STEALTH_JS = r'C:\Users\Administrator\Desktop\stealth.min.js'

DOWNLOADER_MIDDLEWARES = {
   # 启用下载器中间件
   "scrapy_demo.middlewares.SeleniumDownLoaderMiddleware": 543,
}

SeleniumMiddleware.py 下载器中间件

class SeleniumDownLoaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.

        dirver_path = crawler.settings["DIRVER_PATH"]
        stealth_js = crawler.settings["STEALTH_JS"]

        # 这个类方法就是实例化中间件自己本身。
        s = cls(dirver_path=dirver_path, stealth_js=stealth_js)

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def __init__(self, dirver_path, stealth_js, *args, **kwargs):
        # 获取配置
        # settings = get_project_settings()
        # 加载读懂
        # service = ChromeService(executable_path=settings["DIRVER_PATH"])
        service = ChromeService(executable_path=dirver_path)

        # 设置浏览器选项
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-blink-features")
        options.add_argument("--disable-blink-features=AutomationControlled")

        # 接管现有的浏览器
        options.add_experimental_option("debuggerAddress", "127.0.0.1:9333")

        # 实例化浏览器
        self.driver = webdriver.Chrome(service=service, options=options)

        # 防检测selenium
        # with open(settings["STEALTH_JS"]) as f:
        with open(stealth_js) as f:
            js = f.read()
        self.driver.execute_cdp_cmd(
            cmd="Page.addScriptToEvaluateOnNewDocument",
            cmd_args={
                "source": js
            }
        )
        # 显示等待5秒
        self.driver.implicitly_wait(5)

    def __del__(self):
        self.driver.quit()

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 打开url
        self.driver.get(request.url)
        # self.driver.get(
        #     "https://s.taobao.com/search?commend=all&ie=utf8&initiative_id=tbindexz_20170306&page=1&q=%E9%9B%B6%E9%A3%9F&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ssid=s5-e")

        #### 模拟各种操作....
        # self.driver.find_element(By.PARTIAL_LINK_TEXT, "天猫超").click()
        # self.driver.switch_to.window(self.driver.window_handles[-1])
        """
        1、先获取当前的高度
        2、进入死循环
            2-1、滚动到最底部
            2-2、判断当前高度是否比上一次高度大,
                如果是表示底部还有数据,
                    更新当前高度的值,用于下次比较
                否则已经到了底部了
        """
        # 自动滚动获取新数据
        self.load_data_by_scroll(self.driver)

        # 看看效果
        # time.sleep(5)

        # 返回响应。
        return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding="utf-8", status=200)

    def load_data_by_scroll(self, driver: WebDriver):
        js = 'return document.body.scrollHeight;'
        # 获取当前高度
        check_height = driver.execute_script(js)
        while True:
            # 先滚动到最底部,如果能继续加载更新,那么document.body.scrollHeight的值就会变大
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            try:
                # 判断当前的document.body.scrollHeight是否比原来的大
                # 这里巧妙利用WebDriverWait,until只要拿不到返回值然后超过给定的时间(5秒)后就会发生TimeoutException异常
                # 发生异常就是表示已经没有更多数据了,那么直接跳出死循环
                WebDriverWait(driver, 5, 0.2).until(lambda x: x.execute_script(js) > check_height)
                # 更新值,用于下次比较
                check_height = driver.execute_script(js)
            except Exception as e:
                break

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)