Scrapy_Request对象dont_filter演示

发布时间 2023-06-25 15:19:13作者: jiang_jiayun
import scrapy


class BaiduSpider(scrapy.Spider):
    name = "baidu"
    allowed_domains = ["baidu.com"]
    start_urls = ["https://baidu.com"]

    def parse(self, response):
        title = response.xpath('//title/text()').get()
        print(title)
        yield scrapy.Request('https://baidu.com',callback=self.parse_info)

    def parse_info(self, response):
        title = response.xpath('//title/text()').get()
        print(title)
        yield scrapy.Request('https://baidu.com',callback=self.parse_info)

理论上是死循环

默认去重

import scrapy


class BaiduSpider(scrapy.Spider):
    name = "baidu"
    allowed_domains = ["baidu.com"]
    start_urls = ["https://baidu.com"]

    def start_requests(self):
        for url in self.start_urls:
            # dont_filter:取消去重    True继续访问,Falsa取消访问
            yield scrapy.Request(url, dont_filter=True)
    
    def parse(self, response):
        title = response.xpath('//title/text()').get()
        print(title)
        yield scrapy.Request('https://baidu.com',callback=self.parse_info)

    def parse_info(self, response):
        title = response.xpath('//title/text()').get()
        print(title)
        yield scrapy.Request('https://baidu.com',callback=self.parse_info)