.CrawlSpider读书网练习

发布时间 2023-10-05 16:40:23作者: sgj191024
1.创建项目:scrapy startproject dushuproject
2.跳转到spiders路径 cd\dushuproject\dushuproject\spiders
3.创建爬虫类:scrapy genspider read www.dushu.com

  

import scrapy
from readPro.items import ReadproItem


class ReadnetSpider(scrapy.Spider):
    name = 'readNet'
    allowed_domains = ['www.dushu.com']
    start_urls = ['https://www.dushu.com/book/1179_1.html']
    base_url = 'https://www.dushu.com/book/1179_'
    page = 1
    def parse(self, response):
        print("读书网")
        img = response.xpath('//div[@class="bookslist"]//li//img')
        for item in img:

            src = item.xpath('./@data-original').extract_first()
            name = item.xpath('./@alt').extract_first()
            print(src,name)
            book = ReadproItem(src=src,name=name)
            yield book

        pass

        if self.page < 101:
            self.page = self.page + 1
            url = self.base_url + str(self.page) + '.html'
            yield scrapy.Request(url=url, callback=self.parse)