scrapy:CrawlSpider及其案例

发布时间 2023-05-25 17:43:45作者: yub4by

1. CrawSpider介绍

image-20230525163921435

运行原理:

image-20230525163958211

2. pymysql

image-20230525164128794

conda install pymysql

3. 案例-读书网爬虫&数据入库

3.1 案例需求

image-20230525164215111


3.2 网页分析

image-20230525163831180

image-20230525165834118

3.3 创建爬虫项目

image-20230525163213223

3.4 创建爬虫文件(与之前有区别)

scrapy genspider -t crawl read https://www.dushu.com/book/1188.html

image-20230525164750225

3.5 实现爬虫

image-20230525170807520

image-20230525170821604

image-20230525170831402

image-20230525170848927

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from scrapy_readbook_6.items import ScrapyReadbook6Item


class ReadSpider(CrawlSpider):
    name = "read"
    allowed_domains = ["www.dushu.com"]
    start_urls = ["https://www.dushu.com/book/1188_1.html"]  # 注意要_1,不然就不符合Rule,导致取不到第一页

    rules = (Rule(
        LinkExtractor(allow=r"/book/1188_\d+\.html"),  # 正则;需要修改allowed_domains为域名范围
        callback="parse_item",
        follow=False
    ),)

    def parse_item(self, response):
        # 爬取图片和书名
        img_list = response.xpath('//div[@class="bookslist"]//img')
        for img in img_list:
            name = img.xpath('./@data-original').extract_first()
            src = img.xpath('./@alt').extract_first()
            book = ScrapyReadbook6Item(name=name, src=src)
            yield book

scrapy crawl read

image-20230525171105788

3.6 实现入库

image-20230525173142041

image-20230525173225262

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class ScrapyReadbook6Pipeline:
    def open_spider(self, spider):
        self.fp = open('book.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item

    def close_spider(self, spider):
        self.fp.close()


from scrapy.utils.project import get_project_settings  # 加载settings文件
import pymysql
class MysqlPipeline:
    def open_spider(self, spider):
        settings = get_project_settings()
        self.host = settings['DB_HOST']
        self.port =settings['DB_PORT']
        self.user =settings['DB_USER']
        self.password =settings['DB_PASSWROD']
        self.name =settings['DB_NAME']
        self.charset =settings['DB_CHARSET']
        self.connect()

    def connect(self):
        self.conn = pymysql.connect(
                            host=self.host,
                            port=self.port,
                            user=self.user,
                            password=self.password,
                            db=self.name,
                            charset=self.charset
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        sql = 'insert into book(name,src) values("{}","{}")'.format(item['name'], item['src'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

scrapy crawl read

image-20230525173308650

image-20230525173415234