Scrapy在pipeline中集成mongodb

发布时间 2023-07-17 11:51:23作者: 蕝戀

settings.py中设置配置项

MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB_NAME = "bang123"

pipelines.py:

from scrapy.pipelines.images import ImagesPipeline
from itemadapter import is_item, ItemAdapter


class Bang123Pipeline:

    # 保存数据时的集合名
    COLLECTION_NAME = "t_bang123"

    def __init__(self):
        # 读取配置
        from scrapy.utils.project import get_project_settings
        from pymongo import MongoClient
        settings = get_project_settings()

        # Mongodb从settings.py中读取配置信息
        self.client = MongoClient(host=settings["MONGODB_HOST"], port=settings["MONGODB_PORT"])
        self.db = self.client[settings["MONGODB_DB_NAME"]]
        self.collection = self.db[self.COLLECTION_NAME]

    def __del__(self):
        self.client.close()

    def process_item(self, item, spider):

        if spider.name == "bang123":
            # 向mongodb中插入数据
            data_dict = ItemAdapter(item).asdict()
            insert_id = self.collection.insert_one(data_dict)
            print(f"{insert_id=}")
        else:
            print("不是bang123,不写入mongodb中")

        return item

爬虫文件:

import time

import scrapy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from zolwallpaper.items import Bang123Item


class Bang123Spider(CrawlSpider):
    name = "bang123"
    allowed_domains = ["bang123.cn"]
    start_urls = ["https://www.bang123.cn/"]

    rules = (
        # 翻页
        Rule(LinkExtractor(allow=r"https://www.bang123.cn/index_\d+.html"), follow=True),
        # 详情页
        Rule(LinkExtractor(allow=r"https://www.bang123.cn/gongshi/\d+.html"), callback="parse_item", follow=False),
    )

    def parse_item(self, response: HtmlResponse):

        bang_item = Bang123Item()

        selector = response.xpath('//div[@class="article_content layui-field-box"]')[0]
        title = selector.xpath('./h1/text()').get()

        main = response.xpath('//div[@class="content tindent of_table"]/p').getall()

        bang_item["title"] = title
        bang_item["main"] = main

        print(f"【{title=}】")
        print(f"{main=}")
        print("-"*150)

        # 交给管道处理数据
        yield bang_item