一、 采集北京市政百姓信件内容

发布时间 2023-05-26 22:18:56作者: 又一岁荣枯

letter.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from letterBeijing.items import LetterbeijingItem
import json

i = 1

class LetterSpider(scrapy.Spider):
    name = "letter"
    allowed_domains = ["www.beijing.gov.cn"]
    start_urls = ["https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo=1&page.pageSize=6&orgtitleLength=26"]

    def parse(self, response):
        input_string = response.text
        # 处理字符串,使用双引号替换键名和字符串值的单引号
        processed_string = input_string.replace("'", "\"")
        processed_string = processed_string.replace("page:", "\"page\":")
        processed_string = processed_string.replace("pageNo:", "\"pageNo\":")
        processed_string = processed_string.replace("totalCount:", "\"totalCount\":")
        processed_string = processed_string.replace("totalPages:", "\"totalPages\":")
        processed_string = processed_string.replace("pageSize:", "\"pageSize\":")
        processed_string = processed_string.replace("result:", "\"result\":")
        processed_string = processed_string.replace("letterType:", "\"letterType\":")
        processed_string = processed_string.replace("letterTypeName:", "\"letterTypeName\":")
        processed_string = processed_string.replace("letterTitle:", "\"letterTitle\":")
        processed_string = processed_string.replace("showLetterTitle:", "\"showLetterTitle\":")
        processed_string = processed_string.replace("writeDate:", "\"writeDate\":")
        processed_string = processed_string.replace("orgNames:", "\"orgNames\":")
        processed_string = processed_string.replace("showOrgNames:", "\"showOrgNames\":")
        processed_string = processed_string.replace("originalId:", "\"originalId\":")

        contents = json.loads(processed_string)["result"]

        for content in contents:
            # 详情内容的url
            url = ["https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=",
                   "https://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=",
                   ]
            # 内容类型
            type = content["letterTypeName"]
            # 内容标题
            title = content["letterTitle"]
            # 处理机构
            institution = content["showOrgNames"]

            if(type=="咨询"):
                yield scrapy.Request(url=url[0]+content["originalId"],callback=self.parse_second,meta={'type':type,'title':title,'institution':institution})
            elif(type=="建议"):
                yield scrapy.Request(url=url[1]+content["originalId"],callback=self.parse_second,meta={'type':type,'title':title,'institution':institution})


    def parse_second(self,response):
        # 信件内容
        content = response.xpath('//div[@class="col-xs-12 col-md-12 column p-2 text-muted mx-2"]/text()').extract_first()
        # 回复内容
        reply = response.xpath('//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"]/text()').extract_first()
        type = response.meta['type']
        title = response.meta['title']
        institution = response.meta['institution']

        letter = LetterbeijingItem(type=type,title=title,content=content,institution=institution,reply=reply)

        yield letter
        global i
        i += 1
        url = "https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!letterListJson.action?keyword=&startDate=&endDate=&letterType=0&page.pageNo="+str(i)+"&page.pageSize=6&orgtitleLength=26";
        if(i<150):
            yield scrapy.Request(url=url,callback=self.parse,dont_filter = False)

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class LetterbeijingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    # 信件类型
    type = scrapy.Field()
    # 信件标题
    title = scrapy.Field()
    # 信件内容
    content = scrapy.Field()
    # 处理部门
    institution = scrapy.Field()
    # 答复内容
    reply = scrapy.Field()

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exporters import JsonLinesItemExporter

class LetterbeijingPipeline:
    def __init__(self):
        self.file = open('letter.json', 'wb')  # 必须写入二进制
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        print(item)

    def close_item(self, spider):
        self.file.close()

letter.json

https://github.com/HE-ZI-YAN/letterBeijing/blob/main/letterBeijing/letter.json