使用requests和BeautifulSoup对北京市政百姓信件进行爬取

发布时间 2023-06-20 20:41:34作者: 李迎辉

一开始爬取的时候,没有加上请求头,所有导致会出现创宇盾进行防护,加上请求头即可

还有问题就是,这个网址的页号和网址是无关的,所以采用网上说的改变url来改变页号进行爬取是不可行的,使用

    for page in range(start_page, end_page + 1):
        url = url.format(page)
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        contents = response.read()
        # a标签中前一个是信件的类型,后面那个是信件编号
        # 发送GET请求获取网页内容
        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(contents, "html.parser")
        a_tags = soup.find_all('a', onclick=True)

整个代码为(爬取的结果,根据不同类型存入了不同的txt文件中)

import requests
from bs4 import BeautifulSoup
import urllib.request
import json
import re

def huoqu():
    url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"  # 替换为目标网站的URL
    cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
             "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
             " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
             "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
             " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
             "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
                 " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
    headers = {"User-Agent": user_agent, "Cookie": cookie}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    contents = response.read()
    # a标签中前一个是信件的类型,后面那个是信件编号
    # 发送GET请求获取网页内容
    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(contents, "html.parser")
    return soup


def huoqu1(start_page, end_page):
    url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"  # 替换为目标网站的URL
    cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
             "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
             " sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
             "JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
             " _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
             "_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
                 " Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
    headers = {"User-Agent": user_agent, "Cookie": cookie}
    f1 = open('G:/python/pythonProject/信件爬取/1.txt', 'a')
    f2 = open('G:/python/pythonProject/信件爬取/2.txt', 'a')
    f3 = open('G:/python/pythonProject/信件爬取/3.txt', 'a')
    for page in range(start_page, end_page + 1):
        url = url.format(page)
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        contents = response.read()
        # a标签中前一个是信件的类型,后面那个是信件编号
        # 发送GET请求获取网页内容
        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(contents, "html.parser")
        a_tags = soup.find_all('a', onclick=True)
        for element in a_tags:
            onclick_value = element["onclick"]
            match = re.search(r"letterdetail\('(\d+)', '([^']+)'\)", onclick_value)
            if match:
                onclick_param1 = match.group(1)
                # print(type(onclick_param1))
                onclick_param2 = match.group(2)
                if onclick_param1 == '1':
                    f1.write(onclick_param2+'\n')
                if onclick_param1 == '2':
                    f2.write(onclick_param2+'\n')
                if onclick_param1 == '3':
                    f3.write(onclick_param2+'\n')
                print(f"onclick param 1: {onclick_param1}, onclick param 2: {onclick_param2}")
    f1.flush()
    f2.flush()
    f3.flush()
    f1.close()
    f2.close()
    f3.close()


if __name__ == '__main__':
    huoqu1(1, 173)