巨潮网年报抓取

发布时间 2023-06-26 22:47:41作者: package_main

巨潮网年报爬取更新


本人测试发现,巨潮网的年报爬取距离我上一篇博客并没有啥变化,逻辑没变,应好多朋友的需要,这里补充代码

import json
import os

import requests


def load_json():
    with open("./stock.json", encoding="utf-8", mode="r") as f:
        return json.loads(f.read())


def query_report(_stock, time_span):
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    }

    data = {
        'pageNum': '1',
        'pageSize': '30',
        'column': 'szse',
        'tabName': 'fulltext',
        'plate': '',
        'stock': f'{_stock["code"]},{_stock["orgId"]}',
        'searchkey': '',
        'secid': '',
        'category': 'category_ndbg_szsh',
        'trade': '',
        'seDate': time_span,
        'sortName': '',
        'sortType': '',
        'isHLtitle': 'true',
    }
    response = requests.post(
        'http://www.cninfo.com.cn/new/hisAnnouncement/query',
        headers=headers,
        data=data,
    )
    return response.json()


def download_pdf(d_url, _bank, _file_name):
    # http://static.cninfo.com.cn/finalpage/2023-03-09/1216072952.PDF
    resp = requests.get("http://static.cninfo.com.cn/" + d_url, verify=False)
    if not os.path.exists(os.path.join(os.path.abspath("./"), _bank)):
        os.mkdir(os.path.join(os.path.abspath("./"), _bank))
    with open(f"./{_bank}/{_file_name}.pdf", mode="w", encoding="utf-8") as f:
        f.write(resp.text)


def name_to_stock(_bank):
    for item in stock_dic["stockList"]:
        if item["zwjc"] == _bank:
            return item


if __name__ == '__main__':
    stock_dic = load_json()
    bank_list = ["平安银行", "深南电A", "天健集团"]
    for bank in bank_list:

        stock = name_to_stock(bank)
        if not stock:
            print("[x]未找到该企业")
            exit(0)
        query_result = query_report(_stock=stock, time_span='2022-12-26~2023-06-27')
        print(f"[*]搜索【{bank}】找到:")
        for res in query_result['announcements']:
            print(f"\t[*]{res['adjunctUrl']} {res['announcementTitle']}", end="")
            download_pdf(res['adjunctUrl'], bank, res['announcementTitle'])
            print(f"\r\t[*]{res['adjunctUrl']} {res['announcementTitle']}  ...........   ok")
    print("下载完毕~")

运行结果:

pCa99hR.png