【笔记整理】[案例]使用正则表达式来提取36Kr新闻

发布时间 2023-07-17 11:15:50作者: 蕝戀
import datetime
import json
import re

import requests


class Kr36(object):
    def __init__(self):
        self.url = "https://36kr.com/information/web_news/"
        self.headers = {
            "Host": "36kr.com",
            "referer": "https://36kr.com/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }
        self.file = open("36kr.json", "w", encoding="utf-8")

    def parse_data(self, data):
        match_list = re.search(r'<script>window.initialState=(.*?)</script>', data)
        # 拿到正则匹配的第一个组的内容
        group1 = match_list.group(1)
        # 将json转换为python字典
        json_dict = json.loads(group1)
        # print(json_dict)

        results_list = []

        # 遍历获取需要的信息
        for item in json_dict["information"]["informationList"]["itemList"]:
            url = f"""https://36kr.com/p/{item["templateMaterial"]["itemId"]}"""
            title = item["templateMaterial"]["widgetTitle"]
            img_url = item["templateMaterial"]["widgetImage"]
            publish_time = item["templateMaterial"]["publishTime"]

            # 将整合的字典添加到列表中
            results_list.append({
                "title": title,
                "url": url,
                "img_url": img_url,
                "publish_time": datetime.datetime.fromtimestamp(publish_time / 1000).isoformat()
            })

        return results_list

    def get_data(self) -> bytes:
        """发送请求,返回二进制响应体数据"""
        resp = requests.get(self.url, headers=self.headers)
        # 这里不decode了,调用者自行decode
        return resp.content

    def save_data(self, data):
        self.file.write(json.dumps(data, ensure_ascii=False, indent=4))

    def __del__(self):
        self.file.flush()
        self.file.close()

    def run(self):
        resp = self.get_data()
        data = self.parse_data(resp.decode())
        self.save_data(data)


if __name__ == '__main__':
    Kr36().run()