国家水稻数据中心数据获取

发布时间 2023-10-03 22:55:05作者: 这阵风是晚安
import requests
import parsel
import time
import pandas as pd


def get_rice_data(page=1):
    start = page
    url = f"https://www.ricedata.cn/variety/identified/nation_{start}.htm"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    # 获取网页编码格式并设置
    res.encoding = res.apparent_encoding
    html = parsel.Selector(res.text)
    # 获取父标签
    datas = html.xpath('/html/body/table[2]/tr')

    rice_data = []
    rice_header = ['序号', '品种名称', '亲本来源', '类型', '原产地/选育单位', '审定编号']
    for data in datas[1:]:
        rice_num = data.xpath('td[1]/text()').get()
        rice_category = data.xpath('td[2]/a/text()').get()
        rice_source = data.xpath('td[3]/text()').get()
        rice_type = data.xpath('td[4]/text()').get()
        rice_country = data.xpath('td[5]/text()').get()
        rice_ID = data.xpath('td[6]/text()').get()
        rice_data.append([
            rice_num,
            rice_category,
            rice_source,
            rice_type,
            rice_country,
            rice_ID,
        ])
    rice_data = pd.DataFrame(columns=rice_header, data=rice_data)
    return rice_data


if __name__ == "__main__":
    rice_datas = []
    for l in range(1, 100):
        time.sleep(1)
        print(f"正在获取第{l}页数据")
        data = get_rice_data(l)
        rice_datas.append(data)
    rice_info = pd.concat(rice_datas, ignore_index=True)
    rice_info.to_csv('rice_data.csv', mode='w', index=False, sep=',')
    print("保存成功!")