Python爬取大数据实现疫情数据可视化

发布时间 2023-04-18 16:10:51作者: yesyes1

实现啦!

数据爬取

对于Python爬取数据这一内容,直接使用短短的几行代码就能够实现:

然后根据数据的具体情况,将相关数据存储到后缀名为.csv的文件中:

数据可视化

将存储到.csv文件的数据读取出来,然后将他与Python的数据可视化联系起来,这样就很轻松地实现数据可视化啦!

import requests  # 请求库
import json  # 用于解析json数据
import pandas as pd  # 数据分析库
import time  # 导入时间模块
# 找到目标网址,伪装浏览器,请求数据
url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-total?t=328100359682'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

req = requests.get(url, headers=headers)
print(req.status_code)  # 响应码 200为成功

# print(req.text)
# ## 使用Json模块对响应内容进行初步解析

data_json = json.loads(req.text)  # 使用json.loads将json字符串转化为字典
ata_json.keys()

data = data_json['data']
data.keys()

import pprint

pp = pprint.PrettyPrinter(indent=2)  # 2个空格缩进
# pp.pprint(data['areaTree']) #查看data数据结构

# ## 世界各国实时数据爬取

# ### 获取areaTree数据

areaTree = data['areaTree']  # 获取各国数据

pp.pprint(data['areaTree'][0])  # 查看data['areaTree']数据格式

# ### 封装函数并获取数据

# 将提取数据的方法进行封装
def get_data(data, info_list):
    info = pd.DataFrame(data)[info_list]  # 主要信息

    today_data = pd.DataFrame([i['today'] for i in data])  # 提取today的数据
    today_data.columns = ['today_' + i for i in today_data.columns]  # 更改列名

    total_data = pd.DataFrame([i['total'] for i in data])  # 提取total的数据
    total_data.columns = ['total_' + i for i in total_data]  # 更改列名

    return pd.concat([info, today_data, total_data], axis=1)  # 将info.today,total数据进行合并

today_world = get_data(areaTree, ['id', 'lastUpdateTime', 'name'])  # 调用封装的函数获取数据

today_world.head()

# ### 封装函数并存储数据

def save_data(data, name):
    file_name = name + '_' + time.strftime('%Y_%m_%d', time.localtime(time.time())) + '.csv'
    data.to_csv(file_name, sep=',', encoding='utf-8-sig')
    print(file_name + ' 保存成功! ')

save_data(today_world, 'today_world')  # 调用函数保存数据

country_dict = {num: name for num, name in zip(today_world['id'], today_world['name'])}

start = time.time()
for country_id in country_dict:
    try:
        # 按照各国id,访问各国的数据,并获取json数据
        url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode=' + country_id
        req2 = requests.get(url, headers=headers)
        data_json = json.loads(req2.text)

        # 提取各国的数据,然后写入各国的名称
        country_data = get_data(data_json['data']['list'], 'date')
        country_data['name'] = country_dict[country_id]

        # 合并数据
        if country_id == '9577772':
            alltime_country = country_data
        else:
            alltime_country = pd.concat([alltime_country, country_data])
        print('-' * 20, country_dict[country_id], '抓取成功', country_data.shape,
              '以获取数据大小', alltime_country.shape, '累计耗时', round(time.time() - start), '-' * 20)
    #         time.sleep(5)
    except:
        print('-' * 20, country_dict[provience_id], '数据抓取失败', '-' * 20)

save_data(alltime_country, 'alltime_country')

# ## 数据处理

# ### 更换数据列名

td_wd = pd.read_csv('today_world_2021_12_26.csv')
ac = pd.read_csv('alltime_country_2021_12_26.csv')

name_dict = {
    'id': '编号', 'lastUpdateTime': '更新时间', 'name': '名称', 'today_confirm': '当日新增确诊',
    'today_suspect': '当日新增疑似', 'today_heal': '当日新增治愈', 'today_dead': '当日新增死亡', 'today_severe': '当日新增重症',
    'today_storeConfirm': '当日现存确诊', 'today_input': '当日新增输入', 'total_confirm': '累计确诊', 'total_suspect': '累计疑似',
    'total_heal': '累计治愈', 'total_dead': '累计死亡', 'total_severe': '累计重症', 'total_input': '累计输入'
}

td_wd.rename(columns=name_dict, inplace=True)
ac.rename(columns=name_dict, inplace=True)
td_wd.head()

# ### 数据描述

td_wd = td_wd.drop(columns=['Unnamed: 0'])
ac = ac.drop(columns=['Unnamed: 0'])
td_wd.describe()

td_wd['当日现存确诊'] = td_wd['累计确诊'] - td_wd['累计治愈'] - td_wd['累计死亡']
td_wd['病死率'] = (td_wd['累计死亡'] / td_wd['累计确诊']).apply(lambda x: format(x, '.2f'))
td_wd['病死率'] = td_wd['病死率'].astype('float')

ac['当日现存确诊'] = ac['累计确诊'] - ac['累计治愈'] - ac['累计死亡']
ac['病死率'] = (ac['累计死亡'] / ac['累计确诊']).apply(lambda x: format(x, '.2f'))
ac['病死率'] = ac['病死率'].astype('float')

print('统计nan值:')

td_wd_nan = td_wd.isnull().sum() / len(td_wd)

td_wd_nan.apply(lambda x: format(x, '.1%'))

td_wd.sort_values('病死率', ascending=False, inplace=True)
td_wd.head()

td_wd.set_index('名称', inplace=True)  # 索引改为国家名称
wd_top10 = td_wd.sort_values('病死率', ascending=False)[:10]
wd_top10 = wd_top10[['累计确诊', '累计死亡', '病死率']]
wd_top10

cn = []
for i in wd_top10.index[:5]:
    cn.append(i)

# ## 数据可视化

import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.dpi'] = 120

wd_top10.sort_values('病死率').plot.barh(subplots=True, layout=(1, 3), sharex=False, sharey=True, figsize=(7, 4))
plt.show()

ac.describe()

print('统计nan值:')

ac_nan = ac.isnull().sum() / len(ac)

ac_nan.apply(lambda x: format(x, '.1%'))

hisc = {}  # 存储最近几天历史数据的字典
ac['date'] = ac['date'].apply(lambda x: x[-5:])
ac

for i in ac.index:
    name = ac['名称'][i]
    if name not in hisc.keys():
        hisc[name] = []
        hisc[name].append(list(ac.loc[i]))
    else:
        hisc[name].append(list(ac.loc[i]))

# cn = list(hisc.keys())

ds1 = {}

q = 0
for i in cn:
    if i not in ds1.keys():
        ds1[i] = []
        for j in hisc[i]:
            ds1[i].append(j[0])
    else:
        for j in hisc[i]:
            ds1[i].append(j[0])

d = 0
for i in cn:
    d += 1
    if d == 1:
        p_a = ds1[i]
        continue
    else:
        p_b = ds1[i]
        p_a = set(p_a) & set(p_b)

ppd = list(p_a)
ds2 = {}
ppd.sort()

s = 0
for i in cn:
    for j in hisc[i]:
        if j[0] in ppd:
            if i not in ds2.keys():
                ds2[i] = []
                ds2[i].append(j[1])
            else:
                ds2[i].append(j[1])

ds2k = list(ds2.keys())

for i in ds2k:
    plt.plot(ppd, ds2[i])
plt.xticks(rotation=60)  # 横坐标每个值旋转60度
plt.xticks(range(0, len(ppd), 3))
plt.legend(cn)
plt.title('当日新增确诊')
plt.show()

s = 0
ds2 = {}
for i in cn:
    for j in hisc[i]:
        if j[0] in ppd:
            if i not in ds2.keys():
                ds2[i] = []
                ds2[i].append(j[2])
            else:
                ds2[i].append(j[2])
ds2k = list(ds2.keys())
for i in ds2k:
    plt.plot(ppd, ds2[i])
plt.xticks(rotation=60)  # 横坐标每个值旋转60度
plt.xticks(range(0, len(ppd), 3))
plt.legend(cn)
plt.title('当日新增治愈')
plt.show()

s = 0
ds2 = {}
for i in cn:
    for j in hisc[i]:
        if j[0] in ppd:
            if i not in ds2.keys():
                ds2[i] = []
                ds2[i].append(j[3])
            else:
                ds2[i].append(j[3])
ds2k = list(ds2.keys())
for i in ds2k:
    plt.plot(ppd, ds2[i])
plt.xticks(rotation=60)  # 横坐标每个值旋转60度
plt.xticks(range(0, len(ppd), 3))
plt.legend(cn)
plt.title('当日新增死亡')
plt.show()

s = 0
ds2 = {}
for i in cn:
    for j in hisc[i]:
        if j[0] in ppd:
            if i not in ds2.keys():
                ds2[i] = []
                ds2[i].append(j[-1])
            else:
                ds2[i].append(j[-1])
ds2k = list(ds2.keys())
for i in ds2k:
    plt.plot(ppd, ds2[i])
plt.xticks(rotation=60)  # 横坐标每个值旋转60度
plt.xticks(range(0, len(ppd), 3))
plt.legend(cn)
plt.title('病死率')
plt.show()

s = 0
ds2 = {}
for i in cn:
    for j in hisc[i]:
        if j[0] in ppd[-1]:
            if i not in ds2.keys():
                ds2[i] = []
                ds2[i].append(j[7])
            else:
                ds2[i].append(j[7])
ds2k = list(ds2.keys())
sd = []
for i in ds2k:
    sd.append(ds2[i][0])
print(sd)
plt.bar(cn, sd)
plt.xticks(rotation=60)  # 横坐标每个值旋转60度
plt.legend(cn)
plt.title('累计治愈')
plt.show()

(以上代码转载自:https://www.cnblogs.com/double-c/p/15732201.html)