大数据分析——对游戏“野蛮时代”玩家进行大数据分析

发布时间 2023-06-07 13:18:17作者: 林菁

一、选题的背景

  对游戏“野蛮时代”进行数据分析,首先离不开对游戏的了解,不了解游戏很可能会影响自己的分析,从而得出错误的结论。在体验了游戏之余,混迹了微博、贴吧等社群,对游戏的玩法有了一定了解后,对数据集的各种字段代表的含义也有了一定的理解,由此开始对该数据集进行数据分析。本次大数据分析主要分析了一款游戏的用户数据情况,包括付费和非付费用户的比例、每日、每小时新增用户的数量、平均在线时长、付费比率、游戏胜率和游戏场次等多个方面。

  通过对这些数据指标的分析,可以更好地了解用户的需求和行为,从而优化游戏设计和运营策略,提高用户满意度和留存率。

二、数据分析设计方案

  使用了数据库链接工具,了解数据库,通过sql语句查出来符合分析要求的数据通过pandas分析matplotlib和pyecharts画图分析出来可视化结果。

三、数据分析步骤

1、数据源:数据来源于野蛮官网

2、数据清洗过程:

import pandas as pd
from sqlalchemy import create_engine

df = pd.read_csv("tap_fun_test.csv")
df1 = pd.read_csv("tap_fun_train.csv")
list1 = []
df = df[
    ['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
     'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
df1 = df1[
    ['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
     'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
list1.append(df)
list1.append(df1)
data = pd.concat(list1)
engine = create_engine('mysql://root:LAS1234567878@12@localhost:3306/big?charset=utf8')
data.to_sql('age_of_barbarians', con=engine, index=False, if_exists='append')

3、大数据分析过程

import os
import pandas as pd
from sqlalchemy import create_engine
from pyecharts import options as opts
from pyecharts.charts import Pie, Line, Bar, Liquid

engine = create_engine('mysql://root:LAS1234567878@12@localhost/big?charset=utf8')

# PU 占比
sql = """
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c1 = (
    Pie()
    .add(
        "",
        [list(z) for z in zip(data.columns, data.values[0])],
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} 占比: {d}%"))
    .render("pie_pu.html")
)
os.system("pie_pu.html")

# DNU 柱形图
sql = """
select cast(register_time as date) as day,
       count(1) as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;
"""
data = pd.read_sql(con=engine, sql=sql)

c2 = (
    Bar()
    .add_xaxis(list(data['day']))
    .add_yaxis("新增用户数", list(data['dnu']))
    .set_global_opts(title_opts=opts.TitleOpts(title="每日新增用户数量"))
    .render("bar_dnu.html")
)
os.system("bar_dnu.html")

# 每小时注册情况
sql = """
select hour(cast(register_time as datetime)) as hour,
       count(1) as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))
order by hour;
"""
data = pd.read_sql(con=engine, sql=sql)
c3 = (
    Line()
    .add_xaxis(list(data['hour']))
    .add_yaxis("新增用户数", list(data['dnu']))
    .set_global_opts(title_opts=opts.TitleOpts(title="每小时新增用户数量"))
    .render("line_dnu.html")
)
os.system("line_dnu.html")

# 每小时注册情况
sql = """
select avg(avg_online_minutes) as `平均在线时长`,
       sum(case when pay_price > 0 then avg_online_minutes else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费玩家在线时长`,
       sum(case when pay_price > 0 then 0 else avg_online_minutes end) / sum(case when pay_price > 0 then 0 else 1 end) as `非付费玩家在线时长`
from age_of_barbarians;
"""
data = pd.read_sql(con=engine, sql=sql)
c4 = (
    Bar()
    .add_xaxis(list(data.columns))
    .add_yaxis("平均在线时长(单位:分钟)", list(data.values[0]))
    .set_global_opts(title_opts=opts.TitleOpts(title="平均在线时长"))
    .render("bar_online.html")
)
os.system("bar_online.html")

# 付费比率
sql = """
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) / sum(case when avg_online_minutes > 0 then 1 else 0 end) as `rate`
from age_of_barbarians;  
"""
data = pd.read_sql(con=engine, sql=sql)
c5 = (
    Liquid()
    .add("lq", [data['rate'][0], data['rate'][0]])
    .set_global_opts(title_opts=opts.TitleOpts(title="付费比率"))
    .render("liquid_base.html")
)
os.system("liquid_base.html")

# 用户游戏胜率
sql = """
select 'PVP' as `游戏类型`,
       sum(pvp_win_count) / sum(pvp_battle_count) as `平均胜率`,
       sum(case when pay_price > 0 then pvp_win_count else 0 end) / sum(case when pay_price > 0 then pvp_battle_count else 0 end) as `付费用户胜率`,
       sum(case when pay_price = 0 then pvp_win_count else 0 end) / sum(case when pay_price = 0 then pvp_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
       sum(pve_win_count) / sum(pve_battle_count) as `平均胜率`,
       sum(case when pay_price > 0 then pve_win_count else 0 end) / sum(case when pay_price > 0 then pve_battle_count else 0 end) as `付费用户胜率`,
       sum(case when pay_price = 0 then pve_win_count else 0 end) / sum(case when pay_price = 0 then pve_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c6 = (
    Bar()
    .add_dataset(
        source=[data.columns.tolist()] + data.values.tolist()
    )
    .add_yaxis(series_name="平均胜率", y_axis=[])
    .add_yaxis(series_name="付费用户胜率", y_axis=[])
    .add_yaxis(series_name="非付费用户胜率", y_axis=[])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="游戏胜率"),
        xaxis_opts=opts.AxisOpts(type_="category"),
    )
    .render("dataset_bar_rate.html")
)
os.system("dataset_bar_rate.html")

# 用户游戏场次
sql = """
select 'PVP' as `游戏类型`,
       avg(pvp_battle_count) as `平均场次`,
       sum(case when pay_price > 0 then pvp_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
       sum(case when pay_price = 0 then pvp_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
       avg(pve_battle_count) as `均场次`,
       sum(case when pay_price > 0 then pve_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
       sum(case when pay_price = 0 then pve_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c7 = (
    Bar()
    .add_dataset(
        source=[data.columns.tolist()] + data.values.tolist()
    )
    .add_yaxis(series_name="平均场次", y_axis=[])
    .add_yaxis(series_name="付费用户平均场次", y_axis=[])
    .add_yaxis(series_name="非付费用户平均场次", y_axis=[])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="游戏场次"),
        xaxis_opts=opts.AxisOpts(type_="category"),
    )
    .render("dataset_bar_times.html")
)
os.system("dataset_bar_times.html")

4、数据可视化过程

(1)、解决精度问题、修改字段类型

 

-- 解决精度问题
-- 修改字段类型
alter table age_of_barbarians
    modify register_time timestamp(0);
alter table age_of_barbarians
    modify avg_online_minutes float(10, 2);
alter table age_of_barbarians
    modify pay_price float(10, 2);

 

 

2)、用户分析

 

-- 1.用户分析

-- 用户总量
select count(1) as total, count(distinct user_id) as users
from age_of_barbarians;

-- PU ( Paying Users):付费用户总量
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians;

-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。
select cast(register_time as date) as day,
       count(1)                    as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;

-- 每小时的新登入用户数量
select hour(cast(register_time as datetime)) as hour,
       count(1)                              as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))

 

 

3)、用户活跃度分析

 

# 2.用户活跃度分析
-- 平均在线时长
select avg(avg_online_minutes)                        as `平均在线时长`,
       sum(case when pay_price > 0 then avg_online_minutes else 0 end) /
       sum(case when pay_price > 0 then 1 else 0 end) as `付费用户在线时长`,
       sum(case when pay_price > 0 then 0 else avg_online_minutes end) /
       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户在线时长`
from age_of_barbarians;

(4)、用户付费情况分析

 

# --3.用户付费情况分析

-- APA(Active Payment Account):活跃付费用户数。
select count(1) as APA
from age_of_barbarians
where pay_price > 0
  and avg_online_minutes > 0;
-- 60987

-- ARPU(Average Revenue Per User) :平均每用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians;
-- 0.582407

-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end)
from age_of_barbarians;
-- 29.190265

-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) /
       sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians;
-- 0.02

-- 付费用户人数,付费总额,付费总次数,平均每人付费,平均每人付费次数,平均每次付费
select count(1)                        as pu,            -- 60988
       sum(pay_price)                  as sum_pay_price, -- 1780226.7
       avg(pay_price)                  as avg_pay_price, -- 29.189786
       sum(pay_count)                  as sum_pay_count, -- 193030
       avg(pay_count)                  as avg_pay_count, -- 3.165
       sum(pay_price) / sum(pay_count) as each_pay_price -- 9.222539
from age_of_barbarians
where pay_price > 0;

 

 (5)、用户习惯分析

# --4.用户习惯分析
#
# --胜率
select 'PVP'                                                         as `游戏类型`,
       sum(pvp_win_count) / sum(pvp_battle_count)                    as `平均胜率`,
       sum(case when pay_price > 0 then pvp_win_count else 0 end) /
       sum(case when pay_price > 0 then pvp_battle_count else 0 end) as `付费用户胜率`,
       sum(case when pay_price = 0 then pvp_win_count else 0 end) /
       sum(case when pay_price = 0 then pvp_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
union all
select 'PVE'                                                         as `游戏类型`,
       sum(pve_win_count) / sum(pve_battle_count)                    as `平均胜率`,
       sum(case when pay_price > 0 then pve_win_count else 0 end) /
       sum(case when pay_price > 0 then pve_battle_count else 0 end) as `付费用户胜率`,
       sum(case when pay_price = 0 then pve_win_count else 0 end) /
       sum(case when pay_price = 0 then pve_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians;

# --pvp场次
select 'PVP'                                                                                                          as `游戏类型`,
       avg(pvp_battle_count)                                                                                          as `平均场次`,
       sum(case when pay_price > 0 then pvp_battle_count else 0 end) /
       sum(case when pay_price > 0 then 1 else 0 end)                                                                 as `付费用户平均场次`,
       sum(case when pay_price = 0 then pvp_battle_count else 0 end) /
       sum(case when pay_price = 0 then 1 else 0 end)                                                                 as `非付费用户平均场次`
from age_of_barbarians
union all
select 'PVE'                                                                                                          as `游戏类型`,
       avg(pve_battle_count)                                                                                          as `均场次`,
       sum(case when pay_price > 0 then pve_battle_count else 0 end) /
       sum(case when pay_price > 0 then 1 else 0 end)                                                                 as `付费用户平均场次`,
       sum(case when pay_price = 0 then pve_battle_count else 0 end) /
       sum(case when pay_price = 0 then 1 else 0 end)                                                                 as `非付费用户平均场次`
from age_of_barbarians

 

 

5、完成程序源代码

(1)、数据清洗部分

import pandas as pd

from sqlalchemy import create_engine

 

df = pd.read_csv("tap_fun_test.csv")

df1 = pd.read_csv("tap_fun_train.csv")

list1 = []

df = df[

    ['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',

     'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']

]

df1 = df1[

    ['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',

     'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']

]

list1.append(df)

list1.append(df1)

data = pd.concat(list1)

engine = create_engine('mysql://root:LAS1234567878@12@localhost:3306/big?charset=utf8')

data.to_sql('age_of_barbarians', con=engine, index=False, if_exists='append')

(2)、分析python部分

import os

import pandas as pd

from sqlalchemy import create_engine

from pyecharts import options as opts

from pyecharts.charts import Pie, Line, Bar, Liquid

 

engine = create_engine('mysql://root:LAS1234567878@12@localhost/big?charset=utf8')

 

# PU 占比

sql = """

select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,

       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`

from age_of_barbarians

"""

data = pd.read_sql(con=engine, sql=sql)

c1 = (

    Pie()

    .add(

        "",

        [list(z) for z in zip(data.columns, data.values[0])],

    )

    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} 占比: {d}%"))

    .render("pie_pu.html")

)

os.system("pie_pu.html")

 

# DNU 柱形图

sql = """

select cast(register_time as date) as day,

       count(1) as dnu

from age_of_barbarians

group by cast(register_time as date)

order by day;

"""

data = pd.read_sql(con=engine, sql=sql)

 

c2 = (

    Bar()

    .add_xaxis(list(data['day']))

    .add_yaxis("新增用户数", list(data['dnu']))

    .set_global_opts(title_opts=opts.TitleOpts(title="每日新增用户数量"))

    .render("bar_dnu.html")

)

os.system("bar_dnu.html")

 

# 每小时注册情况

sql = """

select hour(cast(register_time as datetime)) as hour,

       count(1) as dnu

from age_of_barbarians

group by hour(cast(register_time as datetime))

order by hour;

"""

data = pd.read_sql(con=engine, sql=sql)

c3 = (

    Line()

    .add_xaxis(list(data['hour']))

    .add_yaxis("新增用户数", list(data['dnu']))

    .set_global_opts(title_opts=opts.TitleOpts(title="每小时新增用户数量"))

    .render("line_dnu.html")

)

os.system("line_dnu.html")

 

# 每小时注册情况

sql = """

select avg(avg_online_minutes) as `平均在线时长`,

       sum(case when pay_price > 0 then avg_online_minutes else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费玩家在线时长`,

       sum(case when pay_price > 0 then 0 else avg_online_minutes end) / sum(case when pay_price > 0 then 0 else 1 end) as `非付费玩家在线时长`

from age_of_barbarians;

"""

data = pd.read_sql(con=engine, sql=sql)

c4 = (

    Bar()

    .add_xaxis(list(data.columns))

.add_yaxis("平均在线时长(单位:

(3)、分析sql部分(可视化部分)

-- 解决精度问题

-- 修改字段类型

alter table age_of_barbarians

    modify register_time timestamp(0);

alter table age_of_barbarians

    modify avg_online_minutes float(10, 2);

alter table age_of_barbarians

    modify pay_price float(10, 2);

 

-- 1.用户分析

 

-- 用户总量

select count(1) as total, count(distinct user_id) as users

from age_of_barbarians;

 

-- PU ( Paying Users):付费用户总量

select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,

       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`

from age_of_barbarians;

 

-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。

select cast(register_time as date) as day,

       count(1)                    as dnu

from age_of_barbarians

group by cast(register_time as date)

order by day;

 

-- 每小时的新登入用户数量

select hour(cast(register_time as datetime)) as hour,

       count(1)                              as dnu

from age_of_barbarians

group by hour(cast(register_time as datetime))

order by hour;

 

 

# 2.用户活跃度分析

-- 平均在线时长

select avg(avg_online_minutes)                        as `平均在线时长`,

       sum(case when pay_price > 0 then avg_online_minutes else 0 end) /

       sum(case when pay_price > 0 then 1 else 0 end) as `付费用户在线时长`,

       sum(case when pay_price > 0 then 0 else avg_online_minutes end) /

       sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户在线时长`

from age_of_barbarians;

 

 

 

# --3.用户付费情况分析

 

-- APA(Active Payment Account):活跃付费用户数。

select count(1) as APA

from age_of_barbarians

where pay_price > 0

  and avg_online_minutes > 0;

-- 60987

 

-- ARPU(Average Revenue Per User) :平均每用户收入。

select sum(pay_price) / sum(case when avg_online_minutes > 0 then 1 else 0 end)

from age_of_barbarians;

-- 0.582407

 

-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。

select sum(pay_price) / sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end)

from age_of_barbarians;

-- 29.190265

 

-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。

select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) /

       sum(case when avg_online_minutes > 0 then 1 else 0 end)

from age_of_

四、总结

 

1、通过对数据的挖掘和对这些数据指标的分析,可以更好地了解用户的需求和行为,从而优化游戏设计和运营策略,提高用户满意度和留存率。

 

2、通过这次的课程设计,加强了我在python高级运用的技能。途中遇到麻烦时还请教了数据库老师,向他答疑解惑,与此同时我也更加清楚和了解如何结合使用sql和python对大数据进行分析。