一、选题的背景
对游戏“野蛮时代”进行数据分析,首先离不开对游戏的了解,不了解游戏很可能会影响自己的分析,从而得出错误的结论。在体验了游戏之余,混迹了微博、贴吧等社群,对游戏的玩法有了一定了解后,对数据集的各种字段代表的含义也有了一定的理解,由此开始对该数据集进行数据分析。本次大数据分析主要分析了一款游戏的用户数据情况,包括付费和非付费用户的比例、每日、每小时新增用户的数量、平均在线时长、付费比率、游戏胜率和游戏场次等多个方面。
通过对这些数据指标的分析,可以更好地了解用户的需求和行为,从而优化游戏设计和运营策略,提高用户满意度和留存率。
二、数据分析设计方案
使用了数据库链接工具,了解数据库,通过sql语句查出来符合分析要求的数据通过pandas分析matplotlib和pyecharts画图分析出来可视化结果。
三、数据分析步骤
1、数据源:数据来源于野蛮官网
2、数据清洗过程:
import pandas as pd
from sqlalchemy import create_engine
df = pd.read_csv("tap_fun_test.csv")
df1 = pd.read_csv("tap_fun_train.csv")
list1 = []
df = df[
['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
df1 = df1[
['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
list1.append(df)
list1.append(df1)
data = pd.concat(list1)
engine = create_engine('mysql://root:LAS1234567878@12@localhost:3306/big?charset=utf8')
data.to_sql('age_of_barbarians', con=engine, index=False, if_exists='append')
3、大数据分析过程
import os
import pandas as pd
from sqlalchemy import create_engine
from pyecharts import options as opts
from pyecharts.charts import Pie, Line, Bar, Liquid
engine = create_engine('mysql://root:LAS1234567878@12@localhost/big?charset=utf8')
# PU 占比
sql = """
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c1 = (
Pie()
.add(
"",
[list(z) for z in zip(data.columns, data.values[0])],
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} 占比: {d}%"))
.render("pie_pu.html")
)
os.system("pie_pu.html")
# DNU 柱形图
sql = """
select cast(register_time as date) as day,
count(1) as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;
"""
data = pd.read_sql(con=engine, sql=sql)
c2 = (
Bar()
.add_xaxis(list(data['day']))
.add_yaxis("新增用户数", list(data['dnu']))
.set_global_opts(title_opts=opts.TitleOpts(title="每日新增用户数量"))
.render("bar_dnu.html")
)
os.system("bar_dnu.html")
# 每小时注册情况
sql = """
select hour(cast(register_time as datetime)) as hour,
count(1) as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))
order by hour;
"""
data = pd.read_sql(con=engine, sql=sql)
c3 = (
Line()
.add_xaxis(list(data['hour']))
.add_yaxis("新增用户数", list(data['dnu']))
.set_global_opts(title_opts=opts.TitleOpts(title="每小时新增用户数量"))
.render("line_dnu.html")
)
os.system("line_dnu.html")
# 每小时注册情况
sql = """
select avg(avg_online_minutes) as `平均在线时长`,
sum(case when pay_price > 0 then avg_online_minutes else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费玩家在线时长`,
sum(case when pay_price > 0 then 0 else avg_online_minutes end) / sum(case when pay_price > 0 then 0 else 1 end) as `非付费玩家在线时长`
from age_of_barbarians;
"""
data = pd.read_sql(con=engine, sql=sql)
c4 = (
Bar()
.add_xaxis(list(data.columns))
.add_yaxis("平均在线时长(单位:分钟)", list(data.values[0]))
.set_global_opts(title_opts=opts.TitleOpts(title="平均在线时长"))
.render("bar_online.html")
)
os.system("bar_online.html")
# 付费比率
sql = """
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) / sum(case when avg_online_minutes > 0 then 1 else 0 end) as `rate`
from age_of_barbarians;
"""
data = pd.read_sql(con=engine, sql=sql)
c5 = (
Liquid()
.add("lq", [data['rate'][0], data['rate'][0]])
.set_global_opts(title_opts=opts.TitleOpts(title="付费比率"))
.render("liquid_base.html")
)
os.system("liquid_base.html")
# 用户游戏胜率
sql = """
select 'PVP' as `游戏类型`,
sum(pvp_win_count) / sum(pvp_battle_count) as `平均胜率`,
sum(case when pay_price > 0 then pvp_win_count else 0 end) / sum(case when pay_price > 0 then pvp_battle_count else 0 end) as `付费用户胜率`,
sum(case when pay_price = 0 then pvp_win_count else 0 end) / sum(case when pay_price = 0 then pvp_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
sum(pve_win_count) / sum(pve_battle_count) as `平均胜率`,
sum(case when pay_price > 0 then pve_win_count else 0 end) / sum(case when pay_price > 0 then pve_battle_count else 0 end) as `付费用户胜率`,
sum(case when pay_price = 0 then pve_win_count else 0 end) / sum(case when pay_price = 0 then pve_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c6 = (
Bar()
.add_dataset(
source=[data.columns.tolist()] + data.values.tolist()
)
.add_yaxis(series_name="平均胜率", y_axis=[])
.add_yaxis(series_name="付费用户胜率", y_axis=[])
.add_yaxis(series_name="非付费用户胜率", y_axis=[])
.set_global_opts(
title_opts=opts.TitleOpts(title="游戏胜率"),
xaxis_opts=opts.AxisOpts(type_="category"),
)
.render("dataset_bar_rate.html")
)
os.system("dataset_bar_rate.html")
# 用户游戏场次
sql = """
select 'PVP' as `游戏类型`,
avg(pvp_battle_count) as `平均场次`,
sum(case when pay_price > 0 then pvp_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
sum(case when pay_price = 0 then pvp_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
avg(pve_battle_count) as `均场次`,
sum(case when pay_price > 0 then pve_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
sum(case when pay_price = 0 then pve_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c7 = (
Bar()
.add_dataset(
source=[data.columns.tolist()] + data.values.tolist()
)
.add_yaxis(series_name="平均场次", y_axis=[])
.add_yaxis(series_name="付费用户平均场次", y_axis=[])
.add_yaxis(series_name="非付费用户平均场次", y_axis=[])
.set_global_opts(
title_opts=opts.TitleOpts(title="游戏场次"),
xaxis_opts=opts.AxisOpts(type_="category"),
)
.render("dataset_bar_times.html")
)
os.system("dataset_bar_times.html")
4、数据可视化过程
(1)、解决精度问题、修改字段类型
-- 解决精度问题
-- 修改字段类型
alter table age_of_barbarians
modify register_time timestamp(0);
alter table age_of_barbarians
modify avg_online_minutes float(10, 2);
alter table age_of_barbarians
modify pay_price float(10, 2);
(2)、用户分析
-- 1.用户分析
-- 用户总量
select count(1) as total, count(distinct user_id) as users
from age_of_barbarians;
-- PU ( Paying Users):付费用户总量
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians;
-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。
select cast(register_time as date) as day,
count(1) as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;
-- 每小时的新登入用户数量
select hour(cast(register_time as datetime)) as hour,
count(1) as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))
(3)、用户活跃度分析
# 2.用户活跃度分析
-- 平均在线时长
select avg(avg_online_minutes) as `平均在线时长`,
sum(case when pay_price > 0 then avg_online_minutes else 0 end) /
sum(case when pay_price > 0 then 1 else 0 end) as `付费用户在线时长`,
sum(case when pay_price > 0 then 0 else avg_online_minutes end) /
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户在线时长`
from age_of_barbarians;
(4)、用户付费情况分析
# --3.用户付费情况分析
-- APA(Active Payment Account):活跃付费用户数。
select count(1) as APA
from age_of_barbarians
where pay_price > 0
and avg_online_minutes > 0;
-- 60987
-- ARPU(Average Revenue Per User) :平均每用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians;
-- 0.582407
-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end)
from age_of_barbarians;
-- 29.190265
-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) /
sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians;
-- 0.02
-- 付费用户人数,付费总额,付费总次数,平均每人付费,平均每人付费次数,平均每次付费
select count(1) as pu, -- 60988
sum(pay_price) as sum_pay_price, -- 1780226.7
avg(pay_price) as avg_pay_price, -- 29.189786
sum(pay_count) as sum_pay_count, -- 193030
avg(pay_count) as avg_pay_count, -- 3.165
sum(pay_price) / sum(pay_count) as each_pay_price -- 9.222539
from age_of_barbarians
where pay_price > 0;
(5)、用户习惯分析
# --4.用户习惯分析
#
# --胜率
select 'PVP' as `游戏类型`,
sum(pvp_win_count) / sum(pvp_battle_count) as `平均胜率`,
sum(case when pay_price > 0 then pvp_win_count else 0 end) /
sum(case when pay_price > 0 then pvp_battle_count else 0 end) as `付费用户胜率`,
sum(case when pay_price = 0 then pvp_win_count else 0 end) /
sum(case when pay_price = 0 then pvp_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
sum(pve_win_count) / sum(pve_battle_count) as `平均胜率`,
sum(case when pay_price > 0 then pve_win_count else 0 end) /
sum(case when pay_price > 0 then pve_battle_count else 0 end) as `付费用户胜率`,
sum(case when pay_price = 0 then pve_win_count else 0 end) /
sum(case when pay_price = 0 then pve_battle_count else 0 end) as `非付费用户胜率`
from age_of_barbarians;
# --pvp场次
select 'PVP' as `游戏类型`,
avg(pvp_battle_count) as `平均场次`,
sum(case when pay_price > 0 then pvp_battle_count else 0 end) /
sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
sum(case when pay_price = 0 then pvp_battle_count else 0 end) /
sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
union all
select 'PVE' as `游戏类型`,
avg(pve_battle_count) as `均场次`,
sum(case when pay_price > 0 then pve_battle_count else 0 end) /
sum(case when pay_price > 0 then 1 else 0 end) as `付费用户平均场次`,
sum(case when pay_price = 0 then pve_battle_count else 0 end) /
sum(case when pay_price = 0 then 1 else 0 end) as `非付费用户平均场次`
from age_of_barbarians
5、完成程序源代码
(1)、数据清洗部分
import pandas as pd
from sqlalchemy import create_engine
df = pd.read_csv("tap_fun_test.csv")
df1 = pd.read_csv("tap_fun_train.csv")
list1 = []
df = df[
['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
df1 = df1[
['user_id', 'register_time', 'pvp_battle_count', 'pvp_lanch_count', 'pvp_win_count', 'pve_battle_count',
'pve_lanch_count', 'pve_win_count', 'avg_online_minutes', 'pay_price', 'pay_count']
]
list1.append(df)
list1.append(df1)
data = pd.concat(list1)
engine = create_engine('mysql://root:LAS1234567878@12@localhost:3306/big?charset=utf8')
data.to_sql('age_of_barbarians', con=engine, index=False, if_exists='append')
(2)、分析python部分
import os
import pandas as pd
from sqlalchemy import create_engine
from pyecharts import options as opts
from pyecharts.charts import Pie, Line, Bar, Liquid
engine = create_engine('mysql://root:LAS1234567878@12@localhost/big?charset=utf8')
# PU 占比
sql = """
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians
"""
data = pd.read_sql(con=engine, sql=sql)
c1 = (
Pie()
.add(
"",
[list(z) for z in zip(data.columns, data.values[0])],
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} 占比: {d}%"))
.render("pie_pu.html")
)
os.system("pie_pu.html")
# DNU 柱形图
sql = """
select cast(register_time as date) as day,
count(1) as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;
"""
data = pd.read_sql(con=engine, sql=sql)
c2 = (
Bar()
.add_xaxis(list(data['day']))
.add_yaxis("新增用户数", list(data['dnu']))
.set_global_opts(title_opts=opts.TitleOpts(title="每日新增用户数量"))
.render("bar_dnu.html")
)
os.system("bar_dnu.html")
# 每小时注册情况
sql = """
select hour(cast(register_time as datetime)) as hour,
count(1) as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))
order by hour;
"""
data = pd.read_sql(con=engine, sql=sql)
c3 = (
Line()
.add_xaxis(list(data['hour']))
.add_yaxis("新增用户数", list(data['dnu']))
.set_global_opts(title_opts=opts.TitleOpts(title="每小时新增用户数量"))
.render("line_dnu.html")
)
os.system("line_dnu.html")
# 每小时注册情况
sql = """
select avg(avg_online_minutes) as `平均在线时长`,
sum(case when pay_price > 0 then avg_online_minutes else 0 end) / sum(case when pay_price > 0 then 1 else 0 end) as `付费玩家在线时长`,
sum(case when pay_price > 0 then 0 else avg_online_minutes end) / sum(case when pay_price > 0 then 0 else 1 end) as `非付费玩家在线时长`
from age_of_barbarians;
"""
data = pd.read_sql(con=engine, sql=sql)
c4 = (
Bar()
.add_xaxis(list(data.columns))
.add_yaxis("平均在线时长(单位:
(3)、分析sql部分(可视化部分)
-- 解决精度问题
-- 修改字段类型
alter table age_of_barbarians
modify register_time timestamp(0);
alter table age_of_barbarians
modify avg_online_minutes float(10, 2);
alter table age_of_barbarians
modify pay_price float(10, 2);
-- 1.用户分析
-- 用户总量
select count(1) as total, count(distinct user_id) as users
from age_of_barbarians;
-- PU ( Paying Users):付费用户总量
select sum(case when pay_price > 0 then 1 else 0 end) as `付费用户`,
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户`
from age_of_barbarians;
-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。
select cast(register_time as date) as day,
count(1) as dnu
from age_of_barbarians
group by cast(register_time as date)
order by day;
-- 每小时的新登入用户数量
select hour(cast(register_time as datetime)) as hour,
count(1) as dnu
from age_of_barbarians
group by hour(cast(register_time as datetime))
order by hour;
# 2.用户活跃度分析
-- 平均在线时长
select avg(avg_online_minutes) as `平均在线时长`,
sum(case when pay_price > 0 then avg_online_minutes else 0 end) /
sum(case when pay_price > 0 then 1 else 0 end) as `付费用户在线时长`,
sum(case when pay_price > 0 then 0 else avg_online_minutes end) /
sum(case when pay_price > 0 then 0 else 1 end) as `非付费用户在线时长`
from age_of_barbarians;
# --3.用户付费情况分析
-- APA(Active Payment Account):活跃付费用户数。
select count(1) as APA
from age_of_barbarians
where pay_price > 0
and avg_online_minutes > 0;
-- 60987
-- ARPU(Average Revenue Per User) :平均每用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians;
-- 0.582407
-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。
select sum(pay_price) / sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end)
from age_of_barbarians;
-- 29.190265
-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) /
sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_
四、总结
1、通过对数据的挖掘和对这些数据指标的分析,可以更好地了解用户的需求和行为,从而优化游戏设计和运营策略,提高用户满意度和留存率。
2、通过这次的课程设计,加强了我在python高级运用的技能。途中遇到麻烦时还请教了数据库老师,向他答疑解惑,与此同时我也更加清楚和了解如何结合使用sql和python对大数据进行分析。
标签:数据分析,count,野蛮,pay,sum,玩家,case,else,price From: https://www.cnblogs.com/joyceLinjj/p/17463042.html