第六周-电子商务网站用户行为分析-526互联

访问数据库

import os
import pandas as pd


# 修改工作路径到指定文件夹
#os.chdir("D:/chapter11/demo")

# 第一种连接方式
#from sqlalchemy import create_engine

#engine = create_engine('mysql+pymysql://root:123@192.168.31.140:3306/test?charset=utf8')
#sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)

# 第二种连接方式
import pymysql as pm

con = pm.connect(host='localhost',user='root',password='123456',database='test',charset='utf8')
data = pd.read_sql('select * from all_gzdata',con=con)
con.close()           #关闭连接

# 保存读取的数据
data.to_csv("D:\\360MoveData\\Users\\86130\\Documents\\Tencent Files\\2268756693\\FileRecv\\all_gzdata.csv", index=False, encoding='utf-8')

网页类型设计

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
# 分析网页类型
counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计
counts = counts.copy()
counts = pd.concat(counts).groupby(level=0).sum()  # 合并统计结果，把相同的统计项合并（即按index分组并求和）
counts = counts.reset_index()  # 重新设置index，将原来的index作为counts的一列。
counts.columns = ['index', 'num']  # 重新设置列名，主要是第二列，默认为0
counts['type'] = counts['index'].str.extract('(\d{3})')  # 提取前三个数字作为类别id
counts_ = counts[['type', 'num']].groupby('type').sum()  # 按类别合并
counts_.sort_values(by='num', ascending=False, inplace=True)  # 降序排列
counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum()
print(counts_)

         num     ratio
type                  
101   411665  0.491570
199   201426  0.240523
107   182900  0.218401
301    18430  0.022007
102    17357  0.020726
106     3957  0.004725
103     1715  0.002048

知识类型内部统计

# 因为只有107001一类，但是可以继续细分成三类：知识内容页、知识列表页、知识首页
def count107(i): #自定义统计函数
    j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy()  # 找出类别包含107的网址
    j['type'] = None # 添加空列
    j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页'
    j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页'
    j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页'
    return j['type'].value_counts()
# 注意：获取一次sql对象就需要重新访问一下数据库(!!!)
#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)

counts2 = [count107(i) for i in sql] # 逐块统计
counts2 = pd.concat(counts2).groupby(level=0).sum()  # 合并统计结果
print(counts2)
#计算各个部分的占比
res107 = pd.DataFrame(counts2)
# res107.reset_index(inplace=True)
res107.index.name= '107类型'
res107.rename(columns={'type':'num'}, inplace=True)
res107['比例'] = res107['num'] / res107['num'].sum()
res107.reset_index(inplace = True)
print(res107)

知识内容页    164243
知识列表页      9656
知识首页       9001
Name: type, dtype: int64
   107类型     num        比例
0  知识内容页  164243  0.897993
1  知识列表页    9656  0.052794
2   知识首页    9001  0.049213

统计带“?”的数据

def countquestion(i):  # 自定义统计函数
    j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy()  # 找出类别包含107的网址
    return j

# 注意获取一次sql对象就需要重新访问一下数据库
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)

counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql]
counts3 = pd.concat(counts3).groupby(level=0).sum()
print(counts3)

# 求各个类型的占比并保存数据
df1 =  pd.DataFrame(counts3)
df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100
df1.sort_values(by='fullURLId',ascending=False,inplace=True)
print(df1.round(4))

101003        47
102002        25
107001       346
1999001    64718
301001       356
Name: fullURLId, dtype: int64
         fullURLId     perc
1999001      64718  98.8182
301001         356   0.5436
107001         346   0.5283
101003          47   0.0718
102002          25   0.0382

统计199类型中的具体类型占比

def page199(i): #自定义统计函数
    j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) & 
         (i['fullURL'].str.contains('\?'))]
    j['pageTitle'].fillna('空',inplace=True)
    j['type'] = '其他' # 添加空列
    j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手'
    j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功'
    j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询'
    j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜'
    j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验'
    j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询'
    j['type'][(j['pageTitle'].str.contains('_法律快车')) | 
            (j['pageTitle'].str.contains('-法律快车'))] = '法律快车'
    j['type'][j['pageTitle'].str.contains('空')] = '空'
    
    return j

# 注意：获取一次sql对象就需要重新访问一下数据库
#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
#sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine)

counts4 = [page199(i) for i in sql] # 逐块统计
counts4 = pd.concat(counts4)
d1 = counts4['type'].value_counts()
print(d1)
d2 = counts4[counts4['type']=='其他']
print(d2)
# 求各个部分的占比并保存数据
df1_ =  pd.DataFrame(d1)
df1_['perc'] = df1_['type']/df1_['type'].sum()*100
df1_.sort_values(by='type',ascending=False,inplace=True)
print(df1_)

法律快车-律师助手    49894
法律快车法律咨询      6421
咨询发布成功        5220
快搜            1943
法律快车           818
其他             359
法律快车法律经验        59
空                4
Name: type, dtype: int64
                                                fullURL  \
2631  http://www.lawtime.cn/spelawyer/index.php?py=g...   
2632  http://www.lawtime.cn/spelawyer/index.php?py=g...   
1677  http://m.baidu.com/from=844b/bd_page_type=1/ss...   
4303  http://m.baidu.com/from=0/bd_page_type=1/ssid=...   
3673  http://www.lawtime.cn/lawyer/lll25879862593080...   
...                                                 ...   
4829  http://www.lawtime.cn/spelawyer/index.php?m=se...   
4837  http://www.lawtime.cn/spelawyer/index.php?m=se...   
4842  http://www.lawtime.cn/spelawyer/index.php?m=se...   
8302  http://www.lawtime.cn/spelawyer/index.php?m=se...   
5034  http://www.baidu.com/link?url=O7iBD2KmoJdkHWTZ...   

                                     pageTitle type  
2631   个旧律师成功案例 - 法律快车提供个旧知名律师、优秀律师、专业律师的咨询和推荐   其他  
2632   个旧律师成功案例 - 法律快车提供个旧知名律师、优秀律师、专业律师的咨询和推荐   其他  
1677                          婚姻法论文 - 法律快车法律论文   其他  
4303                什么是机动车？什么是非机动车？ - 法律快车交通事故   其他  
3673                          404错误提示页面 - 法律快车   其他  
...                                        ...  ...  
4829  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐   其他  
4837  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐   其他  
4842  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐   其他  
8302  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐   其他  
5034                 离婚协议书范本（2015年版） - 法律快车婚姻法   其他  

[359 rows x 3 columns]
            type       perc
法律快车-律师助手  49894  77.094471
法律快车法律咨询    6421   9.921506
咨询发布成功      5220   8.065762
快搜          1943   3.002256
法律快车         818   1.263945
其他           359   0.554714
法律快车法律经验      59   0.091165
空              4   0.006181

统计无目的浏览用户中各个类型占比

def xiaguang(i): #自定义统计函数
    j = i.loc[(i['fullURL'].str.contains('\.html'))==False,
              ['fullURL','fullURLId','pageTitle']]
    return j

# 注意获取一次sql对象就需要重新访问一下数据库
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息

counts5 = [xiaguang(i) for i in sql]
counts5 = pd.concat(counts5)

xg1 = counts5['fullURLId'].value_counts()
print(xg1)
# 求各个部分的占比
xg_ =  pd.DataFrame(xg1)
xg_.reset_index(inplace=True)
xg_.columns= ['index', 'num']
xg_['perc'] = xg_['num']/xg_['num'].sum()*100
xg_.sort_values(by='num',ascending=False,inplace=True)

xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别id    

xgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并
xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列
xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100

print(xgs_.round(4))

统计用户浏览网页次数的情况

# 统计点击次数
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息

counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数
counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果，level=0表示按照index分组
print(counts1)

counts1_ = pd.DataFrame(counts1)
counts1_
counts1['realIP'] = counts1.index.tolist()

counts1_[1]=1  # 添加1列全为1
hit_count = counts1_.groupby('realIP').sum()  # 统计各个“不同点击次数”分别出现的次数
# 也可以使用counts1_['realIP'].value_counts()功能
hit_count.columns=['用户数']
hit_count.index.name = '点击次数'

# 统计1~7次、7次以上的用户人数
hit_count.sort_index(inplace = True)
hit_count_7 = hit_count.iloc[:7,:]
time = hit_count.iloc[7:,0].sum()  # 统计点击次数7次以上的用户数
hit_count_7 = hit_count_7.append([{'用户数':time}], ignore_index=True)
hit_count_7.index = ['1','2','3','4','5','6','7','7次以上']
hit_count_7['用户比例'] = hit_count_7['用户数'] / hit_count_7['用户数'].sum()
print(hit_count_7)

82033         2
95502         1
103182        1
116010        2
136206        1
             ..
4294809358    2
4294811150    1
4294852154    3
4294865422    2
4294917690    1
Name: realIP, Length: 230149, dtype: int64
         用户数      用户比例
1     132119  0.574059
2      44175  0.191941
3      17573  0.076355
4      10156  0.044128
5       5952  0.025862
6       4132  0.017954
7       2632  0.011436
7次以上   13410  0.058267

分析浏览一次的用户行为

# 初始化数据库连接:
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize=1024 * 5)
# 分块统计各个IP的点击次数
result = [i['realIP'].value_counts() for i in sql]
click_count = pd.concat(result).groupby(level=0).sum()
click_count = click_count.reset_index()
click_count.columns = ['realIP', 'times']
# 筛选出来点击一次的数据
click_one_data = click_count[click_count['times'] == 1]
# 这里只能再次读取数据 因为sql是一个生成器类型，所以在使用过一次以后，就不能继续使用了。必须要重新执行一次读取。
sql = pd.read_sql('all_gzdata', engine, chunksize=1024 * 5)
# 取出这三列数据
data = [i[['fullURLId', 'fullURL', 'realIP']] for i in sql]
data = pd.concat(data)
# 和并数据 我以click_one_data为基准 按照realIP合并过来，目的方便查看点击一次的网页和realIP
merge_data = pd.merge(click_one_data, data, on='realIP', how='left')
# 点击一次的数据统计 写入数据库 以方便读取 校准无误 写入后就可以注释掉此句代码
#erge_data.to_sql('click_one_count', engine, if_exists='append')
print(merge_data)

# 统计排名前4和其他的网页类型
URL_count_4 = URL_count.iloc[:4,:]
time = hit_count.iloc[4:,0].sum()  # 统计其他的
URLindex = URL_count_4.index.values
URL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True)
URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3], 
                     '其他']
URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum()
print(URL_count_4)

           realIP  times fullURLId  \
0            95502      1    101003   
1           103182      1    101003   
2           136206      1    101003   
3           140151      1    107001   
4           155761      1    101003   
...            ...    ...       ...   
132114  4294737166      1    101003   
132115  4294804343      1    101003   
132116  4294807822      1    101003   
132117  4294811150      1    101003   
132118  4294917690      1    101003   

                                                  fullURL  
0         http://www.lawtime.cn/ask/question_7882607.html  
1         http://www.lawtime.cn/ask/question_7174864.html  
2         http://www.lawtime.cn/ask/question_8246285.html  
3       http://www.lawtime.cn/info/gongsi/slbgfgs/2011...  
4         http://www.lawtime.cn/ask/question_5951952.html  
...                                                   ...  
132114    http://www.lawtime.cn/ask/question_3947040.html  
132115    http://www.lawtime.cn/ask/question_2064846.html  
132116    http://www.lawtime.cn/ask/question_9981155.html  
132117    http://www.lawtime.cn/ask/question_4931163.html  
132118    http://www.lawtime.cn/ask/question_6910223.html  

[132119 rows x 4 columns]
          count        比例
101003   102560  0.649011
107001    19443  0.123037
1999001    9381  0.059364
301001      515  0.003259
其他        26126  0.165328

统计单用户浏览次数为一次的网页

# 在浏览1次的前提下, 得到的网页被浏览的总次数
fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count())
fullURL_count.columns = ["count"]
fullURL_count["fullURL"] = fullURL_count.index.tolist()
fullURL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列

# 网页类型ID统计
fullURLId_count = merge_data['fullURLId'].value_counts()
fullURLId_count = fullURLId_count.reset_index()
fullURLId_count.columns = ['fullURLId', 'count']
fullURLId_count['percent'] = fullURLId_count['count'] / fullURLId_count['count'].sum() * 100
print('*****' * 10)
print(fullURLId_count)

# 用户点击一次 浏览的网页统计
fullURL_count = merge_data['fullURL'].value_counts()
fullURL_count = fullURL_count.reset_index()
fullURL_count.columns = ['fullURL', 'count']
fullURL_count['percent'] = fullURL_count['count'] / fullURL_count['count'].sum() * 100
print('*****' * 10)
print(fullURL_count)

**************************************************
   fullURLId   count    percent
0     101003  102560  77.626988
1     107001   19443  14.716279
2    1999001    9381   7.100417
3     301001     515   0.389800
4     102001      70   0.052983
5     103003      45   0.034060
6     101002      33   0.024977
7     101001      28   0.021193
8     102002      13   0.009840
9     106001      13   0.009840
10    101009       4   0.003028
11    101004       3   0.002271
12    101007       3   0.002271
13    101008       2   0.001514
14    102003       2   0.001514
15    101005       1   0.000757
16    102004       1   0.000757
17    101006       1   0.000757
18    102006       1   0.000757
**************************************************
                                                 fullURL  count   percent
0      http://www.lawtime.cn/info/shuifa/slb/20121119...   1013  0.766733
1      http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...    501  0.379204
2         http://www.lawtime.cn/ask/question_925675.html    423  0.320166
3      http://www.lawtime.cn/info/shuifa/slb/20121119...    367  0.277780
4               http://www.lawtime.cn/ask/exp/13655.html    301  0.227825
...                                                  ...    ...       ...
88030    http://www.lawtime.cn/ask/question_3357263.html      1  0.000757
88031  http://www.lawtime.cn/info/laodong/laodonganli...      1  0.000757
88032  http://www.lawtime.cn/info/lunwen/ipzhuzuo/201...      1  0.000757
88033     http://www.lawtime.cn/ask/question_307554.html      1  0.000757
88034   http://www.lawtime.cn/ask/question_10467655.html      1  0.000757

[88035 rows x 3 columns]

删除不符合规则的网页

import os
import re
import pandas as pd
import pymysql as pm
from random import sample

# 修改工作路径到指定文件夹
os.chdir("D:\\360MoveData\\Users\\86130\\Documents\\Tencent Files\\2268756693\\FileRecv")

# 读取数据
con = pm.connect(host='localhost',user='root',password='123456',database='test',charset='utf8')
data = pd.read_sql('select * from all_gzdata',con=con)
con.close()  # 关闭连接

# 取出107类型数据
index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']]
data_107 = data.loc[index107,:]

# 在107类型中筛选出婚姻类数据
index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']]
data_hunyin = data_107.loc[index,:]

# 提取所需字段(realIP、fullURL)
info = data_hunyin.loc[:,['realIP','fullURL']]

# 去除网址中“？”及其后面内容
da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']]
info.loc[:,'fullURL'] = da     # 将info中‘fullURL’那列换成da
# 去除无html网址
index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']]
index.count(True)   # True 或者 1 ， False 或者 0
info1 = info.loc[index,:]
print("（学号 3110）去除无html网址如下：")
print(info1)

            realIP                                            fullURL
0       2683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...
4       2683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...
9       1275347569  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
62      1531496412  http://www.lawtime.cn/info/hunyin/hunyinfagui/...
86       838215995  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
...            ...                                                ...
837347  2320911216  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
837362  3458366734  http://www.lawtime.cn/info/hunyin/jhsy/daiyun/...
837370  2526756791  http://www.lawtime.cn/info/hunyin/hynews/20101...
837376  4267065457  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
837434  3271035001  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...

[31199 rows x 2 columns]

还原翻译网址

# 找出翻页和非翻页网址
index = [re.search('/\d+_\d+\.html',i)!=None for i in info1.loc[:,'fullURL']]
index1 = [i==False for i in index]
info1_1 = info1.loc[index,:]   # 带翻页网址
info1_2 = info1.loc[index1,:]  # 无翻页网址
# 将翻页网址还原
da = [re.sub('_\d+\.html','.html',str(i)) for i in info1_1.loc[:,'fullURL']]
info1_1.loc[:,'fullURL'] = da
# 翻页与非翻页网址合并
frames = [info1_1,info1_2]
info2 = pd.concat(frames)
# 或者
info2 = pd.concat([info1_1,info1_2],axis = 0)   # 默认为0，即行合并
# 去重（realIP和fullURL两列相同）
info3 = info2.drop_duplicates()
# 将IP转换成字符型数据
info3.iloc[:,0] = [str(index) for index in info3.iloc[:,0]]
info3.iloc[:,1] = [str(index) for index in info3.iloc[:,1]]
print("（学号 3110）还原的翻译网址如下：")
print(info3)
len(info3)

            realIP                                            fullURL
0       2683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...
86       838215995  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
98      1531496412  http://www.lawtime.cn/info/hunyin/hunyinfagui/...
130      923358328  http://www.lawtime.cn/info/hunyin/zhonghun/zho...
140     1275347569  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
...            ...                                                ...
837191  3897562894  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
837362  3458366734  http://www.lawtime.cn/info/hunyin/jhsy/daiyun/...
837370  2526756791  http://www.lawtime.cn/info/hunyin/hynews/20101...
837376  4267065457  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...
837434  3271035001  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...

[16570 rows x 2 columns]

筛选浏览次数不满两次的用户

# 代码11-12 筛选浏览次数不满两次的用户

# 筛选满足一定浏览次数的IP
IP_count = info3['realIP'].value_counts()
# 找出IP集合
IP = list(IP_count.index)
count = list(IP_count.values)
# 统计每个IP的浏览次数，并存放进IP_count数据框中,第一列为IP，第二列为浏览次数
IP_count = pd.DataFrame({'IP':IP,'count':count})
print("（学号 3110）")
print(IP_count)
# 筛选出浏览网址在n次以上的IP集合
n = 2
index = IP_count.loc[:,'count']>n
IP_index = IP_count.loc[index,'IP']
print(IP_index)

              IP  count
0      2609113527    895
1      3812410744    140
2       225896631     59
3       242673847     56
4      1190924814     48
...           ...    ...
10524  3494221838      1
10525  1219597838      1
10526    49885111      1
10527  2861434551      1
10528  2306969614      1

[10529 rows x 2 columns]
0      2609113527
1      3812410744
2       225896631
3       242673847
4      1190924814
          ...    
865    3634500980
866    1519157623
867    3851633265
868    2213364337
869    1938534819
Name: IP, Length: 870, dtype: object

划分数据集

# 划分IP集合为训练集和测试集
index_tr = sample(range(0,len(IP_index)),int(len(IP_index)*0.8))  # 或者np.random.sample
index_te = [i for i in range(0,len(IP_index)) if i not in index_tr]
IP_tr = IP_index[index_tr]
IP_te = IP_index[index_te]
# 将对应数据集划分为训练集和测试集
index_tr = [i in list(IP_tr) for i in info3.loc[:,'realIP']]
index_te = [i in list(IP_te) for i in info3.loc[:,'realIP']]
data_tr = info3.loc[index_tr,:]
data_te = info3.loc[index_te,:]
print("（学号 3110）")
print(len(data_tr))
IP_tr = data_tr.iloc[:,0]  # 训练集IP
url_tr = data_tr.iloc[:,1]  # 训练集网址
IP_tr = list(set(IP_tr))  # 去重处理
url_tr = list(set(url_tr))  # 去重处理
len(url_tr)

4542

2448