pandas-分组过滤聚合

发布时间 2023-12-10 01:22:03作者: 贝壳里的星海

pandas-分组过滤聚合

groupby分组

通过groupby方法对series或者dataframe对象进行分组,该方法会返回一个分组对象

pandas中的groupby函数也是先将df按照某个字段进行拆分,将相同属性分为一组;然后对拆分后的各组执行相应的转换操作;最后输出汇总转换后的各组结果

df.groupby(
    by=None,
    axis=0,
    level=None,
    as_index: 'bool' = True,
    sort: 'bool' = True,
    group_keys: 'bool' = True,
    squeeze: 'bool' = <object object at 0x137ac9240>,
    observed: 'bool' = False,
    dropna: 'bool' = True,
)
返回值 包含有关组信息的groupby对象

groupby的一个属性二个方法:

  • groups:返回一个字典类型的对象,包含分组信息。
  • size():返回每组记录的数量。
  • describe():分组统计信息。
import pandas as pd
import numpy as np
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'data':[0,5,10,5,10,15,10,15,20]})
print(df)
#   key  data
# 0   A     0
# 1   B     5
# 2   C    10
# 3   A     5
# 4   B    10
# 5   C    15
# 6   A    10
# 7   B    15
# 8   C    20

# 分组,通过key将数据分成ABC三组
grouped=df.groupby(['key']) #通过key分组
#查看分组情况
for dtype,group in grouped:
    print(dtype,group)
    
# A   key  data
# 0   A     0
# 3   A     5
# 6   A    10
# B   key  data
# 1   B     5
# 4   B    10
# 7   B    15
# C   key  data
# 2   C    10
# 5   C    15
# 8   C    20


dfsum=df.groupby(['key']).sum() # 链式调用先分组再用聚合函数聚合
print(dfsum)
#     data
#key
#A      15
#B      30
#C      45
import pandas as pd
import numpy as np


df=pd.DataFrame({'key1':['A','B','C','A','B','C','A','B','C'],  
                'key2':['one','one','one','two','two','two','three','three','three'],
                'data1':[0,5,10,5,10,15,10,15,20],
})

print(df)
#       key1   key2  data1
# 0    A    one      0
# 1    B    one      5
# 2    C    one     10
# 3    A    two      5
# 4    B    two     10
# 5    C    two     15
# 6    A  three     10
# 7    B  three     15
# 8    C  three     20

print()
grouped=df.groupby(['key2'])

for gram,data in grouped:
    print(gram,data)
    


# one   key1 key2  data1
# 0    A  one      0
# 1    B  one      5
# 2    C  one     10
# three   key1   key2  data1
# 6    A  three     10
# 7    B  three     15
# 8    C  three     20
# two   key1 key2  data1
# 3    A  two      5
# 4    B  two     10
# 5    C  two     15
import pandas as pd
#数据集
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                 'data':[0,5,10,5,10,15,10,15,20]},
                index=[1,2,3,4,5,6,7,8,9])

#自定义函数区分大于5和小于5的数据
def big5(x):  
    result=0
    if x>5:
        result=1
    return result

by_big5=df.groupby(big5)  #根据索引是否大于5进行分组
#查看分组情况
for group_name,group_data in by_big5:
    print(group_data)
    
#   key  data
# 1   A     0
# 2   B     5
# 3   C    10
# 4   A     5
# 5   B    10
#   key  data
# 6   C    15
# 7   A    10
# 8   B    15
# 9   C    20

groupby多列分组

import pandas as pd
#数据集
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                 'bool':[1,1,1,1,2,2,2,2,2],
                 'data':[0,5,10,5,10,15,10,15,20]
                 
                 },
                index=[1,2,3,4,5,6,7,8,9])

print(df)
#   key  bool  data
# 1   A     1     0
# 2   B     1     5
# 3   C     1    10
# 4   A     1     5
# 5   B     2    10
# 6   C     2    15
# 7   A     2    10
# 8   B     2    15
# 9   C     2    20
grouped=df.groupby(['key','bool'])
print(grouped.sum())
#           data
# key bool
# A   1        5
#     2       10
# B   1        5
#     2       25
# C   1       10
#     2       35

每隔n个分组



import  pandas as pd

d = pd.DataFrame(list(zip(list(range(1,10)), list(range(11,20)))), columns=['a', 'b'])
print(d)

d=d.assign(c=d.groupby(d.index//3)['b'].transform('sum'))
 
print(d)
 
#    a   b
# 0  1  11
# 1  2  12
# 2  3  13
# 3  4  14
# 4  5  15
# 5  6  16
# 6  7  17
# 7  8  18
# 8  9  19
#    a   b   c
# 0  1  11  36
# 1  2  12  36
# 2  3  13  36
# 3  4  14  45
# 4  5  15  45
# 5  6  16  45
# 6  7  17  54
# 7  8  18  54
# 8  9  19  54

filter过滤

DataFrame.filter(self, items=None, like=None, regex=None, axis=None)

根据分组数据进行过滤


import pandas as pd
#数据集

df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                 'data':[0,5,10,5,10,15,10,15,20]})


def filtersum25(x):
    if x['data'].sum()>25:
        return True
    else:
        return False
    
grouped=df.groupby(['key'])    # 先进行分组

for group_name,group_data in grouped:
    print(group_data)
#   key  data
# 0   A     0
# 3   A     5
# 6   A    10
#   key  data
# 1   B     5
# 4   B    10
# 7   B    15
#   key  data
# 2   C    10
# 5   C    15
# 8   C    20
print()
filter= grouped.filter(filtersum25)   # 再进行分组过滤
print(filter)

#  key  data
#1   B     5
#2   C    10
#4   B    10
#5   C    15
#7   B    15
#8   C    20

agg整合

df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9],
                   [np.nan, np.nan, np.nan]],
                  columns=['A', 'B', 'C'])
 
print(df)
# -----return------
#      A    B    C
# 0  1.0  2.0  3.0
# 1  4.0  5.0  6.0
# 2  7.0  8.0  9.0
# 3  NaN  NaN  NaN
 
print(df.agg(sum))
# ----return------
# A    12.0
# B    15.0
# C    18.0
# dtype: float64

参考资料

https://zhuanlan.zhihu.com/p/370851569

https://blog.csdn.net/weixin_43790276/article/details/125922102

https://blog.csdn.net/jwtning/article/details/107759838?spm=1001.2101.3001.6650.3&utm_medium=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromBaidu~Rate-3-107759838-blog-123511623.235^v38^pc_relevant_sort&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromBaidu~Rate-3-107759838-blog-123511623.235^v38^pc_relevant_sort&utm_relevant_index=6