【小睿的ML之路】Pandas数据预处理(含泰坦尼克号数据分析案例实战)

发布时间 2023-09-15 01:09:43作者: 郭小睿
import pandas

food_info = pandas.read_csv("food_info.csv",encoding="gbk")
print(food_info)
   名称  价格(元)  糖分(g)  重量(kg)  含水量(mg)
0  苹果    200     20      10       30
1  香蕉    100     50      30       70
2   梨     10     20      30       40
# 使用 sort_values() 排序,不修改原始数据
sorted_info = food_info.sort_values("价格(元)")
print("Sorted DataFrame (Not Inplace):")
print(sorted_info)

# 原始数据不受影响
print("\nOriginal DataFrame:")
print(food_info)

# 使用 sort_values() 排序,并修改原始数据
food_info.sort_values("价格(元)", inplace=True)
print("\nSorted DataFrame (Inplace):")
print(food_info)
Sorted DataFrame (Not Inplace):
   名称  价格(元)  糖分(g)  重量(kg)  含水量(mg)
2   梨     10     20      30       40
1  香蕉    100     50      30       70
0  苹果    200     20      10       30

Original DataFrame:
   名称  价格(元)  糖分(g)  重量(kg)  含水量(mg)
0  苹果    200     20      10       30
1  香蕉    100     50      30       70
2   梨     10     20      30       40

Sorted DataFrame (Inplace):
   名称  价格(元)  糖分(g)  重量(kg)  含水量(mg)
2   梨     10     20      30       40
1  香蕉    100     50      30       70
0  苹果    200     20      10       30
# ascending=True,从小到大,反之从大到小
food_info.sort_values("价格(元)", inplace=True, ascending=False)
print(food_info)
   名称  价格(元)  糖分(g)  重量(kg)  含水量(mg)
0  苹果    200     20      10       30
1  香蕉    100     50      30       70
2   梨     10     20      30       40

# 泰坦尼克号数据分析案例实战
import numpy as np
import pandas as pd


titanic_survival = pd.read_csv("titanic_train.csv")
# 舱位纬度
# pclass 舱位(1头等舱、2二等舱、3三等舱)
# cabin 客舱号
# survived 是否生还,0死亡,1生还
# 乘客纬度
# name姓名、sex性别、age年龄、sibsp兄弟姐妹/配偶数量
# parch父母/小孩数量
# 船票纬度
# ticket 船票信息
# fare 票价
# 地区纬度
# embarked 登船港口
print(titanic_survival.head())
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
age = titanic_survival["Age"]
# print(age.loc[0:10])

age_is_null = pd.isnull(age) # 判断缺失值
# print(age_is_null)

age_null_true = age[age_is_null] # 获取缺失值的索引
# print(age_null_true)

age_null_count = len(age_null_true) #查看指定列缺失值的记录数
print(age_null_count)
177
error_average_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) # 求年龄平均值,若数据存在NaN,则无法统计
print(error_average_age)
nan
good_ages = titanic_survival["Age"][age_is_null == False] # 过滤掉缺失值的指定列
# print(good_age)
average_age = sum(good_ages) / len(good_ages) # 求年龄普工君子
print(average_age)
29.69911764705882
good_average_age = titanic_survival["Age"].mean() # 调用mean()函数,过滤掉缺失值的指定列,求平均值
print(good_average_age)
29.69911764705882
#  计算不同舱位的平均票价
passenager_classes = [1,2,3]
fares_by_class = {}
for this_class in passenager_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] # 过滤出每个舱位的数据集
    pclass_fares = pclass_rows["Fare"] # 获取每个舱位数据集的票价记录
    fare_for_class = pclass_fares.mean() # 过滤掉票价列的缺失值,求平均值
    fares_by_class[this_class] = fare_for_class # 赋值-统计

print(fares_by_class)
{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
# 计算不同舱位的平均获救概率 pivot_table方法
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
# 计算不同舱位的平均年龄,未设置aggfunc默认为求均值mean
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620
# 计算不同登船港口,总票价和总生还人数
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
drop_na_columns = titanic_survival.dropna(axis=1) #丢失缺失列
# print(drop_na_columns)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) #丢失缺失记录,可以指定筛选列
print(new_titanic_survival)
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
885          886         0       3   
886          887         0       2   
887          888         1       1   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...    ...   
885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket     Fare Cabin Embarked  
0        0         A/5 21171   7.2500   NaN        S  
1        0          PC 17599  71.2833   C85        C  
2        0  STON/O2. 3101282   7.9250   NaN        S  
3        0            113803  53.1000  C123        S  
4        0            373450   8.0500   NaN        S  
..     ...               ...      ...   ...      ...  
885      5            382652  29.1250   NaN        Q  
886      0            211536  13.0000   NaN        S  
887      0            112053  30.0000   B42        S  
889      0            111369  30.0000  C148        C  
890      0            370376   7.7500   NaN        Q  

[714 rows x 12 columns]
# 定位具体值
row_index_83_age = titanic_survival.loc[83,"Age"] # 行号,列名
print(row_index_83_age)
28.0
# 按年龄排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
#重置排序
titanic_reindexed = new_titanic_survival.reset_index(drop=True) 
print(titanic_reindexed.loc[0:10])
     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
    PassengerId  Survived  Pclass                                  Name   Sex  \
0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1           852         0       3                   Svensson, Mr. Johan  male   
2           494         0       1               Artagaveytia, Mr. Ramon  male   
3            97         0       1             Goldschmidt, Mr. George B  male   
4           117         0       3                  Connors, Mr. Patrick  male   
5           673         0       2           Mitchell, Mr. Henry Michael  male   
6           746         0       1          Crosby, Capt. Edward Gifford  male   
7            34         0       2                 Wheadon, Mr. Edward H  male   
8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9           281         0       3                      Duane, Mr. Frank  male   
10          457         0       1             Millet, Mr. Francis Davis  male   

     Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0   80.0      0      0       27042  30.0000   A23        S  
1   74.0      0      0      347060   7.7750   NaN        S  
2   71.0      0      0    PC 17609  49.5042   NaN        C  
3   71.0      0      0    PC 17754  34.6542    A5        C  
4   70.5      0      0      370369   7.7500   NaN        Q  
5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
6   70.0      1      1   WE/P 5735  71.0000   B22        S  
7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
8   65.0      0      1      113509  61.9792   B30        C  
9   65.0      0      0      336439   7.7500   NaN        Q  
10  65.0      0      0       13509  26.5500   E38        S