随机森林填充数据

发布时间 2023-04-06 01:33:29作者: ThankCAT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
boston = pd.read_csv("./boston_house_prices.csv")
boston_target = pd.DataFrame(boston["MEDV"])
boston_data = pd.DataFrame(boston.iloc[:,boston.columns != "MEDV"])
boston_target.columns = [0]
boston_data.columns = range(len(boston_data.columns))
n_samples = boston_data.shape[0]
n_feature = boston_data.shape[1]
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_sample = int(np.floor(n_samples * n_feature * missing_rate))
n_missing_sample
3289
missing_feature = rng.randint(0, n_feature, n_missing_sample)
missing_sample = rng.randint(0, n_samples, n_missing_sample)
x_missing = boston_data.copy()
y_missing = boston_data.copy()
x_missing = np.array(x_missing)
x_missing[missing_sample, missing_feature] = np.nan

x_missing = pd.DataFrame(x_missing)
x_missing.isna().sum()
0     200
1     201
2     200
3     203
4     202
5     201
6     185
7     197
8     196
9     197
10    204
11    214
12    189
dtype: int64
im_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
x_missing_mean = im_mean.fit_transform(x_missing)
im_0 = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
x_missing_0 = im_0.fit_transform(x_missing)
x_missing_reg = x_missing.copy()
x_missing_reg.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       306 non-null    float64
 1   1       305 non-null    float64
 2   2       306 non-null    float64
 3   3       303 non-null    float64
 4   4       304 non-null    float64
 5   5       305 non-null    float64
 6   6       321 non-null    float64
 7   7       309 non-null    float64
 8   8       310 non-null    float64
 9   9       309 non-null    float64
 10  10      302 non-null    float64
 11  11      292 non-null    float64
 12  12      317 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB
sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values
for i in sortindex:
    df = x_missing_reg
    fillc = df.iloc[:,i]
    df = pd.concat([df.iloc[:,df.columns != i], boston_target], axis=0)
    df_0 = SimpleImputer(missing_values=np.nan, strategy="constant",fill_value=0).fit_transform(df)
    y_train = fillc[fillc.notna()]
    y_test = fillc[fillc.isna()]
    x_train = df_0[y_train.index, :]
    x_test = df_0[y_test.index, :]
    rfc = RandomForestRegressor(n_estimators=100)
    rfc.fit(x_train, y_train)
    Ypredict = rfc.predict(x_test)
    x_missing_reg.loc[x_missing_reg.iloc[:,i].isna(),i] = Ypredict
X = [boston_data, x_missing_mean, x_missing_0, x_missing_reg]
mse = []
for x in X:
    rfc = RandomForestRegressor(random_state=0, n_estimators=100)
    score = cross_val_score(rfc, x, np.array(boston_target).ravel(),cv=10,scoring="neg_mean_squared_error").mean()
    mse.append(score * -1)
x_labels = ['Full data',
            'Zero Imputation',
            'Mean Imputation',
            'Regressor Imputation'] 
colors = ['r', 'g', 'b', 'orange']
plt.figure(figsize=(12, 6)) 
ax = plt.subplot(111)
for i in np.arange(len(mse)):
    ax.barh(i, mse[i],color=colors[i], alpha=0.6, align='center') 
ax.set_title('Imputation Techniques with Boston Data')
ax.set_xlim(left=np.min(mse) * 0.9, right=np.max(mse) * 1.1) 
ax.set_yticks(np.arange(len(mse))) 
ax.set_xlabel('MSE')
ax.set_yticklabels(x_labels) 
plt.show()