随机森林代码实现(奥拓数据分类)

发布时间 2023-12-13 19:05:07作者: cojames
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("./data/train.csv")
data.head()

import seaborn as sns

sns.countplot(data.target)
plt.show()

# 采用随机欠采样之前需要确定数据的特征值和标签值
y=data["target"]
x=data.drop(["id","target"],axis=1)

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_resampled,y_resampled = rus.fit_resample(x,y)
sns.countplot(y_resampled)
plt.show()

y_resampled
#j将标签转化为编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)
y_resampled

#数据集分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(oob_score=True)
estimator.fit(x_train,y_train)
y_pre = estimator.predict(x_test)
y_test,y_pre
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse=False)
y_pre = one_hot.fit_transform(y_pre.reshape(-1,1))
y_test = one_hot.fit_transform(y_test.reshape(-1,1))
y_test,y_pre
from sklearn.metrics import log_loss

log_loss(y_test,y_pre,eps=1e-15,normalize=True)
# 7.637713870225003
y_pre_proba = estimator.predict_proba(x_test)
y_pre_proba
log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 0.7611795612521034

# 确定n_estimators的取值范围
tuned_parameters = range(10,200,10)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters)) 

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters)) 

# 调优过程实现
for i,one_parameter in enumerate(tuned_parameters):
    estimator = RandomForestClassifier(n_estimators=one_parameter,
                                       max_depth=10,
                                       max_features=10,
                                       min_samples_leaf=10,
                                       oob_score=True,
                                       random_state=0,
                                       n_jobs=-1)
    estimator.fit(x_train,y_train)
    
    # 输出accuracy
    accuracy_t[i] = estimator.oob_score_
    
    # 输出log_loss
    y_pre = estimator.predict_proba(x_test)
    error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True)

# 优化结果过程可视化 
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,accuracy_t)
axes[1].plot(tuned_parameters,error_t)

axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("accuracy_t")

axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("error_t")

axes[0].grid()
axes[1].grid()

 确定max_depth的取值范围
tuned_parameters = range(10,100,10)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters)) 

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters)) 

# 调优过程实现
for i,one_parameter in enumerate(tuned_parameters):
    estimator = RandomForestClassifier(n_estimators=175,
                                       max_depth=one_parameter,
                                       max_features=10,
                                       min_samples_leaf=10,
                                       oob_score=True,
                                       random_state=0,
                                       n_jobs=-1)
    estimator.fit(x_train,y_train)
    
    # 输出accuracy
    accuracy_t[i] = estimator.oob_score_
    
    # 输出log_loss
    y_pre = estimator.predict_proba(x_test)
    error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True)

# 优化结果过程可视化 
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,accuracy_t)
axes[1].plot(tuned_parameters,error_t)

axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("accuracy_t")

axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("error_t")

axes[0].grid()
axes[1].grid()

# 确定max_features取值范围
tuned_parameters = range(5,40,5)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters)) 

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters)) 

# 调优过程实现
for i,one_parameter in enumerate(tuned_parameters):
    estimator = RandomForestClassifier(n_estimators=175,
                                       max_depth=30,
                                       max_features=one_parameter,
                                       min_samples_leaf=10,
                                       oob_score=True,
                                       random_state=0,
                                       n_jobs=-1)
    estimator.fit(x_train,y_train)
    
    # 输出accuracy
    accuracy_t[i] = estimator.oob_score_
    
    # 输出log_loss
    y_pre = estimator.predict_proba(x_test)
    error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,accuracy_t)
axes[1].plot(tuned_parameters,error_t)

axes[0].set_xlabel("max_features")
axes[0].set_ylabel("accuracy_t")

axes[1].set_xlabel("max_features")
axes[1].set_ylabel("error_t")

axes[0].grid()
axes[1].grid()

# 确定n_estimators的取值范围
tuned_parameters = range(1,10,2)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters)) 

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters)) 

# 调优过程实现
for i,one_parameter in enumerate(tuned_parameters):
    estimator = RandomForestClassifier(n_estimators=175,
                                       max_depth=30,
                                       max_features=15,
                                       min_samples_leaf=one_parameter,
                                       oob_score=True,
                                       random_state=0,
                                       n_jobs=-1)
    estimator.fit(x_train,y_train)
    
    # 输出accuracy
    accuracy_t[i] = estimator.oob_score_
    
    # 输出log_loss
    y_pre = estimator.predict_proba(x_test)
    error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,accuracy_t)
axes[1].plot(tuned_parameters,error_t)

axes[0].set_xlabel("min_samples_leaf")
axes[0].set_ylabel("accuracy_t")

axes[1].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("error_t")

axes[0].grid()
axes[1].grid()


#确定最优模型
estimator = RandomForestClassifier(n_estimators=175,
                                       max_depth=30,
                                       max_features=15,
                                       min_samples_leaf=1,
                                       oob_score=True,
                                       random_state=0,
                                       n_jobs=-1)
estimator.fit(x_train,y_train)
y_pre_proba = estimator.predict_proba(x_test)
log_loss(y_test,y_pre_proba)
# 0.7413651159154644