缺失值处理基础语法

发布时间 2023-03-28 16:58:41作者: chliyiyu

1、Imputer

from sklearn.preprocessing import Imputer

from sklearn.model_selection import train_test_split

import pandas as pd

fileName = '***/abc.xlsx'

df = pd.read_excel(fileName)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)  -- most_frequent, median, mean

imp.fit(df)

df = imp.transform(df)

X = df.iloc[:,:-1]

y = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

forest = RandomForestClassifier()

forest.fit(x_train, y_train)

print('accuracy on the training subset:{:.3f}'.format(forest.score(x_train, y_train)))

print('accuracy on the training subset:{:.3f}'.format(forest.score(x_test, y_test)))

 

2、xgboost

from xgboost as xg

from sklearn.model_selection import train_test_split

import pandas as pd

fileName = '***/abc.xlsx'

df = pd.read_excel(fileName)

X = df.iloc[:,:-1]

y = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = xg.XGBClassifier(missing=None)

model.fit(x_train, y_train)

print('accuracy on the training subset:{:.3f}'.format(model.score(x_train, y_train)))

print('accuracy on the training subset:{:.3f}'.format(model.score(x_test, y_test)))

 

3、catboost

from catboost as cb

from sklearn.model_selection import train_test_split

import pandas as pd

fileName = '***/abc.xlsx'

df = pd.read_excel(fileName)

X = df.iloc[:,:-1]

y = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

rf = cb.CatBoostClassifier()

rf.fit(x_train, y_train)

print('accuracy on the training subset:{:.3f}'.format(rf.score(x_train, y_train)))

print('accuracy on the training subset:{:.3f}'.format(rf.score(x_test, y_test)))