Python——基于数据挖掘的上市公司财务造假识别(制造业)

发布时间 2023-05-29 12:07:07作者: 小平凡的记录

制造业

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()

from scipy import stats
from scipy.stats import norm, skew

t1=pd.read_csv("制造业.csv")
t1_train=t1.drop("FLAG",axis=1)
t1
TICKER_SYMBOL ACT_PUBTIME PUBLISH_DATE END_DATE_REP END_DATE REPORT_TYPE FISCAL_PERIOD MERGED_FLAG ACCOUTING_STANDARDS CURRENCY_CD ... CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER FLAG
0 4019 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
1 8166 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
2 11737 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
3 16479 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
4 16842 4 4 3 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13965 4992204 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13966 4992858 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN 0.000 NaN NaN NaN NaN NaN NaN NaN NaN
13967 4993201 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13968 4998808 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13969 4999709 7 7 7 6 A 12 1 CHAS_2007 CNY ... 2.6656 23.084 21.9179 0.6571 0.6256 33.6589 16.4249 0.3692 54.0618 NaN

13970 rows × 363 columns

1 数据预处理

1.1计算缺失率,并降序排序

all_data_na = (t1_train.isnull().sum() / len(t1_train) * 100).sort_values(ascending=False) 

missing_data = pd.DataFrame({'missing_data' : all_data_na})
missing_data 
missing_data
ACCRUED_EXP 99.971367
N_INC_BORR_OTH_FI 99.806729
PERPETUAL_BOND_L 99.634932
PREFERRED_STOCK_L 99.606299
PREFERRED_STOCK_E 99.591983
... ...
T_COMPR_INCOME 0.000000
N_INCOME_ATTR_P 0.000000
FINAN_EXP 0.000000
ACT_PUBTIME 0.000000
TICKER_SYMBOL 0.000000

362 rows × 1 columns

将缺失率用图表的方式展示

f, ax = plt.subplots(figsize=(30, 15))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)   #条形图
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
Text(0.5, 1.0, 'Percent missing data by feature')

# 统计缺失率大于80%的个数
missing_data_count1 = all_data_na.index[all_data_na > 80] 

# 统计缺失率小于20%的个数
missing_data_count2 = all_data_na.index[all_data_na < 20] 

print(missing_data_count1.shape,missing_data_count2.shape)
(93,) (84,)
#缺失率>80%的特征
a=missing_data.values[:93]
x=pd.DataFrame(a, index = missing_data.index[:93])
x
0
ACCRUED_EXP 99.971367
N_INC_BORR_OTH_FI 99.806729
PERPETUAL_BOND_L 99.634932
PREFERRED_STOCK_L 99.606299
PREFERRED_STOCK_E 99.591983
... ...
OP_CL 81.338583
R_D 81.159628
N_CF_OPA_LIAB 80.952040
N_CF_NFA_LIAB 80.952040
OP_TL 80.916249

93 rows × 1 columns

1.2 删除80%以上的缺失率

t2=t1_train.drop(columns=x.index)
t2
TICKER_SYMBOL ACT_PUBTIME PUBLISH_DATE END_DATE_REP END_DATE REPORT_TYPE FISCAL_PERIOD MERGED_FLAG ACCOUTING_STANDARDS CURRENCY_CD ... AP_TURNOVER CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER
0 4019 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 8166 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 11737 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 16479 3 3 2 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 16842 4 4 3 1 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13965 4992204 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13966 4992858 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN 0.000 NaN NaN NaN NaN NaN NaN NaN
13967 4993201 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13968 4998808 7 7 7 6 A 12 1 CHAS_2007 CNY ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13969 4999709 7 7 7 6 A 12 1 CHAS_2007 CNY ... 10.6956 2.6656 23.084 21.9179 0.6571 0.6256 33.6589 16.4249 0.3692 54.0618

13970 rows × 269 columns

1.3 对缺失率20%到80%的数据填充中位数

b=missing_data.index[93:278]
for o in b:
    t2[o]=t2[o].fillna(t2[o].median())
t2
TICKER_SYMBOL ACT_PUBTIME PUBLISH_DATE END_DATE_REP END_DATE REPORT_TYPE FISCAL_PERIOD MERGED_FLAG ACCOUTING_STANDARDS CURRENCY_CD ... AP_TURNOVER CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER
0 4019 3 3 2 1 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
1 8166 3 3 2 1 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
2 11737 3 3 2 1 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
3 16479 3 3 2 1 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
4 16842 4 4 3 1 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13965 4992204 7 7 7 6 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
13966 4992858 7 7 7 6 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 0.0000 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
13967 4993201 7 7 7 6 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
13968 4998808 7 7 7 6 A 12 1 CHAS_2007 CNY ... 4.8617 1.0942 149.7293 4.1120 3.0696 2.7145 74.30515 87.75175 0.5354 8.49245
13969 4999709 7 7 7 6 A 12 1 CHAS_2007 CNY ... 10.6956 2.6656 23.0840 21.9179 0.6571 0.6256 33.65890 16.42490 0.3692 54.06180

13970 rows × 269 columns

1.4 对缺失率20%以下的数据使用KNN填充

d=missing_data.index[278:336] #列名

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10)
t2[d] = imputer.fit_transform(t2[d])
print(t2.isnull().sum())
TICKER_SYMBOL    0
ACT_PUBTIME      0
PUBLISH_DATE     0
END_DATE_REP     0
END_DATE         0
                ..
TFA_TURNOVER     0
DAYS_AP          0
DAYS_INVEN       0
TA_TURNOVER      0
AR_TURNOVER      0
Length: 269, dtype: int64

1.5 删除与预测是否造假结果无关的特征因子

删除股票代码,实际披露时间,发布时间,报告截止日期,截止日期,报告类型,会计区间,合并标志:1-合并,2-母公司,会计准则,货币代码共 10 个与预测是否造假结果无关的特征因子

t2=t2.drop(["TICKER_SYMBOL","ACT_PUBTIME","PUBLISH_DATE","END_DATE_REP","END_DATE","REPORT_TYPE","FISCAL_PERIOD","MERGED_FLAG","ACCOUTING_STANDARDS","CURRENCY_CD"],axis=1)

1.6 查看是否还存在缺失值

t2.isna().any().sum()  
0

1.7 对数据进行标准化

from sklearn.preprocessing import StandardScaler

#标准化,返回值为标准化后的数据
t4=pd.DataFrame(StandardScaler().fit_transform(t2),columns=t2.columns)
t4
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... AP_TURNOVER CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER
0 -0.110544 -0.106696 -0.161667 -0.182694 -0.067294 -0.177580 -0.271929 -0.054680 -0.201905 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
1 -0.036496 1.088871 -0.182107 -0.052401 -0.085668 -0.026558 0.016419 -0.171927 0.060346 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
2 0.070766 -0.189223 0.057981 -0.140868 0.021829 -0.115114 -0.100801 0.073932 -0.023286 -0.110754 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
3 -0.039637 -0.205146 -0.184401 -0.159863 -0.062639 -0.060387 -0.197651 0.346521 -0.105029 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
4 -0.244743 -0.199970 -0.265148 -0.148300 -0.085668 -0.182752 -0.279125 -0.178592 -0.283117 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13965 -0.245654 -0.175257 -0.248184 -0.192613 -0.085668 -0.180662 -0.279316 -0.178050 -0.279115 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13966 -0.204023 -0.205182 -0.257308 -0.191965 -0.085668 -0.175087 -0.270255 -0.175323 -0.266459 -0.087742 ... -0.071554 -0.120861 -0.063158 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13967 -0.227119 -0.204127 -0.201336 -0.164736 -0.085668 -0.164288 -0.183161 -0.162139 -0.237732 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13968 0.100220 -0.204577 -0.038156 -0.128786 -0.085668 -0.128173 0.075970 -0.152256 -0.019633 -0.068500 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13969 1.609554 0.222399 1.025478 0.436742 -0.085668 0.581671 0.691256 0.019537 1.080323 -0.087742 ... 0.306581 2.042642 -0.058198 0.017576 -0.069761 -0.013200 -0.009263 -0.050238 -0.593860 -0.006262

13970 rows × 259 columns

2 划分数据集

以前5年数据为训练集、验证集train,第6年为测试集test

#以前5年数据为训练集、验证集train,第6年为测试集test
train=t4.iloc[:11310,:]
test=t4.iloc[11310:,:259]
train["FLAG"]=t1["FLAG"]
train
  train["FLAG"]=t1["FLAG"]
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER FLAG
0 -0.110544 -0.106696 -0.161667 -0.182694 -0.067294 -0.177580 -0.271929 -0.054680 -0.201905 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
1 -0.036496 1.088871 -0.182107 -0.052401 -0.085668 -0.026558 0.016419 -0.171927 0.060346 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
2 0.070766 -0.189223 0.057981 -0.140868 0.021829 -0.115114 -0.100801 0.073932 -0.023286 -0.110754 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
3 -0.039637 -0.205146 -0.184401 -0.159863 -0.062639 -0.060387 -0.197651 0.346521 -0.105029 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
4 -0.244743 -0.199970 -0.265148 -0.148300 -0.085668 -0.182752 -0.279125 -0.178592 -0.283117 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11305 -0.248180 -0.177748 -0.244404 -0.195324 -0.085668 -0.182942 -0.277525 -0.177054 -0.279415 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
11306 -0.218672 -0.196336 -0.255531 -0.193333 -0.085668 -0.160477 -0.268560 -0.174560 -0.270623 -0.087742 ... -1.587291 2.125000 -0.031456 -0.041665 -0.012392 0.005011 2.477695 -1.585968 -0.054737 0.0
11307 -0.200565 -0.204200 -0.232985 -0.177734 -0.085668 -0.175507 -0.207126 -0.160690 -0.242246 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
11308 -0.101380 -0.197020 -0.049710 -0.100780 -0.085668 -0.178231 0.042636 -0.123428 -0.095392 -0.064501 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
11309 1.326097 0.141651 0.889289 0.170126 0.029234 0.515964 0.529339 0.024492 0.854325 2.263116 ... 0.710450 -0.058365 0.016279 -0.071072 -0.013249 -0.009230 -0.050121 -0.719755 0.004747 0.0

11310 rows × 260 columns

import pandas as pd
train.to_excel("训练集、验证集.xlsx")
test.to_excel("测试集.xlsx")

2.1 样本不均衡处理

X_train1=np.array(train.iloc[:11310,:259])
y_train1 =train.FLAG.values
from collections import Counter

# 查看所生成的样本类别分布,0和1样本比例9比1,属于类别不平衡数据
print(Counter(y_train1))
Counter({0.0: 11219, 1.0: 91})
import matplotlib.pyplot as plt

# make data
x = [11219, 91]
labels = ['0', '1']

# plot
fig, ax = plt.subplots()
ax.pie(x, radius=3, center=(4, 4),labels=labels, 
       wedgeprops={"linewidth": 1, "edgecolor": "white"}, autopct='%.1f%%', frame=True)

ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
       ylim=(0, 8), yticks=np.arange(1, 8))

plt.show()

from imblearn.over_sampling import SMOTE

# 生成0和1比例为3比1的数据样本
oversample = SMOTE(sampling_strategy=0.2,random_state=42)
X_os, y_os = oversample.fit_resample(X_train1,y_train1)
print(Counter(y_os))
Counter({0.0: 11219, 1.0: 2243})
X_os.shape
(13462, 259)
import pandas as pd
a1 = pd.DataFrame(X_os)
a1["259"] = y_os
a1.columns = train.columns #添加列名
a1
#a.to_excel("洗好的数据.xlsx")
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER FLAG
0 -0.110544 -0.106696 -0.161667 -0.182694 -0.067294 -0.177580 -0.271929 -0.054680 -0.201905 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
1 -0.036496 1.088871 -0.182107 -0.052401 -0.085668 -0.026558 0.016419 -0.171927 0.060346 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
2 0.070766 -0.189223 0.057981 -0.140868 0.021829 -0.115114 -0.100801 0.073932 -0.023286 -0.110754 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
3 -0.039637 -0.205146 -0.184401 -0.159863 -0.062639 -0.060387 -0.197651 0.346521 -0.105029 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
4 -0.244743 -0.199970 -0.265148 -0.148300 -0.085668 -0.182752 -0.279125 -0.178592 -0.283117 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13457 -0.194605 -0.204111 -0.235016 -0.192665 -0.095194 -0.166159 -0.266567 -0.157112 -0.255136 -0.082809 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 1.0
13458 -0.231584 -0.196071 -0.240270 -0.175277 -0.085668 0.100651 -0.104562 -0.135625 -0.215914 -0.093468 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 1.0
13459 -0.172396 -0.090448 -0.126067 -0.083162 -0.085668 -0.109957 -0.217281 -0.088728 -0.182285 -0.087742 ... -0.459342 -0.013455 -0.022908 -0.067021 -0.013149 -0.008900 -0.020502 -0.563696 -0.051747 1.0
13460 0.220213 0.428407 0.539064 0.129878 0.930931 0.152119 0.261990 0.167506 0.343409 -0.036143 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 1.0
13461 0.015222 0.092724 0.156657 0.054408 -0.085668 -0.104527 0.074379 -0.154946 0.042608 -0.085088 ... -0.462918 0.015973 -0.025581 0.770781 0.029360 -0.008998 0.032707 -0.261838 0.219923 1.0

13462 rows × 260 columns

a2 = a1.drop("FLAG",axis=1)
a2
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... AP_TURNOVER CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER
0 -0.110544 -0.106696 -0.161667 -0.182694 -0.067294 -0.177580 -0.271929 -0.054680 -0.201905 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
1 -0.036496 1.088871 -0.182107 -0.052401 -0.085668 -0.026558 0.016419 -0.171927 0.060346 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
2 0.070766 -0.189223 0.057981 -0.140868 0.021829 -0.115114 -0.100801 0.073932 -0.023286 -0.110754 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
3 -0.039637 -0.205146 -0.184401 -0.159863 -0.062639 -0.060387 -0.197651 0.346521 -0.105029 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
4 -0.244743 -0.199970 -0.265148 -0.148300 -0.085668 -0.182752 -0.279125 -0.178592 -0.283117 -0.087742 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13457 -0.194605 -0.204111 -0.235016 -0.192665 -0.095194 -0.166159 -0.266567 -0.157112 -0.255136 -0.082809 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13458 -0.231584 -0.196071 -0.240270 -0.175277 -0.085668 0.100651 -0.104562 -0.135625 -0.215914 -0.093468 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13459 -0.172396 -0.090448 -0.126067 -0.083162 -0.085668 -0.109957 -0.217281 -0.088728 -0.182285 -0.087742 ... 0.103209 -0.459342 -0.013455 -0.022908 -0.067021 -0.013149 -0.008900 -0.020502 -0.563696 -0.051747
13460 0.220213 0.428407 0.539064 0.129878 0.930931 0.152119 0.261990 0.167506 0.343409 -0.036143 ... -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13461 0.015222 0.092724 0.156657 0.054408 -0.085668 -0.104527 0.074379 -0.154946 0.042608 -0.085088 ... -0.040714 -0.462918 0.015973 -0.025581 0.770781 0.029360 -0.008998 0.032707 -0.261838 0.219923

13462 rows × 259 columns

2.2 划分训练集、验证集

#前 5 年制造业数据分别进行训练集与验证集的切割
from sklearn.model_selection import train_test_split
import pandas as pd 
train_data,test_data1 = train_test_split(a1,test_size = 0.2,random_state=0)
#验证集
test_data1
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER FLAG
10307 -0.237276 -0.149240 -0.209804 -0.164470 -0.085668 -0.159014 -0.254104 -0.178466 -0.256564 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
6913 2.860131 1.594538 0.480208 2.647223 -0.085668 1.097509 8.681112 1.615339 3.903271 0.161419 ... -1.074846 -0.055897 -0.007669 -0.055792 -0.012656 -0.009343 -0.045661 -0.929869 -0.047602 0.0
7530 -0.236536 -0.178238 -0.227843 -0.184537 -0.085668 -0.175392 -0.267297 -0.177575 -0.268344 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
8204 -0.247465 -0.196230 -0.194191 -0.192289 -0.085668 -0.174138 -0.264405 -0.158679 -0.264602 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
11212 0.204918 -0.167106 0.011418 -0.155947 0.087527 -0.077206 0.006010 -0.041689 0.031319 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11098 -0.204823 -0.191856 -0.257564 -0.195262 -0.085668 -0.181840 -0.262093 -0.140187 -0.261923 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
451 -0.146761 -0.059989 -0.141490 -0.150467 -0.087393 -0.178277 -0.149462 -0.135039 -0.148100 -0.064996 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
5634 -0.217310 -0.048995 -0.150073 -0.185264 -0.085668 -0.147963 -0.269352 -0.144624 -0.223935 -0.085432 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
4379 -0.225533 -0.153682 -0.128768 -0.162863 -0.111326 -0.171458 -0.151384 0.109614 -0.174713 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
7712 -0.229782 -0.188186 -0.269640 -0.172389 -0.085668 -0.171999 -0.278531 -0.171582 -0.279448 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0

2693 rows × 260 columns

#训练集
train_data
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA ... CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER FLAG
11732 -0.115619 -0.159654 -0.215413 -0.066286 -0.055511 -0.135853 -0.226678 -0.152113 -0.203331 -0.087742 ... 2.124952 -0.047612 -0.005608 -0.059694 -0.012875 -0.009210 -0.042106 0.674642 0.081817 1.0
2849 -0.232070 -0.138400 -0.136509 -0.174268 -0.085668 -0.166162 -0.091276 -0.171693 -0.196572 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
4938 0.147070 0.316345 0.139180 0.228339 0.028565 1.242537 -0.003413 -0.167509 0.159136 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
10029 -0.214163 -0.154632 -0.230625 -0.187055 -0.085668 -0.165990 -0.213737 -0.150521 -0.240806 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
5420 -0.210466 -0.171303 -0.145414 -0.145187 -0.085668 -0.153739 -0.053173 -0.062340 -0.168963 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13123 -0.244645 -0.113926 -0.200190 -0.112910 -0.095363 -0.145159 -0.109428 -0.093625 -0.201378 -0.092453 ... -0.958218 0.086090 -0.028829 -0.063935 -0.013042 -0.007976 0.044354 -1.065248 -0.053682 1.0
3264 -0.230011 -0.182265 -0.250535 -0.182594 -0.114804 -0.176142 -0.289794 -0.178597 -0.276976 -0.087742 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
9845 -0.078908 -0.201787 -0.191256 -0.195170 -0.085668 -0.155015 -0.258678 -0.178345 -0.205729 -0.087742 ... -0.812152 -0.054565 0.101067 -0.071234 -0.013280 -0.009285 -0.052963 -1.054317 -0.045676 1.0
10799 -0.239831 -0.174050 -0.177255 -0.192585 -0.085668 -0.101249 -0.273990 -0.151724 -0.254010 -0.103202 ... -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602 0.0
2732 1.702119 0.274794 -0.037051 0.189707 2.197139 -0.087228 0.200058 -0.144783 0.668527 -0.087742 ... 0.010898 -0.026220 -0.022478 -0.055215 -0.012707 -0.009076 -0.031109 0.198842 -0.051366 0.0

10769 rows × 260 columns

#删除验证集FLAG
test_data2=test_data1.drop("FLAG",axis=1)

3 造假指标模型建立

from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.metrics import auc
#特征重要性选择
from xgboost import plot_importance
#训练集数据
X_train=np.array(train_data.iloc[:,:259])
y_train =np.array(train_data["FLAG"])
#验证集数据
y=np.array(test_data1["FLAG"])
feature_1 = a1.drop('FLAG',axis = 1)
feature_1
CASH_C_EQUIV NOTES_RECEIV AR PREPAYMENT INT_RECEIV OTH_RECEIV INVENTORIES OTH_CA T_CA AVAIL_FOR_SALE_FA LT_EQUITY_INVEST INVEST_REAL_ESTATE FIXED_ASSETS CIP INTAN_ASSETS GOODWILL LT_AMOR_EXP DEFER_TAX_ASSETS OTH_NCA T_NCA T_ASSETS ST_BORR NOTES_PAYABLE AP ADVANCE_RECEIPTS PAYROLL_PAYABLE TAXES_PAYABLE INT_PAYABLE DIV_PAYABLE OTH_PAYABLE NCL_WITHIN_1Y OTH_CL T_CL LT_BORR LT_PAYABLE ESTIMATED_LIAB DEFER_REVENUE DEFER_TAX_LIAB T_NCL T_LIAB PAID_IN_CAPITAL CAPITAL_RESER SPECIAL_RESER SURPLUS_RESER RETAINED_EARNINGS T_EQUITY_ATTR_P MINORITY_INT T_SH_EQUITY T_LIAB_EQUITY OTH_COMPRE_INCOME C_PAID_OTH_FINAN_A N_CF_FR_INVEST_A C_FR_BORR N_CF_OPERATE_A C_FR_CAP_CONTR C_PAID_INVEST C_FR_OTH_FINAN_A C_PAID_OTH_INVEST_A C_INF_FR_INVEST_A C_PAID_G_S ... TSE_TA C_TA TEAP_IC LT_AMOR_EXP_TA NCA_TA ST_BORR_TA NCL_TA EQU_MULTIPLIER CAP_FIX_RATIO N_TAN_A_TA REPAY_TA ID_IC AP_TA INVEN_TA CL_TA ADV_R_TA AR_TA TEAP_TA T_FIXED_A_TA FIXED_A_TA TRE_TA CA_TA INTAN_A_TA AIL_TR VAL_CHG_P_TR COGS_TR SELL_EXP_TR PERIOD_EXP_TR INV_INC_TR IT_TP OPA_P_TP OP_TR FINAN_EXP_TR VAL_CHG_P_TP NI_CUT_NI OPA_P_TR N_NOPI_TP R_TR NOPG_TR NI_TR TCOGS_TR TP_TR NOPL_TR ADMIN_EXP_TR EBITDA_TR BTAX_SURCHG_TR IT_TR EBIT_TR OP_TP DAYS_AR AP_TURNOVER CA_TURNOVER OPER_CYCLE INVEN_TURNOVER FA_TURNOVER TFA_TURNOVER DAYS_AP DAYS_INVEN TA_TURNOVER AR_TURNOVER
0 -0.110544 -0.106696 -0.161667 -0.182694 -0.067294 -0.177580 -0.271929 -0.054680 -0.201905 -0.087742 -0.142028 -0.086266 -0.200848 -0.164105 -0.277532 -0.163757 -0.234584 -0.116027 -0.114840 -0.232795 -0.228647 -0.229273 -0.225301 -0.204865 -0.163052 -0.185749 -0.160000 -0.198005 -0.077002 -0.183300 -0.187825 -0.081143 -0.254193 -0.203991 -0.139548 -0.074228 -0.097049 -0.109180 -0.209598 -0.257919 -0.252988 0.005258 -0.146179 -0.153425 -0.171217 -0.157934 -0.154017 -0.167159 -0.228648 -0.048196 -0.157168 0.275978 -0.286845 -0.143991 -0.194064 -0.048096 -0.153902 -0.159625 -0.019451 -0.183022 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
1 -0.036496 1.088871 -0.182107 -0.052401 -0.085668 -0.026558 0.016419 -0.171927 0.060346 -0.087742 -0.153631 -0.086266 -0.186912 -0.173041 -0.111736 -0.163757 -0.172424 -0.023101 -0.114840 -0.209060 -0.067471 -0.229273 0.827724 0.093545 0.092487 0.069331 0.087165 -0.198005 -0.077002 -0.053152 -0.187825 -0.076646 0.043979 -0.150792 -0.139548 -0.074228 0.100215 -0.109180 -0.186920 -0.010804 -0.109135 -0.403590 -0.146179 0.015458 0.038215 -0.141095 -0.154017 -0.152307 -0.067472 -0.048196 -0.157168 0.182495 -0.232185 0.099247 -0.194064 -0.121115 -0.153902 -0.159625 -0.154439 -0.020811 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
2 0.070766 -0.189223 0.057981 -0.140868 0.021829 -0.115114 -0.100801 0.073932 -0.023286 -0.110754 -0.067454 -0.120642 -0.200108 0.077432 -0.148904 0.586702 -0.089601 -0.113231 -0.059961 -0.105702 -0.064816 -0.316157 -0.231363 -0.137904 -0.011700 -0.191030 -0.024851 -0.198005 -0.105756 -0.108134 -0.187825 -0.076646 -0.200263 -0.150792 -0.139548 -0.074228 0.260842 -0.062466 -0.163225 -0.202685 -0.029154 0.651345 -0.146179 -0.042070 -0.044831 0.188303 -0.064006 0.157133 -0.064817 -0.048196 -0.157168 -0.431703 -0.250391 0.005059 -0.243632 0.327603 -0.153902 -0.159625 0.228867 -0.158202 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
3 -0.039637 -0.205146 -0.184401 -0.159863 -0.062639 -0.060387 -0.197651 0.346521 -0.105029 -0.087742 -0.120960 -0.086266 -0.221463 -0.199303 -0.177723 -0.171951 -0.027574 -0.127111 -0.117747 -0.236460 -0.175166 -0.230604 -0.164503 -0.125771 -0.142193 -0.045072 -0.156540 -0.209112 -0.077002 -0.148004 -0.242135 -0.076646 -0.188544 -0.116849 -0.139548 -0.074228 -0.128884 -0.039623 -0.162012 -0.192893 -0.286260 0.076386 -0.146179 -0.171182 -0.131627 -0.123079 -0.177217 -0.135453 -0.175168 -0.062454 -0.157168 0.120646 -0.198916 -0.093394 -0.245538 0.044030 -0.189950 0.105294 0.073860 -0.109958 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
4 -0.244743 -0.199970 -0.265148 -0.148300 -0.085668 -0.182752 -0.279125 -0.178592 -0.283117 -0.087742 -0.120960 -0.086266 -0.282834 -0.196378 -0.306345 -0.163757 -0.189353 -0.135653 -0.114840 -0.288391 -0.302069 -0.312094 -0.194654 -0.204394 -0.168588 -0.229982 -0.185895 -0.217441 -0.108710 -0.183572 -0.187825 -0.076646 -0.259576 -0.150792 -0.139548 -0.074228 -0.162071 -0.109180 -0.218931 -0.264605 -0.401198 -0.402463 -0.146179 -0.176925 -0.219024 -0.355697 -0.179565 -0.341002 -0.302070 -0.048196 -0.176421 0.224868 -0.285642 -0.201230 -0.194064 -0.121115 -0.153902 -0.159625 -0.156845 -0.177266 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13457 -0.194605 -0.204111 -0.235016 -0.192665 -0.095194 -0.166159 -0.266567 -0.157112 -0.255136 -0.082809 -0.144837 -0.086266 -0.234505 -0.179108 -0.247421 -0.173251 -0.202890 -0.117034 -0.115165 -0.249601 -0.267204 -0.241517 -0.233453 -0.203503 -0.160887 -0.201682 -0.171888 -0.198005 -0.077002 -0.156267 -0.187825 -0.076646 -0.251653 -0.150792 -0.139548 -0.085263 -0.142678 -0.124578 -0.215774 -0.257401 -0.138162 -0.345235 -0.146179 -0.164382 -0.194570 -0.269117 -0.180658 -0.264808 -0.267205 -0.048232 -0.157168 0.282769 -0.277337 -0.214423 -0.237723 -0.089799 -0.174903 -0.159625 -0.081430 -0.168416 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13458 -0.231584 -0.196071 -0.240270 -0.175277 -0.085668 0.100651 -0.104562 -0.135625 -0.215914 -0.093468 -0.075270 0.064067 -0.170791 -0.197127 -0.291951 -0.174709 -0.235937 -0.132742 -0.119921 -0.205796 -0.223482 -0.183847 -0.242438 -0.113028 -0.165859 -0.198643 -0.352867 -0.101828 -0.078867 0.014294 -0.243134 -0.077372 -0.178927 -0.182819 -0.139548 0.072025 -0.158947 -0.109180 -0.191181 -0.192376 -0.015725 -0.131649 -0.146179 -0.104321 -0.381811 -0.255928 -0.208644 -0.257609 -0.223483 -0.037582 -0.157168 0.219093 -0.215293 -0.297951 -0.196203 -0.126649 0.038585 -0.159625 -0.140010 -0.169934 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13459 -0.172396 -0.090448 -0.126067 -0.083162 -0.085668 -0.109957 -0.217281 -0.088728 -0.182285 -0.087742 -0.123844 -0.120632 -0.234071 -0.054888 -0.188660 -0.163757 -0.128242 -0.031042 -0.114840 -0.221879 -0.212135 -0.252805 -0.194654 -0.079903 -0.078436 -0.206288 -0.167967 0.184241 -0.077002 -0.023900 -0.187825 -0.076646 -0.186352 -0.150792 -0.139548 0.849701 -0.147337 -0.109180 -0.130924 -0.183368 0.092909 -0.037259 -0.222978 -0.104361 -0.472813 -0.247493 -0.145101 -0.243325 -0.212136 -0.048196 -0.157168 0.284709 -0.227859 -0.224250 -0.194064 -0.041249 -0.132306 -0.192049 -0.032639 -0.181799 ... 0.011101 -0.091564 -0.763986 -0.071247 0.996515 -0.014163 -0.009115 0.068415 0.295576 0.011159 -0.254426 0.988053 0.000208 0.124612 -0.011954 -0.011675 0.593479 0.011130 2.383589 2.401008 0.011301 -1.000220 0.351214 -0.008270 -0.051766 1.752778 -0.611433 -0.008477 -0.053741 -0.113486 0.027801 0.008361 -0.008459 -0.013867 0.070320 0.008370 -0.032261 0.038886 -0.008778 0.008361 -0.008370 0.008355 -0.008532 -0.008450 -0.472817 0.014996 -0.339346 0.008169 0.032168 0.010236 0.103209 -0.459342 -0.013455 -0.022908 -0.067021 -0.013149 -0.008900 -0.020502 -0.563696 -0.051747
13460 0.220213 0.428407 0.539064 0.129878 0.930931 0.152119 0.261990 0.167506 0.343409 -0.036143 -0.133788 -0.086266 0.574080 1.685421 0.752913 -0.025926 -0.010816 0.002251 0.311119 0.653651 0.514408 0.162320 -0.251056 0.137400 -0.154652 -0.148068 0.305776 2.339665 -0.077002 -0.083185 -0.018381 0.503922 0.097436 -0.155138 -0.158031 -0.074228 0.376548 0.404251 1.020602 0.333509 -0.014059 1.199035 -0.146179 0.190460 0.870264 0.871184 -0.024417 0.765729 0.514407 -0.079826 0.026608 -0.935087 0.135455 0.398947 -0.243537 -0.125528 -0.021834 -0.024758 -0.114136 0.050575 ... 0.011147 -0.127655 0.007816 -0.090990 -0.018944 -0.014594 -0.009392 -0.020234 -0.079771 0.011144 -0.158489 -0.020358 -0.016036 -0.144829 -0.011974 -0.010509 -0.168016 0.011145 -0.132080 -0.126937 0.011562 0.019110 -0.139211 -0.008475 -0.044078 0.087840 -0.156706 -0.008489 -0.041648 0.010630 0.016545 0.008475 -0.008480 -0.011569 0.034511 0.008481 -0.022897 0.038886 -0.008784 0.008472 -0.008481 0.008471 -0.008531 -0.008533 -0.003868 -0.044050 -0.054146 0.008459 0.022926 -0.012153 -0.071554 -0.120861 -0.030988 -0.022326 -0.046674 -0.012361 -0.008952 -0.031444 -0.112856 -0.047602
13461 0.015222 0.092724 0.156657 0.054408 -0.085668 -0.104527 0.074379 -0.154946 0.042608 -0.085088 -0.117501 -0.086266 0.083504 -0.131142 -0.256025 -0.170433 0.338018 -0.074333 0.013288 -0.046307 0.001719 -0.154665 0.307370 0.160793 -0.096559 -0.026013 -0.139392 -0.198598 -0.077002 0.065349 0.128240 -0.076646 0.029736 0.175793 -0.115495 -0.083274 0.024891 -0.113892 0.041858 0.034694 -0.113173 0.029863 -0.146179 -0.023991 -0.028819 -0.031130 -0.140034 -0.050337 0.001718 -0.020687 -0.167773 0.198412 -0.077565 0.225320 -0.194064 -0.131067 -0.063616 -0.128632 -0.114486 -0.004891 ... 0.011075 0.102449 -0.414292 -0.090990 -0.811893 -0.014594 -0.009182 0.034719 -0.130274 0.011120 -0.128349 0.499511 -0.014519 1.970255 -0.011936 -0.004084 -0.587565 0.011085 -0.768414 -0.818383 0.011551 0.806651 -0.422075 -0.008475 -0.038851 0.377146 -0.372627 -0.008509 -0.038106 0.069489 0.016263 0.008474 -0.008479 -0.008330 0.041396 0.008481 -0.024758 0.015333 -0.008814 0.008469 -0.008481 0.008468 -0.008534 -0.008605 -0.058418 0.213152 -0.037802 0.008452 0.024738 -0.018517 -0.040714 -0.462918 0.015973 -0.025581 0.770781 0.029360 -0.008998 0.032707 -0.261838 0.219923

13462 rows × 259 columns

3.1 Logistics Regression 调参过程

在模型中先固定参数的默认值,然后进行参数调节,进行网格搜索与专业文献查阅寻找精确度最高而又不引起模型过拟合的参数值.模型优化评价指标为 AUC 值.

在逻辑回归模型中,需要调整的参数共有 2 个:penalty 与 C, 其中 penalty 是正则化方法,C 为逻辑回归中的超参数,表示正则化强度的倒数,在模型中默认为 1,表示正则项与损失函数的比值为 1:1.当模型中的 C 越小时,会导致损失损失函数越小,从而对其惩罚更重,正则化作用越强.

from sklearn.linear_model import LogisticRegression
#训练逻辑回归模型  C调参范围 [0.05,0.1,0.2,0.3]
clf1 = LogisticRegression(C=0.2,penalty="l2").fit(X_train, y_train)

#返回预测属于股票代码的概率 
y_pred_gbc = clf1.predict_proba(test_data2)[:,1] 

#查看召回率
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





0.884673004897724
#penalty调参范围:[l1、l2、none]
clf2= LogisticRegression(penalty="none").fit(X_train, y_train)
y_pred_gbc = clf2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





0.9013195044655719

逻辑回归属于线性判别模型,而本文所处理的数据集维度较高,故可能存在其他非线性模型能够表现的更好.

3.2 SVM 调参过程

SVM 需要调整的参数有 2 个,分别为 kernal 和 C,

其中 kernal 代表核方法,可选的函数有:“poly”:多项式核函数,“rbf”:高斯核函数 (径向基函数),“linear”:线性核函数,“sigmod”:核函数.核函数在 SVM 中发挥着重要功能,在简化向量内积运算起着重要作用,其中高斯核函数在非线性分类问题上广泛应用.

C 代表错误项的惩罚系数,在软间隔分类中应用较多.C 越大,对错误样本的惩罚力度就越大,训练的样本准确率越高.但是容易产生过拟合现象,机器模型的泛化能力降低.相反,C 取较小的值时,允许训练样本中存在错误分类的样本,能够增强模型的泛化能力.

from sklearn import svm
#kernal调参范围: ["linear","rbf","sigmoid","poly"]   
svm1 = svm.SVC(kernel='rbf',probability=True).fit(X_train, y_train)
y_pred_gbc = svm1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
0.8988533563814463
#kernal调参范围: ["linear","rbf","sigmoid","poly"]   
svm2 = svm.SVC(C=0.003,probability=True).fit(X_train, y_train)
y_pred_gbc = svm2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc
0.8741169691731491

3.3 RF 调参过程

在随机森林模型中需要调节的参数有 4 个,分别为

max_depth:树的最大深度、

n_estinators:树模型的数量、

min_samples_split:中间节点分支所需的最小样本数量、

min_sample_leaf:叶节点存在所需的最小样本数量.

为了防止模型出现过拟合现象,本文在调节其他参数时控制 max_depth=3.

from sklearn.ensemble import RandomForestClassifier
#max_depth调参范围: [3,5,7,8,11,13]  
RF1 = RandomForestClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
y_pred_gbc = RF1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8762738884087198
#n_estinators 调参范围: [300,400,500,600,700]
RF2 = RandomForestClassifier(n_estimators=600, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8856746374723902
#min_samples_leaf 调参范围:   [10,20,40,60,70,80,100]
RF3= RandomForestClassifier(min_samples_leaf=30, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8766320944972631
#min_samples_split 调参范围:  [60,70,80,90,110,130]
RF4 = RandomForestClassifier(min_samples_split=80, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.8757802746566791

3.4 DT 调参过程

在决策树模型中需要调整的参数共有 4 个,分别为

max_depth:树的最大深度、

min_samples_split:中间节点分支所需要的的最小样本量、

min_sample_leaf:叶节点存在所需的最小样本量、

max_leaf_nodes:最大叶子节点数.

为了防止模型出现过拟合现象,本文调节其他参数对模型的 AUC 影响时控制 max_depth=6

from sklearn import tree
#max_depth 调参范围:  [5,6,7,8,9,10]
DT1 =tree.DecisionTreeClassifier(max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9026846249879958
#min_samples_leaf 调参范围:  [2,3,6,8]
DT1 =tree.DecisionTreeClassifier(min_samples_leaf=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9008700662633248
#max_leaf_nodes 调参范围:  [50,60,70,80,100]
DT2 =tree.DecisionTreeClassifier(max_leaf_nodes=70, max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9006472678382791
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 max_depth=6
#min_samples_split 调参范围:  [2,3,4,5,6,8]
DT3 =tree.DecisionTreeClassifier(min_samples_split=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9005358686257564

——筛选在DT算法中特征重要性系数前20个指标

DT_importances = DT3.feature_importances_*10000
DT = pd.Series(DT_importances, index = a2.columns)
DT = DT.sort_values(ascending=False)
DT = pd.DataFrame({'feature_importances' : DT})
DT.head(20)
feature_importances
DILUTED_EPS 1876.459630
ESTIMATED_LIAB 1418.956658
RETAINED_EARNINGS 1274.737100
ASSETS_DISP_GAIN 1189.341616
C_FR_CAP_CONTR 350.146297
CASH_C_EQUIV 305.869870
DEFER_TAX_LIAB 305.319042
INT_RECEIV 278.811896
CURRENT_RATIO 254.761597
N_CF_FR_FINAN_A 245.185566
OTH_GAIN 214.434895
GOODWILL 207.893979
N_INCOME 199.309895
CL_TA 190.726453
NOPERATE_EXP 180.024039
OTH_CL 167.889986
GAIN_INVEST 160.657452
DIV_PAYABLE 157.439514
IT_TR 117.051125
A_J_INVEST_INCOME 101.434231

3.5 XGBoost 调参过程

XGBoost 需要调节的参数共有 9 个,下面本文只介绍对该模型相对重要的两个参数:

第一个参数是 n_estimators,在 XGBoost 模型中这个参数发挥着重要作用,表示该模型中分类器的个数,该参数的值越大,模型的学习能力就会越强.

第二个参数是learning_rate,learning_rate 表示集成模型中的学习速率,又被称之为步长控制迭代速率,有效的调节该参数值能够防止模型出现过拟合现象,默认值为 0.1,调节范围为[0,1].

本文为了尽可能防止模型出现过拟合,在调节其他参数的值时将学习率设定为0.001.

from xgboost import XGBClassifier
from xgboost import plot_importance
#learning_rate 调参范围:  [0.001,0.002,0.003,0.0035]
XGBoost1 = XGBClassifier(learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:58:21] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=0.001
#learning_rate 调参范围:   [100,110,120,200,300] 
XGBoost2 = XGBClassifier(n_estimators=120,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:58:59] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.911076058772688
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#max_depth 调参范围:  [2,3,5,6,7,10]
XGBoost3 = XGBClassifier(max_depth=6,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:59:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#min_child_weight  调参范围:  [1,3,4,5,7,8]
XGBoost4 = XGBClassifier(min_child_weight=3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:00:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9118438490348603
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Gamma 调参范围:   [0.2,0.3,0.5,0.6,0.7,0.8]
XGBoost5 = XGBClassifier(gamma=0.4,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost5.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:01:06] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089844425237683
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Colsample_btree 调参范围:  [0.6,0.7,0.8,0.85,0.9] 
XGBoost7 = XGBClassifier(colsample_btree=0.85,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost7.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "colsample_btree" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)



[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_alpha 调参范围:  [0.1,0.2,0.25,0.3,0.35] 
XGBoost8 = XGBClassifier(reg_alpha=0.2,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost8.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:02:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9087702871410737
#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_lambda 调参范围: [0.15,0.3,0.5,0.8]
XGBoost9 = XGBClassifier(reg_lambda=0.3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost9.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:03:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9160871026601364

——筛选在XGBoost算法中特征重要性系数前20个指标

XGBoost_importances = XGBoost9.feature_importances_*10000
XGBoost = pd.Series(XGBoost_importances, index = a2.columns)
XGBoost = XGBoost.sort_values(ascending=False)
XGBoost = pd.DataFrame({'feature_importances' : XGBoost})
XGBoost.head(20)
feature_importances
DILUTED_EPS 1092.639893
ASSETS_DISP_GAIN 839.089539
RETAINED_EARNINGS 501.631378
T_CA 428.929138
ESTIMATED_LIAB 363.835968
DEFER_TAX_LIAB 311.677368
CURRENT_RATIO 294.901703
N_CF_FR_FINAN_A 284.164001
GOODWILL 239.289185
N_INCOME 238.080658
CL_TA 224.853226
INVENTORIES 224.492996
ROE_A 214.091843
NOPERATE_EXP 196.429474
OTH_CA 192.453537
OTH_CL 191.517349
C_FR_MINO_S_SUBS 191.478973
GAIN_INVEST 187.904297
C_INF_FR_INVEST_A 180.430847
CASH_C_EQUIV 178.665329

3.6 GBM 调参过程

该模型需要添加的参数共有 7 个,本文选取了对该模型相对重要的几个参数进行调节.

第一个参数是:max_depth:模型中树的最大深度.

第二个参数是 n_estimators:模型中分类器的数量,该参数在模型中的作用较为强大,可以有效的提升模型的学习能力.

第三个参数是 learning_rate:学习率,该参数的有效调节对模型是否会过拟合发挥着重要作用,参数的取值范围为 [0,1],默认值为 0.1.为了能够有效的提升模型的泛化能力并且防止模型出现过拟合现象,本文经过网络搜索法并且查阅大量机器学习专业文献将 learning 设置为 0.0088.

from sklearn.ensemble import GradientBoostingClassifier
#learning_rate 调参范围:   [0.004,0.007,0.0076,0.0088,0.009]
GBM1 = GradientBoostingClassifier(learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#n_estimators 调参范围:  [110,120,130,140,160]
GBM2 = GradientBoostingClassifier(n_estimators=130,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9171185057140113
#Subsample 调参范围: [0.1,0.2,0.25,0.3,0.4]
GBM3 = GradientBoostingClassifier(subsample=0.3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9131902429655239
#min_samples_split 调参范围: [2,3,4,5,6] 
GBM4 = GradientBoostingClassifier(min_samples_split=4,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#mmin_samples_leaf 调参范围: [2,3,4,6,7,9] 
GBM5 = GradientBoostingClassifier(min_samples_leaf=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM5.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9021089023336215
#max_depth 调参范围: [2,3,4,5,8]
GBM6 = GradientBoostingClassifier(max_depth=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM6.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964
#validation_fraction 调参范围: [0.1,0.3,0.4,0.5,0.7,0.8]
GBM7 =GradientBoostingClassifier(validation_fraction=0.1,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM7.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
0.9016575434552964

——筛选在GBM算法中特征重要性系数前20个指标

GBM_importances = GBM2.feature_importances_*10000
GBM = pd.Series(GBM_importances, index = a2.columns)
GBM = GBM.sort_values(ascending=False)
GBM = pd.DataFrame({'feature_importances' : GBM})
GBM.head(20)
feature_importances
DILUTED_EPS 2545.405622
ASSETS_DISP_GAIN 1393.862699
RETAINED_EARNINGS 1201.324132
ESTIMATED_LIAB 1123.032285
NCA_DISPLOSS 496.913323
C_FR_CAP_CONTR 429.072252
OTH_GAIN 392.266813
NOPERATE_EXP 216.007424
PROC_SELL_INVEST 191.362051
T_CA 179.957164
DEFER_TAX_LIAB 172.171302
N_CF_FR_INVEST_A 157.682156
INT_PAYABLE 157.357831
DIV_PAYABLE 103.399260
C_PAID_OTH_FINAN_A 95.313147
INT_RECEIV 87.237988
REV_PS 86.175900
C_INF_FR_INVEST_A 76.066347
ADVANCE_RECEIPTS 64.396530
T_EQUITY_ATTR_P 62.686717