制造业

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()

from scipy import stats
from scipy.stats import norm, skew

t1=pd.read_csv("制造业.csv")
t1_train=t1.drop("FLAG",axis=1)
t1

	TICKER_SYMBOL	ACT_PUBTIME	PUBLISH_DATE	END_DATE_REP	END_DATE	REPORT_TYPE	FISCAL_PERIOD	MERGED_FLAG	ACCOUTING_STANDARDS	CURRENCY_CD	...	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER	FLAG
0	4019	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
1	8166	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
2	11737	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
3	16479	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
4	16842	4	4	3	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13965	4992204	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13966	4992858	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	0.000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13967	4993201	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13968	4998808	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13969	4999709	7	7	7	6	A	12	1	CHAS_2007	CNY	...	2.6656	23.084	21.9179	0.6571	0.6256	33.6589	16.4249	0.3692	54.0618	NaN

13970 rows × 363 columns

1 数据预处理

1.1计算缺失率，并降序排序

all_data_na = (t1_train.isnull().sum() / len(t1_train) * 100).sort_values(ascending=False) 

missing_data = pd.DataFrame({'missing_data' : all_data_na})
missing_data

	missing_data
ACCRUED_EXP	99.971367
N_INC_BORR_OTH_FI	99.806729
PERPETUAL_BOND_L	99.634932
PREFERRED_STOCK_L	99.606299
PREFERRED_STOCK_E	99.591983
...	...
T_COMPR_INCOME	0.000000
N_INCOME_ATTR_P	0.000000
FINAN_EXP	0.000000
ACT_PUBTIME	0.000000
TICKER_SYMBOL	0.000000

362 rows × 1 columns

将缺失率用图表的方式展示

f, ax = plt.subplots(figsize=(30, 15))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)   #条形图
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

Text(0.5, 1.0, 'Percent missing data by feature')

# 统计缺失率大于80%的个数
missing_data_count1 = all_data_na.index[all_data_na > 80] 

# 统计缺失率小于20%的个数
missing_data_count2 = all_data_na.index[all_data_na < 20] 

print(missing_data_count1.shape,missing_data_count2.shape)

(93,) (84,)

#缺失率>80%的特征
a=missing_data.values[:93]
x=pd.DataFrame(a, index = missing_data.index[:93])
x

	0
ACCRUED_EXP	99.971367
N_INC_BORR_OTH_FI	99.806729
PERPETUAL_BOND_L	99.634932
PREFERRED_STOCK_L	99.606299
PREFERRED_STOCK_E	99.591983
...	...
OP_CL	81.338583
R_D	81.159628
N_CF_OPA_LIAB	80.952040
N_CF_NFA_LIAB	80.952040
OP_TL	80.916249

93 rows × 1 columns

1.2 删除80%以上的缺失率

t2=t1_train.drop(columns=x.index)
t2

	TICKER_SYMBOL	ACT_PUBTIME	PUBLISH_DATE	END_DATE_REP	END_DATE	REPORT_TYPE	FISCAL_PERIOD	MERGED_FLAG	ACCOUTING_STANDARDS	CURRENCY_CD	...	AP_TURNOVER	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER
0	4019	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	8166	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	11737	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	16479	3	3	2	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	16842	4	4	3	1	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13965	4992204	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13966	4992858	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	0.000	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13967	4993201	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13968	4998808	7	7	7	6	A	12	1	CHAS_2007	CNY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13969	4999709	7	7	7	6	A	12	1	CHAS_2007	CNY	...	10.6956	2.6656	23.084	21.9179	0.6571	0.6256	33.6589	16.4249	0.3692	54.0618

13970 rows × 269 columns

1.3 对缺失率20%到80%的数据填充中位数

b=missing_data.index[93:278]
for o in b:
    t2[o]=t2[o].fillna(t2[o].median())
t2

	TICKER_SYMBOL	ACT_PUBTIME	PUBLISH_DATE	END_DATE_REP	END_DATE	REPORT_TYPE	FISCAL_PERIOD	MERGED_FLAG	ACCOUTING_STANDARDS	CURRENCY_CD	...	AP_TURNOVER	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER
0	4019	3	3	2	1	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
1	8166	3	3	2	1	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
2	11737	3	3	2	1	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
3	16479	3	3	2	1	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
4	16842	4	4	3	1	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13965	4992204	7	7	7	6	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
13966	4992858	7	7	7	6	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	0.0000	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
13967	4993201	7	7	7	6	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
13968	4998808	7	7	7	6	A	12	1	CHAS_2007	CNY	...	4.8617	1.0942	149.7293	4.1120	3.0696	2.7145	74.30515	87.75175	0.5354	8.49245
13969	4999709	7	7	7	6	A	12	1	CHAS_2007	CNY	...	10.6956	2.6656	23.0840	21.9179	0.6571	0.6256	33.65890	16.42490	0.3692	54.06180

13970 rows × 269 columns

1.4 对缺失率20%以下的数据使用KNN填充

d=missing_data.index[278:336] #列名

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10)
t2[d] = imputer.fit_transform(t2[d])
print(t2.isnull().sum())

TICKER_SYMBOL    0
ACT_PUBTIME      0
PUBLISH_DATE     0
END_DATE_REP     0
END_DATE         0
                ..
TFA_TURNOVER     0
DAYS_AP          0
DAYS_INVEN       0
TA_TURNOVER      0
AR_TURNOVER      0
Length: 269, dtype: int64

1.5 删除与预测是否造假结果无关的特征因子

删除股票代码，实际披露时间，发布时间，报告截止日期，截止日期，报告类型，会计区间，合并标志：1-合并，2-母公司，会计准则，货币代码共 10 个与预测是否造假结果无关的特征因子

t2=t2.drop(["TICKER_SYMBOL","ACT_PUBTIME","PUBLISH_DATE","END_DATE_REP","END_DATE","REPORT_TYPE","FISCAL_PERIOD","MERGED_FLAG","ACCOUTING_STANDARDS","CURRENCY_CD"],axis=1)

1.6 查看是否还存在缺失值

t2.isna().any().sum()

1.7 对数据进行标准化

from sklearn.preprocessing import StandardScaler

#标准化，返回值为标准化后的数据
t4=pd.DataFrame(StandardScaler().fit_transform(t2),columns=t2.columns)
t4

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	AP_TURNOVER	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER
0	-0.110544	-0.106696	-0.161667	-0.182694	-0.067294	-0.177580	-0.271929	-0.054680	-0.201905	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
1	-0.036496	1.088871	-0.182107	-0.052401	-0.085668	-0.026558	0.016419	-0.171927	0.060346	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
2	0.070766	-0.189223	0.057981	-0.140868	0.021829	-0.115114	-0.100801	0.073932	-0.023286	-0.110754	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
3	-0.039637	-0.205146	-0.184401	-0.159863	-0.062639	-0.060387	-0.197651	0.346521	-0.105029	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
4	-0.244743	-0.199970	-0.265148	-0.148300	-0.085668	-0.182752	-0.279125	-0.178592	-0.283117	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13965	-0.245654	-0.175257	-0.248184	-0.192613	-0.085668	-0.180662	-0.279316	-0.178050	-0.279115	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13966	-0.204023	-0.205182	-0.257308	-0.191965	-0.085668	-0.175087	-0.270255	-0.175323	-0.266459	-0.087742	...	-0.071554	-0.120861	-0.063158	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13967	-0.227119	-0.204127	-0.201336	-0.164736	-0.085668	-0.164288	-0.183161	-0.162139	-0.237732	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13968	0.100220	-0.204577	-0.038156	-0.128786	-0.085668	-0.128173	0.075970	-0.152256	-0.019633	-0.068500	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13969	1.609554	0.222399	1.025478	0.436742	-0.085668	0.581671	0.691256	0.019537	1.080323	-0.087742	...	0.306581	2.042642	-0.058198	0.017576	-0.069761	-0.013200	-0.009263	-0.050238	-0.593860	-0.006262

13970 rows × 259 columns

2 划分数据集

以前5年数据为训练集、验证集train,第6年为测试集test

#以前5年数据为训练集、验证集train,第6年为测试集test
train=t4.iloc[:11310,:]
test=t4.iloc[11310:,:259]

train["FLAG"]=t1["FLAG"]
train

  train["FLAG"]=t1["FLAG"]

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER	FLAG
0	-0.110544	-0.106696	-0.161667	-0.182694	-0.067294	-0.177580	-0.271929	-0.054680	-0.201905	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
1	-0.036496	1.088871	-0.182107	-0.052401	-0.085668	-0.026558	0.016419	-0.171927	0.060346	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
2	0.070766	-0.189223	0.057981	-0.140868	0.021829	-0.115114	-0.100801	0.073932	-0.023286	-0.110754	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
3	-0.039637	-0.205146	-0.184401	-0.159863	-0.062639	-0.060387	-0.197651	0.346521	-0.105029	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
4	-0.244743	-0.199970	-0.265148	-0.148300	-0.085668	-0.182752	-0.279125	-0.178592	-0.283117	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11305	-0.248180	-0.177748	-0.244404	-0.195324	-0.085668	-0.182942	-0.277525	-0.177054	-0.279415	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
11306	-0.218672	-0.196336	-0.255531	-0.193333	-0.085668	-0.160477	-0.268560	-0.174560	-0.270623	-0.087742	...	-1.587291	2.125000	-0.031456	-0.041665	-0.012392	0.005011	2.477695	-1.585968	-0.054737	0.0
11307	-0.200565	-0.204200	-0.232985	-0.177734	-0.085668	-0.175507	-0.207126	-0.160690	-0.242246	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
11308	-0.101380	-0.197020	-0.049710	-0.100780	-0.085668	-0.178231	0.042636	-0.123428	-0.095392	-0.064501	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
11309	1.326097	0.141651	0.889289	0.170126	0.029234	0.515964	0.529339	0.024492	0.854325	2.263116	...	0.710450	-0.058365	0.016279	-0.071072	-0.013249	-0.009230	-0.050121	-0.719755	0.004747	0.0

11310 rows × 260 columns

import pandas as pd
train.to_excel("训练集、验证集.xlsx")
test.to_excel("测试集.xlsx")

2.1 样本不均衡处理

X_train1=np.array(train.iloc[:11310,:259])
y_train1 =train.FLAG.values

from collections import Counter

# 查看所生成的样本类别分布，0和1样本比例9比1，属于类别不平衡数据
print(Counter(y_train1))

Counter({0.0: 11219, 1.0: 91})

import matplotlib.pyplot as plt

# make data
x = [11219, 91]
labels = ['0', '1']

# plot
fig, ax = plt.subplots()
ax.pie(x, radius=3, center=(4, 4),labels=labels, 
       wedgeprops={"linewidth": 1, "edgecolor": "white"}, autopct='%.1f%%', frame=True)

ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
       ylim=(0, 8), yticks=np.arange(1, 8))

plt.show()

from imblearn.over_sampling import SMOTE

# 生成0和1比例为3比1的数据样本
oversample = SMOTE(sampling_strategy=0.2,random_state=42)
X_os, y_os = oversample.fit_resample(X_train1,y_train1)
print(Counter(y_os))

Counter({0.0: 11219, 1.0: 2243})

X_os.shape

(13462, 259)

import pandas as pd
a1 = pd.DataFrame(X_os)
a1["259"] = y_os
a1.columns = train.columns #添加列名
a1
#a.to_excel("洗好的数据.xlsx")

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER	FLAG
0	-0.110544	-0.106696	-0.161667	-0.182694	-0.067294	-0.177580	-0.271929	-0.054680	-0.201905	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
1	-0.036496	1.088871	-0.182107	-0.052401	-0.085668	-0.026558	0.016419	-0.171927	0.060346	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
2	0.070766	-0.189223	0.057981	-0.140868	0.021829	-0.115114	-0.100801	0.073932	-0.023286	-0.110754	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
3	-0.039637	-0.205146	-0.184401	-0.159863	-0.062639	-0.060387	-0.197651	0.346521	-0.105029	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
4	-0.244743	-0.199970	-0.265148	-0.148300	-0.085668	-0.182752	-0.279125	-0.178592	-0.283117	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13457	-0.194605	-0.204111	-0.235016	-0.192665	-0.095194	-0.166159	-0.266567	-0.157112	-0.255136	-0.082809	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	1.0
13458	-0.231584	-0.196071	-0.240270	-0.175277	-0.085668	0.100651	-0.104562	-0.135625	-0.215914	-0.093468	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	1.0
13459	-0.172396	-0.090448	-0.126067	-0.083162	-0.085668	-0.109957	-0.217281	-0.088728	-0.182285	-0.087742	...	-0.459342	-0.013455	-0.022908	-0.067021	-0.013149	-0.008900	-0.020502	-0.563696	-0.051747	1.0
13460	0.220213	0.428407	0.539064	0.129878	0.930931	0.152119	0.261990	0.167506	0.343409	-0.036143	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	1.0
13461	0.015222	0.092724	0.156657	0.054408	-0.085668	-0.104527	0.074379	-0.154946	0.042608	-0.085088	...	-0.462918	0.015973	-0.025581	0.770781	0.029360	-0.008998	0.032707	-0.261838	0.219923	1.0

13462 rows × 260 columns

a2 = a1.drop("FLAG",axis=1)
a2

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	AP_TURNOVER	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER
0	-0.110544	-0.106696	-0.161667	-0.182694	-0.067294	-0.177580	-0.271929	-0.054680	-0.201905	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
1	-0.036496	1.088871	-0.182107	-0.052401	-0.085668	-0.026558	0.016419	-0.171927	0.060346	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
2	0.070766	-0.189223	0.057981	-0.140868	0.021829	-0.115114	-0.100801	0.073932	-0.023286	-0.110754	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
3	-0.039637	-0.205146	-0.184401	-0.159863	-0.062639	-0.060387	-0.197651	0.346521	-0.105029	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
4	-0.244743	-0.199970	-0.265148	-0.148300	-0.085668	-0.182752	-0.279125	-0.178592	-0.283117	-0.087742	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13457	-0.194605	-0.204111	-0.235016	-0.192665	-0.095194	-0.166159	-0.266567	-0.157112	-0.255136	-0.082809	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13458	-0.231584	-0.196071	-0.240270	-0.175277	-0.085668	0.100651	-0.104562	-0.135625	-0.215914	-0.093468	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13459	-0.172396	-0.090448	-0.126067	-0.083162	-0.085668	-0.109957	-0.217281	-0.088728	-0.182285	-0.087742	...	0.103209	-0.459342	-0.013455	-0.022908	-0.067021	-0.013149	-0.008900	-0.020502	-0.563696	-0.051747
13460	0.220213	0.428407	0.539064	0.129878	0.930931	0.152119	0.261990	0.167506	0.343409	-0.036143	...	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13461	0.015222	0.092724	0.156657	0.054408	-0.085668	-0.104527	0.074379	-0.154946	0.042608	-0.085088	...	-0.040714	-0.462918	0.015973	-0.025581	0.770781	0.029360	-0.008998	0.032707	-0.261838	0.219923

13462 rows × 259 columns

2.2 划分训练集、验证集

#前 5 年制造业数据分别进行训练集与验证集的切割
from sklearn.model_selection import train_test_split
import pandas as pd 
train_data,test_data1 = train_test_split(a1,test_size = 0.2,random_state=0)
#验证集
test_data1

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER	FLAG
10307	-0.237276	-0.149240	-0.209804	-0.164470	-0.085668	-0.159014	-0.254104	-0.178466	-0.256564	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
6913	2.860131	1.594538	0.480208	2.647223	-0.085668	1.097509	8.681112	1.615339	3.903271	0.161419	...	-1.074846	-0.055897	-0.007669	-0.055792	-0.012656	-0.009343	-0.045661	-0.929869	-0.047602	0.0
7530	-0.236536	-0.178238	-0.227843	-0.184537	-0.085668	-0.175392	-0.267297	-0.177575	-0.268344	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
8204	-0.247465	-0.196230	-0.194191	-0.192289	-0.085668	-0.174138	-0.264405	-0.158679	-0.264602	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
11212	0.204918	-0.167106	0.011418	-0.155947	0.087527	-0.077206	0.006010	-0.041689	0.031319	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11098	-0.204823	-0.191856	-0.257564	-0.195262	-0.085668	-0.181840	-0.262093	-0.140187	-0.261923	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
451	-0.146761	-0.059989	-0.141490	-0.150467	-0.087393	-0.178277	-0.149462	-0.135039	-0.148100	-0.064996	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
5634	-0.217310	-0.048995	-0.150073	-0.185264	-0.085668	-0.147963	-0.269352	-0.144624	-0.223935	-0.085432	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
4379	-0.225533	-0.153682	-0.128768	-0.162863	-0.111326	-0.171458	-0.151384	0.109614	-0.174713	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
7712	-0.229782	-0.188186	-0.269640	-0.172389	-0.085668	-0.171999	-0.278531	-0.171582	-0.279448	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0

2693 rows × 260 columns

#训练集
train_data

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	...	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER	FLAG
11732	-0.115619	-0.159654	-0.215413	-0.066286	-0.055511	-0.135853	-0.226678	-0.152113	-0.203331	-0.087742	...	2.124952	-0.047612	-0.005608	-0.059694	-0.012875	-0.009210	-0.042106	0.674642	0.081817	1.0
2849	-0.232070	-0.138400	-0.136509	-0.174268	-0.085668	-0.166162	-0.091276	-0.171693	-0.196572	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
4938	0.147070	0.316345	0.139180	0.228339	0.028565	1.242537	-0.003413	-0.167509	0.159136	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
10029	-0.214163	-0.154632	-0.230625	-0.187055	-0.085668	-0.165990	-0.213737	-0.150521	-0.240806	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
5420	-0.210466	-0.171303	-0.145414	-0.145187	-0.085668	-0.153739	-0.053173	-0.062340	-0.168963	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13123	-0.244645	-0.113926	-0.200190	-0.112910	-0.095363	-0.145159	-0.109428	-0.093625	-0.201378	-0.092453	...	-0.958218	0.086090	-0.028829	-0.063935	-0.013042	-0.007976	0.044354	-1.065248	-0.053682	1.0
3264	-0.230011	-0.182265	-0.250535	-0.182594	-0.114804	-0.176142	-0.289794	-0.178597	-0.276976	-0.087742	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
9845	-0.078908	-0.201787	-0.191256	-0.195170	-0.085668	-0.155015	-0.258678	-0.178345	-0.205729	-0.087742	...	-0.812152	-0.054565	0.101067	-0.071234	-0.013280	-0.009285	-0.052963	-1.054317	-0.045676	1.0
10799	-0.239831	-0.174050	-0.177255	-0.192585	-0.085668	-0.101249	-0.273990	-0.151724	-0.254010	-0.103202	...	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602	0.0
2732	1.702119	0.274794	-0.037051	0.189707	2.197139	-0.087228	0.200058	-0.144783	0.668527	-0.087742	...	0.010898	-0.026220	-0.022478	-0.055215	-0.012707	-0.009076	-0.031109	0.198842	-0.051366	0.0

10769 rows × 260 columns

#删除验证集FLAG
test_data2=test_data1.drop("FLAG",axis=1)

3 造假指标模型建立

from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.metrics import auc
#特征重要性选择
from xgboost import plot_importance

#训练集数据
X_train=np.array(train_data.iloc[:,:259])
y_train =np.array(train_data["FLAG"])
#验证集数据
y=np.array(test_data1["FLAG"])

feature_1 = a1.drop('FLAG',axis = 1)
feature_1

	CASH_C_EQUIV	NOTES_RECEIV	AR	PREPAYMENT	INT_RECEIV	OTH_RECEIV	INVENTORIES	OTH_CA	T_CA	AVAIL_FOR_SALE_FA	LT_EQUITY_INVEST	INVEST_REAL_ESTATE	FIXED_ASSETS	CIP	INTAN_ASSETS	GOODWILL	LT_AMOR_EXP	DEFER_TAX_ASSETS	OTH_NCA	T_NCA	T_ASSETS	ST_BORR	NOTES_PAYABLE	AP	ADVANCE_RECEIPTS	PAYROLL_PAYABLE	TAXES_PAYABLE	INT_PAYABLE	DIV_PAYABLE	OTH_PAYABLE	NCL_WITHIN_1Y	OTH_CL	T_CL	LT_BORR	LT_PAYABLE	ESTIMATED_LIAB	DEFER_REVENUE	DEFER_TAX_LIAB	T_NCL	T_LIAB	PAID_IN_CAPITAL	CAPITAL_RESER	SPECIAL_RESER	SURPLUS_RESER	RETAINED_EARNINGS	T_EQUITY_ATTR_P	MINORITY_INT	T_SH_EQUITY	T_LIAB_EQUITY	OTH_COMPRE_INCOME	C_PAID_OTH_FINAN_A	N_CF_FR_INVEST_A	C_FR_BORR	N_CF_OPERATE_A	C_FR_CAP_CONTR	C_PAID_INVEST	C_FR_OTH_FINAN_A	C_PAID_OTH_INVEST_A	C_INF_FR_INVEST_A	C_PAID_G_S	...	TSE_TA	C_TA	TEAP_IC	LT_AMOR_EXP_TA	NCA_TA	ST_BORR_TA	NCL_TA	EQU_MULTIPLIER	CAP_FIX_RATIO	N_TAN_A_TA	REPAY_TA	ID_IC	AP_TA	INVEN_TA	CL_TA	ADV_R_TA	AR_TA	TEAP_TA	T_FIXED_A_TA	FIXED_A_TA	TRE_TA	CA_TA	INTAN_A_TA	AIL_TR	VAL_CHG_P_TR	COGS_TR	SELL_EXP_TR	PERIOD_EXP_TR	INV_INC_TR	IT_TP	OPA_P_TP	OP_TR	FINAN_EXP_TR	VAL_CHG_P_TP	NI_CUT_NI	OPA_P_TR	N_NOPI_TP	R_TR	NOPG_TR	NI_TR	TCOGS_TR	TP_TR	NOPL_TR	ADMIN_EXP_TR	EBITDA_TR	BTAX_SURCHG_TR	IT_TR	EBIT_TR	OP_TP	DAYS_AR	AP_TURNOVER	CA_TURNOVER	OPER_CYCLE	INVEN_TURNOVER	FA_TURNOVER	TFA_TURNOVER	DAYS_AP	DAYS_INVEN	TA_TURNOVER	AR_TURNOVER
0	-0.110544	-0.106696	-0.161667	-0.182694	-0.067294	-0.177580	-0.271929	-0.054680	-0.201905	-0.087742	-0.142028	-0.086266	-0.200848	-0.164105	-0.277532	-0.163757	-0.234584	-0.116027	-0.114840	-0.232795	-0.228647	-0.229273	-0.225301	-0.204865	-0.163052	-0.185749	-0.160000	-0.198005	-0.077002	-0.183300	-0.187825	-0.081143	-0.254193	-0.203991	-0.139548	-0.074228	-0.097049	-0.109180	-0.209598	-0.257919	-0.252988	0.005258	-0.146179	-0.153425	-0.171217	-0.157934	-0.154017	-0.167159	-0.228648	-0.048196	-0.157168	0.275978	-0.286845	-0.143991	-0.194064	-0.048096	-0.153902	-0.159625	-0.019451	-0.183022	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
1	-0.036496	1.088871	-0.182107	-0.052401	-0.085668	-0.026558	0.016419	-0.171927	0.060346	-0.087742	-0.153631	-0.086266	-0.186912	-0.173041	-0.111736	-0.163757	-0.172424	-0.023101	-0.114840	-0.209060	-0.067471	-0.229273	0.827724	0.093545	0.092487	0.069331	0.087165	-0.198005	-0.077002	-0.053152	-0.187825	-0.076646	0.043979	-0.150792	-0.139548	-0.074228	0.100215	-0.109180	-0.186920	-0.010804	-0.109135	-0.403590	-0.146179	0.015458	0.038215	-0.141095	-0.154017	-0.152307	-0.067472	-0.048196	-0.157168	0.182495	-0.232185	0.099247	-0.194064	-0.121115	-0.153902	-0.159625	-0.154439	-0.020811	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
2	0.070766	-0.189223	0.057981	-0.140868	0.021829	-0.115114	-0.100801	0.073932	-0.023286	-0.110754	-0.067454	-0.120642	-0.200108	0.077432	-0.148904	0.586702	-0.089601	-0.113231	-0.059961	-0.105702	-0.064816	-0.316157	-0.231363	-0.137904	-0.011700	-0.191030	-0.024851	-0.198005	-0.105756	-0.108134	-0.187825	-0.076646	-0.200263	-0.150792	-0.139548	-0.074228	0.260842	-0.062466	-0.163225	-0.202685	-0.029154	0.651345	-0.146179	-0.042070	-0.044831	0.188303	-0.064006	0.157133	-0.064817	-0.048196	-0.157168	-0.431703	-0.250391	0.005059	-0.243632	0.327603	-0.153902	-0.159625	0.228867	-0.158202	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
3	-0.039637	-0.205146	-0.184401	-0.159863	-0.062639	-0.060387	-0.197651	0.346521	-0.105029	-0.087742	-0.120960	-0.086266	-0.221463	-0.199303	-0.177723	-0.171951	-0.027574	-0.127111	-0.117747	-0.236460	-0.175166	-0.230604	-0.164503	-0.125771	-0.142193	-0.045072	-0.156540	-0.209112	-0.077002	-0.148004	-0.242135	-0.076646	-0.188544	-0.116849	-0.139548	-0.074228	-0.128884	-0.039623	-0.162012	-0.192893	-0.286260	0.076386	-0.146179	-0.171182	-0.131627	-0.123079	-0.177217	-0.135453	-0.175168	-0.062454	-0.157168	0.120646	-0.198916	-0.093394	-0.245538	0.044030	-0.189950	0.105294	0.073860	-0.109958	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
4	-0.244743	-0.199970	-0.265148	-0.148300	-0.085668	-0.182752	-0.279125	-0.178592	-0.283117	-0.087742	-0.120960	-0.086266	-0.282834	-0.196378	-0.306345	-0.163757	-0.189353	-0.135653	-0.114840	-0.288391	-0.302069	-0.312094	-0.194654	-0.204394	-0.168588	-0.229982	-0.185895	-0.217441	-0.108710	-0.183572	-0.187825	-0.076646	-0.259576	-0.150792	-0.139548	-0.074228	-0.162071	-0.109180	-0.218931	-0.264605	-0.401198	-0.402463	-0.146179	-0.176925	-0.219024	-0.355697	-0.179565	-0.341002	-0.302070	-0.048196	-0.176421	0.224868	-0.285642	-0.201230	-0.194064	-0.121115	-0.153902	-0.159625	-0.156845	-0.177266	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13457	-0.194605	-0.204111	-0.235016	-0.192665	-0.095194	-0.166159	-0.266567	-0.157112	-0.255136	-0.082809	-0.144837	-0.086266	-0.234505	-0.179108	-0.247421	-0.173251	-0.202890	-0.117034	-0.115165	-0.249601	-0.267204	-0.241517	-0.233453	-0.203503	-0.160887	-0.201682	-0.171888	-0.198005	-0.077002	-0.156267	-0.187825	-0.076646	-0.251653	-0.150792	-0.139548	-0.085263	-0.142678	-0.124578	-0.215774	-0.257401	-0.138162	-0.345235	-0.146179	-0.164382	-0.194570	-0.269117	-0.180658	-0.264808	-0.267205	-0.048232	-0.157168	0.282769	-0.277337	-0.214423	-0.237723	-0.089799	-0.174903	-0.159625	-0.081430	-0.168416	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13458	-0.231584	-0.196071	-0.240270	-0.175277	-0.085668	0.100651	-0.104562	-0.135625	-0.215914	-0.093468	-0.075270	0.064067	-0.170791	-0.197127	-0.291951	-0.174709	-0.235937	-0.132742	-0.119921	-0.205796	-0.223482	-0.183847	-0.242438	-0.113028	-0.165859	-0.198643	-0.352867	-0.101828	-0.078867	0.014294	-0.243134	-0.077372	-0.178927	-0.182819	-0.139548	0.072025	-0.158947	-0.109180	-0.191181	-0.192376	-0.015725	-0.131649	-0.146179	-0.104321	-0.381811	-0.255928	-0.208644	-0.257609	-0.223483	-0.037582	-0.157168	0.219093	-0.215293	-0.297951	-0.196203	-0.126649	0.038585	-0.159625	-0.140010	-0.169934	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13459	-0.172396	-0.090448	-0.126067	-0.083162	-0.085668	-0.109957	-0.217281	-0.088728	-0.182285	-0.087742	-0.123844	-0.120632	-0.234071	-0.054888	-0.188660	-0.163757	-0.128242	-0.031042	-0.114840	-0.221879	-0.212135	-0.252805	-0.194654	-0.079903	-0.078436	-0.206288	-0.167967	0.184241	-0.077002	-0.023900	-0.187825	-0.076646	-0.186352	-0.150792	-0.139548	0.849701	-0.147337	-0.109180	-0.130924	-0.183368	0.092909	-0.037259	-0.222978	-0.104361	-0.472813	-0.247493	-0.145101	-0.243325	-0.212136	-0.048196	-0.157168	0.284709	-0.227859	-0.224250	-0.194064	-0.041249	-0.132306	-0.192049	-0.032639	-0.181799	...	0.011101	-0.091564	-0.763986	-0.071247	0.996515	-0.014163	-0.009115	0.068415	0.295576	0.011159	-0.254426	0.988053	0.000208	0.124612	-0.011954	-0.011675	0.593479	0.011130	2.383589	2.401008	0.011301	-1.000220	0.351214	-0.008270	-0.051766	1.752778	-0.611433	-0.008477	-0.053741	-0.113486	0.027801	0.008361	-0.008459	-0.013867	0.070320	0.008370	-0.032261	0.038886	-0.008778	0.008361	-0.008370	0.008355	-0.008532	-0.008450	-0.472817	0.014996	-0.339346	0.008169	0.032168	0.010236	0.103209	-0.459342	-0.013455	-0.022908	-0.067021	-0.013149	-0.008900	-0.020502	-0.563696	-0.051747
13460	0.220213	0.428407	0.539064	0.129878	0.930931	0.152119	0.261990	0.167506	0.343409	-0.036143	-0.133788	-0.086266	0.574080	1.685421	0.752913	-0.025926	-0.010816	0.002251	0.311119	0.653651	0.514408	0.162320	-0.251056	0.137400	-0.154652	-0.148068	0.305776	2.339665	-0.077002	-0.083185	-0.018381	0.503922	0.097436	-0.155138	-0.158031	-0.074228	0.376548	0.404251	1.020602	0.333509	-0.014059	1.199035	-0.146179	0.190460	0.870264	0.871184	-0.024417	0.765729	0.514407	-0.079826	0.026608	-0.935087	0.135455	0.398947	-0.243537	-0.125528	-0.021834	-0.024758	-0.114136	0.050575	...	0.011147	-0.127655	0.007816	-0.090990	-0.018944	-0.014594	-0.009392	-0.020234	-0.079771	0.011144	-0.158489	-0.020358	-0.016036	-0.144829	-0.011974	-0.010509	-0.168016	0.011145	-0.132080	-0.126937	0.011562	0.019110	-0.139211	-0.008475	-0.044078	0.087840	-0.156706	-0.008489	-0.041648	0.010630	0.016545	0.008475	-0.008480	-0.011569	0.034511	0.008481	-0.022897	0.038886	-0.008784	0.008472	-0.008481	0.008471	-0.008531	-0.008533	-0.003868	-0.044050	-0.054146	0.008459	0.022926	-0.012153	-0.071554	-0.120861	-0.030988	-0.022326	-0.046674	-0.012361	-0.008952	-0.031444	-0.112856	-0.047602
13461	0.015222	0.092724	0.156657	0.054408	-0.085668	-0.104527	0.074379	-0.154946	0.042608	-0.085088	-0.117501	-0.086266	0.083504	-0.131142	-0.256025	-0.170433	0.338018	-0.074333	0.013288	-0.046307	0.001719	-0.154665	0.307370	0.160793	-0.096559	-0.026013	-0.139392	-0.198598	-0.077002	0.065349	0.128240	-0.076646	0.029736	0.175793	-0.115495	-0.083274	0.024891	-0.113892	0.041858	0.034694	-0.113173	0.029863	-0.146179	-0.023991	-0.028819	-0.031130	-0.140034	-0.050337	0.001718	-0.020687	-0.167773	0.198412	-0.077565	0.225320	-0.194064	-0.131067	-0.063616	-0.128632	-0.114486	-0.004891	...	0.011075	0.102449	-0.414292	-0.090990	-0.811893	-0.014594	-0.009182	0.034719	-0.130274	0.011120	-0.128349	0.499511	-0.014519	1.970255	-0.011936	-0.004084	-0.587565	0.011085	-0.768414	-0.818383	0.011551	0.806651	-0.422075	-0.008475	-0.038851	0.377146	-0.372627	-0.008509	-0.038106	0.069489	0.016263	0.008474	-0.008479	-0.008330	0.041396	0.008481	-0.024758	0.015333	-0.008814	0.008469	-0.008481	0.008468	-0.008534	-0.008605	-0.058418	0.213152	-0.037802	0.008452	0.024738	-0.018517	-0.040714	-0.462918	0.015973	-0.025581	0.770781	0.029360	-0.008998	0.032707	-0.261838	0.219923

13462 rows × 259 columns

3.1 Logistics Regression 调参过程

在模型中先固定参数的默认值，然后进行参数调节，进行网格搜索与专业文献查阅寻找精确度最高而又不引起模型过拟合的参数值．模型优化评价指标为 AUC 值．

在逻辑回归模型中，需要调整的参数共有 2 个：penalty 与 C, 其中 penalty 是正则化方法，C 为逻辑回归中的超参数，表示正则化强度的倒数，在模型中默认为 1，表示正则项与损失函数的比值为 1:1．当模型中的 C 越小时，会导致损失损失函数越小，从而对其惩罚更重，正则化作用越强．

from sklearn.linear_model import LogisticRegression

#训练逻辑回归模型  C调参范围 [0.05,0.1,0.2,0.3]
clf1 = LogisticRegression(C=0.2,penalty="l2").fit(X_train, y_train)

#返回预测属于股票代码的概率 
y_pred_gbc = clf1.predict_proba(test_data2)[:,1] 

#查看召回率
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





0.884673004897724

#penalty调参范围:[l1、l2、none]
clf2= LogisticRegression(penalty="none").fit(X_train, y_train)
y_pred_gbc = clf2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





0.9013195044655719

逻辑回归属于线性判别模型，而本文所处理的数据集维度较高，故可能存在其他非线性模型能够表现的更好．

3.2 SVM 调参过程

SVM 需要调整的参数有 2 个，分别为 kernal 和 C，

其中 kernal 代表核方法，可选的函数有：“poly”：多项式核函数，“rbf”：高斯核函数 (径向基函数)，“linear”：线性核函数，“sigmod”：核函数．核函数在 SVM 中发挥着重要功能，在简化向量内积运算起着重要作用，其中高斯核函数在非线性分类问题上广泛应用．

C 代表错误项的惩罚系数，在软间隔分类中应用较多．C 越大，对错误样本的惩罚力度就越大，训练的样本准确率越高．但是容易产生过拟合现象，机器模型的泛化能力降低．相反，C 取较小的值时，允许训练样本中存在错误分类的样本，能够增强模型的泛化能力．

from sklearn import svm

#kernal调参范围: ["linear","rbf","sigmoid","poly"]   
svm1 = svm.SVC(kernel='rbf',probability=True).fit(X_train, y_train)
y_pred_gbc = svm1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc

0.8988533563814463

#kernal调参范围: ["linear","rbf","sigmoid","poly"]   
svm2 = svm.SVC(C=0.003,probability=True).fit(X_train, y_train)
y_pred_gbc = svm2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc=metrics.auc(fpr, tpr)
roc_auc

0.8741169691731491

3.3 RF 调参过程

在随机森林模型中需要调节的参数有 4 个，分别为

max_depth：树的最大深度、

n_estinators：树模型的数量、

min_samples_split：中间节点分支所需的最小样本数量、

min_sample_leaf：叶节点存在所需的最小样本数量．

为了防止模型出现过拟合现象，本文在调节其他参数时控制 max_depth=3．

from sklearn.ensemble import RandomForestClassifier

#max_depth调参范围: [3,5,7,8,11,13]  
RF1 = RandomForestClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
y_pred_gbc = RF1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.8762738884087198

#n_estinators 调参范围: [300,400,500,600,700]
RF2 = RandomForestClassifier(n_estimators=600, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.8856746374723902

#min_samples_leaf 调参范围:   [10,20,40,60,70,80,100]
RF3= RandomForestClassifier(min_samples_leaf=30, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.8766320944972631

#min_samples_split 调参范围:  [60,70,80,90,110,130]
RF4 = RandomForestClassifier(min_samples_split=80, random_state=0,max_depth=3).fit(X_train, y_train)
y_pred_gbc = RF4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.8757802746566791

3.4 DT 调参过程

在决策树模型中需要调整的参数共有 4 个，分别为

max_depth：树的最大深度、

min_samples_split：中间节点分支所需要的的最小样本量、

min_sample_leaf：叶节点存在所需的最小样本量、

max_leaf_nodes：最大叶子节点数．

为了防止模型出现过拟合现象，本文调节其他参数对模型的 AUC 影响时控制 max_depth=6

from sklearn import tree

#max_depth 调参范围:  [5,6,7,8,9,10]
DT1 =tree.DecisionTreeClassifier(max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9026846249879958

#min_samples_leaf 调参范围:  [2,3,6,8]
DT1 =tree.DecisionTreeClassifier(min_samples_leaf=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9008700662633248

#max_leaf_nodes 调参范围:  [50,60,70,80,100]
DT2 =tree.DecisionTreeClassifier(max_leaf_nodes=70, max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9006472678382791

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 max_depth=6
#min_samples_split 调参范围:  [2,3,4,5,6,8]
DT3 =tree.DecisionTreeClassifier(min_samples_split=3,max_depth=6).fit(X_train,y_train)
y_pred_gbc = DT3.predict_proba(test_data2)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9005358686257564

——筛选在DT算法中特征重要性系数前20个指标

DT_importances = DT3.feature_importances_*10000
DT = pd.Series(DT_importances, index = a2.columns)
DT = DT.sort_values(ascending=False)
DT = pd.DataFrame({'feature_importances' : DT})
DT.head(20)

	feature_importances
DILUTED_EPS	1876.459630
ESTIMATED_LIAB	1418.956658
RETAINED_EARNINGS	1274.737100
ASSETS_DISP_GAIN	1189.341616
C_FR_CAP_CONTR	350.146297
CASH_C_EQUIV	305.869870
DEFER_TAX_LIAB	305.319042
INT_RECEIV	278.811896
CURRENT_RATIO	254.761597
N_CF_FR_FINAN_A	245.185566
OTH_GAIN	214.434895
GOODWILL	207.893979
N_INCOME	199.309895
CL_TA	190.726453
NOPERATE_EXP	180.024039
OTH_CL	167.889986
GAIN_INVEST	160.657452
DIV_PAYABLE	157.439514
IT_TR	117.051125
A_J_INVEST_INCOME	101.434231

3.5 XGBoost 调参过程

XGBoost 需要调节的参数共有 9 个，下面本文只介绍对该模型相对重要的两个参数：

第一个参数是 n_estimators，在 XGBoost 模型中这个参数发挥着重要作用，表示该模型中分类器的个数，该参数的值越大，模型的学习能力就会越强．

第二个参数是learning_rate，learning_rate 表示集成模型中的学习速率，又被称之为步长控制迭代速率，有效的调节该参数值能够防止模型出现过拟合现象，默认值为 0.1，调节范围为[0,1]．

本文为了尽可能防止模型出现过拟合，在调节其他参数的值时将学习率设定为0.001．

from xgboost import XGBClassifier
from xgboost import plot_importance

#learning_rate 调参范围:  [0.001,0.002,0.003,0.0035]
XGBoost1 = XGBClassifier(learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:58:21] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=0.001
#learning_rate 调参范围:   [100,110,120,200,300] 
XGBoost2 = XGBClassifier(n_estimators=120,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:58:59] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.911076058772688

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#max_depth 调参范围:  [2,3,5,6,7,10]
XGBoost3 = XGBClassifier(max_depth=6,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[22:59:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#min_child_weight  调参范围:  [1,3,4,5,7,8]
XGBoost4 = XGBClassifier(min_child_weight=3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:00:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9118438490348603

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Gamma 调参范围:   [0.2,0.3,0.5,0.6,0.7,0.8]
XGBoost5 = XGBClassifier(gamma=0.4,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost5.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:01:06] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089844425237683

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#Colsample_btree 调参范围:  [0.6,0.7,0.8,0.85,0.9] 
XGBoost7 = XGBClassifier(colsample_btree=0.85,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost7.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "colsample_btree" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)



[23:01:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9089825218476904

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_alpha 调参范围:  [0.1,0.2,0.25,0.3,0.35] 
XGBoost8 = XGBClassifier(reg_alpha=0.2,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost8.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:02:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9087702871410737

#为了防止模型出现过拟合现象,调节其他参数对模型的 AUC 影响时控制 learning_rate=1
#reg_lambda 调参范围: [0.15,0.3,0.5,0.8]
XGBoost9 = XGBClassifier(reg_lambda=0.3,learning_rate=0.001).fit(X_train,y_train)
y_pred_gbc = XGBoost9.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

D:\anaconda3\envs\Rshimmering\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[23:03:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





0.9160871026601364

——筛选在XGBoost算法中特征重要性系数前20个指标

XGBoost_importances = XGBoost9.feature_importances_*10000
XGBoost = pd.Series(XGBoost_importances, index = a2.columns)
XGBoost = XGBoost.sort_values(ascending=False)
XGBoost = pd.DataFrame({'feature_importances' : XGBoost})
XGBoost.head(20)

	feature_importances
DILUTED_EPS	1092.639893
ASSETS_DISP_GAIN	839.089539
RETAINED_EARNINGS	501.631378
T_CA	428.929138
ESTIMATED_LIAB	363.835968
DEFER_TAX_LIAB	311.677368
CURRENT_RATIO	294.901703
N_CF_FR_FINAN_A	284.164001
GOODWILL	239.289185
N_INCOME	238.080658
CL_TA	224.853226
INVENTORIES	224.492996
ROE_A	214.091843
NOPERATE_EXP	196.429474
OTH_CA	192.453537
OTH_CL	191.517349
C_FR_MINO_S_SUBS	191.478973
GAIN_INVEST	187.904297
C_INF_FR_INVEST_A	180.430847
CASH_C_EQUIV	178.665329

3.6 GBM 调参过程

该模型需要添加的参数共有 7 个，本文选取了对该模型相对重要的几个参数进行调节．

第一个参数是：max_depth：模型中树的最大深度．

第二个参数是 n_estimators：模型中分类器的数量，该参数在模型中的作用较为强大，可以有效的提升模型的学习能力．

第三个参数是 learning_rate：学习率，该参数的有效调节对模型是否会过拟合发挥着重要作用，参数的取值范围为 [0,1]，默认值为 0.1．为了能够有效的提升模型的泛化能力并且防止模型出现过拟合现象，本文经过网络搜索法并且查阅大量机器学习专业文献将 learning 设置为 0.0088．

from sklearn.ensemble import GradientBoostingClassifier

#learning_rate 调参范围:   [0.004,0.007,0.0076,0.0088,0.009]
GBM1 = GradientBoostingClassifier(learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM1.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9016575434552964

#n_estimators 调参范围:  [110,120,130,140,160]
GBM2 = GradientBoostingClassifier(n_estimators=130,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM2.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9171185057140113

#Subsample 调参范围: [0.1,0.2,0.25,0.3,0.4]
GBM3 = GradientBoostingClassifier(subsample=0.3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM3.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9131902429655239

#min_samples_split 调参范围: [2,3,4,5,6] 
GBM4 = GradientBoostingClassifier(min_samples_split=4,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM4.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9016575434552964

#mmin_samples_leaf 调参范围: [2,3,4,6,7,9] 
GBM5 = GradientBoostingClassifier(min_samples_leaf=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM5.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9021089023336215

#max_depth 调参范围: [2,3,4,5,8]
GBM6 = GradientBoostingClassifier(max_depth=3,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM6.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9016575434552964

#validation_fraction 调参范围: [0.1,0.3,0.4,0.5,0.7,0.8]
GBM7 =GradientBoostingClassifier(validation_fraction=0.1,learning_rate=0.0088).fit(X_train,y_train)
y_pred_gbc = GBM7.predict_proba(test_data2)[:,1] 
fpr, tpr, thresholds = metrics.roc_curve(y,y_pred_gbc,pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.9016575434552964

——筛选在GBM算法中特征重要性系数前20个指标

GBM_importances = GBM2.feature_importances_*10000
GBM = pd.Series(GBM_importances, index = a2.columns)
GBM = GBM.sort_values(ascending=False)
GBM = pd.DataFrame({'feature_importances' : GBM})
GBM.head(20)

	feature_importances
DILUTED_EPS	2545.405622
ASSETS_DISP_GAIN	1393.862699
RETAINED_EARNINGS	1201.324132
ESTIMATED_LIAB	1123.032285
NCA_DISPLOSS	496.913323
C_FR_CAP_CONTR	429.072252
OTH_GAIN	392.266813
NOPERATE_EXP	216.007424
PROC_SELL_INVEST	191.362051
T_CA	179.957164
DEFER_TAX_LIAB	172.171302
N_CF_FR_INVEST_A	157.682156
INT_PAYABLE	157.357831
DIV_PAYABLE	103.399260
C_PAID_OTH_FINAN_A	95.313147
INT_RECEIV	87.237988
REV_PS	86.175900
C_INF_FR_INVEST_A	76.066347
ADVANCE_RECEIPTS	64.396530
T_EQUITY_ATTR_P	62.686717