import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data ad_excel('./GermanCredit.xls', sheet_name='Data') #读取xls文件的Data sheet
data.head()
num_features = ['DURATION','AMOUNT','INSTALL_RATE','AGE','NUM_CREDITS','NUM_DEPENDENTS']
cat_features = lumns.drop(num_features + ['OBS#'])
data.isnull().sum()
# 都没有缺失值
OBS# 0
CHK_ACCT 0
DURATION 0
HISTORY 0
NEW_CAR 0
USED_CAR 0
FURNITURE 0
RADIO/TV 0
EDUCATION 0
RETRAINING 0
AMOUNT 0
SAV_ACCT 0
EMPLOYMENT 0
INSTALL_RATE 0
MALE_DIV 0
MALE_SINGLE 0
MALE_MAR_or_WID 0
CO-APPLICANT 0
GUARANTOR 0
PRESENT_RESIDENT 0
REAL_ESTATE 0
PROP_UNKN_NONE 0
AGE 0
OTHER_INSTALL 0
RENT 0
OWN_RES 0
NUM_CREDITS 0
JOB 0
NUM_DEPENDENTS 0
TELEPHONE 0
FOREIGN 0
RESPONSE 0
dtype: int64
发现DURATION是贷款期限,分布在4-72个月之间,而且分布是一个看似左偏的正态分布,做一个hist图看得更清楚!
plt.hist(data['DURATION'])
(array([171., 262., 337., 57., 86., 17., 54., 2., 13., 1.]),array([ 4. , 10.8, 17.6, 24.4, 31.2, 38. , 44.8, 51.6, 58.4, 65.2, 72. ]),<a list of 10 Patch objects>)
x<20 dua_rank = 1
20<x<40 dua_rank = 2
40<x<60 dua_rank = 3
60<x<72 dua_rank = 4
并且创造一个新特征 dua_rank 添加在new_data中,也可以用sklearn.KBinsDiscretizer进行分箱处理
dua_rank = []
duration = data['DURATION']
for i in duration:if i <=20:dua_rank.append(1)elif i<= 40:dua_rank.append(2)elif i < 60:dua_rank.append(3)else:dua_rank.append(4)
plt.hist(dua_rank,bins = 4)
(array([554., 365., 67., 14.]),array([1. , 1.75, 2.5 , 3.25, 4. ]),<a list of 4 Patch objects>)
new_data = py()
new_data['dua_rank'] = dua_rank
new_data.head()
OBS# | CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | ... | OTHER_INSTALL | RENT | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | RESPONSE | dua_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 6 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 |
1 | 2 | 1 | 48 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | 0 | 3 |
2 | 3 | 3 | 12 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 1 | 1 |
3 | 4 | 0 | 42 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 2 | 2 | 0 | 0 | 1 | 3 |
4 | 5 | 0 | 24 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 2 |
5 rows × 33 columns
plt.hist(data['AMOUNT'])
(array([445., 293., 97., 80., 38., 19., 14., 8., 5., 1.]),array([ 250. , 2067.4, 3884.8, 5702.2, 7519.6, 9337. , 11154.4,12971.8, 14789.2, 16606.6, 18424. ]),<a list of 10 Patch objects>)
同样可以用sklearn.KBinsDiscretizer进行分箱离散化
percent = np.percentile(data['AMOUNT'], [i * 10 for i in range(1,10)])
amount_rank = []
for i in data['AMOUNT']:if i < percent[0]:amount_rank.append(1)elif i < percent[1]:amount_rank.append(2)elif i <percent[2]:amount_rank.append(3)elif i < percent[3]:amount_rank.append(4)elif i < percent[4]:amount_rank.append(5)elif i < percent[5]:amount_rank.append(6)elif i < percent[6]:amount_rank.append(7)elif i < percent[7]:amount_rank.append(8)elif i < percent[8]:amount_rank.append(9)else:amount_rank.append(10)
new_data['amount_rank'] = amount_rank
data['INSTALL_RATE'].value_counts()
4 476
2 231
3 157
1 136
Name: INSTALL_RATE, dtype: int64
data['AGE'].describe()
count 1000.000000
mean 35.546000
std 11.375469
min 19.000000
25% 27.000000
50% 33.000000
75% 42.000000
max 75.000000
Name: AGE, dtype: float64
percent = np.percentile(data['AGE'], [25, 50, 75])
age_rank = []
for i in data['AGE']:if i <= percent[0]:age_rank.append(1)elif i <= percent[1]:age_rank.append(2)elif i <= percent[2]:age_rank.append(3)else:age_rank.append(4)
new_data['age_rank'] = age_rank
new_data.head()
OBS# | CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | ... | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | RESPONSE | dua_rank | amount_rank | age_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 6 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 2 | 4 |
1 | 2 | 1 | 48 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 1 | 2 | 1 | 0 | 0 | 0 | 3 | 9 | 1 |
2 | 3 | 3 | 12 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 1 | 1 | 2 | 0 | 0 | 1 | 1 | 5 | 4 |
3 | 4 | 0 | 42 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 2 | 2 | 0 | 0 | 1 | 3 | 10 | 4 |
4 | 5 | 0 | 24 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 2 | 9 | 4 |
5 rows × 35 columns
data['NUM_CREDITS'].describe()
count 1000.000000
mean 1.407000
std 0.577654
min 1.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 4.000000
Name: NUM_CREDITS, dtype: float64
plt.hist(data['NUM_CREDITS'],bins = 4)
(array([633., 333., 28., 6.]),array([1. , 1.75, 2.5 , 3.25, 4. ]),<a list of 4 Patch objects>)
data['NUM_DEPENDENTS'].describe()
count 1000.000000
mean 1.155000
std 0.362086
min 1.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 2.000000
Name: NUM_DEPENDENTS, dtype: float64
plt.hist(data['NUM_DEPENDENTS'],bins = 2)
(array([845., 155.]), array([1. , 1.5, 2. ]), <a list of 2 Patch objects>)
new_data.drop(['AGE','AMOUNT','DURATION'], axis = 1,inplace=True)
所以我们将 HISTORY , JOB 特征 进行独热编码
history = data['HISTORY']
new_history = pd.get_dummies(history,prefix='histor')
job = data['JOB']
new_job = pd.get_dummies(job, prefix= 'job')
new_data = pd.concat([new_data, new_history, new_job], axis = 1)
new_data.drop(['HISTORY', 'JOB'], axis = 1,inplace=True)
new_data
OBS# | CHK_ACCT | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | SAV_ACCT | EMPLOYMENT | ... | age_rank | histor_0 | histor_1 | histor_2 | histor_3 | histor_4 | job_0 | job_1 | job_2 | job_3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 4 | ... | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | ... | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | ... | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 996 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | ... | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
996 | 997 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
997 | 998 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | ... | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
998 | 999 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
999 | 1000 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1000 rows × 39 columns
new_data.info
<bound method DataFrame.info of OBS# CHK_ACCT NEW_CAR USED_CAR FURNITURE RADIO/TV EDUCATION
0 1 0 0 0 0 1 0
1 2 1 0 0 0 1 0
2 3 3 0 0 0 0 1
3 4 0 0 0 1 0 0
4 5 0 1 0 0 0 0
.. ... ... ... ... ... ... ...
995 996 3 0 0 1 0 0
996 997 0 0 1 0 0 0
997 998 3 0 0 0 1 0
998 999 0 0 0 0 1 0
999 1000 1 0 1 0 0 0 RETRAINING SAV_ACCT EMPLOYMENT ... age_rank histor_0 histor_1
0 0 4 4 ... 4 0 0
1 0 0 2 ... 1 0 0
2 0 0 3 ... 4 0 0
3 0 0 3 ... 4 0 0
4 0 0 2 ... 4 0 0
.. ... ... ... ... ... ... ...
995 0 0 3 ... 2 0 0
996 0 0 2 ... 3 0 0
997 0 0 4 ... 3 0 0
998 0 0 2 ... 1 0 0
999 0 1 0 ... 1 0 0 histor_2 histor_3 histor_4 job_0 job_1 job_2 job_3
0 0 0 1 0 0 1 0
1 1 0 0 0 0 1 0
2 0 0 1 0 1 0 0
3 1 0 0 0 0 1 0
4 0 1 0 0 0 1 0
.. ... ... ... ... ... ... ...
995 1 0 0 0 1 0 0
996 1 0 0 0 0 0 1
997 1 0 0 0 0 1 0
998 1 0 0 0 0 1 0
999 0 0 1 0 0 1 0 [1000 rows x 39 columns]>
from sklearn import tree
del_selection import train_test_split
import DecisionTreeClassifier
del_selection import GridSearchCV
import graphviz
from sklearn.svm import SVC
x = new_data.drop(['RESPONSE'], axis = 1)
y = new_data.loc[:,['RESPONSE']]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 777)
model = DecisionTreeClassifier(criterion='gini',max_depth=4,min_samples_split=4,max_features=6)
params = {'criterion':['gini','entropy'],'max_depth': range(1,30),'min_samples_split': range(2,10),'min_samples_leaf' : range(1,6),
}
cv = GridSearchCV(model,param_grid= params,n_jobs= -1,verbose=1,scoring='accuracy', cv = 5)
cv.fit(data.iloc[:,:-1], data.iloc[:,-1])
Fitting 5 folds for each of 2320 candidates, totalling 11600 fits[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1348 tasks | elapsed: 2.4s
[Parallel(n_jobs=-1)]: Done 8248 tasks | elapsed: 15.7s
[Parallel(n_jobs=-1)]: Done 11600 out of 11600 | elapsed: 22.5s finishedGridSearchCV(cv=5, error_score='raise-deprecating',estimator=DecisionTreeClassifier(class_weight=None,criterion='gini', max_depth=4,max_features=6,max_leaf_nodes=None,min_impurity_decrease=0.0,min_impurity_split=None,min_samples_leaf=1,min_samples_split=4,min_weight_fraction_leaf=0.0,presort=False, random_state=None,splitter='best'),iid='warn', n_jobs=-1,param_grid={'criterion': ['gini', 'entropy'],'max_depth': range(1, 30),'min_samples_leaf': range(1, 6),'min_samples_split': range(2, 10)},pre_dispatch='2*n_jobs', refit=True, return_train_score=False,scoring='accuracy', verbose=1)
model1 = cv.best_estimator_
cv.best_params_,cv.best_score_
({'criterion': 'gini','max_depth': 5,'min_samples_leaf': 2,'min_samples_split': 5},0.734)
plt.figure(figsize= (9,6))
plt.bar(data.iloc[:,:-1].columns, model1.feature_importances_)
icks(rotation = 90)
plt.show()
cv.fit(x, y)
model2 = cv.best_estimator_
cv.best_params_,cv.best_score_
Fitting 5 folds for each of 2320 candidates, totalling 11600 fits[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1328 tasks | elapsed: 3.0s
[Parallel(n_jobs=-1)]: Done 7928 tasks | elapsed: 18.5s
[Parallel(n_jobs=-1)]: Done 11600 out of 11600 | elapsed: 27.5s finished({'criterion': 'gini','max_depth': 5,'min_samples_leaf': 2,'min_samples_split': 2},0.728)
model2
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,max_features=6, max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=2, min_samples_split=2,min_weight_fraction_leaf=0.0, presort=False,random_state=None, splitter='best')
plt.figure(figsize= (9,6))
plt.lumns, model2.feature_importances_)
icks(rotation = 90)
plt.show()
graph_data = port_graphviz(model2,out_file = None,feature_nameslumns,filled= True, rounded= True)
graph = graphviz.Source(graph_data,)
graph
model = SVC()
params = {'C':range(1,10)}
cv = GridSearchCV(model,param_grid=params, verbose = 1,cv = 5,scoring='accuracy',n_jobs=-1)
cv.fit(x, y)
model1 = cv.best_estimator_
cv.best_score_
Fitting 5 folds for each of 9 candidates, totalling 45 fits[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 38 out of 45 | elapsed: 1.3s remaining: 0.2s
[Parallel(n_jobs=-1)]: Done 45 out of 45 | elapsed: 1.5s finished
F:Anaconda3libsite-packagessklearnutilsvalidation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().y = column_or_1d(y, warn=True)
F:Anaconda3libsite-packagessklearnsvmbase.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning."avoid this warning.", FutureWarning)0.653
cv.fit(data.iloc[:,:-1], data.iloc[:,-1])
model2 = cv.best_estimator_
cv.best_score_
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.Fitting 5 folds for each of 9 candidates, totalling 45 fits[Parallel(n_jobs=-1)]: Done 38 out of 45 | elapsed: 1.6s remaining: 0.2s
[Parallel(n_jobs=-1)]: Done 45 out of 45 | elapsed: 1.8s finished
F:Anaconda3libsite-packagessklearnsvmbase.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning."avoid this warning.", FutureWarning)0.7
semble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, random_state=2)
params = {'n_estimators':range(1,1000)
}
cv = GridSearchCV(model, param_grid=params ,verbose = 1,n_jobs=-1, scoring='accuracy')
cv.fit(x,y)
rfc1= cv.best_estimator_
F:Anaconda3libsite-packagessklearnmodel_selection_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.warnings.warn(CV_WARNING, FutureWarning)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.Fitting 3 folds for each of 999 candidates, totalling 2997 fits[Parallel(n_jobs=-1)]: Done 75 tasks | elapsed: 3.7s
[Parallel(n_jobs=-1)]: Done 526 tasks | elapsed: 30.7s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 1.1min
[Parallel(n_jobs=-1)]: Done 1126 tasks | elapsed: 2.3min
[Parallel(n_jobs=-1)]: Done 1576 tasks | elapsed: 4.5min
[Parallel(n_jobs=-1)]: Done 2126 tasks | elapsed: 8.2min
[Parallel(n_jobs=-1)]: Done 2776 tasks | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 2997 out of 2997 | elapsed: 16.4min finished
F:Anaconda3libsite-packagessklearnmodel_selection_search.py:715: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().self.best_estimator_.fit(X, y, **fit_params)
rfc = cv.best_estimator_
rfc
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=None, max_features='auto', max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=673,n_jobs=None, oob_score=False, random_state=2, verbose=0,warm_start=False)
cv.best_score_
0.759
cv.fit(data.iloc[:,:-1],y.iloc[:,-1])
rfc2= cv.best_estimator_
F:Anaconda3libsite-packagessklearnmodel_selection_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.warnings.warn(CV_WARNING, FutureWarning)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.Fitting 3 folds for each of 999 candidates, totalling 2997 fits[Parallel(n_jobs=-1)]: Done 75 tasks | elapsed: 4.0s
[Parallel(n_jobs=-1)]: Done 423 tasks | elapsed: 21.4s
[Parallel(n_jobs=-1)]: Done 673 tasks | elapsed: 49.5s
[Parallel(n_jobs=-1)]: Done 1023 tasks | elapsed: 1.9min
[Parallel(n_jobs=-1)]: Done 1473 tasks | elapsed: 4.0min
[Parallel(n_jobs=-1)]: Done 2023 tasks | elapsed: 7.5min
[Parallel(n_jobs=-1)]: Done 2673 tasks | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 2997 out of 2997 | elapsed: 16.5min finished
cv.best_score_ #
0.773
本文发布于:2024-01-29 16:57:46,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170651867116826.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |