第三部分主要是对数据做特征工程,这也是影响数据挖掘模型好坏的非常重要的一步。
我们的学习目标有:
1.学习数据的预处理,包括特征的预处理、缺失值和异常值的处理、数据分桶等
2. 学习特征交互、编码、选择的相应方法
第一步先导入我们需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
del_selection import StratifiedKFold, KFold
ics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')#训练集测试集导入
train_data = pd.read_csv('train.csv')
test_A_data = pd.read_csv('testA.csv')
#分离出数值型特征和类别型特征
numerical_fea = list(train_data.select_dtypes(exclude = ['object']).columns)
category_fea = list(train_data.select_dtypes(include = ['object']).columns)
label = 'isDefault'
ve(label)
前面我们通过EDA对特征有了一个大致的了解,我们可以一一解决刚刚碰到的一些比较明显的问题
初始数据中难免会有缺失值,如何填充缺失值会影响到最后的结果,所以我们可以用多种填充法然后选取最优的一种。
先介绍一些简单的缺失值的填充办法:
#所有缺失值填充0
train_data = train_data.fillna(0)#用缺失值上处的数据填充
train_data = train_data.fillna(axis = 0,method = 'ffill')#纵向用缺失值下面的值替换缺失值,且设置最多只填充两个连续的缺失值
train_data = train_data.fillna(axis = 0,method = 'bfill',limit = 2)
我们先来看看缺失值的基本情况
train_data.isnull().sum()'''
results:
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 1
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 1
regionCode 0
dti 239
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 405
revolBal 0
revolUtil 531
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 1
policyCode 0
n0 40270
n1 40270
n2 40270
n2.1 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
'''
我们发现n0-n14以及employLength特征缺失值较多,employmentTitle,postCode,dti,pubRecBankruptcies,revolUtil,title有较少的缺失,我们这里采用的方法是对于数值型变量,我们取中位数,对于类别型变量,我们使用众数来填充缺失值:
train_data[numerical_fea] = train_data[numerical_fea].fillna(train_data[numerical_fea].median())
train_data[category_fea] = train_data[category_fea].fillna(train_data[category_fea].mode())#重新查看一下缺失值的情况
train_data.isnull().sum()
我们发现还剩下employmentLength这个特征的缺失值没有被填充,之后我们再处理。
我们先来对类别型特征做一些预处理
#最早的日期
startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
#先转换格式再用日期-最早的日期得出天数为新的特征issueDateDT
for data in [train_data,test_A_data]:data['issueDate'] _datetime(data['issueDate'],format = '%Y-%m-%d')#构造时间特征data['issueDateDT'] = data['issueDate'].apply(lambda x: x - startdate).dt.days
def employmentLength_to_int(s):if pd.isnull(s):return selse:return np.int8(s.split(' ')[0])for data in [train_data,test_A_data]:data['employmentLength'].replace(to_replace='< 1 year',value='0 year',inplace=True)data['employmentLength'].replace(to_replace='10+ years',value='10 years',inplace=True)data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
for data in [train_data,test_A_data]:data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x : x[-4:])
一般来熟对于类别型特征处理有两种思路:
1.若这个变量是有大小关系的,比如该数据中的grade,可以使用数值映射
('A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7)
2.若变量之间是没有什么关系的,我们可以用独热编码(One-hot-encode)
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:print(f, '类型数:', data[f].nunique())#对于grade这种有大小,优劣区分的特征
for data in [data_train, data_test_a]:data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})#对于这些没有大小之分的特征,类型数在2之上,又不是高维稀疏的,且纯分类特征,可以使用独热编码
for data in [train_data,test_A_data]:data = pd.get_dummies(data,columns=['subGrade', 'homeOwnership', 'verificationStatus','regionCode'],drop_first= True)
#写一个判断异常/正常值的函数,并生成一个新的列fea_outliers
def find_outliers_by_3sigma(data,fea):data_mean = np.mean(data[fea])data_std = np.std(data[fea])lower_rule = data_mean - 3 * data_stdupper_rule = data_mean + 3 * data_stddata[fea + '_outliers'] = data[fea].apply(lambda x:str('异常值') if x < lower_rule and x > upper_rule else '正常值' )return data
#得到特征的异常值后可以进一步分析变量异常值和目标变量的关系
for fea in numerical_fea:train_data = find_outliers_by_3sigma(train_data,fea)print(train_data[fea + '_outliers'].value_counts())print(upby(fea+'_outliers')['isDefault'].sum())print('*' * 20)#删除异常值
for fea in numerical_fea:data_train = data_train[data_train[fea+'_outliers']=='正常值']data_train = set_index(drop=True)
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'],1000)# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin1'] = np.floor(np.log10(data['loanAmnt'])# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)#卡方分箱及其他自学
for col in ['grade', 'subGrade']: temp_dict = upby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})temp_dict.index = temp_dict[col].valuestemp_dict = temp_dict[col + '_target_mean'].to_dict()train_data[col + '_target_mean'] = train_data[col].map(temp_dict)train_data[col + '_target_mean'] = test_A_data[col].map(temp_dict)# 其他衍生变量 mean 和 std
for df in [train_data, test_A_data]:for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
labelEncoder 直接放入模型中
##label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
from sklearn.preprocessing import LabelEncoder
#tqdm是一个看程序进程的函数
for col in tqdm(['subGrade','postCode','title','employmentTitle']):le = LabelEncoder()# astype()函数可用于转化dateframe某一列的数据类型,values方法返回结果是数组le.fit(list(train_data[col].astype(str).values) + list(test_A_data[col].astype(str).values))print(le.classes_)train_data[col] = le.transform(list(train_data[col].astype(str).values))test_A_data[col] = le.transform(list(test_A_data[col].astype(str).values))
print('Label Encoding Finished!')
某些算法需要单独增加的特征工程
#写一个归一化的函数
for fea in tqdm(['需要归一化的列']):data[fea] = (data[fea] - min(data[fea]))/(max(data[fea]) - min(data[fea]))
1.filter(过滤法)
1方差选择法2相关系数法(pearson 相关系数)3卡方检验4互信息法
2.Wrapper(包裹法)
1.RFE(递归特征消除)
2.RFECV(递归特征消除交叉验证)
3.Embedded
1.基于惩罚项的特征选择法
2.基于树模型的特征选择
接下来对以上三种方法进行介绍:
1.filter法
#1.方差选择法
#该方法要计算各个特征的方差,设置一个阈值,仅选择大于阈值的特征(方差小说明数据之间变化不大所以可能对变量影响不大)
from sklearn.feature_selection import VarianceThreshold
#threshold就是阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)#2.相关系数法
#根据Pearson相关系数计算每个特征与目标的线性相关性
#结果的取值区间为 [-1,1] , -1 表示完全的负相关, +1表示完全的正相关,0 表示没有线性相关。
#结合Pearson相关系数选取最好的K个特征
#注意:0只代表没有线性关系,并不代表独立。
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个最好的特征,返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)#3.chi2检验
#同样是用于检验自变量与因变量之间的相关性
#注:卡方只能运用在正定矩阵上,否则会报错Input X must be non-negative
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)#4.互信息法
#也是检验自变量与因变量之间的相关性
from sklearn.feature_selection import SelectKBest
from minepy import MINE
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,
#返回一个二元组,二元组的第2项设置成固定的P值0.5
def mic(x, y):m = MINE()mpute_score(x, y)return (m.mic(), 0.5)
#参数k为选择的特征个数
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
2.Wrapper(RFE)
递归特征消除法(RFE):递归消除特征法使用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练。 在feature_selection库的RFE类可以用于选择特征,相关代码如下(以逻辑回归为例)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)
3.Embedded(惩罚法)
基于惩罚项的特征选择法 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#带L1惩罚项的逻辑回归作为基模型的特征选择SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征,相关代码如下:
from sklearn.feature_selection import SelectFromModel
semble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)
# 删除不需要的数据
for data in [data_train, data_test_a]:data.drop(['issueDate','id'], axis=1,inplace=True)#纵向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0,method='ffill')x_train = data_train.drop(['isDefault','id'], axis=1)
#计算协方差
data_corr = with(data_train.isDefault) #计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values# 当然也可以直接看图
data_numeric = data_train[numerical_fea]
correlation = ()f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)features = [f for f in lumns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']def cv_model(clf, train_x, train_y, test_x, clf_name):folds = 5seed = 2020kf = KFold(n_splits=folds, shuffle=True, random_state=seed)train = np.zeros(train_x.shape[0])test = np.zeros(test_x.shape[0])cv_scores = []for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):print('************************************ {} ************************************'.format(str(i+1)))trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]if clf_name == "lgb":train_matrix = clf.Dataset(trn_x, label=trn_y)valid_matrix = clf.Dataset(val_x, label=val_y)params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc','min_child_weight': 5,'num_leaves': 2 ** 5,'lambda_l2': 10,'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 4,'learning_rate': 0.1,'seed': 2020,'nthread': 28,'n_jobs':24,'silent': True,'verbose': -1,}model = ain(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)val_pred = model.predict(val_x, num_iteration=model.best_iteration)test_pred = model.predict(test_x, num_iteration=model.best_iteration)# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])if clf_name == "xgb":train_matrix = clf.DMatrix(trn_x , label=trn_y)valid_matrix = clf.DMatrix(val_x , label=val_y)params = {'booster': 'gbtree','objective': 'binary:logistic','eval_metric': 'auc','gamma': 1,'min_child_weight': 1.5,'max_depth': 5,'lambda': 10,'subsample': 0.7,'colsample_bytree': 0.7,'colsample_bylevel': 0.7,'eta': 0.04,'tree_method': 'exact','seed': 2020,'nthread': 36,"silent": True,}watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]model = ain(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)if clf_name == "cat":params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli','od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}model = clf(iterations=20000, **params)model.fit(trn_x, trn_y, eval_set=(val_x, val_y),cat_features=[], use_best_model=True, verbose=500)val_pred = model.predict(val_x)test_pred = model.predict(test_x)train[valid_index] = val_predtest = test_pred / kf.n_splitscv_scores.append(roc_auc_score(val_y, val_pred))print(cv_scores)print("%s_scotrainre_list:" % clf_name, cv_scores)print("%s_score_mean:" % clf_name, np.mean(cv_scores))print("%s_score_std:" % clf_name, np.std(cv_scores))return train, testdef lgb_model(x_train, y_train, x_test):lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")return lgb_train, lgb_testdef xgb_model(x_train, y_train, x_test):xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")return xgb_train, xgb_testdef cat_model(x_train, y_train, x_test):cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)testA_result = pd.read_csv('../testA_result.csv')
roc_auc_score(testA_result['isDefault'].values, lgb_test)
本文发布于:2024-01-28 23:54:14,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170645725711212.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |