基于爱荷华州埃姆斯的住宅数据信息,预测每间房屋的销售价格。
这是一个回归问题,评估方式是均方根误差。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import boxcox1p
import missingno as msno
import warnings
warnings.filterwarnings("ignore")%matplotlib inline
# 导入数据
house_train = pd.read_csv('/home/aistudio/data/data32288/train.csv')
house_test = pd.read_csv('/home/aistudio/data/data32288/test.csv')house_train.shape,house_test.shape
((1460, 81), (1459, 80))
house_train.info()
print('-'*40)
house_test.info()
<class frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float644 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float6427 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float6460 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
----------------------------------------
<class frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Id 1459 non-null int64 1 MSSubClass 1459 non-null int64 2 MSZoning 1455 non-null object 3 LotFrontage 1232 non-null float644 LotArea 1459 non-null int64 5 Street 1459 non-null object 6 Alley 107 non-null object 7 LotShape 1459 non-null object 8 LandContour 1459 non-null object 9 Utilities 1457 non-null object 10 LotConfig 1459 non-null object 11 LandSlope 1459 non-null object 12 Neighborhood 1459 non-null object 13 Condition1 1459 non-null object 14 Condition2 1459 non-null object 15 BldgType 1459 non-null object 16 HouseStyle 1459 non-null object 17 OverallQual 1459 non-null int64 18 OverallCond 1459 non-null int64 19 YearBuilt 1459 non-null int64 20 YearRemodAdd 1459 non-null int64 21 RoofStyle 1459 non-null object 22 RoofMatl 1459 non-null object 23 Exterior1st 1458 non-null object 24 Exterior2nd 1458 non-null object 25 MasVnrType 1443 non-null object 26 MasVnrArea 1444 non-null float6427 ExterQual 1459 non-null object 28 ExterCond 1459 non-null object 29 Foundation 1459 non-null object 30 BsmtQual 1415 non-null object 31 BsmtCond 1414 non-null object 32 BsmtExposure 1415 non-null object 33 BsmtFinType1 1417 non-null object 34 BsmtFinSF1 1458 non-null float6435 BsmtFinType2 1417 non-null object 36 BsmtFinSF2 1458 non-null float6437 BsmtUnfSF 1458 non-null float6438 TotalBsmtSF 1458 non-null float6439 Heating 1459 non-null object 40 HeatingQC 1459 non-null object 41 CentralAir 1459 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1459 non-null int64 44 2ndFlrSF 1459 non-null int64 45 LowQualFinSF 1459 non-null int64 46 GrLivArea 1459 non-null int64 47 BsmtFullBath 1457 non-null float6448 BsmtHalfBath 1457 non-null float6449 FullBath 1459 non-null int64 50 HalfBath 1459 non-null int64 51 BedroomAbvGr 1459 non-null int64 52 KitchenAbvGr 1459 non-null int64 53 KitchenQual 1458 non-null object 54 TotRmsAbvGrd 1459 non-null int64 55 Functional 1457 non-null object 56 Fireplaces 1459 non-null int64 57 FireplaceQu 729 non-null object 58 GarageType 1383 non-null object 59 GarageYrBlt 1381 non-null float6460 GarageFinish 1381 non-null object 61 GarageCars 1458 non-null float6462 GarageArea 1458 non-null float6463 GarageQual 1381 non-null object 64 GarageCond 1381 non-null object 65 PavedDrive 1459 non-null object 66 WoodDeckSF 1459 non-null int64 67 OpenPorchSF 1459 non-null int64 68 EnclosedPorch 1459 non-null int64 69 3SsnPorch 1459 non-null int64 70 ScreenPorch 1459 non-null int64 71 PoolArea 1459 non-null int64 72 PoolQC 3 non-null object 73 Fence 290 non-null object 74 MiscFeature 51 non-null object 75 MiscVal 1459 non-null int64 76 MoSold 1459 non-null int64 77 YrSold 1459 non-null int64 78 SaleType 1458 non-null object 79 SaleCondition 1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
# 统计性描述
house_train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Id | 1460.0 | 730.500000 | 421.610009 | 1.0 | 365.75 | 730.5 | 1095.25 | 1460.0 |
MSSubClass | 1460.0 | 56.897260 | 42.300571 | 20.0 | 20.00 | 50.0 | 70.00 | 190.0 |
LotFrontage | 1201.0 | 70.049958 | 24.284752 | 21.0 | 59.00 | 69.0 | 80.00 | 313.0 |
LotArea | 1460.0 | 10516.828082 | 9981.264932 | 1300.0 | 7553.50 | 9478.5 | 11601.50 | 215245.0 |
OverallQual | 1460.0 | 6.099315 | 1.382997 | 1.0 | 5.00 | 6.0 | 7.00 | 10.0 |
OverallCond | 1460.0 | 5.575342 | 1.112799 | 1.0 | 5.00 | 5.0 | 6.00 | 9.0 |
YearBuilt | 1460.0 | 1971.267808 | 30.202904 | 1872.0 | 1954.00 | 1973.0 | 2000.00 | 2010.0 |
YearRemodAdd | 1460.0 | 1984.865753 | 20.645407 | 1950.0 | 1967.00 | 1994.0 | 2004.00 | 2010.0 |
MasVnrArea | 1452.0 | 103.685262 | 181.066207 | 0.0 | 0.00 | 0.0 | 166.00 | 1600.0 |
BsmtFinSF1 | 1460.0 | 443.639726 | 456.098091 | 0.0 | 0.00 | 383.5 | 712.25 | 5644.0 |
BsmtFinSF2 | 1460.0 | 46.549315 | 161.319273 | 0.0 | 0.00 | 0.0 | 0.00 | 1474.0 |
BsmtUnfSF | 1460.0 | 567.240411 | 441.866955 | 0.0 | 223.00 | 477.5 | 808.00 | 2336.0 |
TotalBsmtSF | 1460.0 | 1057.429452 | 438.705324 | 0.0 | 795.75 | 991.5 | 1298.25 | 6110.0 |
1stFlrSF | 1460.0 | 1162.626712 | 386.587738 | 334.0 | 882.00 | 1087.0 | 1391.25 | 4692.0 |
2ndFlrSF | 1460.0 | 346.992466 | 436.528436 | 0.0 | 0.00 | 0.0 | 728.00 | 2065.0 |
LowQualFinSF | 1460.0 | 5.844521 | 48.623081 | 0.0 | 0.00 | 0.0 | 0.00 | 572.0 |
GrLivArea | 1460.0 | 1515.463699 | 525.480383 | 334.0 | 1129.50 | 1464.0 | 1776.75 | 5642.0 |
BsmtFullBath | 1460.0 | 0.425342 | 0.518911 | 0.0 | 0.00 | 0.0 | 1.00 | 3.0 |
BsmtHalfBath | 1460.0 | 0.057534 | 0.238753 | 0.0 | 0.00 | 0.0 | 0.00 | 2.0 |
FullBath | 1460.0 | 1.565068 | 0.550916 | 0.0 | 1.00 | 2.0 | 2.00 | 3.0 |
HalfBath | 1460.0 | 0.382877 | 0.502885 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
BedroomAbvGr | 1460.0 | 2.866438 | 0.815778 | 0.0 | 2.00 | 3.0 | 3.00 | 8.0 |
KitchenAbvGr | 1460.0 | 1.046575 | 0.220338 | 0.0 | 1.00 | 1.0 | 1.00 | 3.0 |
TotRmsAbvGrd | 1460.0 | 6.517808 | 1.625393 | 2.0 | 5.00 | 6.0 | 7.00 | 14.0 |
Fireplaces | 1460.0 | 0.613014 | 0.644666 | 0.0 | 0.00 | 1.0 | 1.00 | 3.0 |
GarageYrBlt | 1379.0 | 1978.506164 | 24.689725 | 1900.0 | 1961.00 | 1980.0 | 2002.00 | 2010.0 |
GarageCars | 1460.0 | 1.767123 | 0.747315 | 0.0 | 1.00 | 2.0 | 2.00 | 4.0 |
GarageArea | 1460.0 | 472.980137 | 213.804841 | 0.0 | 334.50 | 480.0 | 576.00 | 1418.0 |
WoodDeckSF | 1460.0 | 94.244521 | 125.338794 | 0.0 | 0.00 | 0.0 | 168.00 | 857.0 |
OpenPorchSF | 1460.0 | 46.660274 | 66.256028 | 0.0 | 0.00 | 25.0 | 68.00 | 547.0 |
EnclosedPorch | 1460.0 | 21.954110 | 61.119149 | 0.0 | 0.00 | 0.0 | 0.00 | 552.0 |
3SsnPorch | 1460.0 | 3.409589 | 29.317331 | 0.0 | 0.00 | 0.0 | 0.00 | 508.0 |
ScreenPorch | 1460.0 | 15.060959 | 55.757415 | 0.0 | 0.00 | 0.0 | 0.00 | 480.0 |
PoolArea | 1460.0 | 2.758904 | 40.177307 | 0.0 | 0.00 | 0.0 | 0.00 | 738.0 |
MiscVal | 1460.0 | 43.489041 | 496.123024 | 0.0 | 0.00 | 0.0 | 0.00 | 15500.0 |
MoSold | 1460.0 | 6.321918 | 2.703626 | 1.0 | 5.00 | 6.0 | 8.00 | 12.0 |
YrSold | 1460.0 | 2007.815753 | 1.328095 | 2006.0 | 2007.00 | 2008.0 | 2009.00 | 2010.0 |
SalePrice | 1460.0 | 180921.195890 | 79442.502883 | 34900.0 | 129975.00 | 163000.0 | 214000.00 | 755000.0 |
msno.matrix(house_train, labels=True)
msno.bar(house_train)
msno.heatmap(house_train)
data_null = house_train.isnull().sum()
data_null[data_null>0].sort_values(ascending=False)
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
LotFrontage 259
GarageYrBlt 81
GarageType 81
GarageFinish 81
GarageQual 81
GarageCond 81
BsmtFinType2 38
BsmtExposure 38
BsmtFinType1 37
BsmtCond 37
BsmtQual 37
MasVnrArea 8
MasVnrType 8
Electrical 1
dtype: int64
# numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for col in lumns:if house_train[col].dtype in numeric_dtypes:if col in ['TotalSF', 'Total_Bathrooms','Total_porch_sf','haspool','hasgarage','hasbsmt','hasfireplace']:passelse:numeric.append(col) fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
# 调整子图布局
plt.subplots_adjust(right=2,top=2)
# 显示husl颜色空间平均分布的8个颜色
lor_palette("husl", 8)
# 从1开始
for i, feature in enumerate(list(house_train[numeric]),1):if(feature=='MiscVal'):breakplt.subplot(len(list(numeric)), 3, i)sns.scatterplot(x=feature,y='SalePrice',hue='SalePrice',palette='Blues',data=house_train)# labelpad:控制刻度标注的上下位置plt.xlabel('{}'.format(feature),size=15,labelpad=12.5)plt.ylabel('SalePrice', size=15, labelpad=12.5)for j in range(2):plt.tick_params(axis='x', labelsize=12)plt.tick_params(axis='y', labelsize=12)plt.legend(loc='best', prop={'size': 10})plt.show()
house_train['SalePrice'].describe()
count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
Name: SalePrice, dtype: float64
f,ax = plt.subplots(1,2,figsize=(16,6))
sns.distplot(house_train['SalePrice'],fit,ax=ax[0])
sns.boxplot(house_train['SalePrice'])#skewness and kurtosis
print("Skewness: {}".format(house_train['SalePrice'].skew()))
print("Kurtosis: {}".format(house_train['SalePrice'].kurt()))
Skewness: 1.8828757597682129
Kurtosis: 6.536281860064529
fig = plt.figure()
stats.probplot(house_train['SalePrice'],plot=plt)
((array([-3.30513952, -3.04793228, -2.90489705, ..., 2.90489705,3.04793228, 3.30513952]),array([ 34900, 35311, 37900, ..., 625000, 745000, 755000])),(74160.16474519414, 180921.19589041095, 0.9319665641512983))
# 数据变换
house_train['SalePrice'] = np.log1p(house_train['SalePrice'])
fig = plt.figure()
stats.probplot(house_train['SalePrice'],plot=plt)
((array([-3.30513952, -3.04793228, -2.90489705, ..., 2.90489705,3.04793228, 3.30513952]),array([10.46027076, 10.47197813, 10.54273278, ..., 13.34550853,13.52114084, 13.53447435])),(0.398259646654151, 12.024057394918403, 0.9953761551826702))
def draw_corr(data):corr = ()plt.subplots(figsize=(12,12))sns.heatmap(corr,vmax=1,square=True,cmap='Blues')plt.show()draw_corr(house_train)
# 相关性最大的10个特征
corrmat = ()
plt.subplots(figsize=(10,8))
k = 10
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(house_train[cols].values.T)# annot_kws:当annot为True时,可设置各个参数,包括大小size,颜色color,加粗,斜体字等
# fmt:格式设置 这里保留2位小数
sns.heatmap(cm,cbar=True, annot=True, square=True,fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
f,ax = plt.subplots(figsize=(8,6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=house_train)
# 删除异常值
mask = (house_train['OverallQual']<5)&(house_train['SalePrice']>12)
house_train.drop(house_train[mask].index, inplace=True)
house_train.plot.scatter(x='OverallQual', y='SalePrice')
house_train.plot.scatter(x='GrLivArea', y='SalePrice',alpha=0.3)
# 删除右下角两个异常值
mask = (house_train['GrLivArea']>4000)&(house_train['SalePrice']<12.5)
house_train= house_train.drop(house_train[mask].index)# 删除异常值后
fig,ax = plt.subplots()
ax.scatter(x=house_train['GrLivArea'],y=house_train['SalePrice'])
plt.xlabel('GrLivArea',fontsize=13)
plt.ylabel('SalePrice',fontsize=13)
house_train.plot.scatter(x='GarageCars', y='SalePrice', alpha=0.3)
house_train.plot.scatter(x='GarageArea', y='SalePrice')
# 删除异常值
mask = (house_train['GarageArea']>1100)&(house_train['SalePrice']<12.5)
house_train.drop(house_train[mask].index, inplace=True)house_train.plot.scatter(x='GarageArea', y='SalePrice')
house_train.plot.scatter(x='TotalBsmtSF', y='SalePrice')
house_train.plot.scatter(x='1stFlrSF', y='SalePrice')
house_train.plot.scatter(x='FullBath', y='SalePrice')
house_train.plot.scatter(x='YearBuilt', y='SalePrice')
# 删除异常值
mask = (house_train['YearBuilt']<1900)&(house_train['SalePrice']>12.3)
house_train= house_train.drop(house_train[mask].index)# 删除异常值后
house_train.plot.scatter(x='YearBuilt', y='SalePrice')
house_train.plot.scatter(x='YearRemodAdd', y='SalePrice')
# 重置索引
set_index(drop=True,inplace=True)
合并测试集和训练集,对整体数据做特征工程
train_num = house_train.shape[0]
test_num = house_test.shape[0]train_y = house_train.SalePrice.valuesall_data = pd.concat((house_train,house_test)).reset_index(drop=True)
all_data.drop(['SalePrice','Id'],axis=1,inplace=True)
all_data.shape,train_num,test_num
((2908, 79), 1449, 1459)
count = all_data.isnull().sum().sort_values(ascending=False)
ratio = count/len(all_data)*100
cols_type = all_data[count.index].dtypesmissing_data = pd.concat([count,ratio,cols_type],axis=1,keys=['count','ratio','cols_type'])
missing_data=missing_data[missing_data.ratio>0]
missing_data
count | ratio | cols_type | |
---|---|---|---|
PoolQC | 2899 | 99.690509 | object |
MiscFeature | 2804 | 96.423659 | object |
Alley | 2711 | 93.225585 | object |
Fence | 2338 | 80.398900 | object |
FireplaceQu | 1418 | 48.762036 | object |
LotFrontage | 484 | 16.643741 | float64 |
GarageCond | 159 | 5.467675 | object |
GarageQual | 159 | 5.467675 | object |
GarageYrBlt | 159 | 5.467675 | float64 |
GarageFinish | 159 | 5.467675 | object |
GarageType | 157 | 5.398900 | object |
BsmtCond | 82 | 2.819807 | object |
BsmtExposure | 82 | 2.819807 | object |
BsmtQual | 81 | 2.785420 | object |
BsmtFinType2 | 80 | 2.751032 | object |
BsmtFinType1 | 79 | 2.716644 | object |
MasVnrType | 24 | 0.825309 | object |
MasVnrArea | 23 | 0.790922 | float64 |
MSZoning | 4 | 0.137552 | object |
BsmtHalfBath | 2 | 0.068776 | float64 |
Utilities | 2 | 0.068776 | object |
Functional | 2 | 0.068776 | object |
BsmtFullBath | 2 | 0.068776 | float64 |
BsmtFinSF2 | 1 | 0.034388 | float64 |
BsmtFinSF1 | 1 | 0.034388 | float64 |
Exterior2nd | 1 | 0.034388 | object |
BsmtUnfSF | 1 | 0.034388 | float64 |
TotalBsmtSF | 1 | 0.034388 | float64 |
Exterior1st | 1 | 0.034388 | object |
SaleType | 1 | 0.034388 | object |
Electrical | 1 | 0.034388 | object |
KitchenQual | 1 | 0.034388 | object |
GarageArea | 1 | 0.034388 | float64 |
GarageCars | 1 | 0.034388 | float64 |
# 可视化
f,axis = plt.subplots(figsize=(15,12))
icks(rotation='90')
sns.barplot(x=missing_data.index,y=missing_data.ratio)plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
在赛事方提供的特征描述中说明了一部分特征值数据的缺失是由于房屋确实不存在此种类型的特征,因此对于这一部分特征的缺失值,根据特征的数据类型分别进行插补,类别特征的缺失值以一种新类别插补,数值特征以0值插补,剩余的那一部分缺失的特征值采用众数插补
str_cols = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "MasVnrType", "MSSubClass"]
for col in str_cols:all_data[col].fillna("None",inplace=True)del str_cols, col
num_cols=["BsmtUnfSF","TotalBsmtSF","BsmtFinSF2","BsmtFinSF1","BsmtFullBath","BsmtHalfBath", "MasVnrArea","GarageCars","GarageArea","GarageYrBlt"]
for col in num_cols:all_data[col].fillna(0, inplace=True)
del num_cols, col
other_cols = ["MSZoning", "Electrical", "KitchenQual", "Exterior1st", "Exterior2nd", "SaleType"]
for col in other_cols:all_data[col].fillna(all_data[col].mode()[0], inplace=True)del other_cols, col
位于同一街道的相邻的房屋往往具有相同的街区面积属性
all_data["LotFrontage"] = upby("Neighborhood")["LotFrontage"].transform(lambda x: x.dian()))
缺失值为2,而几乎所有的值都是AllPub,考虑删除
all_data["Utilities"].isnull().sum()
2
all_data["Utilities"].value_counts()
AllPub 2905
NoSeWa 1
Name: Utilities, dtype: int64
# 删除特征
all_data.drop(['Utilities'],axis=1,inplace=True)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
# 查看缺失值
mask =all_data.isnull().sum().sort_values(ascending=False)>0
lumns[mask]
Index([], dtype='object')
# 对顺序变量进行编码
def custom_coding(x):if(x=='Ex'):r = 0elif(x=='Gd'):r = 1elif(x=='TA'):r = 2elif(x=='Fa'):r = 3elif(x=='None'):r = 4else:r = 5return r
## 顺序变量特征编码
cols = ['BsmtCond','BsmtQual','ExterCond','ExterQual','FireplaceQu','GarageCond','GarageQual','HeatingQC','KitchenQual','PoolQC']
for col in cols:all_data[col] = all_data[col].apply(custom_coding)del cols, col
一些特征其被表示成数值特征缺乏意义,例如年份还有类别,这里将其转换为字符串,即类别型变量
cols = ['MSSubClass', 'YrSold', 'MoSold', 'OverallCond', "MSZoning", "BsmtFullBath", "BsmtHalfBath", "HalfBath","Functional", "Electrical", "KitchenQual","KitchenAbvGr", "SaleType", "Exterior1st", "Exterior2nd", "YearBuilt", "YearRemodAdd", "GarageYrBlt","BedroomAbvGr","LowQualFinSF"]
for col in cols:all_data[col] = all_data[col].astype(str)
del cols, col
# 对年份类的数据等进行LabelEncoder编码
from sklearn.preprocessing import LabelEncoder# 年份等特征的标签编码
str_cols = ["YearBuilt", "YearRemodAdd", 'GarageYrBlt', "YrSold", 'MoSold']
for col in str_cols:all_data[col] = LabelEncoder().fit_transform(all_data[col])
# 为了后续构建有意义的其他特征而进行标签编码
lab_cols = ['Heating','BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold', 'MSZoning','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','Exterior1st','MasVnrType','Foundation', 'GarageType','SaleType','SaleCondition']for col in lab_cols:new_col = "labfit_" + colall_data[new_col] = LabelEncoder().fit_transform(all_data[col]) del col,str_cols,lab_cols,new_col
# 区域相关特征对于确定房价非常重要,增加了一个总面积的特征
all_data['TotalHouseArea'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
# 房屋改造时间(YearsSinceRemodel)与房屋出售时间(YrSold)间隔时间的长短通常也会影响房价
all_data['YearsSinceRemodel'] = all_data['YrSold'].astype(int) - all_data['YearRemodAdd'].astype(int)
# 房屋的整体质量特征也是影响房价的重要要因素
all_data['Total_Home_Quality'] = all_data['OverallQual'].astype(int) + all_data['OverallCond'].astype(int)
房屋内某些区域空间的有无通常也是影响房屋价格的重要因素,例如有三季门廊区(3SsnPorch)、开放式门廊(OpenPorchSF)的房屋可能就比没有三级门廊区的房屋价格贵。因此这里我们再增添几个特征用于描述房屋内是否存在这些区域空间
all_data['HasWoodDeck'] = (all_data['WoodDeckSF'] == 0) * 1
all_data['HasOpenPorch'] = (all_data['OpenPorchSF'] == 0) * 1
all_data['HasEnclosedPorch'] = (all_data['EnclosedPorch'] == 0) * 1
all_data['Has3SsnPorch'] = (all_data['3SsnPorch'] == 0) * 1
all_data['HasScreenPorch'] = (all_data['ScreenPorch'] == 0) * 1
# 房屋总面积加车库面积
all_data["TotalAllArea"] = all_data["TotalHouseArea"] + all_data["GarageArea"]
# 房屋总面积和房屋材质指标乘积
all_data["TotalHouse_and_OverallQual"] = all_data["TotalHouseArea"] * all_data["OverallQual"]# 地面上居住总面积和房屋材质指标乘积
all_data["GrLivArea_and_OverallQual"] = all_data["GrLivArea"] * all_data["OverallQual"]# 地段总面积和房屋材质指标乘积
all_data["LotArea_and_OverallQual"] = all_data["LotArea"] * all_data["OverallQual"]# 一般区域分类与房屋总面积的乘积
all_data["MSZoning_and_TotalHouse"] = all_data["labfit_MSZoning"] * all_data["TotalHouseArea"]
# 一般区域分类指标与房屋材质指标之和
all_data["MSZoning_and_OverallQual"] = all_data["labfit_MSZoning"] + all_data["OverallQual"]
# 一般区域分类指标与初始建设年份之和
all_data["MSZoning_and_YearBuilt"] = all_data["labfit_MSZoning"] + all_data["YearBuilt"]
# 地理邻近环境位置指标与总房屋面积之积
all_data["Neighborhood_and_TotalHouse"] = all_data["labfit_Neighborhood"] * all_data["TotalHouseArea"]
all_data["Neighborhood_and_OverallQual"] = all_data["labfit_Neighborhood"] + all_data["OverallQual"]
all_data["Neighborhood_and_YearBuilt"] = all_data["labfit_Neighborhood"] + all_data["YearBuilt"]# 1型成品的面积和房屋材质指标乘积
all_data["BsmtFinSF1_and_OverallQual"] = all_data["BsmtFinSF1"] * all_data["OverallQual"]
## 家庭功能评级指标与房屋总面积的乘积
all_data["Functional_and_TotalHouse"] = all_data["labfit_Functional"] * all_data["TotalHouseArea"]
all_data["Functional_and_OverallQual"] = all_data["labfit_Functional"] + all_data["OverallQual"]
all_data["TotalHouse_and_LotArea"] = all_data["TotalHouseArea"] + all_data["LotArea"]
# 房屋与靠近公路或铁路指标乘积系数
all_data["Condition1_and_TotalHouse"] = all_data["labfit_Condition1"] * all_data["TotalHouseArea"]
all_data["Condition1_and_OverallQual"] = all_data["labfit_Condition1"] + all_data["OverallQual"]
# 地下室相关面积总和指标
all_data["Bsmt"] = all_data["BsmtFinSF1"] + all_data["BsmtFinSF2"] + all_data["BsmtUnfSF"]
# 地面上全浴室和地面上房间总数量之和
all_data["Rooms"] = all_data["FullBath"]+all_data["TotRmsAbvGrd"]
# 开放式门廊、围廊、三季门廊、屏风玄关总面积
all_data["PorchArea"] = all_data["OpenPorchSF"]+all_data["EnclosedPorch"]+all_data["3SsnPorch"]+all_data["ScreenPorch"]
## 全部功能区总面积(房屋、地下室、车库、门廊等)
all_data["TotalPlace"] = all_data["TotalAllArea"] + all_data["PorchArea"]
将数值型feature里skew(偏度)绝对值大于0.75的特征进行一个log变换,将非正态的数据修正为接近正态分布的数据,以便满足线性模型的需要。
为什么要通过函数变换来改变原始数值型特征的分布呢?
常用数据转换方法的有:对数转换,box-cox转换等变换方式,其中对数转换的方式是最为常用的,取对数之后数据的性质和相关关系不会发生改变,但压缩了变量的尺度,大大方便了计算。
from scipy.stats import norm, skew# 计算各数值型特征变量的偏度
num_features = all_data.select_dtypes(include=['int64','float64','int32']).copy()
num_feature_names = list(lumns)skewed_feats = all_data[num_feature_names].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness[skewness["Skew"].abs()>0.75]
Skew | |
---|---|
MiscVal | 21.915535 |
PoolArea | 17.661095 |
LotArea | 13.334935 |
labfit_Condition2 | 12.437736 |
TotalHouse_and_LotArea | 12.380094 |
labfit_Heating | 12.136394 |
LotArea_and_OverallQual | 11.799484 |
3SsnPorch | 11.354131 |
labfit_LandSlope | 5.009358 |
BsmtFinSF2 | 4.137116 |
EnclosedPorch | 4.005089 |
ScreenPorch | 3.926054 |
GarageCond | 3.153395 |
labfit_Condition1 | 3.005668 |
GarageQual | 2.863557 |
MasVnrArea | 2.619878 |
Condition1_and_TotalHouse | 2.544979 |
BsmtCond | 2.542349 |
OpenPorchSF | 2.493685 |
PorchArea | 2.232411 |
labfit_BldgType | 2.186631 |
BsmtFinSF1_and_OverallQual | 2.017572 |
WoodDeckSF | 1.852261 |
TotalHouse_and_OverallQual | 1.615116 |
GrLivArea_and_OverallQual | 1.485190 |
1stFlrSF | 1.264660 |
LotFrontage | 1.106714 |
GrLivArea | 1.048644 |
TotalHouseArea | 1.012116 |
BsmtFinSF1 | 0.982488 |
BsmtUnfSF | 0.919524 |
TotalAllArea | 0.891388 |
TotalPlace | 0.887892 |
2ndFlrSF | 0.853227 |
Neighborhood_and_TotalHouse | 0.852391 |
ExterQual | -0.784824 |
ExterCond | -0.838720 |
Functional_and_OverallQual | -0.920453 |
labfit_BsmtExposure | -1.116930 |
labfit_MSZoning | -1.745237 |
HasEnclosedPorch | -1.880501 |
labfit_Fence | -1.990335 |
labfit_SaleCondition | -2.785113 |
HasScreenPorch | -2.915483 |
labfit_PavedDrive | -2.979584 |
labfit_BsmtFinType2 | -3.036904 |
labfit_CentralAir | -3.461892 |
labfit_SaleType | -3.737598 |
labfit_Functional | -4.062504 |
Has3SsnPorch | -8.695256 |
labfit_Street | -16.166862 |
PoolQC | -20.309793 |
设置阈值为1,对偏度大于阈值的特征进行log函数变换操作以提升质量
skew_cols = list(skewness[skewness["Skew"].abs()>1].index)
for col in skew_cols:# 偏度超过阈值的特征做box-cox变换# all_data[col] = boxcox1p(all_data[col], 0.15)# 偏度超过阈值的特征对数变换all_data[col] = np.log1p(all_data[col])
# 查看字符特征变量
all_data.info()
<class frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Columns: 135 entries, MSSubClass to TotalPlace
dtypes: float64(54), int64(40), object(41)
memory usage: 3.0+ MB
# 对于这些剩下的字符型特征,采用独热编码的方式,将其转化为数值型的特征
all_data = pd.get_dummies(all_data)
all_data.head()
LotFrontage | LotArea | OverallQual | YearBuilt | YearRemodAdd | MasVnrArea | ExterQual | ExterCond | BsmtQual | BsmtCond | ... | SaleType_ConLw | SaleType_New | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.189655 | 9.042040 | 7 | 109 | 53 | 5.283204 | 1 | 2 | 1 | 1.098612 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 4.394449 | 9.169623 | 6 | 82 | 26 | 0.000000 | 2 | 2 | 1 | 1.098612 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 4.234107 | 9.328212 | 7 | 107 | 52 | 5.093750 | 1 | 2 | 1 | 1.098612 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 4.110874 | 9.164401 | 7 | 24 | 20 | 0.000000 | 2 | 2 | 2 | 0.693147 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 4.442651 | 9.565284 | 8 | 106 | 50 | 5.860786 | 1 | 2 | 1 | 1.098612 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 393 columns
all_data.info()
all_data.shape
<class frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Columns: 393 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(54), int64(40), uint8(299)
memory usage: 2.9 MB(2908, 393)
可以看到总共才2908行数据,就有390维特征,特征比较多,我们在这里使用Lasso进行降维
# 划分数据集
def split_data(all_data,train_index):cols = list(lumns)# 用众数填充特征工程中产生的异常值(正负无穷大)for col in cols:all_data[col].values[np.isinf(all_data[col].values)]=all_data[col].median()del cols,coltrain_data = all_data[:train_index]test_data = all_data[train_index:]return train_data,test_data
train_X,test_X = split_data(all_data,train_num)
train_X.shape,test_X.shape,train_y.shape
((1449, 393), (1459, 393), (1449,))
1.针对离群点做标准化处理
from sklearn.preprocessing import RobustScalerscaler = RobustScaler()
# 训练集特征归一化
train_X = scaler.fit_transform(train_X)
# 测试集特征归一化
test_X = ansform(test_X)
2.建模
from sklearn.linear_model import Lassolasso_model = Lasso(alpha=0.001)
lasso_model.fit(train_X,train_y)
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,normalize=False, positive=False, precompute=False, random_state=None,selection='cyclic', tol=0.0001, warm_start=False)
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)# 索引和特征重要性
FI_lasso = pd.DataFrame({"Feature Importance":f_},index=lumns)
# 由高到低进行排序
FI_lasso.sort_values("Feature Importance",ascending=False).round(5).head(10)
Feature Importance | |
---|---|
Neighborhood_Crawfor | 0.09052 |
Total_Home_Quality | 0.08677 |
TotalPlace | 0.07877 |
GrLivArea | 0.06999 |
KitchenQual_0 | 0.05483 |
Functional_and_TotalHouse | 0.04605 |
labfit_SaleCondition | 0.04488 |
Exterior1st_BrkFace | 0.04458 |
YearBuilt | 0.03844 |
MSZoning_and_YearBuilt | 0.03626 |
3.可视化
# 不为0的特征
FI_lasso=FI_lasso[FI_lasso["Feature Importance"] !=0 ].sort_values("Feature Importance")
FI_lasso.plot(kind="barh",figsize=(12,40), color='g')
icks(rotation=90)display(FI_lasso.shape)
4.特征选择
# 挑选特征
choose_cols = FI_list()
choose_data = all_data[choose_cols].copy()choose_data.shape
(2908, 86)
# 划分数据集
train_X, test_X = choose_data[:train_num], choose_data[train_num:]
# 标准化处理
scaler = RobustScaler()
train_X = scaler.fit_transform(train_X)
test_X = ansform(test_X)train_X.shape,test_X.shape,train_y.shape
((1449, 86), (1459, 86), (1449,))
# Models
semble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge,RidgeCV,Lasso,LinearRegression
from sklearn.linear_model import ElasticNet,ElasticNetCV,SGDRegressor,BayesianRidge
from sklearn.svm import SVR,LinearSVR
ssor import StackingCVRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor# Misc
del_selection import GridSearchCV,KFold, cross_val_score
ics import mean_squared_error
from sklearn.preprocessing import StandardScalerfrom sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone# 12折交叉验证
kf = KFold(n_splits=12,random_state=42,shuffle=True)
# 均方根误差
def rmse(y,y_pred):return np.sqrt(mean_squared_error(y,y_pred))def cv_rmse(model,X,y):# neg_mean_squared_error 负均方根误差rmse = np.sqrt(-cross_val_score(model,X,y,scoring="neg_mean_squared_error",cv=kf))return rmse
前面新建的特征和原始特征存在相关性,这可能导致较强的多重共线性 (Multicollinearity)
pca_model = PCA(n_components=60)
train_X = pca_model.fit_transform(train_X)
test_X = ansform(test_X)
通常对于一个问题,我们可以依据数据构建不同的模型去求解,这些模型站在不同的角度去解释数据的内部结构。我们可以融合不同的求解方法,得到更优的求解结果。在集成学习中,我们要做的事情就是构建不同的个体学习器,并将它们很好的进行集成。关键在于同一个训练集上训练出来的模型相关性很高,而我们希望它们“不同”,这样才能查漏补缺,取得更好的结果。
Stacking和Blending属于Bagging方法,两者的不同之处在于采用不同的方式融合个体学习器,前者非线性,后者线性。
# 网格搜索
def get_best_model_and_accuracy(model, params, X, y):# 如果报错,结果是0grid = GridSearchCV(model,params,scoring='neg_mean_squared_error',cv=5,n_jobs=-1,error_score=0.)grid.fit(X, y) # 拟合模型和参数# 经典的性能指标print("Best Score: {}".format(np.sqrt(-grid.best_score_)))# 得到最佳准确率的最佳参数print("Best Parameters: {}".format(grid.best_params_))# 拟合的平均时间(秒)print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))# 预测的平均时间(秒)# 从该指标可以看出模型在真实世界的性能print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))grid.cv_results_['mean_test_score'] = np.sqrt(-grid.cv_results_['mean_test_score'])# 打印单独的各参数组合参数及对应的评估指标print(pd.DataFrame(grid.cv_results_)[['params','mean_test_score','std_test_score']])return grid
param_Lasso = {'alpha': [0.0004,0.0005,0.0006],'max_iter':[10000],'random_state':[1]}
Lasso_grid =get_best_model_and_accuracy(Lasso(),param_Lasso,train_X,train_y)
Best Score: 0.11233809637926326
Best Parameters: {'alpha': 0.0004, 'max_iter': 10000, 'random_state': 1}
Average Time to Fit (s): 0.002
Average Time to Score (s): 0.0params mean_test_score
0 {'alpha': 0.0004, 'max_iter': 10000, 'random_state': 1} 0.112338
1 {'alpha': 0.0005, 'max_iter': 10000, 'random_state': 1} 0.112341
2 {'alpha': 0.0006, 'max_iter': 10000, 'random_state': 1} 0.112416 std_test_score
0 0.000861
1 0.000884
2 0.000907
param_Ridge = {'alpha':[35,40,45,50,55]}
Ridge_grid =get_best_model_and_accuracy(Ridge(),param_Ridge,train_X,train_y)
Best Score: 0.11201108834987004
Best Parameters: {'alpha': 35}
Average Time to Fit (s): 0.001
Average Time to Score (s): 0.0params mean_test_score std_test_score
0 {'alpha': 35} 0.112011 0.000953
1 {'alpha': 40} 0.112035 0.000967
2 {'alpha': 45} 0.112073 0.000980
3 {'alpha': 50} 0.112122 0.000991
4 {'alpha': 55} 0.112180 0.001001
param_SVR = {'C':[11,12,13,14,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]}
SVR_grid =get_best_model_and_accuracy(SVR(),param_SVR,train_X,train_y)
Best Score: 0.11185206657627142
Best Parameters: {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
Average Time to Fit (s): 0.317
Average Time to Score (s): 0.044params
0 {'C': 11, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
1 {'C': 11, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
2 {'C': 11, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
3 {'C': 11, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
4 {'C': 12, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
5 {'C': 12, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
6 {'C': 12, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
7 {'C': 12, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
8 {'C': 13, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
9 {'C': 13, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
10 {'C': 13, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
11 {'C': 13, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
12 {'C': 14, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
13 {'C': 14, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
14 {'C': 14, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
15 {'C': 14, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
16 {'C': 15, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
17 {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
18 {'C': 15, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
19 {'C': 15, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'} mean_test_score std_test_score
0 0.112221 0.001143
1 0.111954 0.001126
2 0.112240 0.001131
3 0.112010 0.001115
4 0.112148 0.001147
5 0.111916 0.001105
6 0.112193 0.001135
7 0.111954 0.001103
8 0.112077 0.001141
9 0.111902 0.001092
10 0.112097 0.001137
11 0.111994 0.001098
12 0.112045 0.001135
13 0.111888 0.001081
14 0.112054 0.001127
15 0.111958 0.001082
16 0.112021 0.001123
17 0.111852 0.001068
18 0.112056 0.001113
19 0.111902 0.001071
param_KernelRidge = {'alpha':[0.3,0.4,0.5], 'kernel':["polynomial"],'degree':[3],'coef0':[0.8,1,1.2]}KernelRidge_grid =get_best_model_and_accuracy(KernelRidge(),param_KernelRidge,train_X,train_y)
Best Score: 0.12053877269961878
Best Parameters: {'alpha': 0.5, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
Average Time to Fit (s): 0.207
Average Time to Score (s): 0.037params
0 {'alpha': 0.3, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
1 {'alpha': 0.3, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
2 {'alpha': 0.3, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
3 {'alpha': 0.4, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
4 {'alpha': 0.4, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
5 {'alpha': 0.4, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
6 {'alpha': 0.5, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
7 {'alpha': 0.5, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
8 {'alpha': 0.5, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'} mean_test_score std_test_score
0 0.131492 0.001534
1 0.124723 0.001179
2 0.123360 0.001052
3 0.132097 0.001687
4 0.123652 0.001257
5 0.121633 0.001096
6 0.133186 0.001837
7 0.123168 0.001331
8 0.120539 0.001138
ElasticNet可以看做Lasso和Ridge的中庸化的产物。它也是对普通的线性回归做了正则化,但是它的损失函数既不全是L1的正则化,也不全是L2的正则化,而是用一个权重参数ρ来平衡L1和L2正则化的比重
机器学习算法之岭回归、Lasso回归和ElasticNet回归
param_ElasticNet = {'alpha':[0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5],'max_iter':[10000],'random_state':[3]}
ElasticNet_grid =get_best_model_and_accuracy(ElasticNet(),param_ElasticNet,train_X,train_y)
Best Score: 0.11223819703859092
Best Parameters: {'alpha': 0.005, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
Average Time to Fit (s): 0.001
Average Time to Score (s): 0.0params
0 {'alpha': 0.0008, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
1 {'alpha': 0.0008, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
2 {'alpha': 0.0008, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
3 {'alpha': 0.0008, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3}
4 {'alpha': 0.004, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
5 {'alpha': 0.004, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
6 {'alpha': 0.004, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
7 {'alpha': 0.004, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3}
8 {'alpha': 0.005, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
9 {'alpha': 0.005, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
10 {'alpha': 0.005, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
11 {'alpha': 0.005, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3} mean_test_score std_test_score
0 0.112599 0.000791
1 0.112573 0.000795
2 0.112379 0.000828
3 0.112327 0.000865
4 0.112244 0.000872
5 0.112254 0.000888
6 0.113251 0.001022
7 0.114522 0.001099
8 0.112238 0.000895
9 0.112282 0.000914
10 0.113737 0.001056
11 0.115224 0.001138
bay = BayesianRidge()
xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05, max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1)lgbm = LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05, n_estimators=700,max_bin = 55,bagging_fraction = 0.8,bagging_freq = 5, feature_fraction = 0.25,feature_fraction_seed=9, bagging_seed=9,min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)GBR = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,max_depth=4, max_features='sqrt',min_samples_leaf=15, min_samples_split=10,loss='huber', random_state =5)
Stacking的学习器分两层,第一层是若干个弱学习器,它们分别进行预测,然后把预测结果传递给第二层;第二层学习器基于第一层的预测结果预测。
Stacking容易过拟合,采用K-Fold方法进行训练:
将训练集分成5份,迭代5次,每次迭代都将4份数据作为Train Set对每个Base Model进行训练,然后剩下的一份作为Hold-out Set进行预测。同时,每个Base Model在Test Set的预测值也要保存下来。经过5-Flod迭代后,我们获得了一个:训练样本行数 * 模型数量 的矩阵(每个Base Model在进行cv 的过程,分别会对每一份Hold-out-set进行预测,汇总产生一个对所有训练集的预测),这个矩阵作为第二层的训练数据进行训练,得到model B。
将之前保存的每个Base Model对测试集进行的预测的平均值拼成一个:测试样本行数 * 模型数量 的矩阵(每个Base Model会对测试集进行5-fold次预测,所以在拼测试数据的预测结果之前,需要对每个Base Model预测5-Fold次的预测结果求均值)。
model B对测试集的预测进行预测。
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):def __init__(self,mod,meta_model):d = mod # 首层学习器模型a_model = meta_model # 次学习器模型# 堆叠的最大特征划分折数self.k=5self.kf = KFold(n_splits=self.k, random_state=42, shuffle=True)# 训练数据def fit(self,X,y):# self.saved_model包含所有第一层学习器self.saved_model = [list() for i d]# 维度:训练样本行数*模型数量oof_train = np.zeros((X.shape[0], d)))for i,model in d): # 返回索引和模型本身#返回数据分割成分(训练集和验证集对应元素)的索引for train_index, val_index in self.kf.split(X,y): renew_model = clone(model) # 模型的复制# 对分割出来的训练集数据进行训练renew_model.fit(X[train_index], y[train_index])# 添加模型 self.saved_model[i].append(renew_model)# 保存对应模型的验证集预测值oof_train[val_index,i] = renew_model.predict(X[val_index])# 次学习器模型训练,这里只是用到了首层预测值作为特征a_model.fit(oof_train,y)return self# 测试数据def predict(self,X):# 得到的是整个测试集的首层预测值,np.column_stack:左右根据列拼接 mean(axis=1):跨列求和whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) for single_model in self.saved_model])# 返回次学习器模型对整个测试集的首层预测值的最终预测a_model.predict(whole_test) ## 获取首层学习结果的堆叠特征def get_oof(self,X,y,test_X): oof = np.zeros((X.shape[0],d)))test_single = np.zeros((test_X.shape[0],self.k))test_mean = np.zeros((test_X.shape[0],d)))for i,model in d):for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):clone_model = clone(model)clone_model.fit(X[train_index],y[train_index])# 预测结果保存oof[val_index,i] = clone_model.predict(X[val_index])test_single[:,j] = clone_model.predict(test_X)# 对每个模型的测试集预测K-Fold结果取均值test_mean[:,i] = an(axis=1)return oof, test_mean
lasso = Lasso_grid.best_estimator_
ridge = Ridge_grid.best_estimator_
svr = SVR_grid.best_estimator_
ker = KernelRidge_grid.best_estimator_
ela= ElasticNet_grid.best_estimator_stack_model = stacking(mod=[bay,lasso,ridge,svr,ker,ela], meta_model=ker)# 查看训练集精度
score = cv_rmse(stack_model,train_X,train_y)
an())
0.10746634249868159
# 第二层学习器特征获取
x_train_stack, x_test_stack = _oof(train_X,train_y,test_X)
train_X.shape,train_y.shape,test_X.shape
((1449, 60), (1449,), (1459, 60))
# 第一层总共6个模型
x_train_stack.shape, x_test_stack.shape
((1449, 6), (1459, 6))
stacking的一般来说得到初级模型的预测值后,用次级模型训练预测就可以了。不过在本案例中,我们尝试将第一层得到的堆叠特征与初始特征进行合并,最后利用这些合并后的特征再次投入stacking里面进行训练。
# 将stacking特征和数据原始的特征拼接,水平方向上
x_train_add = np.hstack((train_X,x_train_stack))
x_test_add = np.hstack((test_X,x_test_stack))x_train_add.shape,x_test_add.shape
((1449, 66), (1459, 66))
# 查看拼接特征后的精度,发现效果有所提升
score = cv_rmse(stack_model,x_train_add,train_y)
an())
0.10195220877304757
将x_train_add,train_y和x_test_add通过stacking重新进行训练
param_Lasso = {'alpha': [0.0004,0.0005,0.0006],'max_iter':[10000],'random_state':[1]}
Lasso_2 =get_best_model_and_accuracy(Lasso(),param_Lasso,x_train_add,train_y)
Best Score: 0.11162310214215297
Best Parameters: {'alpha': 0.0005, 'max_iter': 10000, 'random_state': 1}
Average Time to Fit (s): 0.009
Average Time to Score (s): 0.0params mean_test_score
0 {'alpha': 0.0004, 'max_iter': 10000, 'random_state': 1} 0.111637
1 {'alpha': 0.0005, 'max_iter': 10000, 'random_state': 1} 0.111623
2 {'alpha': 0.0006, 'max_iter': 10000, 'random_state': 1} 0.111662 std_test_score
0 0.000880
1 0.000896
2 0.000909
param_Ridge = {'alpha':[35,40,45,50,55]}
Ridge_2 =get_best_model_and_accuracy(Ridge(),param_Ridge,x_train_add,train_y)
Best Score: 0.1118608032209135
Best Parameters: {'alpha': 35}
Average Time to Fit (s): 0.002
Average Time to Score (s): 0.0params mean_test_score std_test_score
0 {'alpha': 35} 0.111861 0.000949
1 {'alpha': 40} 0.111892 0.000962
2 {'alpha': 45} 0.111924 0.000973
3 {'alpha': 50} 0.111960 0.000983
4 {'alpha': 55} 0.111999 0.000992
param_SVR = {'C':[11,12,13,14,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]}
SVR_2 =get_best_model_and_accuracy(SVR(),param_SVR,x_train_add,train_y)
Best Score: 0.11187202151025108
Best Parameters: {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
Average Time to Fit (s): 0.316
Average Time to Score (s): 0.044params
0 {'C': 11, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
1 {'C': 11, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
2 {'C': 11, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
3 {'C': 11, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
4 {'C': 12, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
5 {'C': 12, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
6 {'C': 12, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
7 {'C': 12, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
8 {'C': 13, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
9 {'C': 13, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
10 {'C': 13, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
11 {'C': 13, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
12 {'C': 14, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
13 {'C': 14, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
14 {'C': 14, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
15 {'C': 14, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'}
16 {'C': 15, 'epsilon': 0.008, 'gamma': 0.0003, 'kernel': 'rbf'}
17 {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'}
18 {'C': 15, 'epsilon': 0.009, 'gamma': 0.0003, 'kernel': 'rbf'}
19 {'C': 15, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'} mean_test_score std_test_score
0 0.112114 0.001168
1 0.111980 0.001131
2 0.112167 0.001164
3 0.112013 0.001132
4 0.112075 0.001161
5 0.111909 0.001112
6 0.112136 0.001158
7 0.111960 0.001113
8 0.112050 0.001159
9 0.111898 0.001082
10 0.112133 0.001152
11 0.111930 0.001096
12 0.112024 0.001159
13 0.111873 0.001057
14 0.112087 0.001149
15 0.111928 0.001074
16 0.111989 0.001150
17 0.111872 0.001046
18 0.112041 0.001143
19 0.111910 0.001060
param_KernelRidge = {'alpha':[0.3,0.4,0.5], 'kernel':["polynomial"],'degree':[3],'coef0':[0.8,1,1.2]}KernelRidge_2 =get_best_model_and_accuracy(KernelRidge(),param_KernelRidge,x_train_add,train_y)
Best Score: 0.11754411372302964
Best Parameters: {'alpha': 0.5, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
Average Time to Fit (s): 0.184
Average Time to Score (s): 0.032params
0 {'alpha': 0.3, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
1 {'alpha': 0.3, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
2 {'alpha': 0.3, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
3 {'alpha': 0.4, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
4 {'alpha': 0.4, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
5 {'alpha': 0.4, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
6 {'alpha': 0.5, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
7 {'alpha': 0.5, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
8 {'alpha': 0.5, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'} mean_test_score std_test_score
0 0.121835 0.002417
1 0.119743 0.002347
2 0.118019 0.002291
3 0.121416 0.002253
4 0.119359 0.002201
5 0.117628 0.002159
6 0.121293 0.002123
7 0.119272 0.002083
8 0.117544 0.002051
param_ElasticNet = {'alpha':[0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5],'max_iter':[10000],'random_state':[3]}
ElasticNet_2 =get_best_model_and_accuracy(ElasticNet(),param_ElasticNet,x_train_add,train_y)
Best Score: 0.10667612140906058
Best Parameters: {'alpha': 0.0008, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
Average Time to Fit (s): 0.025
Average Time to Score (s): 0.0params
0 {'alpha': 0.0008, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
1 {'alpha': 0.0008, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
2 {'alpha': 0.0008, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
3 {'alpha': 0.0008, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3}
4 {'alpha': 0.004, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
5 {'alpha': 0.004, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
6 {'alpha': 0.004, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
7 {'alpha': 0.004, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3}
8 {'alpha': 0.005, 'l1_ratio': 0.08, 'max_iter': 10000, 'random_state': 3}
9 {'alpha': 0.005, 'l1_ratio': 0.1, 'max_iter': 10000, 'random_state': 3}
10 {'alpha': 0.005, 'l1_ratio': 0.3, 'max_iter': 10000, 'random_state': 3}
11 {'alpha': 0.005, 'l1_ratio': 0.5, 'max_iter': 10000, 'random_state': 3} mean_test_score std_test_score
0 0.106676 0.000741
1 0.107021 0.000758
2 0.111335 0.000889
3 0.111619 0.000880
4 0.111584 0.000877
5 0.111586 0.000891
6 0.112205 0.001007
7 0.113027 0.001072
8 0.111594 0.000896
9 0.111623 0.000914
10 0.112603 0.001041
11 0.113622 0.001111
bay_2 = BayesianRidge()
xgb_2 = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,learning_rate=0.05,max_depth=3,min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,random_state =7, nthread = -1)lgbm_2 = LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05,n_estimators=700,max_bin = 55,bagging_fraction = 0.8,bagging_freq = 5,feature_fraction = 0.25,feature_fraction_seed=9, bagging_seed=9,min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)GBR_2 = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,max_depth=4,max_features='sqrt',min_samples_leaf=15,min_samples_split=10,loss='huber', random_state =5)lasso_2 = Lasso_2.best_estimator_
ridge_2 = Ridge_2.best_estimator_
svr_2 = SVR_2.best_estimator_
ker_2 = KernelRidge_2.best_estimator_
ela_2 = ElasticNet_2.best_estimator_stack_model_2 = stacking(mod=[bay_2,lasso_2,ridge_2,svr_2,ker_2,ela_2], meta_model=ker_2)
last_x_train_stack, last_x_test_stack = stack__oof(x_train_add,train_y,x_test_add)last_x_train_stack.shape, last_x_test_stack.shape
((1449, 6), (1459, 6))
第二层模型KernelRidge的参数搜索
param_ker = {'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"],'degree':[3,4],'coef0':[0.8,1,1.2]}
Ker_stack_model=get_best_model_and_accuracy(KernelRidge(),param_ker,last_x_train_stack,train_y).best_estimator_
Best Score: 0.08808555947636867
Best Parameters: {'alpha': 0.2, 'coef0': 0.8, 'degree': 4, 'kernel': 'polynomial'}
Average Time to Fit (s): 0.186
Average Time to Score (s): 0.03params
0 {'alpha': 0.2, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
1 {'alpha': 0.2, 'coef0': 0.8, 'degree': 4, 'kernel': 'polynomial'}
2 {'alpha': 0.2, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
3 {'alpha': 0.2, 'coef0': 1, 'degree': 4, 'kernel': 'polynomial'}
4 {'alpha': 0.2, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
5 {'alpha': 0.2, 'coef0': 1.2, 'degree': 4, 'kernel': 'polynomial'}
6 {'alpha': 0.3, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
7 {'alpha': 0.3, 'coef0': 0.8, 'degree': 4, 'kernel': 'polynomial'}
8 {'alpha': 0.3, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
9 {'alpha': 0.3, 'coef0': 1, 'degree': 4, 'kernel': 'polynomial'}
10 {'alpha': 0.3, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
11 {'alpha': 0.3, 'coef0': 1.2, 'degree': 4, 'kernel': 'polynomial'}
12 {'alpha': 0.4, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
13 {'alpha': 0.4, 'coef0': 0.8, 'degree': 4, 'kernel': 'polynomial'}
14 {'alpha': 0.4, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
15 {'alpha': 0.4, 'coef0': 1, 'degree': 4, 'kernel': 'polynomial'}
16 {'alpha': 0.4, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
17 {'alpha': 0.4, 'coef0': 1.2, 'degree': 4, 'kernel': 'polynomial'}
18 {'alpha': 0.5, 'coef0': 0.8, 'degree': 3, 'kernel': 'polynomial'}
19 {'alpha': 0.5, 'coef0': 0.8, 'degree': 4, 'kernel': 'polynomial'}
20 {'alpha': 0.5, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'}
21 {'alpha': 0.5, 'coef0': 1, 'degree': 4, 'kernel': 'polynomial'}
22 {'alpha': 0.5, 'coef0': 1.2, 'degree': 3, 'kernel': 'polynomial'}
23 {'alpha': 0.5, 'coef0': 1.2, 'degree': 4, 'kernel': 'polynomial'} mean_test_score std_test_score
0 0.089836 0.000473
1 0.088086 0.000600
2 0.089773 0.000480
3 0.088102 0.000599
4 0.089749 0.000485
5 0.088118 0.000599
6 0.090043 0.000456
7 0.088470 0.000586
8 0.089935 0.000462
9 0.088486 0.000586
10 0.089885 0.000468
11 0.088501 0.000586
12 0.090214 0.000443
13 0.088732 0.000579
14 0.090070 0.000449
15 0.088748 0.000580
16 0.089994 0.000455
17 0.088762 0.000581
18 0.090358 0.000434
19 0.088920 0.000576
20 0.090184 0.000439
21 0.088936 0.000577
22 0.090086 0.000445
23 0.088949 0.000578
cv_rmse(Ker_stack_model,last_x_train_stack,train_y).mean()
0.08791312508608311
# 注意之前对目标数据做过log变换
y_pred_stack = np.expm1(Ker_stack_model.predict(last_x_test_stack))
可以直接用stack_model类的函数拟合并预测数据
stack_model = stacking(mod=[lgbm,ela,svr,ridge,lasso,bay,xgb,GBR,ker],meta_model=KernelRidge(alpha=0.2 ,kernel='polynomial',degree=4, coef0=0.8))stack_model.fit(x_train_add,train_y)
y_pred_stack_2 = np.exp(stack_model.predict(x_test_add))
xgb.fit(last_x_train_stack,train_y)
y_pred_xgb = np.expm1(xgb.predict(last_x_test_stack))
# 交叉验证
cv_rmse(xgb,x_train_stack,train_y).mean()
0.1139198877562616
# 训练集误差
y_train_xgb = xgb.predict(last_x_train_stack)
rmse(y_train_xgb,train_y)
0.08778404527191365
lgbm.fit(last_x_train_stack,train_y)
y_pred_lgbm = np.expm1(lgbm.predict(last_x_test_stack))cv_rmse(lgbm,x_train_stack,train_y).mean()
0.1161628433489873
y_train_lgbm = xgb.predict(x_train_stack)
rmse(y_train_lgbm,train_y)
0.10937253913955777
# 模型融合
y_pred = (0.7*y_pred_stack)+(0.15*y_pred_xgb)+(0.15*y_pred_lgbm)
submission = pd.read_csv("/home/aistudio/data/data32288/submission.csv")submission.shape,y_pred.shape
((1459, 2), (1459,))
submission.iloc[:,1] = y_pred
_csv(r'./house_submission.csv',index=False)submission.head()
Id | SalePrice | |
---|---|---|
0 | 1461 | 119962.721230 |
1 | 1462 | 161987.446003 |
2 | 1463 | 188901.912081 |
3 | 1464 | 194701.643631 |
4 | 1465 | 194480.370160 |
Blending与Stacking主要区别在于训练集不是通过K-Fold来获得预测值从而生成第二阶段模型的特征,而是建立一个Holdout集,第二阶段的stacker模型就基于第一阶段模型对验证集的预测值进行拟合。也就是就是把Stacking流程中的K-Fold CV 改成 HoldOut CV。
步骤:
Blending的优点:
Blending的缺点:
del_selection import StratifiedKFold,train_test_split# 模型融合中使用到的各个单模型
clfs = [BayesianRidge(),Lasso(),Ridge(),SVR(),KernelRidge(),ElasticNet()]# 切分训练数据集为train,val两部分
X_train, X_val, y_train, y_val = train_test_split(train_X,train_y,test_size=0.33, random_state=1855)
dataset_val = np.zeros((X_val.shape[0], len(clfs))) # 对验证集的预测
dataset_test = np.zeros((test_X.shape[0], len(clfs))) #对测试集的预测 # 依次训练各个单模型
for j, clf in enumerate(clfs):# 使用train_X训练模型,获得其预测的输出作为第2部分的新特征clf.fit(X_train, y_train)dataset_val[:, j] = clf.predict(X_val)# 对于测试集,直接用这k个模型的预测值作为新的特征dataset_test[:, j] = clf.predict(test_X)# 融合使用的模型
clf = XGBRegressor()
clf.fit(dataset_val, y_val)# 注意前面对目标数据做过log变换
y_submission = np.expm1(clf.predict(dataset_test))
cv_rmse(clf,train_X,train_y).mean()
0.14310972129182878
y_submission
array([122274.41, 142203.67, 176042.67, ..., 164987.31, 107128.92,250321.12], dtype=float32)
y_pred_stack
array([118603.60717676, 162614.48976635, 190387.78002988, ...,179561.60366542, 117042.61233382, 223750.10906997])
# 使用mlxtend包
stack_gen = StackingCVRegressor(regressors=(lgbm,ela,svr,ridge,lasso,bay,xgb,GBR,ker),meta_regressor=ker,use_features_in_secondary=True)# 元分类器将根据原始回归器和原始数据集的预测进行训练
获取每个模型的交叉验证分数
scores = {}score = cv_rmse(lgbm,train_X,train_y)
print("lightgbm: {:.4f} ({:.4f})".an(), score.std()))
scores['lgbm'] = (an(), score.std())
lightgbm: 0.1280 (0.0148)
score = cv_rmse(ela,train_X,train_y)
print("ElasticNet: {:.4f} ({:.4f})".an(), score.std()))
scores['ela'] = (an(), score.std())
ElasticNet: 0.1108 (0.0151)
score = cv_rmse(svr,train_X,train_y)
print("SVR: {:.4f} ({:.4f})".an(), score.std()))
scores['svr'] = (an(), score.std())
SVR: 0.1096 (0.0172)
score = cv_rmse(ridge,train_X,train_y)
print("ridge: {:.4f} ({:.4f})".an(), score.std()))
scores['ridge'] = (an(), score.std())
ridge: 0.1106 (0.0154)
score = cv_rmse(lasso,train_X,train_y)
print("Lasso: {:.4f} ({:.4f})".an(), score.std()))
scores['Lasso'] = (an(), score.std())
Lasso: 0.1108 (0.0150)
score = cv_rmse(bay,train_X,train_y)
print("bay: {:.4f} ({:.4f})".an(), score.std()))
scores['bay'] = (an(), score.std())
bay: 0.1106 (0.0152)
score = cv_rmse(xgb,train_X,train_y)
print("xgb: {:.4f} ({:.4f})".an(), score.std()))
scores['xgb'] = (an(), score.std())
xgb: 0.1259 (0.0156)
score = cv_rmse(GBR,train_X,train_y)
print("GBR: {:.4f} ({:.4f})".an(), score.std()))
scores['GBR'] = (an(), score.std())
GBR: 0.1326 (0.0189)
score = cv_rmse(ker,train_X,train_y)
print("ker: {:.4f} ({:.4f})".an(), score.std()))
scores['ker'] = (an(), score.std())
ker: 0.1178 (0.0167)
score = cv_rmse(stack_gen,train_X,train_y)
print("stack_gen: {:.4f} ({:.4f})".an(), score.std()))
scores['stack_gen'] = (an(), score.std())
stack_gen: 0.1338 (0.0191)
确定性能最佳的模型
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()],markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]),horizontalalignment='left', size='large', color='black', weight='semibold')plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)plt.title('Scores of Models', size=20)
plt.show()
本文发布于:2024-02-02 03:47:29,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170681743741163.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |