在机器学习领域,集成学习是一种将多个模型结合起来以提高预测性能的技术。不同的机器学习模型能够以不同的方式提取数据中的模式,通过整合这些模型,可以获得一个性能更优的模型。本文将探讨常见的集成学习方法及其缺陷,然后介绍如何通过堆叠(Stacking)和混合(Blending)技术来克服这些缺陷,并展示如何在Python中实现这些技术。
集成学习方法通过结合不同的模型来构建一个最优模型。常见的方法之一是构建多个模型,然后取它们的平均值或多数投票结果。这种方法简单易用,通常能够获得较好的结果。然而,问题在于较弱的模型会与较强的模型获得相同的权重,这有时会导致得分下降。另一种方法是构建多个模型并取它们的加权平均值。这种方法可以为较强的模型分配更高的权重,但确定权重并非易事,也不是非常理想。
为了解决上述问题,可以使用堆叠和混合技术来进行集成建模。具体步骤如下:
将使用已经处理过的贷款申请数据来预测利率。首先,加载数据并将其分为两部分。
import pandas as pd
import numpy as np
x_train = pd.read_csv("C:/Users/chakr/Desktop/Clean_data/X_train_reg.csv")
y_train = pd.read_csv("C:/Users/chakr/Desktop/Clean_data/y_train_reg.csv")
from sklearn.model_selection import train_test_split
x_train1, x_train2, y_train1, y_train2 = train_test_split(x_train, y_train, test_size=0.25, random_state=42)
接下来,将数据集分为N份(这里使用20份)。
def get_dataset(x_train, y_train, N=5):
merge = pd.concat([x_train, y_train], axis=1)
merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
y_train = merge.iloc[:, (merge.shape[1]-1): (merge.shape[1])]
x_train = merge.iloc[:, 0: (merge.shape[1]-1)]
z = int(len(x_train) / N)
start = [0]
stop = []
for i in range(1, N):
start.append(z * i)
stop.append(z * i)
stop.append(len(x_train))
c = list()
train_data = list()
test_data = list()
y_data = list()
for i in range(0, N):
c = list(range(start[i], stop[i]))
train_data.append(x_train.iloc[[k for k in range(0, len(x_train)) if k not in c], :])
y_data.append(y_train.iloc[[k for k in range(0, len(y_train)) if k not in c], :])
test_data.append(x_train.iloc[c, :])
return (train_data, y_data, test_data, y_train)
现在有了以下数据集:
接下来,定义第一层模型,并为每个模型分配一个代码。
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor, Pool
models = [LinearRegression(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
CatBoostRegressor(logging_level='Silent')]
code = ['lin_reg', 'dtree_reg', 'Knn_reg', 'cat_reg']
然后,定义一个预测函数,用于所有模型的预测。
def stack(x_train, y_train, x_test, models, code):
def flatten_list(_2d_list):
flat_list = []
for element in _2d_list:
if type(element) is list:
for item in element:
flat_list.append(item)
else:
flat_list.append(element)
return flat_list
result = list()
for i in list(range(len(models))):
reg = models[i]
reg.fit(x_train, y_train)
test_pred = flatten_list(reg.predict(x_test).tolist())
result.append(test_pred)
result_df = pd.DataFrame()
for i in list(range(len(code))):
result_df[code[i]] = result[i]
return result_df
最后,为每个块进行预测,以获得最终的数据框架。
final_df = pd.DataFrame(columns=code)
for i in range(0, len(train_data)):
current_df = stack(train_data[i], y_data[i], test_data[i], models, code)
final_df = pd.concat([final_df, current_df])
final_test = stack(x_train1, y_train1, x_train2, models, code)
接下来,构建第二层模型。
reg2 = CatBoostRegressor(logging_level='Silent')
reg2.fit(final_df, final_y)
test_pred = reg2.predict(final_test)
mean_squared_error(test_pred, y_train2) ** 0.5
def stackblend_reg(x_train, y_train, x_test, models, code, N=20, final_layer=LinearRegression()):
def get_dataset(x_train, y_train, N=5):
# ...(省略部分代码以节省空间)...
datasets = get_dataset(x_train, y_train, N)
train_data = datasets[0]
y_data = datasets[1]
test_data = datasets[2]
final_y = datasets[3]
def stack(x_train, y_train, x_test, models=models, code=code):
# ...(省略部分代码以节省空间)...
final_df = pd.DataFrame(columns=code)
for i in range(0, len(train_data)):
current_df = stack(train_data[i], y_data[i], test_data[i], models, code)
final_df = pd.concat([final_df, current_df])
final_test = stack(x_train, y_train, x_test, models, code)
reg2 = final_layer
reg2.fit(final_df, final_y)
test_pred = reg2.predict(final_test)
return test_pred
stack_pred = stackblend_reg(x_train1, y_train1, x_train2,
models=[LinearRegression(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
CatBoostRegressor(logging_level='Silent')],
code=['lin_reg', 'dtree_reg', 'Knn_reg', 'cat_reg'], N=20,
final_layer=CatBoostRegressor(logging_level='Silent'))
mean_squared_error(stack_pred, y_train2) ** 0.5