1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
- from sklearn.base import BaseEstimator, RegressorMixin, clone
- from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
- import numpy as np
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.linear_model import LinearRegression
- import imodels
- import imodels.algebraic.gam_multitask
- class ResidualBoostingRegressor(BaseEstimator, RegressorMixin):
- def __init__(self, estimator, n_estimators=10):
- """
- A meta-estimator that fits a base estimator to the residuals of the
- previous estimators.
- Parameters:
- - estimator: The estimator to fit on the residual of the previous step.
- - n_estimators: The number of estimators to fit.
- """
- self.estimator = estimator
- self.n_estimators = n_estimators
- def fit(self, X, y):
- """
- Fit the ensemble of base estimators on the training data.
- Parameters:
- - X: array-like of shape (n_samples, n_features)
- Training data.
- - y: array-like of shape (n_samples,)
- Target values.
- Returns:
- - self: object
- """
- # Check that X and y have correct shape
- X, y = check_X_y(X, y)
- self.estimators_ = []
- current_prediction = np.zeros(y.shape)
- for _ in range(self.n_estimators):
- residual = y - current_prediction
- estimator = clone(self.estimator)
- estimator.fit(X, residual)
- self.estimators_.append(estimator)
- current_prediction += estimator.predict(X)
- return self
- def predict(self, X):
- """
- Predict regression target for X.
- Parameters:
- - X: array-like of shape (n_samples, n_features)
- The input samples.
- Returns:
- - y_pred: ndarray of shape (n_samples,)
- The predicted values.
- """
- # Check is fit had been called
- check_is_fitted(self)
- # Input validation
- X = check_array(X)
- predictions = sum(estimator.predict(X)
- for estimator in self.estimators_)
- return predictions
- class SimpleBaggingRegressor:
- def __init__(self, estimator, n_estimators=10, random_state=None):
- self.estimator = estimator
- self.n_estimators = n_estimators
- self.random_state = random_state
- def fit(self, X, y):
- np.random.seed(self.random_state)
- self.estimators_ = []
- rng = np.random.default_rng(self.random_state)
- for _ in range(self.n_estimators):
- # Simple bootstrap sampling
- # sample_indices = np.random.choice(
- # range(X.shape[0]), size=X.shape[0], replace=True)
- sample_indices = rng.choice(
- range(X.shape[0]), size=X.shape[0], replace=True)
- X_sample = X[sample_indices]
- y_sample = y[sample_indices]
- # Fit a base estimator
- # estimator = DecisionTreeRegressor()
- estimator = clone(self.estimator)
- estimator.fit(X_sample, y_sample)
- self.estimators_.append(estimator)
- def predict(self, X):
- # Collect predictions from each base estimator
- predictions = np.array([estimator.predict(X)
- for estimator in self.estimators_])
- # Aggregate predictions
- return np.mean(predictions, axis=0)
- if __name__ == '__main__':
- X, y, feature_names = imodels.get_clean_dataset('california_housing')
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.2, random_state=42)
- X_train = X_train[:50, :2]
- y_train = y_train[:50]
- X_test = X_test[:50, :2]
- y_test = y_test[:50]
- # estimator = DecisionTreeRegressor(max_depth=3)
- estimator = imodels.algebraic.gam_multitask.MultiTaskGAMRegressor()
- for n_estimators in [1, 3, 5]:
- # residual_boosting_regressor = ResidualBoostingRegressor(
- # estimator=estimator, n_estimators=n_estimators)
- residual_boosting_regressor = SimpleBaggingRegressor(
- estimator=estimator, n_estimators=n_estimators)
- residual_boosting_regressor.fit(X_train, y_train)
- y_pred = residual_boosting_regressor.predict(X_test)
- mse_train = mean_squared_error(
- y_train, residual_boosting_regressor.predict(X_train))
- mse = mean_squared_error(y_test, y_pred)
- print(
- f'MSE with {n_estimators} estimators: {mse:.2f} (train: {mse_train:.2f})')
|