Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

transforms.py 4.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
  1. '''Shared transforms between different interpretable models
  2. '''
  3. import numpy as np
  4. from sklearn.base import BaseEstimator, TransformerMixin
  5. import pandas as pd
  6. class Winsorizer():
  7. """Performs Winsorization 1->1*
  8. Warning: this class should not be used directly.
  9. """
  10. def __init__(self, trim_quantile=0.0):
  11. self.trim_quantile = trim_quantile
  12. self.winsor_lims = None
  13. def train(self, X):
  14. # get winsor limits
  15. self.winsor_lims = np.ones([2, X.shape[1]]) * np.inf
  16. self.winsor_lims[0, :] = -np.inf
  17. if self.trim_quantile > 0:
  18. for i_col in np.arange(X.shape[1]):
  19. lower = np.percentile(X[:, i_col], self.trim_quantile * 100)
  20. upper = np.percentile(
  21. X[:, i_col], 100 - self.trim_quantile * 100)
  22. self.winsor_lims[:, i_col] = [lower, upper]
  23. def trim(self, X):
  24. X_ = X.copy()
  25. X_ = np.where(X > self.winsor_lims[1, :], np.tile(self.winsor_lims[1, :], [X.shape[0], 1]),
  26. np.where(X < self.winsor_lims[0, :], np.tile(self.winsor_lims[0, :], [X.shape[0], 1]), X))
  27. return X_
  28. class FriedScale():
  29. """Performs scaling of linear variables according to Friedman et alpha_l. 2005 Sec 5
  30. Each variable is first Winsorized l->l*, then standardised as 0.4 x l* / std(l*)
  31. Warning: this class should not be used directly.
  32. """
  33. def __init__(self, winsorizer=None):
  34. self.scale_multipliers = None
  35. self.winsorizer = winsorizer
  36. def train(self, X):
  37. # get multipliers
  38. if self.winsorizer != None:
  39. X_trimmed = self.winsorizer.trim(X)
  40. else:
  41. X_trimmed = X
  42. scale_multipliers = np.ones(X.shape[1])
  43. for i_col in np.arange(X.shape[1]):
  44. num_uniq_vals = len(np.unique(X[:, i_col]))
  45. if num_uniq_vals > 2: # don't scale binary variables which are effectively already rules
  46. scale_multipliers[i_col] = 0.4 / \
  47. (1.0e-12 + np.std(X_trimmed[:, i_col]))
  48. self.scale_multipliers = scale_multipliers
  49. def scale(self, X):
  50. if self.winsorizer != None:
  51. return self.winsorizer.trim(X) * self.scale_multipliers
  52. else:
  53. return X * self.scale_multipliers
  54. class CorrelationScreenTransformer(BaseEstimator, TransformerMixin):
  55. '''Finds correlated features above a magnitude threshold
  56. and zeros out all but the first of them
  57. '''
  58. def __init__(self, threshold=1.0):
  59. # Initialize with a correlation threshold
  60. self.threshold = threshold
  61. self.correlated_feature_sets = []
  62. def fit(self, X, y=None):
  63. # Check if X is a pandas DataFrame; if not, convert it to DataFrame
  64. if not isinstance(X, pd.DataFrame):
  65. X = pd.DataFrame(X)
  66. # Calculate the correlation matrix
  67. corr_matrix = X.corr().abs()
  68. # Identify the features that are correlated based on the threshold
  69. for i in range(len(corr_matrix.columns)):
  70. for j in range(i):
  71. if corr_matrix.iloc[i, j] >= self.threshold or corr_matrix.iloc[i, j] <= -self.threshold:
  72. # Find the set this feature belongs to
  73. found_set = False
  74. for feature_set in self.correlated_feature_sets:
  75. if i in feature_set or j in feature_set:
  76. feature_set.update([i, j])
  77. found_set = True
  78. break
  79. if not found_set:
  80. self.correlated_feature_sets.append(set([i, j]))
  81. # Convert the sets to list of lists where each sublist has indexes to keep and to remove
  82. self.to_keep_remove = []
  83. for feature_set in self.correlated_feature_sets:
  84. feature_list = list(feature_set)
  85. # keep the first, remove the rest
  86. self.to_keep_remove.append((feature_list[0], feature_list[1:]))
  87. return self
  88. def transform(self, X):
  89. # Again, check if X is a pandas DataFrame; if not, convert it
  90. input_type = type(X)
  91. if not isinstance(X, pd.DataFrame):
  92. X = pd.DataFrame(X)
  93. # Set the identified correlated features (except the first) to 0
  94. X_transformed = X.copy()
  95. for keep, to_remove in self.to_keep_remove:
  96. X_transformed.iloc[:, to_remove] = 0
  97. if input_type == np.ndarray:
  98. X_transformed == X_transformed.values
  99. return X_transformed
  100. if __name__ == '__main__':
  101. X = np.random.randn(5, 5)
  102. X[:, 0] = [1, 1, 0, 1, 1]
  103. X[:, 1] = X[:, 0]
  104. transformer = CorrelationScreenTransformer()
  105. print(X)
  106. X_transformed = transformer.fit_transform(X)
  107. print(X_transformed)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...