Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

simple.py 2.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.preprocessing import KBinsDiscretizer
  4. class SimpleDiscretizer:
  5. def __init__(self, n_bins: int = 8, strategy: str = 'uniform'):
  6. self.n_bins = n_bins
  7. self.strategy = strategy
  8. def fit(self, X: np.array, feature_labels: np.array):
  9. self.is_categorical = np.array([set(np.unique(X[:, i])).issubset({0, 1}) for i in np.arange(X.shape[1])])
  10. if False not in self.is_categorical:
  11. self.feature_labels = feature_labels
  12. self.discretizer = None
  13. return
  14. if isinstance(feature_labels, list):
  15. feature_labels = np.array(feature_labels)
  16. # X_categorical = X[:, self.is_categorical]
  17. X_categorical_columns = feature_labels[self.is_categorical]
  18. # X_numeric = X[:, ~self.is_categorical]
  19. X_numeric_columns = feature_labels[~self.is_categorical]
  20. self.discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='onehot', strategy=self.strategy)
  21. # X_numeric_discretized = self.discretizer.fit(X_numeric)
  22. discretized_featnames = []
  23. for feat_name, bin_edges in zip(X_numeric_columns, self.discretizer.bin_edges_):
  24. be_str = bin_edges.astype(str)
  25. discretized_featnames += (
  26. [f'{feat_name}_' + '_to_'.join([be_str[i], be_str[i + 1]]) for i in range(bin_edges.shape[0] - 1)]
  27. )
  28. self.featnames_after_disc = np.append(discretized_featnames, X_categorical_columns)
  29. def transform(self, X: np.array):
  30. if self.discretizer is None:
  31. return pd.DataFrame(X, columns=self.feature_labels)
  32. X_categorical = X[:, self.is_categorical]
  33. X_numeric = X[:, ~self.is_categorical]
  34. X_numeric_discretized = self.discretizer.transform(X_numeric).toarray()
  35. X_concat = np.concatenate((X_numeric_discretized, X_categorical), axis=1)
  36. X_df_onehot = pd.DataFrame(X_concat, columns=self.featnames_after_disc)
  37. return X_df_onehot
  38. def fit_transform(self, X: np.array, feature_labels: np.array):
  39. self.fit(X, feature_labels)
  40. return self.transform(X)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...