Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

rf_neighbors.py 3.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  1. import sys
  2. import numpy as np
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.feature_selection import SelectFromModel
  5. from sklearn.linear_model import Lasso
  6. from sklearn.model_selection import KFold
  7. sys.path.append('lib')
  8. import collections
  9. cell_nums_feature_selection = np.array([1])
  10. cell_nums_train = np.array([1, 2, 3, 4, 5])
  11. cell_nums_test = np.array([6])
  12. def get_rf_neighbors(df, feat_names, outcome_def='y_thresh',
  13. balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl',
  14. feature_selection=None, feature_selection_num=3, seed=42):
  15. # pre-processing same as train.train
  16. np.random.seed(seed)
  17. X = df[feat_names]
  18. y = df[outcome_def].values
  19. m = RandomForestClassifier(n_estimators=100)
  20. kf = KFold(n_splits=len(cell_nums_train))
  21. # feature selection on cell num 1
  22. feature_selector = None
  23. if feature_selection is not None:
  24. if feature_selection == 'select_lasso':
  25. feature_selector_model = Lasso()
  26. elif feature_selection == 'select_rf':
  27. feature_selector_model = RandomForestClassifier()
  28. # select only feature_selection_num features
  29. feature_selector = SelectFromModel(feature_selector_model, threshold=-np.inf,
  30. max_features=feature_selection_num)
  31. idxs = df.cell_num.isin(cell_nums_feature_selection)
  32. feature_selector.fit(X[idxs], y[idxs])
  33. X = feature_selector.transform(X)
  34. support = np.array(feature_selector.get_support())
  35. else:
  36. support = np.ones(len(feat_names)).astype(np.bool)
  37. # split testing data based on cell num
  38. idxs_test = df.cell_num.isin(cell_nums_test)
  39. X_test, Y_test = X[idxs_test], y[idxs_test]
  40. idxs_train = df.cell_num.isin(cell_nums_train)
  41. X_train, Y_train = X[idxs_train], y[idxs_train]
  42. # num_pts_by_fold_cv = []
  43. # build dictionary, key is leaf node, value is list of training samples in the node
  44. m.fit(X_train, Y_train)
  45. node_indices = m.apply(X_train)
  46. node_indices_test = m.apply(X_test)
  47. similarity_matrix = np.zeros((len(X_test), len(X_train)))
  48. for tree in range(100):
  49. node_samples = collections.defaultdict(list)
  50. for i in range(len(X_train)):
  51. node_samples[node_indices[i, tree]].append(i)
  52. for i in range(len(X_test)):
  53. node = node_indices_test[i, tree]
  54. for j in node_samples[node]:
  55. similarity_matrix[i, j] += 1
  56. preds_proba = m.predict_proba(X_test)[:, 1]
  57. # nearest neighbors and similarity
  58. nearest_neighbors = [np.argsort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]
  59. similarity = [np.sort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]
  60. idxs_test = np.where(idxs_test == True)
  61. idxs_train = np.where(idxs_train == True)
  62. df_train = df.iloc[idxs_train]
  63. df_test = df.iloc[idxs_test]
  64. df_test['preds_proba'] = preds_proba
  65. df_test['nearest_neighbors'] = nearest_neighbors
  66. df_test['similarity'] = similarity
  67. return df_train, df_test
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...