Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

main.py 3.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  1. import dagshub
  2. import argparse
  3. import pandas as pd
  4. from sklearn import preprocessing
  5. from sklearn.linear_model import SGDClassifier
  6. from sklearn.tree import DecisionTreeClassifier
  7. from sklearn.ensemble import RandomForestClassifier
  8. from sklearn.metrics import (
  9. roc_auc_score,
  10. average_precision_score,
  11. accuracy_score,
  12. precision_score,
  13. recall_score,
  14. f1_score,
  15. )
  16. from sklearn.model_selection import train_test_split
  17. import joblib
  18. import numpy as np
  19. # Constant
  20. drop_cols = ["Name", "SibSp", "Parch", "Ticket"]
  21. obj_col = "Survived"
  22. train_df_path = "Data/train.csv"
  23. test_df_path = "Data/test.csv"
  24. sub_df_path = "Data/sample_submission.csv"
  25. ## feature engineering
  26. def feature_engineering(raw_df):
  27. df = raw_df.copy()
  28. df["Cabin"] = df["Cabin"].apply(lambda x: x[:1] if x is not np.nan else np.nan)
  29. df["Ticket"] = df["Ticket"].apply(lambda x: str(x).split()[0])
  30. df["Family"] = df["SibSp"] + df["Parch"]
  31. return df
  32. ## train model
  33. def fit_model(train_X, train_y, random_state=42):
  34. clf = RandomForestClassifier(n_estimators=50, random_state=random_state)
  35. clf.fit(train_X, train_y)
  36. return clf
  37. # cat
  38. def to_category(train_df, test_df):
  39. cat = ["Sex", "Cabin", "Embarked"]
  40. for col in cat:
  41. le = preprocessing.LabelEncoder()
  42. train_df[col] = le.fit_transform(train_df[col])
  43. test_df[col] = le.transform(test_df[col])
  44. return train_df, test_df
  45. # evaluation
  46. def eval_model(clf, X, y):
  47. y_proba = clf.predict_proba(X)[:, 1]
  48. y_pred = clf.predict(X)
  49. return {
  50. "roc_auc": roc_auc_score(y, y_proba),
  51. "average_precision": average_precision_score(y, y_proba),
  52. "accuracy": accuracy_score(y, y_pred),
  53. "precision": precision_score(y, y_pred),
  54. "recall": recall_score(y, y_pred),
  55. "f1": f1_score(y, y_pred),
  56. }
  57. ## submission
  58. def submission(clf, X):
  59. sub = pd.read_csv(sub_df_path)
  60. sub[obj_col] = clf.predict(X)
  61. sub.to_csv("Submission/submission.csv", index=False)
  62. # train
  63. def train():
  64. print("Loading data...")
  65. df_train = pd.read_csv(train_df_path, index_col="PassengerId")
  66. df_test = pd.read_csv(test_df_path, index_col="PassengerId")
  67. print("Engineering features...")
  68. y = df_train[obj_col]
  69. X = feature_engineering(df_train).drop(drop_cols + [obj_col], axis=1)
  70. test_df = feature_engineering(df_test).drop(drop_cols, axis=1)
  71. X, test_df = to_category(X, test_df)
  72. X.fillna(0, inplace=True)
  73. test_df.fillna(0, inplace=True)
  74. with dagshub.dagshub_logger() as logger:
  75. print("Training model...")
  76. X_train, X_test, y_train, y_test = train_test_split(
  77. X, y, test_size=0.33, random_state=42, stratify=y
  78. )
  79. model = fit_model(X_train, y_train)
  80. print("Saving trained model...")
  81. joblib.dump(model, "Model/model.joblib")
  82. logger.log_hyperparams(model_class=type(model).__name__)
  83. logger.log_hyperparams({"model": model.get_params()})
  84. print("Evaluating model...")
  85. train_metrics = eval_model(model, X_train, y_train)
  86. print("Train metrics:")
  87. print(train_metrics)
  88. logger.log_metrics({f"train__{k}": v for k, v in train_metrics.items()})
  89. test_metrics = eval_model(model, X_test, y_test)
  90. print("Test metrics:")
  91. print(test_metrics)
  92. logger.log_metrics({f"test__{k}": v for k, v in test_metrics.items()})
  93. print("Creating Submission File...")
  94. submission(model, test_df)
  95. if __name__ == "__main__":
  96. train()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...