1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
- import dagshub
- import argparse
- import pandas as pd
- from sklearn import preprocessing
- from sklearn.linear_model import SGDClassifier
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import (
- roc_auc_score,
- average_precision_score,
- accuracy_score,
- precision_score,
- recall_score,
- f1_score,
- )
- from sklearn.model_selection import train_test_split
- import joblib
- import numpy as np
- # Constant
- drop_cols = ["Name", "SibSp", "Parch", "Ticket"]
- obj_col = "Survived"
- train_df_path = "Data/train.csv"
- test_df_path = "Data/test.csv"
- sub_df_path = "Data/sample_submission.csv"
- ## feature engineering
- def feature_engineering(raw_df):
- df = raw_df.copy()
- df["Cabin"] = df["Cabin"].apply(lambda x: x[:1] if x is not np.nan else np.nan)
- df["Ticket"] = df["Ticket"].apply(lambda x: str(x).split()[0])
- df["Family"] = df["SibSp"] + df["Parch"]
- return df
- ## train model
- def fit_model(train_X, train_y, random_state=42):
- clf = RandomForestClassifier(n_estimators=50, random_state=random_state)
- clf.fit(train_X, train_y)
- return clf
- # cat
- def to_category(train_df, test_df):
- cat = ["Sex", "Cabin", "Embarked"]
- for col in cat:
- le = preprocessing.LabelEncoder()
- train_df[col] = le.fit_transform(train_df[col])
- test_df[col] = le.transform(test_df[col])
- return train_df, test_df
- # evaluation
- def eval_model(clf, X, y):
- y_proba = clf.predict_proba(X)[:, 1]
- y_pred = clf.predict(X)
- return {
- "roc_auc": roc_auc_score(y, y_proba),
- "average_precision": average_precision_score(y, y_proba),
- "accuracy": accuracy_score(y, y_pred),
- "precision": precision_score(y, y_pred),
- "recall": recall_score(y, y_pred),
- "f1": f1_score(y, y_pred),
- }
- ## submission
- def submission(clf, X):
- sub = pd.read_csv(sub_df_path)
- sub[obj_col] = clf.predict(X)
- sub.to_csv("Submission/submission.csv", index=False)
- # train
- def train():
- print("Loading data...")
- df_train = pd.read_csv(train_df_path, index_col="PassengerId")
- df_test = pd.read_csv(test_df_path, index_col="PassengerId")
- print("Engineering features...")
- y = df_train[obj_col]
- X = feature_engineering(df_train).drop(drop_cols + [obj_col], axis=1)
- test_df = feature_engineering(df_test).drop(drop_cols, axis=1)
- X, test_df = to_category(X, test_df)
- X.fillna(0, inplace=True)
- test_df.fillna(0, inplace=True)
- with dagshub.dagshub_logger() as logger:
- print("Training model...")
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.33, random_state=42, stratify=y
- )
- model = fit_model(X_train, y_train)
- print("Saving trained model...")
- joblib.dump(model, "Model/model.joblib")
- logger.log_hyperparams(model_class=type(model).__name__)
- logger.log_hyperparams({"model": model.get_params()})
- print("Evaluating model...")
- train_metrics = eval_model(model, X_train, y_train)
- print("Train metrics:")
- print(train_metrics)
- logger.log_metrics({f"train__{k}": v for k, v in train_metrics.items()})
- test_metrics = eval_model(model, X_test, y_test)
- print("Test metrics:")
- print(test_metrics)
- logger.log_metrics({f"test__{k}": v for k, v in test_metrics.items()})
- print("Creating Submission File...")
- submission(model, test_df)
- if __name__ == "__main__":
- train()
|