nipdep
/
titanic
mirror of https://github.com/nipdep/titanic.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
            import os
import sys
import yaml
import json
import pickle
import lightgbm as lgb
import dagshub
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

params = yaml.safe_load(open('params.yaml'))['train']

np.set_printoptions(suppress=True)

if len(sys.argv) != 5:
    sys.stderr.write('Argument error. Usage:\n')
    sys.stderr.write('\tpython featurization.py data-dit-path model-dir-path\n')
    sys.exit(1)

train_input = os.path.join(sys.argv[1], 'ks_train.csv')
test_input = os.path.join(sys.argv[1], 'ks_test.csv')
model_output = os.path.join(sys.argv[2], 'ks_model.pkl')
test_output = os.path.join(sys.argv[2], 'ks_test.csv')
score_path = os.path.join(sys.argv[3])
plots_file = os.path.join(sys.argv[4])

split = params['split']

df = pd.read_csv(train_input)

X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df.iloc[:,-1],
    test_size=split,
    random_state=43
)

# model training
clf = lgb.LGBMClassifier()
clf.fit(X_train,y_train)

with dagshub.dagshub_logger() as logger:
    logger.log_hyperparams(model_class=type(clf).__name__)
    logger.log_hyperparams({'model': clf.get_params()})  

# model evaluation
y_tr_pred = clf.predict(X_train)
y_ts_pred = clf.predict(X_test)

#print(f"For train dataset \n \taccuracy : {accuracy_score(y_train, y_tr_pred)} \n \tprecision : {precision_score(y_train, y_tr_pred)} \n \trecall : {recall_score(y_train, y_tr_pred)} \n \t f1 score : {f1_score(y_train, y_tr_pred)}")
#print(f"For test dataset \n \taccuracy : {accuracy_score(y_test, y_ts_pred)} \n \tprecision : {precision_score(y_test, y_ts_pred)} \n \trecall : {recall_score(y_test, y_ts_pred)} \n \t f1 score : {f1_score(y_test, y_ts_pred)}")

with open(score_path,'w') as pf:
    json.dump({'train' : {'accuracy' : accuracy_score(y_train, y_tr_pred), 'precision' : precision_score(y_train, y_tr_pred), 'recall' : recall_score(y_train, y_tr_pred), 'f1-score' : f1_score(y_train, y_tr_pred)},
    'test' : {'accuracy' : accuracy_score(y_test, y_ts_pred), 'precision' : precision_score(y_test, y_ts_pred), 'recall' : recall_score(y_test, y_ts_pred), 'f1-score' : f1_score(y_test, y_ts_pred)}
    }, pf)

precision, recall, thresholds = precision_recall_curve(y_test, y_ts_pred)

with open(plots_file, 'w') as fd:
    json.dump({'prc': [{
            'precision': float(p),
            'recall': float(r),
            'threshold': float(t)
        } for p, r, t in zip(precision, recall, thresholds)
    ]}, fd)

# make prediction for test set
test_df = pd.read_csv(test_input)
preds = clf.predict(test_df)
test_df['prediction'] = preds

test_df.to_csv(test_output, index=False)

def save_model(path, model):
    
    msg = 'The output model : {} \n'
    sys.stderr.write(msg.format(model))

    with open(path, 'wb') as fd:
        pickle.dump(model, fd, pickle.HIGHEST_PROTOCOL)

save_model(model_output, clf)