Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train.py 2.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  1. import os
  2. import sys
  3. import yaml
  4. import json
  5. import pickle
  6. import lightgbm as lgb
  7. import dagshub
  8. import pandas as pd
  9. import numpy as np
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
  12. params = yaml.safe_load(open('params.yaml'))['train']
  13. np.set_printoptions(suppress=True)
  14. if len(sys.argv) != 5:
  15. sys.stderr.write('Argument error. Usage:\n')
  16. sys.stderr.write('\tpython featurization.py data-dit-path model-dir-path\n')
  17. sys.exit(1)
  18. train_input = os.path.join(sys.argv[1], 'ks_train.csv')
  19. test_input = os.path.join(sys.argv[1], 'ks_test.csv')
  20. model_output = os.path.join(sys.argv[2], 'ks_model.pkl')
  21. test_output = os.path.join(sys.argv[2], 'ks_test.csv')
  22. score_path = os.path.join(sys.argv[3])
  23. plots_file = os.path.join(sys.argv[4])
  24. split = params['split']
  25. df = pd.read_csv(train_input)
  26. X_train, X_test, y_train, y_test = train_test_split(
  27. df.iloc[:,:-1], df.iloc[:,-1],
  28. test_size=split,
  29. random_state=43
  30. )
  31. # model training
  32. clf = lgb.LGBMClassifier()
  33. clf.fit(X_train,y_train)
  34. with dagshub.dagshub_logger() as logger:
  35. logger.log_hyperparams(model_class=type(clf).__name__)
  36. logger.log_hyperparams({'model': clf.get_params()})
  37. # model evaluation
  38. y_tr_pred = clf.predict(X_train)
  39. y_ts_pred = clf.predict(X_test)
  40. #print(f"For train dataset \n \taccuracy : {accuracy_score(y_train, y_tr_pred)} \n \tprecision : {precision_score(y_train, y_tr_pred)} \n \trecall : {recall_score(y_train, y_tr_pred)} \n \t f1 score : {f1_score(y_train, y_tr_pred)}")
  41. #print(f"For test dataset \n \taccuracy : {accuracy_score(y_test, y_ts_pred)} \n \tprecision : {precision_score(y_test, y_ts_pred)} \n \trecall : {recall_score(y_test, y_ts_pred)} \n \t f1 score : {f1_score(y_test, y_ts_pred)}")
  42. with open(score_path,'w') as pf:
  43. json.dump({'train' : {'accuracy' : accuracy_score(y_train, y_tr_pred), 'precision' : precision_score(y_train, y_tr_pred), 'recall' : recall_score(y_train, y_tr_pred), 'f1-score' : f1_score(y_train, y_tr_pred)},
  44. 'test' : {'accuracy' : accuracy_score(y_test, y_ts_pred), 'precision' : precision_score(y_test, y_ts_pred), 'recall' : recall_score(y_test, y_ts_pred), 'f1-score' : f1_score(y_test, y_ts_pred)}
  45. }, pf)
  46. precision, recall, thresholds = precision_recall_curve(y_test, y_ts_pred)
  47. with open(plots_file, 'w') as fd:
  48. json.dump({'prc': [{
  49. 'precision': float(p),
  50. 'recall': float(r),
  51. 'threshold': float(t)
  52. } for p, r, t in zip(precision, recall, thresholds)
  53. ]}, fd)
  54. # make prediction for test set
  55. test_df = pd.read_csv(test_input)
  56. preds = clf.predict(test_df)
  57. test_df['prediction'] = preds
  58. test_df.to_csv(test_output, index=False)
  59. def save_model(path, model):
  60. msg = 'The output model : {} \n'
  61. sys.stderr.write(msg.format(model))
  62. with open(path, 'wb') as fd:
  63. pickle.dump(model, fd, pickle.HIGHEST_PROTOCOL)
  64. save_model(model_output, clf)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...