Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train.py 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  1. import numpy as np
  2. import pandas as pd
  3. import os
  4. import pickle
  5. import warnings
  6. traing_df = pd.read_csv('../readonly/final_project_data/training_data.csv')
  7. train_df = traing_df.loc[traing_df.date_block_num!=33, :]
  8. val_df = traing_df.loc[traing_df.date_block_num==33, :]
  9. X_tr, y_tr = train_df.iloc[:, 0:5], train_df.iloc[:, 5]
  10. X_val, y_val = val_df.iloc[:, 0:5], val_df.iloc[:, 5]
  11. def learning_rate_005_decay_power_099(current_iter):
  12. base_learning_rate = 0.05
  13. lr = base_learning_rate * np.power(.99, current_iter)
  14. return lr if lr > 1e-3 else 1e-3
  15. import lightgbm as lgb
  16. fit_params={"early_stopping_rounds":30,
  17. "eval_metric" : 'neg_mean_absolute_error',
  18. "eval_set" : [(X_val,y_val)],
  19. 'eval_names': ['valid'],
  20. #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
  21. 'verbose': 100,
  22. 'categorical_feature': 'auto'}
  23. from scipy.stats import randint as sp_randint
  24. from scipy.stats import uniform as sp_uniform
  25. param_test ={'num_leaves': sp_randint(6, 50),
  26. 'min_child_samples': sp_randint(100, 500),
  27. 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
  28. 'subsample': sp_uniform(loc=0.2, scale=0.8),
  29. 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
  30. 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
  31. 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
  32. #This parameter defines the number of HP points to be tested
  33. n_HP_points_to_test = 10
  34. import lightgbm as lgb
  35. from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
  36. #n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
  37. clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, n_jobs=4, n_estimators=5000)
  38. gs = RandomizedSearchCV(
  39. estimator=clf, param_distributions=param_test,
  40. n_iter=n_HP_points_to_test,
  41. scoring='neg_mean_absolute_error',
  42. cv=3,
  43. refit=True,
  44. random_state=314,
  45. verbose=True)
  46. gs.fit(X_tr, y_tr, **fit_params)
  47. print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))
  48. fin_model = gs.best_estimator_
  49. files_name = '../readonly/models/lightgbm_model_1.sav'
  50. pickle.dump(fin_model, open(files_name, 'wb'))
  51. preds = fin_model.predict(test_df.loc[:, ["date_block_num", "shop_id", "item_id", "cat_id", "item_price"]])
  52. dt = {'ID' : test_df.iloc[:, 0].values, 'item_cnt_month' : preds}
  53. res_df = pd.DataFrame(data=dt)
  54. res_df.to_csv('../readonly/final_project_data/result.csv', index=False)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...