1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
- import numpy as np
- import pandas as pd
- import os
- import pickle
- import warnings
- traing_df = pd.read_csv('../readonly/final_project_data/training_data.csv')
- train_df = traing_df.loc[traing_df.date_block_num!=33, :]
- val_df = traing_df.loc[traing_df.date_block_num==33, :]
- X_tr, y_tr = train_df.iloc[:, 0:5], train_df.iloc[:, 5]
- X_val, y_val = val_df.iloc[:, 0:5], val_df.iloc[:, 5]
- def learning_rate_005_decay_power_099(current_iter):
- base_learning_rate = 0.05
- lr = base_learning_rate * np.power(.99, current_iter)
- return lr if lr > 1e-3 else 1e-3
- import lightgbm as lgb
- fit_params={"early_stopping_rounds":30,
- "eval_metric" : 'neg_mean_absolute_error',
- "eval_set" : [(X_val,y_val)],
- 'eval_names': ['valid'],
- #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
- 'verbose': 100,
- 'categorical_feature': 'auto'}
- from scipy.stats import randint as sp_randint
- from scipy.stats import uniform as sp_uniform
- param_test ={'num_leaves': sp_randint(6, 50),
- 'min_child_samples': sp_randint(100, 500),
- 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
- 'subsample': sp_uniform(loc=0.2, scale=0.8),
- 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
- 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
- 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
- #This parameter defines the number of HP points to be tested
- n_HP_points_to_test = 10
- import lightgbm as lgb
- from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
- #n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
- clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, n_jobs=4, n_estimators=5000)
- gs = RandomizedSearchCV(
- estimator=clf, param_distributions=param_test,
- n_iter=n_HP_points_to_test,
- scoring='neg_mean_absolute_error',
- cv=3,
- refit=True,
- random_state=314,
- verbose=True)
- gs.fit(X_tr, y_tr, **fit_params)
- print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))
- fin_model = gs.best_estimator_
- files_name = '../readonly/models/lightgbm_model_1.sav'
- pickle.dump(fin_model, open(files_name, 'wb'))
- preds = fin_model.predict(test_df.loc[:, ["date_block_num", "shop_id", "item_id", "cat_id", "item_price"]])
- dt = {'ID' : test_df.iloc[:, 0].values, 'item_cnt_month' : preds}
- res_df = pd.DataFrame(data=dt)
- res_df.to_csv('../readonly/final_project_data/result.csv', index=False)
|