nipdep
/
How-to-Win-a-Data-Science-Competition-Learn-from-Top-Kagglers
mirror of https://github.com/nipdep/How-to-Win-a-Data-Science-Competition-Learn-from-Top-Kagglers.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
            import numpy as np 
import pandas as pd 
import os
import pickle
import warnings

traing_df = pd.read_csv('../readonly/final_project_data/training_data.csv')


train_df = traing_df.loc[traing_df.date_block_num!=33, :]
val_df = traing_df.loc[traing_df.date_block_num==33, :]

X_tr, y_tr = train_df.iloc[:, 0:5], train_df.iloc[:, 5]
X_val, y_val = val_df.iloc[:, 0:5], val_df.iloc[:, 5]

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

import lightgbm as lgb 
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'neg_mean_absolute_error', 
            "eval_set" : [(X_val,y_val)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 10

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='neg_mean_absolute_error',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

gs.fit(X_tr, y_tr, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

fin_model = gs.best_estimator_

files_name = '../readonly/models/lightgbm_model_1.sav'
pickle.dump(fin_model, open(files_name, 'wb'))

preds = fin_model.predict(test_df.loc[:, ["date_block_num", "shop_id",	"item_id",	"cat_id",	"item_price"]])

dt = {'ID' : test_df.iloc[:, 0].values, 'item_cnt_month' : preds}
res_df = pd.DataFrame(data=dt)

res_df.to_csv('../readonly/final_project_data/result.csv', index=False)