1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
- # -*- coding: utf-8 -*-
- """
- @author: Jessica Cabral
- """
- ############################################################################
- # Train Script
- ############################################################################
- import sys
- import os
- import numpy as np
- import pandas as pd
- from datetime import datetime
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.linear_model import LinearRegression
- from sklearn.linear_model import Ridge
- from sklearn.linear_model import Lasso
- from sklearn.linear_model import ElasticNet
- from sklearn.tree import DecisionTreeRegressor
- from sklearn.svm import SVR # Support Vector Regression
- from sklearn.model_selection import cross_validate
- try:
- import cPickle as pickle
- except ImportError:
- import pickle
- ##############################
- # Get Sys Paths
- ##############################
- #if len(sys.argv) != 3:
- # sys.stderr.write('Arguments error. Usage:\n')
- # sys.stderr.write('\tpython train.py data/processed models\n')
- # sys.exit(1)
- PATH_PROCESSED_DATA = sys.argv[1] #'../data/processed'
- PATH_MODEL = sys.argv[2] #'../models'
- if not os.path.exists(PATH_MODEL):
- os.makedirs(PATH_MODEL)
-
- # Test data set split ratio
- split = 0.33
- seed = 201909
- ###############################
- ## Create Folder model version
- ###############################
- #try:
- # model_version = len(next(os.walk(PATH_MODEL))[1])
- # if model_version is None:
- # model_version = 0
- #except StopIteration :
- # model_version = 0
- #
- #model_version_name = 'modelv{}-{}'.format(model_version+1, datetime.now().strftime("%d%m%Y-%H%M%S"))
- #model_version_path = r'{}/{}'.format(PATH_MODEL,model_version_name)
- #
- #if not os.path.exists(model_version_path):
- # os.makedirs(model_version_path)
- #
- #print('Versao do modelo: {}'.format(model_version_name))
- ##############################
- # Features and Target
- ##############################
- bike_processed = pd.read_csv(PATH_PROCESSED_DATA+'/bikes_processed.tsv', sep='\t')
- X = bike_processed[bike_processed.columns.difference(['cnt', 'dteday'])].values
- y = bike_processed['cnt'].values
- ##############################
- # Train Test Split
- ##############################
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed)
- X_train_final = pd.DataFrame(X_train)
- X_train_final['target'] = y_train
- X_test_final = pd.DataFrame(X_test)
- X_test_final['target'] = y_test
- #with open(os.path.join(model_version_path, 'train.tsv'), 'w', encoding='utf8') as fd_out_train:
- with open(os.path.join(PATH_MODEL, 'train.tsv'), 'w', encoding='utf8') as fd_out_train:
- fd_out_train.write(X_train_final.to_csv(sep='\t', index=False))
- #with open(os.path.join(model_version_path, 'test.tsv'), 'w', encoding='utf8') as fd_out_train:
- with open(os.path.join(PATH_MODEL, 'test.tsv'), 'w', encoding='utf8') as fd_out_train:
- fd_out_train.write(X_test_final.to_csv(sep='\t', index=False))
-
- ##############################
- # Train Model
- ##############################
- print('Input matrix size {}'.format(bike_processed.shape))
- print('X_train_final matrix size {}'.format(X_train_final.shape))
- print('X_test_final matrix size {}'.format(X_test_final.shape))
- X = X_train_final[X_train_final.columns.difference(['target'])].values
- y= X_train_final['target'].values
- regr = RandomForestRegressor(max_depth=2, n_estimators=100, random_state=seed)
- regr.fit(X, y)
- print('Modelo treinado com sucesso!')
- ##############################
- # Save Model
- ##############################
-
- #with open(os.path.join(model_version_path, '{}.pkl'.format(model_version_name)), 'wb') as fd:
- with open(os.path.join(PATH_MODEL, 'model.pkl'), 'wb') as fd:
- pickle.dump(regr, fd)
-
- #print('Modelo salvo em: {}'.format(model_version_path, '{}.pkl'.format(model_version_name)))
- print('Modelo salvo em: {}'.format(PATH_MODEL, 'model.pkl'))
|