Dean
/
Deans-bike-lending-prediction


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
            # -*- coding: utf-8 -*-
"""
@author: Jessica Cabral
"""
############################################################################
#       Data Preparation Script
############################################################################

import sys
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing

if len(sys.argv) != 3:
    sys.stderr.write('Arguments error. Usage:\n')
    sys.stderr.write('\tpython prepare.py data output\n')
    sys.exit(1)


##############################
#    Default paths
##############################
PATH_RAW_DATA = sys.argv[1] #'../data/raw'
PATH_PROCESSED_DATA = sys.argv[2] #'../data/processed'

print(PATH_RAW_DATA)
print(PATH_PROCESSED_DATA)

##############################
#     Data read
##############################
print('Reading Data...')
bikes = pd.read_csv(PATH_RAW_DATA)
print('Data Shape: {}\n'.format(bikes.shape))

##############################
#     Selecting features that we are going to use
##############################
print('Selection Features...')
features_to_be_removed = ['casual', 'registered', 'atemp', 'instant', 'yr']
bikes =  bikes[bikes.columns.difference(features_to_be_removed)]
print('Data Shape after feature selection: {}\n'.format(bikes.shape))

##############################
#     transform the "dteday" feature to date type
##############################
bikes["dteday"] = pd.to_datetime(bikes["dteday"])

##############################
#     One-Hot-Encoding
##############################

def dummify_dataset(df, column):
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
    df = df.drop([column], axis=1)
    return df

columns_to_dummify = ['season', 'holiday', 'workingday', 'weathersit'] #'mnth', 'weekday'
for column in columns_to_dummify:
    bikes= dummify_dataset(bikes, column)

print(bikes.head(1))

##############################
#     Normalize features - scale
##############################
print('Feature Normalizarion...')
numerical_features = ["temp", "hum", "windspeed"]

print('Features before the normalization')
print(bikes.loc[:, numerical_features][:5])

# Normalizing...
bikes.loc[:, numerical_features] = preprocessing.scale(bikes.loc[:, numerical_features])

print('Features after the normalization')
print(bikes.loc[:, numerical_features][:5])

##############################
#     Normalize features - scale
##############################
# Lets create a feature that indicates it is a workday
# bikes['isWorking'] = np.where((bikes['workingday'] == 1) & (bikes['holiday'] == 0), 1, 0)

# Add a feature with month quantities, it will help the model
#bikes <- month.count(bikes)

# Criar um fator ordenado para o dia da semana, comecando por segunda-feira
# Neste fator eh convertido para ordenado numÃ©rico para ser compativel com os tipos de dados do Azure ML
#bikes$dayWeek <- as.factor(weekdays(bikes$dteday))


##############################
#     Save processed Dataset
##############################
print('\nSaving...')
if not os.path.exists(PATH_PROCESSED_DATA):
    os.makedirs(PATH_PROCESSED_DATA)

with open(os.path.join(PATH_PROCESSED_DATA, 'bikes_processed.tsv'), 'w', encoding='utf8') as fd_out:
    fd_out.write(bikes.to_csv(sep='\t', index=False, encoding='utf-8-sig'))

print('\nBike Processed saved with sucess!')