Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

2-data-preparation.py 3.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: Jessica Cabral
  4. """
  5. ############################################################################
  6. # Data Preparation Script
  7. ############################################################################
  8. import sys
  9. import os
  10. import pandas as pd
  11. import numpy as np
  12. from sklearn import preprocessing
  13. if len(sys.argv) != 3:
  14. sys.stderr.write('Arguments error. Usage:\n')
  15. sys.stderr.write('\tpython prepare.py data output\n')
  16. sys.exit(1)
  17. ##############################
  18. # Default paths
  19. ##############################
  20. PATH_RAW_DATA = sys.argv[1] #'../data/raw'
  21. PATH_PROCESSED_DATA = sys.argv[2] #'../data/processed'
  22. print(PATH_RAW_DATA)
  23. print(PATH_PROCESSED_DATA)
  24. ##############################
  25. # Data read
  26. ##############################
  27. print('Reading Data...')
  28. bikes = pd.read_csv(PATH_RAW_DATA)
  29. print('Data Shape: {}\n'.format(bikes.shape))
  30. ##############################
  31. # Selecting features that we are going to use
  32. ##############################
  33. print('Selection Features...')
  34. features_to_be_removed = ['casual', 'registered', 'atemp', 'instant', 'yr']
  35. bikes = bikes[bikes.columns.difference(features_to_be_removed)]
  36. print('Data Shape after feature selection: {}\n'.format(bikes.shape))
  37. ##############################
  38. # transform the "dteday" feature to date type
  39. ##############################
  40. bikes["dteday"] = pd.to_datetime(bikes["dteday"])
  41. ##############################
  42. # One-Hot-Encoding
  43. ##############################
  44. def dummify_dataset(df, column):
  45. df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
  46. df = df.drop([column], axis=1)
  47. return df
  48. columns_to_dummify = ['season', 'holiday', 'workingday', 'weathersit'] #'mnth', 'weekday'
  49. for column in columns_to_dummify:
  50. bikes= dummify_dataset(bikes, column)
  51. print(bikes.head(1))
  52. ##############################
  53. # Normalize features - scale
  54. ##############################
  55. print('Feature Normalizarion...')
  56. numerical_features = ["temp", "hum", "windspeed"]
  57. print('Features before the normalization')
  58. print(bikes.loc[:, numerical_features][:5])
  59. # Normalizing...
  60. bikes.loc[:, numerical_features] = preprocessing.scale(bikes.loc[:, numerical_features])
  61. print('Features after the normalization')
  62. print(bikes.loc[:, numerical_features][:5])
  63. ##############################
  64. # Normalize features - scale
  65. ##############################
  66. # Lets create a feature that indicates it is a workday
  67. # bikes['isWorking'] = np.where((bikes['workingday'] == 1) & (bikes['holiday'] == 0), 1, 0)
  68. # Add a feature with month quantities, it will help the model
  69. #bikes <- month.count(bikes)
  70. # Criar um fator ordenado para o dia da semana, comecando por segunda-feira
  71. # Neste fator eh convertido para ordenado numérico para ser compativel com os tipos de dados do Azure ML
  72. #bikes$dayWeek <- as.factor(weekdays(bikes$dteday))
  73. ##############################
  74. # Save processed Dataset
  75. ##############################
  76. print('\nSaving...')
  77. if not os.path.exists(PATH_PROCESSED_DATA):
  78. os.makedirs(PATH_PROCESSED_DATA)
  79. with open(os.path.join(PATH_PROCESSED_DATA, 'bikes_processed.tsv'), 'w', encoding='utf8') as fd_out:
  80. fd_out.write(bikes.to_csv(sep='\t', index=False, encoding='utf-8-sig'))
  81. print('\nBike Processed saved with sucess!')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...