Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

Proc_func.py 13 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Apr 14 20:43:06 2021
  5. @author: tommasobassignana
  6. """
  7. #i need to import every module needed for the functions
  8. import numpy as np
  9. import pandas as pd
  10. def extract_data(data, pred_horizon):
  11. """
  12. Extract the input variables (x), the time (t), and the objective (y) from the data samples.
  13. WARNING : need to be modified to include additional data, or override the function within the models
  14. :param data: df
  15. pred_horizon: integer, same used in create_samples_v2
  16. :return:
  17. """
  18. y = data["y_t+" + str(pred_horizon)]#ATT: le y sono una series, non un df!
  19. x = data.drop(["y_t+" + str(pred_horizon),'datetime_t'], axis=1)
  20. return x, y
  21. def y_to_categorical(yseries_train, yseries_test):
  22. """
  23. function that transforms the rescaled y series in categorical values if the problem
  24. is a classification task
  25. yseries_train = yseries_test = pandas series object
  26. """
  27. conditions = [
  28. (yseries_train == 0),#verified_hypo
  29. (yseries_train == 0.25),#couscin_hypo
  30. (yseries_train == 0.5),#normal
  31. (yseries_train == 0.75),#couscin_hyper
  32. (yseries_train == 1)#verified_hyper
  33. ]
  34. values = ['verified_hypo','couscin_hypo','normal','couscin_hyper','verified_hyper']
  35. yseries_train = np.select(conditions, values)
  36. conditions = [
  37. (yseries_test == 0),#verified_hypo
  38. (yseries_test == 0.25),#couscin_hypo
  39. (yseries_test == 0.5),#normal
  40. (yseries_test == 0.75),#couscin_hyper
  41. (yseries_test == 1)#verified_hyper
  42. ]
  43. values = ['verified_hypo','couscin_hypo','normal','couscin_hyper','verified_hyper']
  44. yseries_test = np.select(conditions, values)
  45. return yseries_train, yseries_test
  46. def create_title(xtrainData, xtestData, model):
  47. col_list = str(list(xtrainData.columns))
  48. tr_rows = str(xtrainData.shape[0])
  49. tst_rows = str(xtrainData.shape[0])
  50. title = col_list + ';' + tr_rows + '-' + tst_rows + ';' + str(model)
  51. return title
  52. def create_samples_V2(df,number_lags,colonne_da_laggare,colonna_Y,pred_horizon):
  53. """
  54. function that takes a classic dataframes and create a new one with columns of lagged variables values for
  55. all columns in colonne_da_laggare. The number of lags are determined by number_lags variables. All the
  56. columns that are not lagged will be mantained with the new name column_t. colonna_y determines what is the target variables
  57. to predict. The numer of units of time*** in the future to shift the target variable is determined by the pred_horizon variable.
  58. ATT:tutte le colonne non laggate saranno portate nel nuovo df rinominate con _t
  59. colonne_da_laggare = list of str containing the names of the columns
  60. pred_horizon and number_lags are both expressed in units of time***
  61. colonna_Y = str
  62. ***the units of time are the frequency of the resampling of df. If df is resampled to 5 minutes, a pred_horizon = 6 means that
  63. i'm trying to predict colonna_Y 30 minutes into the future. if the data is resampled to half-hour intervals, with a pred_horizon of 6,
  64. i'm trying to predict colonna_Y 3h into the future.
  65. """
  66. new_df = pd.DataFrame()
  67. col_non_laggate = df.drop(colonne_da_laggare,axis=1).columns
  68. for feature in colonne_da_laggare:
  69. for lag in range(1, number_lags + 1):
  70. new_df[feature + '_t-' + str(lag)] = df[feature].shift(lag)
  71. new_df[feature+'_t'] = df[feature]
  72. new_df['y'+'_t+'+str(pred_horizon)]=df[colonna_Y].shift(-pred_horizon)
  73. for feature in col_non_laggate:
  74. new_df[feature + '_t'] = df[feature]
  75. #new_df.drop('datetime_t')perchè non va?
  76. return(new_df)
  77. def to_cat_meal_type(data):
  78. """
  79. for the resempling operation meal type must be converted to categorical encoding. nans are substituted with 0.
  80. data = pandas dataset
  81. """
  82. data["meal_type"] = data.meal_type.fillna('0')
  83. data["meal_type"] = np.where((data.meal_type == 'Breakfast'),'1',data.meal_type)
  84. data["meal_type"] = np.where((data.meal_type == 'Snack'),'2',data.meal_type)
  85. data["meal_type"] = np.where((data.meal_type == 'Lunch'),'3',data.meal_type)
  86. data["meal_type"] = np.where((data.meal_type == 'Dinner'),'4',data.meal_type)
  87. data["meal_type"] = np.where((data.meal_type == 'HypoCorrection'),'5',data.meal_type)
  88. data["meal_type"] = data["meal_type"].astype(int)
  89. return data
  90. def checkCarb(inpCarb, minCarb):
  91. """
  92. helper function for dummyCarbs
  93. """
  94. if(inpCarb>minCarb):
  95. return 1
  96. else:
  97. return 0
  98. def dummyCarbs(df ):
  99. """ function that add a new column 'dummyCarbs' with 1 in the period that the quantity of carbs is recorded
  100. df = pandas dataframe object containing a CHO column
  101. return the column!
  102. """
  103. #apply fa passare come primo argomento alla funzione checkCarb i dati della colonna CHO che rappresenta il quantitativo di carbs assunti, mentre attraverso ad args si passa il secondo argomento
  104. dummyCarbs = df["CHO"].apply(checkCarb, args=(1,))
  105. #df["dummyCarbs"] = dummyCarbs
  106. return dummyCarbs
  107. def mealZone(df ):
  108. """
  109. create a new column mealZone with 1 if the observations falls 50 min before or 30 min after a meal(assuming that the resampling is every 5 minutes).
  110. this numbers can be generalized for a mor flezible function:
  111. in the np.linspace line, i-n(8 in this case) indicate the periods before a meal; i+q(6 in this case) indicate the number of periods after a meal
  112. it is interesting to try n=0 to explicit the fact that for a window after a meal the glucose is being processed
  113. df = pandas dataframe object
  114. """
  115. mealZone = dummyCarbs(df).values
  116. mealIndex = np.nonzero(mealZone)[0]
  117. extendedMealIndex = []
  118. for i in mealIndex:
  119. to_append = np.linspace(i-8,i+6,6+8+1,dtype = int)
  120. extendedMealIndex.append(to_append)
  121. okExtendedIndex = []
  122. for sublist in extendedMealIndex:
  123. for element in sublist:
  124. okExtendedIndex.append(element)
  125. mealZone[okExtendedIndex] = 1
  126. df["mealZone"] = mealZone
  127. return df
  128. def Y_cat(data_resampled, values):
  129. """creating Y variables - values might be adjusted
  130. data_resampled = pandas df object with a colum named glucose
  131. values = array representing the class number
  132. """
  133. conditions = [
  134. (data_resampled['glucose'] <= 55),#verified_hypo
  135. (data_resampled['glucose'] > 55) & (data_resampled['glucose'] <= 80),#couscin_hypo
  136. (data_resampled['glucose'] > 80) & (data_resampled['glucose'] <= 170),#normal
  137. (data_resampled['glucose'] > 170) & (data_resampled['glucose'] <= 210),#couscin_hyper
  138. (data_resampled['glucose'] > 210)
  139. ]
  140. #values = [0,1,2,3,4]
  141. data_resampled['gluco_class'] = np.select(conditions, values)
  142. return data_resampled
  143. def time_dummy(df, match_timestamp, new_col):
  144. """
  145. This function puts on a new_col 1 in every occurrence of the specified timestamp, 0 otherwise
  146. df = dataframe object, must have a datetime column but not a datetime index
  147. match_timestamp = str, specified timestamp in the right format es 05:00:00
  148. new_col = str, name of the new column to add in the df
  149. """
  150. dfp = df.set_index('datetime')
  151. dfp.index = pd.to_datetime(dfp.index)
  152. dfp[new_col] = np.where(dfp.index.strftime("%H:%M:%S") == match_timestamp, 1, 0)
  153. df[new_col] = dfp[new_col].values
  154. #forse sarebbe meglio farmi ritornare la colonna
  155. return df
  156. def time_bool(df, match_timestamp, new_col):
  157. """
  158. This function puts True a new_col 1 in every occurrence of the specified timestamp, False otherwise
  159. df = dataframe object, must have a datetime column but not a datetime index
  160. match_timestamp = str, specified timestamp in the right format es 05:00:00
  161. new_col = str, name of the new column to add in the df
  162. """
  163. dfp = df.set_index('datetime')
  164. dfp.index = pd.to_datetime(dfp.index)
  165. dfp[new_col] = np.where(dfp.index.strftime("%H:%M:%S") == match_timestamp, True, False)
  166. df[new_col] = dfp[new_col].values
  167. #forse sarebbe meglio farmi ritornare la colonna
  168. return df
  169. #last blood glucose measure for fasting istance
  170. def get_past_BG(df, new_col, match_timestamp, glucose_col = 'glucose'):
  171. #per ogni riga che individuo essere la amisurazione del mattino mi serve un valore di glucosio da associare come ultimo valore registrato, ad esempio quello delle 10 di sera del giorno precedente.
  172. #uso time_dummy per selezionarmi solo le righe delle 22 pm, la chiamo dummy22
  173. #dove c'è 1 nella colonna dummy22 riporto nella colonna new_col il valore nella colonna glucose altrimenti imputo nan
  174. #faccio un ffill dei valori di lastBGMeasureFast in modo che l'ultimo valore registrato sarà propagato per tutte le 24h successive e andrà a ricadere nella fasting istance
  175. #ci sarà sicuramente un modo più svelto e furbo con iloc ecc
  176. #attenzione che se il valore di glucosio è nan viene propagato un nan
  177. """
  178. df = panda dataframe object
  179. new_col = str, name of the new column to add in the df containing the value that i want
  180. match_timestamp = str, specified timestamp in the right format es 05:00:00
  181. glucose_col = str, name of the column in db that contains all the BG values, default = glucose
  182. """
  183. dfp = df
  184. dfp = time_dummy(dfp, '22:00:00', 'dummy22')
  185. dfp = df.set_index('datetime')
  186. dfp.index = pd.to_datetime(dfp.index)
  187. dfp[new_col] = np.where(dfp['dummy22'] == 1, dfp[glucose_col], np.nan)
  188. dfp[new_col].fillna(method='ffill', inplace=True)
  189. return dfp.drop(["dummy22"], axis=1)
  190. def col_to_check (cols_to_check, dict1, dict2, dict3, dict4):
  191. """
  192. cols_to_check = ["datetime",'glucose','CHO','q'] or similar
  193. list of columns to check for their presence in every test and train dataset
  194. return pid (personal ids) number of unusable datasets
  195. """
  196. pid_to_remove1 = []
  197. pid_to_remove2 = []
  198. for num in ['559','563','570','575','588','591']:
  199. data = dict1.get(num)
  200. if set(cols_to_check).issubset(data.columns) == False:
  201. actual_diff = set(cols_to_check) - set(data.columns)
  202. col_to_print_tr1 = actual_diff.intersection(set(cols_to_check))
  203. print('for TRAIN dataset1 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_tr1))
  204. pid_to_remove1.append(num)
  205. data = dict2.get(num)
  206. if set(cols_to_check).issubset(data.columns) == False:
  207. actual_diff = set(cols_to_check) - set(data.columns)
  208. col_to_print_te1 = actual_diff.intersection(set(cols_to_check))
  209. print('for TEST dataset1 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_te1))
  210. pid_to_remove1.append(num)
  211. for num in ['540','544','552','567','584','596']:
  212. data = dict3.get(num)
  213. if set(cols_to_check).issubset(data.columns) == False:
  214. actual_diff = set(cols_to_check) - set(data.columns)
  215. col_to_print_tr2 = actual_diff.intersection(set(cols_to_check))
  216. print('for TRAIN dataset2 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_tr2))
  217. pid_to_remove2.append(num)
  218. data = dict4.get(num)
  219. if set(cols_to_check).issubset(data.columns) == False:
  220. actual_diff = set(cols_to_check) - set(data.columns)
  221. col_to_print_te2 = actual_diff.intersection(set(cols_to_check))
  222. print('for TEST dataset2 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_te2))
  223. pid_to_remove2.append(num)
  224. return pid_to_remove1, pid_to_remove2
  225. def get_valid_df(pid_to_remove1, pid_to_remove2):
  226. """
  227. """
  228. valid1 = set(['559','563','570','575','588','591']) - set(pid_to_remove1)
  229. valid2 = set(['540','544','552','567','584','596']) - set(pid_to_remove2)
  230. return list(valid1), list(valid2)
  231. #da implementare
  232. #creating hour and minute feature
  233. #data_resampled["hour"] = data_resampled['datetime'].dt.hour
  234. #data_resampled["minute"] = data_resampled['datetime'].dt.minute
  235. #rolling_features
  236. #rolling mean of the past hour pastHourRoll
  237. #data_resampled['pastHourRoll'] = data_resampled['glucose'].rolling(window = 12, min_periods = 1).mean()
  238. #rolling mean of the past hour pastDayRoll
  239. #data_resampled['pastDayRoll'] = data_resampled['glucose'].rolling(window = 12*24, min_periods = 1).mean()
  240. #trovare quanto dista una determinata osservazione rispetto al momento dell'aggregazione, es: voglio sapere quanto dista l'ultima misurazione del glucosio dal momento in cui creo la fasting istance
  241. #per una data colonna COL
  242. #eventualmente posso creare una colonna bool solo per le righe per cui mi serve davvero calcolare questa differenza
  243. #creo una nuova colonna contenente il nome della colonna sopra e una desinenza NEW_COL
  244. #creo un limite di periodi per la ricerca: es se i periodi sono di 5 min e ne imposto il limite a 24, eseguirò il controllo nelle 2 ore precedenti LIMIT
  245. ##per ogni riga del df
  246. ###calcolo l'indice i della riga I
  247. ###controllo se il valore all'indice i-1 è nan o valorizzato
  248. ###se è valorizzato calcolo la differenza di indice, quella è la differenza in periodi
  249. ###se è nan procedo a controllare i-2
  250. ###continuo fino a quando non raggiungo il limite all'indietro oppure se trovo un valore interrompo e vado alla riga successiva
  251. if __name__ == '__main__':
  252. print('main')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...