bassignana
/
dags_repo
mirror of https://github.com/bassignana/dags_repo.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
199

	
200

	
201

	
202

	
203

	
204

	
205

	
206

	
207

	
208

	
209

	
210

	
211

	
212

	
213

	
214

	
215

	
216

	
217

	
218

	
219

	
220

	
221

	
222

	
223

	
224

	
225

	
226

	
227

	
228

	
229

	
230

	
231

	
232

	
233

	
234

	
235

	
236

	
237

	
238

	
239

	
240

	
241

	
242

	
243

	
244

	
245

	
246

	
247

	
248

	
249

	
250

	
251

	
252

	
253

	
254

	
255

	
256

	
257

	
258

	
259

	
260

	
261

	
262

	
263

	
264

	
265

	
266

	
267

	
268

	
269

	
270

	
271

	
272

	
273

	
274

	
275

	
276

	
277

	
278

	
279

	
280

	
281

	
282

	
283

	
284

	
285

	
286

	
287

	
288

	
289

	
290

	
291

	
292

	
293

	
294

	
295

	
296

	
            #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 14 20:43:06 2021

@author: tommasobassignana
"""
#i need to import every module needed for the functions
import numpy as np
import pandas as pd

def extract_data(data, pred_horizon):
        """
        Extract the input variables (x), the time (t), and the objective (y) from the data samples.
        WARNING : need to be modified to include additional data, or override the function within the models
        :param data: df
        pred_horizon: integer, same used in create_samples_v2
        :return:
        """
        
        y = data["y_t+" + str(pred_horizon)]#ATT: le y sono una series, non un df!
        x = data.drop(["y_t+" + str(pred_horizon),'datetime_t'], axis=1)

        return x, y


def y_to_categorical(yseries_train, yseries_test):
    """
    function that transforms the rescaled y series in categorical values if the problem 
    is a classification task
    yseries_train = yseries_test = pandas series object
    """
    conditions = [
        (yseries_train == 0),#verified_hypo
        (yseries_train == 0.25),#couscin_hypo
        (yseries_train == 0.5),#normal
        (yseries_train == 0.75),#couscin_hyper
        (yseries_train == 1)#verified_hyper
        ]
    values = ['verified_hypo','couscin_hypo','normal','couscin_hyper','verified_hyper']
    yseries_train = np.select(conditions, values)

    conditions = [
        (yseries_test == 0),#verified_hypo
        (yseries_test == 0.25),#couscin_hypo
        (yseries_test == 0.5),#normal
        (yseries_test == 0.75),#couscin_hyper
        (yseries_test == 1)#verified_hyper
        ]
    values = ['verified_hypo','couscin_hypo','normal','couscin_hyper','verified_hyper']
    yseries_test = np.select(conditions, values)
    return yseries_train, yseries_test

def create_title(xtrainData, xtestData, model):
    col_list = str(list(xtrainData.columns))
    tr_rows = str(xtrainData.shape[0])
    tst_rows = str(xtrainData.shape[0])
    title = col_list + ';' + tr_rows + '-' + tst_rows + ';' + str(model)
    return title 

def create_samples_V2(df,number_lags,colonne_da_laggare,colonna_Y,pred_horizon):
    """
    function that takes a classic dataframes and create a new one with columns of lagged variables values for
    all columns in colonne_da_laggare. The number of lags are determined by number_lags variables. All the
    columns that are not lagged will be mantained with the new name column_t. colonna_y determines what is the target variables
    to predict. The numer of units of time*** in the future to shift the target variable is determined by the pred_horizon variable.
    ATT:tutte le colonne non laggate saranno portate nel nuovo df rinominate con _t
    colonne_da_laggare = list of str containing the names of the columns
    pred_horizon and number_lags are both expressed in units of time***
    colonna_Y = str
    ***the units of time are the frequency of the resampling of df. If df is resampled to 5 minutes, a pred_horizon = 6 means that 
    i'm trying to predict colonna_Y 30 minutes into the future. if the data is resampled to half-hour intervals, with a pred_horizon of 6,
    i'm trying to predict colonna_Y 3h into the future. 
    """
    new_df = pd.DataFrame()
    col_non_laggate = df.drop(colonne_da_laggare,axis=1).columns
    for feature in colonne_da_laggare:
        for lag in range(1, number_lags + 1):
            new_df[feature + '_t-' + str(lag)] = df[feature].shift(lag)
        new_df[feature+'_t'] = df[feature]
    new_df['y'+'_t+'+str(pred_horizon)]=df[colonna_Y].shift(-pred_horizon)
    for feature in col_non_laggate:
        new_df[feature + '_t'] = df[feature]
    #new_df.drop('datetime_t')perchè non va?
    return(new_df)

def to_cat_meal_type(data):
    """
    for the resempling operation meal type must be converted to categorical encoding. nans are substituted with 0.
    data = pandas dataset
    
    """
    data["meal_type"] = data.meal_type.fillna('0')
    data["meal_type"] = np.where((data.meal_type == 'Breakfast'),'1',data.meal_type)
    data["meal_type"] = np.where((data.meal_type == 'Snack'),'2',data.meal_type)
    data["meal_type"] = np.where((data.meal_type == 'Lunch'),'3',data.meal_type)
    data["meal_type"] = np.where((data.meal_type == 'Dinner'),'4',data.meal_type)
    data["meal_type"] = np.where((data.meal_type == 'HypoCorrection'),'5',data.meal_type)
    data["meal_type"] = data["meal_type"].astype(int)
    return data


def checkCarb(inpCarb, minCarb):
    """
    helper function for dummyCarbs
    """
    if(inpCarb>minCarb):
        return 1
    else:
        return 0

def dummyCarbs(df ):
    """ function that add a new column 'dummyCarbs' with 1 in the period that the quantity of carbs is recorded
    df = pandas dataframe object containing a CHO column
    return the column!
    """
    #apply fa passare come primo argomento alla funzione checkCarb i dati della colonna CHO che rappresenta il quantitativo di carbs assunti, mentre attraverso ad args si passa il secondo argomento 
    dummyCarbs = df["CHO"].apply(checkCarb, args=(1,))
    #df["dummyCarbs"] = dummyCarbs
    return dummyCarbs

def mealZone(df ):
    """
    create a new column mealZone with 1 if the observations falls 50 min before or 30 min after a meal(assuming that the resampling is every 5 minutes). 
    this numbers can be generalized for a mor flezible function:
    in the np.linspace line, i-n(8 in this case) indicate the periods before a meal; i+q(6 in this case) indicate the number of periods after a meal
    it is interesting to try n=0 to explicit the fact that for a window after a meal the glucose is being processed
    df = pandas dataframe object
    """
    mealZone = dummyCarbs(df).values
    mealIndex = np.nonzero(mealZone)[0]
    extendedMealIndex = []
    for i in mealIndex:
        to_append = np.linspace(i-8,i+6,6+8+1,dtype = int)
        extendedMealIndex.append(to_append)
    okExtendedIndex = []
    for sublist in extendedMealIndex:
        for element in sublist:
            okExtendedIndex.append(element)
    mealZone[okExtendedIndex] = 1
    df["mealZone"] = mealZone
    
    return df


def Y_cat(data_resampled, values):
    """creating Y variables - values might be adjusted
    data_resampled = pandas df object with a colum named glucose
    values = array representing the class number 
    """
    conditions = [
    (data_resampled['glucose'] <= 55),#verified_hypo
    (data_resampled['glucose'] > 55) & (data_resampled['glucose'] <= 80),#couscin_hypo
    (data_resampled['glucose'] > 80) & (data_resampled['glucose'] <= 170),#normal
    (data_resampled['glucose'] > 170) & (data_resampled['glucose'] <= 210),#couscin_hyper
    (data_resampled['glucose'] > 210)
    ]
    #values = [0,1,2,3,4]
    data_resampled['gluco_class'] = np.select(conditions, values)
    return data_resampled


def time_dummy(df, match_timestamp, new_col):
    """
    This function puts on a new_col 1 in every occurrence of the specified timestamp, 0 otherwise
    df = dataframe object, must have a datetime column but not a datetime index
    match_timestamp = str, specified timestamp in the right format es 05:00:00
    new_col = str, name of the new column to add in the df
    """

    dfp = df.set_index('datetime')
    dfp.index = pd.to_datetime(dfp.index)
    dfp[new_col] = np.where(dfp.index.strftime("%H:%M:%S") == match_timestamp, 1, 0)
    df[new_col] = dfp[new_col].values
    
    #forse sarebbe meglio farmi ritornare la colonna
    return df

def time_bool(df, match_timestamp, new_col):
    """
    This function puts True a new_col 1 in every occurrence of the specified timestamp, False otherwise
    df = dataframe object, must have a datetime column but not a datetime index
    match_timestamp = str, specified timestamp in the right format es 05:00:00
    new_col = str, name of the new column to add in the df
    """

    dfp = df.set_index('datetime')
    dfp.index = pd.to_datetime(dfp.index)
    dfp[new_col] = np.where(dfp.index.strftime("%H:%M:%S") == match_timestamp, True, False)
    df[new_col] = dfp[new_col].values
    
    #forse sarebbe meglio farmi ritornare la colonna
    return df


#last blood glucose measure for fasting istance
def get_past_BG(df, new_col, match_timestamp, glucose_col = 'glucose'):
#per ogni riga che individuo essere la amisurazione del mattino mi serve un valore di glucosio da associare come ultimo valore registrato, ad esempio quello delle 10 di sera del giorno precedente. 
#uso time_dummy per selezionarmi solo le righe delle 22 pm, la chiamo dummy22
#dove c'è 1 nella colonna dummy22 riporto nella colonna new_col il valore nella colonna glucose altrimenti imputo nan
#faccio un ffill dei valori di lastBGMeasureFast in modo che l'ultimo valore registrato sarà propagato per tutte le 24h successive e andrà a ricadere nella fasting istance 
#ci sarà sicuramente un modo più svelto e furbo con iloc ecc

#attenzione che se il valore di glucosio è nan viene propagato un nan
    """
    df = panda dataframe object
    new_col = str, name of the new column to add in the df containing the value that i want
    match_timestamp = str, specified timestamp in the right format es 05:00:00
    glucose_col = str, name of the column in db that contains all the BG values, default = glucose
    """
    dfp = df
    dfp = time_dummy(dfp, '22:00:00', 'dummy22')
    dfp = df.set_index('datetime')
    dfp.index = pd.to_datetime(dfp.index)
    dfp[new_col] = np.where(dfp['dummy22'] == 1, dfp[glucose_col], np.nan)
    dfp[new_col].fillna(method='ffill', inplace=True)
    
    return dfp.drop(["dummy22"], axis=1)


def col_to_check (cols_to_check, dict1, dict2, dict3, dict4):
    """
    cols_to_check = ["datetime",'glucose','CHO','q'] or similar
    list of columns to check for their presence in every test and train dataset 

    return pid (personal ids) number of unusable datasets
    """
    pid_to_remove1 = []
    pid_to_remove2 = []
    for num in ['559','563','570','575','588','591']:
        data = dict1.get(num)
        if set(cols_to_check).issubset(data.columns) == False:
            actual_diff = set(cols_to_check) - set(data.columns)  
            col_to_print_tr1 = actual_diff.intersection(set(cols_to_check))
            print('for TRAIN dataset1 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_tr1))
            pid_to_remove1.append(num)
        data = dict2.get(num)
        if set(cols_to_check).issubset(data.columns) == False:
            actual_diff = set(cols_to_check) - set(data.columns)  
            col_to_print_te1 = actual_diff.intersection(set(cols_to_check))
            print('for TEST dataset1 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_te1))
            pid_to_remove1.append(num)
    for num in ['540','544','552','567','584','596']:
        data = dict3.get(num)
        if set(cols_to_check).issubset(data.columns) == False:
            actual_diff = set(cols_to_check) - set(data.columns)  
            col_to_print_tr2 = actual_diff.intersection(set(cols_to_check))
            print('for TRAIN dataset2 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_tr2))
            pid_to_remove2.append(num)
        data = dict4.get(num)
        if set(cols_to_check).issubset(data.columns) == False:
            actual_diff = set(cols_to_check) - set(data.columns)  
            col_to_print_te2 = actual_diff.intersection(set(cols_to_check))
            print('for TEST dataset2 ' + str(num) + ' the following columns are not present: ' + str(col_to_print_te2))
            pid_to_remove2.append(num)
    
    return pid_to_remove1, pid_to_remove2


def get_valid_df(pid_to_remove1, pid_to_remove2): 
    """
    """
    valid1 = set(['559','563','570','575','588','591']) - set(pid_to_remove1)
    valid2 = set(['540','544','552','567','584','596']) - set(pid_to_remove2)
    return list(valid1), list(valid2)


#da implementare
#creating hour and minute feature
#data_resampled["hour"] = data_resampled['datetime'].dt.hour
#data_resampled["minute"] = data_resampled['datetime'].dt.minute

#rolling_features
#rolling mean of the past hour pastHourRoll
#data_resampled['pastHourRoll'] = data_resampled['glucose'].rolling(window = 12, min_periods = 1).mean()
#rolling mean of the past hour pastDayRoll
#data_resampled['pastDayRoll'] = data_resampled['glucose'].rolling(window = 12*24, min_periods = 1).mean()

#trovare quanto dista una determinata osservazione rispetto al momento dell'aggregazione, es: voglio sapere quanto dista l'ultima misurazione del glucosio dal momento in cui creo la fasting istance 
#per una data colonna COL
#eventualmente posso creare una colonna bool solo per le righe per cui mi serve davvero calcolare questa differenza 
#creo una nuova colonna contenente il nome della colonna sopra e una desinenza NEW_COL
#creo un limite di periodi per la ricerca: es se i periodi sono di 5 min e ne imposto il limite a 24, eseguirò il controllo nelle 2 ore precedenti LIMIT
##per ogni riga del df
###calcolo l'indice i della riga I
###controllo se il valore all'indice i-1 è nan o valorizzato
###se è valorizzato calcolo la differenza di indice, quella è la differenza in periodi 
###se è nan procedo a controllare i-2
###continuo fino a quando non raggiungo il limite all'indietro oppure se trovo un valore interrompo e vado alla riga successiva
if __name__ == '__main__':
    print('main')