csinva
/
auxilin-prediction
mirror of https://github.com/Yu-Group/auxilin-prediction


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
199

	
200

	
201

	
202

	
203

	
204

	
205

	
206

	
207

	
208

	
209

	
210

	
211

	
212

	
213

	
214

	
215

	
216

	
217

	
218

	
219

	
220

	
221

	
222

	
223

	
224

	
225

	
226

	
227

	
            import numpy as np
import pandas as pd
from math import floor

pd.options.mode.chained_assignment = None  # default='warn' - caution: this turns off setting with copy warning
from viz import *
import config


def add_rule_based_label(df):
    df['Y_peak_time_frac'] = df['Y_peak_idx'].values / df['lifetime'].values
    df['y_z_score'] = (df['Y_max'].values - df['Y_mean'].values) / df['Y_std'].values
    X_max_around_Y_peak = []
    X_max_after_Y_peak = []
    for i in range(len(df)):
        pt = df['Y_peak_idx'].values[i]
        lt = floor(df['lifetime'].values[i])
        left_bf = int(0.2 * lt) + 1  # look at a window with length = 30%*lifetime
        right_bf = int(0.1 * lt) + 1
        arr_around = df['X'].iloc[i][max(0, pt - left_bf): min(pt + right_bf, lt)]
        arr_after = df['X'].iloc[i][min(pt + right_bf, lt - 1):]
        X_max_around_Y_peak.append(max(arr_around))
        if len(arr_after) > 0:
            X_max_after_Y_peak.append(max(arr_after))
        else:
            X_max_after_Y_peak.append(max(arr_around))
    df['X_max_around_Y_peak'] = X_max_around_Y_peak
    df['X_max_after_Y_peak'] = X_max_after_Y_peak
    df['X_max_diff'] = df['X_max_around_Y_peak'] - df['X_max_after_Y_peak']

    def rule_based_model(track):

        # three rules:
        #  if aux peaks too early -- negative
        #  elif:
        #     if y_consec_sig or y_conservative_thresh or (cla drops around aux peak, and aux max is greater than 
        #     mean + 2.6*std), then positive
        #  else: negative

        if track['Y_peak_time_frac'] < 0.2:
            return 0
        if track['y_consec_sig'] or track['y_conservative_thresh']:
            return 1
        # if track['X_max_diff'] > 260 and track['y_z_score'] > 2.6:
        #    return 1
        if track['X_max_diff'] > 260 and track['Y_max'] > 560:
            return 1
        return 0

    df['y_rule_based'] = np.array([rule_based_model(df.iloc[i]) for i in range(len(df))])
    return df


def add_outcomes(df, LABELS=None, thresh=3.25, p_thresh=0.05,
                 aux_peak=642.375, aux_thresh=973, vps_data=False):
    '''Add binary outcome of whether spike happened and info on whether events were questionable
    '''
    df['y_score'] = df['Y_max'].values - (df['Y_mean'].values + thresh * df['Y_std'].values)
    df['y_thresh'] = (df['y_score'].values > 0).astype(int)  # Y_max was big
    df['y'] = df['Y_max'] > aux_peak

    # outcomes based on significant p-values
    num_sigs = [np.array(df['Y_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
    df['y_num_sig'] = np.array([num_sigs[i].sum() for i in range(df.shape[0])]).astype(int)
    df['y_single_sig'] = np.array([num_sigs[i].sum() > 0 for i in range(df.shape[0])]).astype(int)
    df['y_double_sig'] = np.array([num_sigs[i].sum() > 1 for i in range(df.shape[0])]).astype(int)
    df['y_conservative_thresh'] = (df['Y_max'].values > aux_thresh).astype(int)
    y_consec_sig = []
    y_sig_min_diff = []
    for i in range(df.shape[0]):
        idxs_sig = np.where(num_sigs[i] == 1)[0]  # indices of significance
        if len(idxs_sig) > 1:
            y_sig_min_diff.append(np.min(np.diff(idxs_sig)))
        else:
            y_sig_min_diff.append(np.nan)
        # find whether there were consecutive sig. indices
        if len(idxs_sig) > 1 and np.min(np.diff(idxs_sig)) == 1:
            y_consec_sig.append(1)
        else:
            y_consec_sig.append(0)
    df['y_consec_sig'] = y_consec_sig
    df['y_sig_min_diff'] = y_sig_min_diff
    df['y_consec_thresh'] = np.logical_or(df['y_consec_sig'], df['y_conservative_thresh'])

    def add_hotspots(df, num_sigs, outcome_def='consec_sig'):
        '''Identify hotspots as any track which over its time course has multiple events
        events must meet the event definition, then for a time not meet it, then meet it again
        Example: two consecutive significant p-values, then non-significant p-value, then 2 more consecutive p-values
        '''

        if outcome_def == 'consec_sig':
            hotspots = np.zeros(df.shape[0]).astype(int)
            for i in range(df.shape[0]):
                idxs_sig = np.where(num_sigs[i] == 1)[0]  # indices of significance
                if idxs_sig.size < 5:
                    hotspots[i] = 0
                else:
                    diffs = np.diff(idxs_sig)
                    consecs = np.where(diffs == 1)[0]  # diffs==1 means there were consecutive sigs
                    consec_diffs = np.diff(consecs)
                    if consec_diffs.shape[0] > 0 and np.max(
                            consec_diffs) > 2:  # there were greated than 2 non-consec sigs between the consec sigs
                        hotspots[i] = 1
                    else:
                        hotspots[i] = 0
        df['sig_idxs'] = num_sigs
        df['hotspots'] = hotspots == 1

        return df

    df = add_hotspots(df, num_sigs)

    if LABELS is not None:
        df['y_consec_thresh'][df.pid.isin(LABELS['pos'])] = 1  # add manual pos labels
        df['y_consec_thresh'][df.pid.isin(LABELS['neg'])] = 0  # add manual neg labels
        df['hotspots'][df.pid.isin(LABELS['hotspots'])] = True  # add manual hotspot labels

    if not vps_data:
        df = add_rule_based_label(df)

    return df

def add_sig_mean(df, resp_tracks=['Y']):
    """add response of regression problem: mean auxilin strength among significant observations
    """
    for track in resp_tracks:
        sig_mean = []
        for i in range(len(df)):
            r = df.iloc[i]
            sigs = np.array(r[f'{track}_pvals']) < 0.05
            if sum(sigs)>0:
                sig_mean.append(np.mean(np.array(r[track])[sigs]))
            else:
                sig_mean.append(0)
        df[f'{track}_sig_mean'] = sig_mean
        df[f'{track}_sig_mean_normalized'] = sig_mean
        for cell in set(df['cell_num']):
            cell_idx = np.where(df['cell_num'].values == cell)[0]
            y = df[f'{track}_sig_mean'].values[cell_idx]
            df[f'{track}_sig_mean_normalized'].values[cell_idx] = (y - np.mean(y))/np.std(y)
    return df

def add_aux_dyn_outcome(df, p_thresh=0.05, clath_thresh=1500, dyn_thresh=2000,
                        dyn_cons_thresh=5, clath_sig_frac=0.5, clath_consec_thresh_frac=0.15):
    """add response of regression problem: mean auxilin strength among significant observations
    """
    
    # look for clathrin significance
    num_sigs = [np.array(df['X_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
    x_consec_sig = []
    x_frac_sig = []
    lifetime_steps = np.array([len(df['X'].iloc[i]) for i in range(df.shape[0])]) # get lifetimes
    for i in range(df.shape[0]):
        l = lifetime_steps[i]
        sigs = num_sigs[i]
        x_frac_sig.append(np.mean(sigs) >= clath_sig_frac)
        cons = 0
        consec_flag = False
        for j in range(len(sigs)):
            if sigs[j] == 1:
                cons += 1
            else:
                cons = 0
            if cons >= max(l * clath_consec_thresh_frac, 5):
                consec_flag = True
                break
        if consec_flag:
            x_consec_sig.append(1)
        else:
            x_consec_sig.append(0)
    
    
    # outcomes based on significant p-values
    df['clath_conservative_thresh'] = (df['X_max'].values > clath_thresh).astype(int)
    df['clath_sig'] = np.logical_and(x_consec_sig, x_frac_sig)
    df['successful'] = np.logical_and(df['y_consec_thresh'], df['clath_conservative_thresh'])
    df['successful_dynamin'] = df['successful']
    df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin'])
    
    
    # look for dynamin peak
    if 'Z' in df.keys():
        num_sigs = [np.array(df['Z_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
        z_consec_sig = []
        for i in range(df.shape[0]):
            sigs = num_sigs[i]
            cons = 0
            consec_flag = False
            for j in range(len(sigs)):
                if sigs[j] == 1:
                    cons += 1
                else:
                    cons = 0
                if cons >= dyn_cons_thresh:
                    consec_flag = True
                    break
            if consec_flag:
                z_consec_sig.append(1)
            else:
                z_consec_sig.append(0)
        df['z_consec_sig'] = z_consec_sig
        df['Z_max'] = [np.max(df.iloc[i]['Z']) for i in range(df.shape[0])]
        df['z_thresh'] = df['Z_max'] > dyn_thresh
        df['z_consec_thresh'] = np.logical_and(df['z_consec_sig'], df['z_thresh'])
        df['Y_peak_idx'] = np.nan_to_num(np.array([np.argmax(y) for y in df.Y]))
        df['Z_peak_idx'] = np.nan_to_num(np.array([np.argmax(z) for z in df.Z]))
        df['z_peaked_first'] = df['Z_peak_idx'] < df['Y_peak_idx']
        df['z_peak'] = np.logical_and(df['z_consec_thresh'], df['z_peaked_first'])
        
        # peaks must happen at end of track
        df['z_peak'] =  np.logical_and(df['z_peak'], df['Z_peak_idx'] > lifetime_steps / 2)
        
        
        df['successful_dynamin'] = np.logical_or(
            df['successful'],
            np.logical_and(df['clath_conservative_thresh'], df['z_peak'])
        )
        df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin'])
        
        # add more manual labels
        df['successful_full'] = df['successful_full']
        df['successful_full'][df.pid.isin(config.LABELS_DYNAMIN_NEW['pos'])] = 1
        df['successful_full'][df.pid.isin(config.LABELS_DYNAMIN_NEW['neg'])] = 0
        df['hotspots'][df.pid.isin(config.LABELS_DYNAMIN_NEW['hotspots'])] = True    
        
        
    return df