Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

survival.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
  1. """Helper functions for survival analysis"""
  2. import pandas as pd
  3. from matplotlib import pyplot as plt
  4. from sksurv.nonparametric import kaplan_meier_estimator
  5. def df_augment_surv(lines_df):
  6. """Augment line survival dataframe with 'T' (time) and 'E' (event) columns
  7. Enhancing dataframe with data needed for survival analysis
  8. :param pd.DataFrame lines_df: data to augment, modified by function
  9. :return: modified input
  10. :rtype: pd.DataFrame
  11. """
  12. # 'has_next' will be column used to denote lack of censoring
  13. # the presence of not N/A **`next_commit`** will be used as _'event observed'_ column
  14. lines_df.loc[:, 'has_next'] = lines_df['next_commit'].notna()
  15. # convert timestamp to date
  16. lines_df.loc[:, 'Sha_committer_time'] = pd.to_datetime(lines_df['Sha_committer_timestamp'], unit='s')
  17. lines_df.loc[:, 'last_committer_time'] = pd.to_datetime(lines_df['last_committer_timestamp'], unit='s')
  18. lines_df.loc[:, 'next_committer_time'] = pd.to_datetime(lines_df['next_committer_timestamp'], unit='s')
  19. # event duration for survival analysis
  20. # - uncensored
  21. lines_df.loc[:, 'survival_duration'] = lines_df['next_committer_time'] - lines_df['Sha_committer_time']
  22. lines_df.loc[:, 'survival_duration_days'] = lines_df['survival_duration'].dt.total_seconds()/(60*60*24)
  23. # - right-censored
  24. lines_df.loc[:, 'unchanged_duration'] = lines_df['last_committer_time'] - lines_df['Sha_committer_time']
  25. lines_df.loc[:, 'unchanged_duration_days'] = lines_df['unchanged_duration'].dt.total_seconds()/(60*60*24)
  26. # - time to death or to end
  27. lines_df.loc[ lines_df['has_next'], 'observed_duration'] = lines_df['survival_duration']
  28. lines_df.loc[~lines_df['has_next'], 'observed_duration'] = lines_df['unchanged_duration']
  29. lines_df.loc[:, 'observed_duration_days'] = lines_df['observed_duration'].dt.total_seconds()/(60*60*24)
  30. # mnemonics
  31. lines_df.loc[:, 'T'] = lines_df['observed_duration_days']
  32. lines_df.loc[:, 'E'] = lines_df['has_next']
  33. return lines_df
  34. def compute_and_plot_KM_sksurv(E, T, label=None):
  35. """Plot Kaplan-Meier estimation of survival function, using sksurv
  36. :param pd.Series E: boolean valued series denoting which events happened
  37. :param pd.Series T: number valued series with event time
  38. ("death" or "end of observation")
  39. :param str or None label: label for the plot, optional
  40. :rtype: None
  41. """
  42. time, survival_prob, conf_int = kaplan_meier_estimator(
  43. E, T, conf_type="log-log"
  44. )
  45. plt.step(time, survival_prob, where="post", label=label)
  46. plt.fill_between(time, conf_int[0], conf_int[1], alpha=0.25, step="post")
  47. plt.ylim(0, 1)
  48. plt.ylabel("est. probability of survival $\hat{S}(t)$")
  49. plt.xlabel("change line timeline $t$ [days]")
  50. plt.title("KM estimate, via scikit-learn, log-log conf.")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...