Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

create_folds.py 5.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
  1. import os
  2. import json
  3. import yaml
  4. import time
  5. import argparse
  6. import numpy as np
  7. import pandas as pd
  8. from typing import Tuple, Optional
  9. from skbio import Sequence
  10. import dask.dataframe as dd
  11. from sklearn.model_selection import StratifiedKFold
  12. def get_kmer_string(sequence, n_kmers, sep=' ', overlap=True):
  13. """
  14. Function to generate strings composed of the different kmers in the sequence with separator `sep`
  15. :param sequence: DNA sequence string
  16. :param n_kmers: kmers order to look for in the sequence
  17. :param sep: String to use to separate the kmers found. Usually ' ' works well with vectorizers
  18. :param overlap: Whether to use overlap when generating the kmers r not
  19. :return:
  20. """
  21. kmer_sequence = sep.join(
  22. [str(kmer) for kmer in Sequence(sequence=sequence).iter_kmers(n_kmers, overlap=overlap)]
  23. )
  24. return kmer_sequence
  25. def compute_kmer_string_parallel(X_ddf_: dd.Series, kmer_order: int):
  26. print(f'------------------Generate strings of order {kmer_order}------------------')
  27. start = time.time()
  28. kmer_sequence_string_ = X_ddf_ \
  29. .apply(lambda dna_seq: get_kmer_string(dna_seq, n_kmers=kmer_order), meta=('string', 'string')) \
  30. .compute(scheduler='processes')
  31. kmer_sequence_string_.rename(f'overlapping_{kmer_order}mers', inplace=True)
  32. end = time.time()
  33. print(f'total time to generate strings of {kmer_order}mers is: {end - start} \n\n')
  34. return kmer_sequence_string_
  35. def splice_and_save_folds(X_: pd.Series, y_: Optional[pd.Series], folds_idx_list: tuple,
  36. folds_data_dir_path_: str, name_suffix: str):
  37. print(f'-----------------Split {name_suffix} into folds and save them-----------------')
  38. for i, (train_fold_i_idx, holdout_i_idx) in enumerate(folds_idx_list):
  39. fold_n = f'fold_{i + 1}'
  40. X_train = X_.iloc[train_fold_i_idx]
  41. X_holdout = X_.iloc[holdout_i_idx]
  42. if y_ is not None:
  43. y_train = y_.iloc[train_fold_i_idx]
  44. y_holdout = y_.iloc[holdout_i_idx]
  45. train_dir = os.path.join(folds_data_dir_path_, fold_n, 'train')
  46. os.makedirs(train_dir, exist_ok=True)
  47. pd.DataFrame(X_train)\
  48. .to_parquet(os.path.join(train_dir, f'X_train{name_suffix}.parquet'),
  49. engine='pyarrow', compression='snappy', index=True)
  50. if y_ is not None:
  51. pd.DataFrame(y_train)\
  52. .to_parquet(os.path.join(train_dir, f'y_train{name_suffix}.parquet'),
  53. engine='pyarrow', compression='snappy', index=True)
  54. holdout_dir = os.path.join(folds_data_dir_path_, fold_n, 'holdout')
  55. os.makedirs(holdout_dir, exist_ok=True)
  56. pd.DataFrame(X_holdout)\
  57. .to_parquet(os.path.join(holdout_dir, f'X_holdout{name_suffix}.parquet'),
  58. engine='pyarrow', compression='snappy', index=True)
  59. if y_ is not None:
  60. pd.DataFrame(y_holdout)\
  61. .to_parquet(os.path.join(holdout_dir, f'y_holdout{name_suffix}.parquet'),
  62. engine='pyarrow', compression='snappy', index=True)
  63. def create_folds(train_data_path: str, folds_data_dir_path_: str, n_folds: int = 5,
  64. kmer_orders: Tuple[int] = (1, 2, 3, 4, 5, 6)):
  65. """
  66. Create and save splits in the specified folds_data_dir_path_
  67. :param kmer_orders:
  68. :param train_data_path: Path to the csv file where the training data is stored
  69. :param folds_data_dir_path_: path of where to store the data
  70. :param n_folds: Number of folds to create. Usually 5
  71. :return:
  72. """
  73. dfH_train = pd.read_csv(train_data_path, dtype={'sequences': pd.StringDtype(), 'labels': np.int8})
  74. dfH_train.index.rename('index', inplace=True)
  75. X_ = dfH_train['sequences'].copy()
  76. y_ = dfH_train['labels'].copy()
  77. folder = StratifiedKFold(n_splits=n_folds)
  78. folds_idx_tuple = tuple(folder.split(X_, y_))
  79. splice_and_save_folds(X_=X_, y_=y_, folds_idx_list=folds_idx_tuple, folds_data_dir_path_=folds_data_dir_path_,
  80. name_suffix='')
  81. X_ddf = dd.from_pandas(X_, npartitions=50)
  82. for kmer_order in kmer_orders:
  83. kmer_sequence_string = compute_kmer_string_parallel(X_ddf_=X_ddf, kmer_order=kmer_order)
  84. splice_and_save_folds(
  85. X_=kmer_sequence_string, y_=None, folds_idx_list=folds_idx_tuple, folds_data_dir_path_=folds_data_dir_path_,
  86. name_suffix='_' + kmer_sequence_string.name)
  87. def parse_arguments():
  88. parser = argparse.ArgumentParser()
  89. parser.add_argument('--config_file_path', type=str, default='config.yml',
  90. help='Config file with paths to input data')
  91. args_ = parser.parse_args()
  92. return args_
  93. if __name__ == '__main__':
  94. args = parse_arguments()
  95. with open(args.config_file_path) as yaml_file:
  96. config_params = yaml.load(yaml_file, Loader=yaml.FullLoader)
  97. input_data_params = config_params['input_data_params']
  98. training_params = config_params['training_params']
  99. preprocess_params = training_params['preprocesing_params']
  100. assert isinstance(input_data_params, dict)
  101. assert isinstance(training_params, dict)
  102. assert isinstance(preprocess_params, dict)
  103. start_time = time.time()
  104. print('Create and save folds')
  105. create_folds(
  106. train_data_path=os.path.join(input_data_params['input_data_dir_path'], input_data_params['human_train']),
  107. folds_data_dir_path_=preprocess_params['input_training_data_dir_path'],
  108. n_folds=training_params['n_folds'],
  109. kmer_orders=preprocess_params['kmer_orders']
  110. )
  111. end_time = time.time()
  112. with open('create_folds_metrics.json', 'w') as f:
  113. json.dump({'create_cv_folds_time': end_time - start_time}, f)
  114. print("done")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...