Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

generate_kmers.py 5.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
  1. import os
  2. import time
  3. import json
  4. import yaml
  5. import argparse
  6. import pandas as pd
  7. from typing import Tuple
  8. from sklearn.feature_extraction.text import TfidfVectorizer
  9. from sklearn.pipeline import Pipeline
  10. from utils.preprocessing import KmerVectorizer, DropColumns
  11. def load_input_features(train_fold_dir: str, kmer_order: int):
  12. for fold_i in os.listdir(train_fold_dir):
  13. train_path = os.path.join(train_fold_dir, fold_i, 'train')
  14. holdout_path = os.path.join(train_fold_dir, fold_i, 'holdout')
  15. X_train = pd.read_parquet(os.path.join(train_path, f'X_train_overlapping_{kmer_order}mers.parquet'),
  16. engine='pyarrow')
  17. X_holdout = pd.read_parquet(os.path.join(holdout_path, f'X_holdout_overlapping_{kmer_order}mers.parquet'),
  18. engine='pyarrow')
  19. yield fold_i, X_train, X_holdout
  20. def create_kmer_representation(k_mer_orders: Tuple[int], input_train_fold_dir_: str, output_train_fold_dir_: str):
  21. metrics_ = dict()
  22. start_time_complete_pipeline = time.time()
  23. for kmer_order in k_mer_orders:
  24. print(f'-----------------Create feature vectors of {kmer_order}mers-----------------')
  25. if kmer_order == 1:
  26. vectorizer = KmerVectorizer(vectorizer=TfidfVectorizer(sublinear_tf=True, token_pattern=r"(?u)\b\w+\b"),
  27. n_kmers=kmer_order,
  28. kmer_string_precalculated_name=f'overlapping_{kmer_order}mers')
  29. else:
  30. vectorizer = KmerVectorizer(vectorizer=TfidfVectorizer(), n_kmers=kmer_order,
  31. kmer_string_precalculated_name=f'overlapping_{kmer_order}mers')
  32. vectorizing_pipeline = Pipeline([
  33. (f'{kmer_order}mer', vectorizer),
  34. ('drop', DropColumns([f'overlapping_{kmer_order}mers']))
  35. ])
  36. for fold_i, X_train, X_holdout in load_input_features(input_train_fold_dir_, kmer_order=kmer_order):
  37. print(f' -------------Generate features for {fold_i}------------ ')
  38. # Fit and transform the input features
  39. print(f' Fit to {fold_i} KmerVectorizer of order {kmer_order} ')
  40. start_time = time.time()
  41. vectorizing_pipeline.fit(X_train)
  42. end_time = time.time()
  43. print(f' time_fit_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
  44. metrics_[f'time_fit_{fold_i}_{kmer_order}mer'] = end_time - start_time
  45. print(f' Transform in {fold_i} X_train with KmerVectorizer of order {kmer_order}')
  46. start_time = time.time()
  47. X_train_features = vectorizing_pipeline.transform(X_train)
  48. end_time = time.time()
  49. print(f' time_tranform_train_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
  50. metrics_[f'time_tranform_train_{fold_i}_{kmer_order}mer'] = end_time - start_time
  51. print(f' Transform in {fold_i} X_holdout with KmerVectorizer of order {kmer_order}')
  52. start_time = time.time()
  53. X_holdout_features = vectorizing_pipeline.transform(X_holdout)
  54. end_time = time.time()
  55. print(f' time_tranform_holdout_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
  56. metrics_[f'time_tranform_holdout_{fold_i}_{kmer_order}mer'] = end_time - start_time
  57. # Save them in parquet format
  58. print(' Save in parquet format \n')
  59. train_data_path = os.path.join(output_train_fold_dir_, fold_i, 'train')
  60. os.makedirs(train_data_path, exist_ok=True)
  61. X_train_features.to_parquet(
  62. os.path.join(train_data_path, f'X_train_features_{kmer_order}mer.parquet'))
  63. holdout_data_path = os.path.join(output_train_fold_dir_, fold_i, 'holdout')
  64. os.makedirs(holdout_data_path, exist_ok=True)
  65. X_holdout_features.to_parquet(
  66. os.path.join(holdout_data_path, f'X_holdout_features_{kmer_order}mer.parquet'))
  67. end_time_complete_pipeline = time.time()
  68. metrics_[f'Total time of pipeline to transform and save features for all folds'] =\
  69. end_time_complete_pipeline - start_time_complete_pipeline
  70. return metrics_
  71. def parse_arguments():
  72. parser = argparse.ArgumentParser()
  73. parser.add_argument('--config_file_path', type=str, default='config.yml',
  74. help='Config file with paths to input data')
  75. args_ = parser.parse_args()
  76. return args_
  77. if __name__ == '__main__':
  78. args = parse_arguments()
  79. with open(args.config_file_path) as yaml_file:
  80. config_params = yaml.load(yaml_file, Loader=yaml.FullLoader)
  81. preprocesing_params = config_params['training_params']['preprocesing_params']
  82. assert isinstance(preprocesing_params, dict)
  83. metrics = create_kmer_representation(
  84. k_mer_orders=preprocesing_params['kmer_orders'],
  85. input_train_fold_dir_=preprocesing_params['input_training_data_dir_path'],
  86. output_train_fold_dir_=preprocesing_params['output_data_dir_path']
  87. )
  88. with open('generate_kmers_metrics.json', 'w') as f:
  89. json.dump(metrics, f)
  90. print('Done')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...