1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
- import os
- import time
- import json
- import yaml
- import argparse
- import pandas as pd
- from typing import Tuple
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.pipeline import Pipeline
- from utils.preprocessing import KmerVectorizer, DropColumns
- def load_input_features(train_fold_dir: str, kmer_order: int):
- for fold_i in os.listdir(train_fold_dir):
- train_path = os.path.join(train_fold_dir, fold_i, 'train')
- holdout_path = os.path.join(train_fold_dir, fold_i, 'holdout')
- X_train = pd.read_parquet(os.path.join(train_path, f'X_train_overlapping_{kmer_order}mers.parquet'),
- engine='pyarrow')
- X_holdout = pd.read_parquet(os.path.join(holdout_path, f'X_holdout_overlapping_{kmer_order}mers.parquet'),
- engine='pyarrow')
- yield fold_i, X_train, X_holdout
- def create_kmer_representation(k_mer_orders: Tuple[int], input_train_fold_dir_: str, output_train_fold_dir_: str):
- metrics_ = dict()
- start_time_complete_pipeline = time.time()
- for kmer_order in k_mer_orders:
- print(f'-----------------Create feature vectors of {kmer_order}mers-----------------')
- if kmer_order == 1:
- vectorizer = KmerVectorizer(vectorizer=TfidfVectorizer(sublinear_tf=True, token_pattern=r"(?u)\b\w+\b"),
- n_kmers=kmer_order,
- kmer_string_precalculated_name=f'overlapping_{kmer_order}mers')
- else:
- vectorizer = KmerVectorizer(vectorizer=TfidfVectorizer(), n_kmers=kmer_order,
- kmer_string_precalculated_name=f'overlapping_{kmer_order}mers')
- vectorizing_pipeline = Pipeline([
- (f'{kmer_order}mer', vectorizer),
- ('drop', DropColumns([f'overlapping_{kmer_order}mers']))
- ])
- for fold_i, X_train, X_holdout in load_input_features(input_train_fold_dir_, kmer_order=kmer_order):
- print(f' -------------Generate features for {fold_i}------------ ')
- # Fit and transform the input features
- print(f' Fit to {fold_i} KmerVectorizer of order {kmer_order} ')
- start_time = time.time()
- vectorizing_pipeline.fit(X_train)
- end_time = time.time()
- print(f' time_fit_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
- metrics_[f'time_fit_{fold_i}_{kmer_order}mer'] = end_time - start_time
- print(f' Transform in {fold_i} X_train with KmerVectorizer of order {kmer_order}')
- start_time = time.time()
- X_train_features = vectorizing_pipeline.transform(X_train)
- end_time = time.time()
- print(f' time_tranform_train_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
- metrics_[f'time_tranform_train_{fold_i}_{kmer_order}mer'] = end_time - start_time
- print(f' Transform in {fold_i} X_holdout with KmerVectorizer of order {kmer_order}')
- start_time = time.time()
- X_holdout_features = vectorizing_pipeline.transform(X_holdout)
- end_time = time.time()
- print(f' time_tranform_holdout_{fold_i}_{kmer_order}mer: {end_time - start_time} \n')
- metrics_[f'time_tranform_holdout_{fold_i}_{kmer_order}mer'] = end_time - start_time
- # Save them in parquet format
- print(' Save in parquet format \n')
- train_data_path = os.path.join(output_train_fold_dir_, fold_i, 'train')
- os.makedirs(train_data_path, exist_ok=True)
- X_train_features.to_parquet(
- os.path.join(train_data_path, f'X_train_features_{kmer_order}mer.parquet'))
- holdout_data_path = os.path.join(output_train_fold_dir_, fold_i, 'holdout')
- os.makedirs(holdout_data_path, exist_ok=True)
- X_holdout_features.to_parquet(
- os.path.join(holdout_data_path, f'X_holdout_features_{kmer_order}mer.parquet'))
- end_time_complete_pipeline = time.time()
- metrics_[f'Total time of pipeline to transform and save features for all folds'] =\
- end_time_complete_pipeline - start_time_complete_pipeline
- return metrics_
- def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument('--config_file_path', type=str, default='config.yml',
- help='Config file with paths to input data')
- args_ = parser.parse_args()
- return args_
- if __name__ == '__main__':
- args = parse_arguments()
- with open(args.config_file_path) as yaml_file:
- config_params = yaml.load(yaml_file, Loader=yaml.FullLoader)
- preprocesing_params = config_params['training_params']['preprocesing_params']
- assert isinstance(preprocesing_params, dict)
- metrics = create_kmer_representation(
- k_mer_orders=preprocesing_params['kmer_orders'],
- input_train_fold_dir_=preprocesing_params['input_training_data_dir_path'],
- output_train_fold_dir_=preprocesing_params['output_data_dir_path']
- )
- with open('generate_kmers_metrics.json', 'w') as f:
- json.dump(metrics, f)
- print('Done')
|