Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

r10.3.config 9.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
  1. # Master katuali configuration file template
  2. # ------------------------------------------
  3. #
  4. # Having copied this file with the command:
  5. # katuali_config my_config.yaml
  6. # it will be necessary to amend the `DATA:` dictionary in the first section below
  7. # to define the dataset to be processed.
  8. #
  9. # The second section containing software locations should also be checked and
  10. # amended if necessary.
  11. #
  12. # To run the predefined workflows: `all_fast_assm_polish`, `all_standard_assm_polish`
  13. # `all_medaka_train_features` starting from fast5 input,
  14. # nothing else need be done. If starting a workflow from basecall data
  15. # (https://nanoporetech.github.io/katuali/examples.html#starting-from-existing-basecalls),
  16. # basecalls should be placed under the basecalling folder as defined in the pipelines config section.
  17. # For the all_fast_assm_polish pipeline for example, basecalls should be placed in
  18. # "{DATA}/guppy/basecalls.fasta"
  19. ####################
  20. # Input data options
  21. # .fast5 files can be under top-level folders named by a RUNID (these need not
  22. # be actual run UUIDs). Fast5s should be in RUNID/reads. The keys within this
  23. # data dictionary are RUNIDs.
  24. #
  25. # REFERENCE is required for the medaka training workflows, and in general any
  26. # time reads need to be aligned to a reference (e.g. for for subsampling)
  27. #
  28. # MEDAKA_TRAIN_REGIONS and MEDAKA_EVAL_REGIONS define regions for
  29. # training and evaluation within the medaka training workflow, and are
  30. # otherwise not used.
  31. #
  32. # GENOME_SIZE is required only in workflows using the canu or flye assembler.
  33. # When a reference file is given, GENOME_SIZE will be calculated from it in
  34. # the case that a value is not directly specified.
  35. #
  36. # In the example below we train from the "minion" run using "ecoli" and "yeast"
  37. # contigs in the reference and evaluate on the "gridion" run using the contigs
  38. # "ecoli", "yeast" and "na12878_chr21" in the reference.
  39. DATA:
  40. '20200914_1356_6F_PAF26223_da14221a':
  41. 'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
  42. 'SPLIT_FAST5_REGIONS':
  43. ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
  44. 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
  45. 'chr21', 'chr22', 'chrX', 'chrY']
  46. '20200914_1358_2-E5-H5_PAF26161_2e3c81af':
  47. 'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
  48. 'SPLIT_FAST5_REGIONS':
  49. ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
  50. 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
  51. 'chr21', 'chr22', 'chrX', 'chrY']
  52. ############################
  53. # Definitions of standard pipelines
  54. #
  55. # Pipeline target template definitions containing other config parameters that
  56. # will be filled in with config variables to generate targets, see
  57. # documentation for full details.
  58. PIPELINES:
  59. all_initial: [
  60. # do alignments, split bams and fast5s by regions
  61. "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/fast5/",
  62. # calculate alignment stats
  63. "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/calls2ref_stats.txt",
  64. "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/calls2ref_stats.txt"
  65. ]
  66. # Runtime configuration
  67. RUNTIME:
  68. # default number of threads/cores/slots to use per multi-threaded task.
  69. THREADS_PER_JOB: 32
  70. # To run medaka consensus on GPU, set MEDAKA_CONSENSUS_NUM_GPU to 1.
  71. # More than a single GPU is not currently supported (by medaka).
  72. MEDAKA_CONSENSUS_NUM_GPU: 0
  73. ##############################
  74. # Location of various software
  75. #
  76. # Configuration for software is found toward the end of this config.
  77. IN_MEDAKA: "~/git/medaka/venv/bin/activate" # tensorflow-gpu should be installed in the venv for GPU processing.
  78. IN_POMOXIS: "~/git/pomoxis/venv/bin/activate"
  79. CANU_EXEC: "~/git/canu-1.8/Linux-amd64/bin/canu"
  80. FLYE_EXEC: "~/git/Flye/bin/flye"
  81. SHASTA_EXEC: "~/git/shasta-Linux-0.3.0"
  82. GUPPY_EXEC: "/usr/bin/guppy_basecaller"
  83. PORPITA_EXEC: "~/git/porpita/porpita"
  84. SOURCE: "source"
  85. LIFTOVER: "~/miniconda3/envs/liftover/bin/liftOver"
  86. VCFCREATEMULTI: "~/git/vcflib/bin/vcfcreatemulti"
  87. LD_LIBRARY_PATH: "" # set this for e.g. CUDA library path
  88. #########################
  89. # Compute/cluster options
  90. # Location of scratch space on cluster nodes. This is currently only used for
  91. # medaka training where latency on an networked filesystem can significantly
  92. # slow things down. To have katuali copy medaka features to scratch space
  93. # before training set SCRATCH and optionally TMPSCRATCH.
  94. #
  95. # If TMPSCRATCH is given the resultant path will be:
  96. # /SCRATCH/$USER/TMPSCRATCH
  97. # whereas if it is not given the resultant path will be:
  98. # /SCRATCH/$USER/<generated unique name>
  99. #
  100. # TMPSCRATCH is useful in the case of restarting a training job on a node to
  101. # which features have already been copied; in this case, simply set TMPSCRATCH
  102. # to the directory under which the features were copied and katuali will skip
  103. # copying the data and use the existing data.
  104. #
  105. # Note that katuali does not delete the data in scratch, so users must do this
  106. # manually.
  107. SCRATCH: ""
  108. TMPSCRATCH: ""
  109. # Snakemake checks for the existence of input files on the submission node. On
  110. # systems with a high-latency distributed file system, the existence of a file
  111. # on the submission node may not guarantee its presence on a compute node.
  112. # Katuali checks that all input files exist on the machine where a rule is
  113. # about to run. Note that if the file is not found, katuali will attempt to
  114. # force an NFS cache update by changing the owner of the file to the current
  115. # user (using chown). If the file is still not found, execution terminates.
  116. # Verbose logging of the file checking can by setting CHECK_FILES_EXIST_OPTS to
  117. # '--debug'.
  118. #CHECK_FILES_EXIST_OPTS: '--debug' # activate logging of file-checking
  119. CHECK_FILES_EXIST_OPTS: '--quiet' # check files silently
  120. ############################
  121. # Misc. options for programs
  122. #
  123. # For advanced users. See
  124. # https://nanoporetech.github.io/katuali/examples.html#examples
  125. # for help in contructing bespoke pipelines.
  126. #
  127. # For fully specified targets, rather than predefined workflows the below can
  128. # be used to manipulate how each program is run. Each key acts as a shortcut
  129. # for a specific parameter set (as well as functioning as a target name).
  130. # Default parameters are given by the `""` targets
  131. # Guppy
  132. GUPPY_OPTS:
  133. "": "-c dna_r9.4.1_450bps_hac.cfg"
  134. "_v4.0.11_r9.4.1_hac_prom": "-c dna_r9.4.1_450bps_hac.cfg"
  135. "_v4.0.11_r10.3_hac_prom": "-c dna_r10.3_450bps_hac_prom.cfg"
  136. # subsample_bam
  137. SUBSAMPLE_BAM_OPTS:
  138. "": "--all_fail"
  139. "_prop": "--proportional --all_fail"
  140. "_filtered": "--quality 6 --coverage 90 --all_fail"
  141. # mini_align
  142. MINI_ALIGN_OPTS:
  143. "": ""
  144. "_unfiltered": "-A"
  145. # mini_assemble
  146. MINI_ASSEMBLE_OPTS:
  147. "": ""
  148. "_ce": "-c -e 10"
  149. "_ces": "-c -e 10 -n 10"
  150. # porpita
  151. PORPITA_OPTS:
  152. "": ""
  153. # Canu
  154. CANU_EXEC_OPTS:
  155. "useGrid=False" # this shouldn't be changed, snakemake will be confused
  156. CANU_OPTS:
  157. "": ""
  158. # Flye
  159. FLYE_OPTS:
  160. "": ""
  161. # Shasta
  162. SHASTA_OPTS:
  163. "": "--Reads.minReadLength 2000"
  164. # assess_assembly
  165. ASSESS_ASSM_SUFFIXES: [""]
  166. ASSESS_ASSM_OPTS:
  167. "": "-C -H" # runs cataloguing of errors and homopolymer analysis
  168. #############################
  169. # Options for medaka training
  170. # This section can be ignored if not running a medaka training workflow
  171. # Read depths at which to create assemblies for training, this should
  172. # span the range of depths at which the model is to be used
  173. DEPTHS:
  174. [25, 50, 75, 100, 125, 150, 175, 200]
  175. # If any medaka feature files are missing (e.g. due to insufficient coverage
  176. # for some runs / contigs) the medaka training step will not find all the files
  177. # it expects and will not run. To train on the feauture files which were
  178. # created, set this flag to true (after having already created the features
  179. # with the flag set to false)
  180. USE_ONLY_EXISTING_MEDAKA_FEAT:
  181. false
  182. # Run multiple training replicates - output will be in medaka_train_{key},
  183. # values should be a key of the PIPELINES dictionary in this file. In simple
  184. # cases this allows running technical replicates of the training, but also
  185. # allows the pipeline to be changed to for example create features in a
  186. # different manner. For the latter change the value component to an alternative
  187. # user defined pipeline.
  188. MEDAKA_TRAIN_REPLICATES:
  189. "_rep_1": "all_medaka_feat"
  190. "_rep_2": "all_medaka_feat"
  191. "_rep_3": "all_medaka_feat"
  192. # Evaluation of trained models, entries in this list should be keys
  193. # of MEDAKA_OPTS, the values of which need to specify the path of the
  194. # trained model using the `-m option`.
  195. MEDAKA_EVAL_SUFFIXES:
  196. ["_rep_1_best_val", "_rep_2_best_val", "_rep_3_best_val"]
  197. MEDAKA_OPTS:
  198. "": "-m r941_min_high_g344"
  199. "_hac": "-m r941_min_high_g344"
  200. "_hac_prom": "-m r941_prom_high_g344"
  201. "_fast": "-m r941_min_fast_g303"
  202. "_fast_prom": "-m r941_prom_fast_g303"
  203. "_rep_1_best_val": "-m medaka_train_rep_1/model.best.val_cat_acc.hdf5"
  204. "_rep_2_best_val": "-m medaka_train_rep_2/model.best.val_cat_acc.hdf5"
  205. "_rep_3_best_val": "-m medaka_train_rep_3/model.best.val_cat_acc.hdf5"
  206. # Medaka training features
  207. MEDAKA_TRAIN_FEAT_OPTS:
  208. "": "--chunk_len 1000 --chunk_ovlp 0"
  209. MEDAKA_TRAIN_OPTS:
  210. "_rep_1": "--mini_epochs 5 --validation_split 0.10"
  211. "_rep_2": "--mini_epochs 5 --validation_split 0.10"
  212. "_rep_3": "--mini_epochs 5 --validation_split 0.10"
  213. # Empty VARIANT_RESOURCES stubs, required to enable medaka_train_variant.snake
  214. # to be included in the Snakefile without raising config key errors.
  215. # See config_variant.yaml for an example of a valid variant calling config.
  216. VARIANT_RESOURCES:
  217. "SAMPLES" : ""
  218. "VARIANTS_TO_ADD": ""
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...