1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
|
- # Master katuali configuration file template
- # ------------------------------------------
- #
- # Having copied this file with the command:
- # katuali_config my_config.yaml
- # it will be necessary to amend the `DATA:` dictionary in the first section below
- # to define the dataset to be processed.
- #
- # The second section containing software locations should also be checked and
- # amended if necessary.
- #
- # To run the predefined workflows: `all_fast_assm_polish`, `all_standard_assm_polish`
- # `all_medaka_train_features` starting from fast5 input,
- # nothing else need be done. If starting a workflow from basecall data
- # (https://nanoporetech.github.io/katuali/examples.html#starting-from-existing-basecalls),
- # basecalls should be placed under the basecalling folder as defined in the pipelines config section.
- # For the all_fast_assm_polish pipeline for example, basecalls should be placed in
- # "{DATA}/guppy/basecalls.fasta"
- ####################
- # Input data options
- # .fast5 files can be under top-level folders named by a RUNID (these need not
- # be actual run UUIDs). Fast5s should be in RUNID/reads. The keys within this
- # data dictionary are RUNIDs.
- #
- # REFERENCE is required for the medaka training workflows, and in general any
- # time reads need to be aligned to a reference (e.g. for for subsampling)
- #
- # MEDAKA_TRAIN_REGIONS and MEDAKA_EVAL_REGIONS define regions for
- # training and evaluation within the medaka training workflow, and are
- # otherwise not used.
- #
- # GENOME_SIZE is required only in workflows using the canu or flye assembler.
- # When a reference file is given, GENOME_SIZE will be calculated from it in
- # the case that a value is not directly specified.
- #
- # In the example below we train from the "minion" run using "ecoli" and "yeast"
- # contigs in the reference and evaluate on the "gridion" run using the contigs
- # "ecoli", "yeast" and "na12878_chr21" in the reference.
- DATA:
- '20200914_1356_6F_PAF26223_da14221a':
- 'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
- 'SPLIT_FAST5_REGIONS':
- ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
- 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
- 'chr21', 'chr22', 'chrX', 'chrY']
- '20200914_1358_2-E5-H5_PAF26161_2e3c81af':
- 'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
- 'SPLIT_FAST5_REGIONS':
- ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
- 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
- 'chr21', 'chr22', 'chrX', 'chrY']
- ############################
- # Definitions of standard pipelines
- #
- # Pipeline target template definitions containing other config parameters that
- # will be filled in with config variables to generate targets, see
- # documentation for full details.
- PIPELINES:
- all_initial: [
- # do alignments, split bams and fast5s by regions
- "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/fast5/",
- # calculate alignment stats
- "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/calls2ref_stats.txt",
- "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/calls2ref_stats.txt"
- ]
- # Runtime configuration
- RUNTIME:
- # default number of threads/cores/slots to use per multi-threaded task.
- THREADS_PER_JOB: 32
- # To run medaka consensus on GPU, set MEDAKA_CONSENSUS_NUM_GPU to 1.
- # More than a single GPU is not currently supported (by medaka).
- MEDAKA_CONSENSUS_NUM_GPU: 0
- ##############################
- # Location of various software
- #
- # Configuration for software is found toward the end of this config.
- IN_MEDAKA: "~/git/medaka/venv/bin/activate" # tensorflow-gpu should be installed in the venv for GPU processing.
- IN_POMOXIS: "~/git/pomoxis/venv/bin/activate"
- CANU_EXEC: "~/git/canu-1.8/Linux-amd64/bin/canu"
- FLYE_EXEC: "~/git/Flye/bin/flye"
- SHASTA_EXEC: "~/git/shasta-Linux-0.3.0"
- GUPPY_EXEC: "/usr/bin/guppy_basecaller"
- PORPITA_EXEC: "~/git/porpita/porpita"
- SOURCE: "source"
- LIFTOVER: "~/miniconda3/envs/liftover/bin/liftOver"
- VCFCREATEMULTI: "~/git/vcflib/bin/vcfcreatemulti"
- LD_LIBRARY_PATH: "" # set this for e.g. CUDA library path
- #########################
- # Compute/cluster options
- # Location of scratch space on cluster nodes. This is currently only used for
- # medaka training where latency on an networked filesystem can significantly
- # slow things down. To have katuali copy medaka features to scratch space
- # before training set SCRATCH and optionally TMPSCRATCH.
- #
- # If TMPSCRATCH is given the resultant path will be:
- # /SCRATCH/$USER/TMPSCRATCH
- # whereas if it is not given the resultant path will be:
- # /SCRATCH/$USER/<generated unique name>
- #
- # TMPSCRATCH is useful in the case of restarting a training job on a node to
- # which features have already been copied; in this case, simply set TMPSCRATCH
- # to the directory under which the features were copied and katuali will skip
- # copying the data and use the existing data.
- #
- # Note that katuali does not delete the data in scratch, so users must do this
- # manually.
- SCRATCH: ""
- TMPSCRATCH: ""
- # Snakemake checks for the existence of input files on the submission node. On
- # systems with a high-latency distributed file system, the existence of a file
- # on the submission node may not guarantee its presence on a compute node.
- # Katuali checks that all input files exist on the machine where a rule is
- # about to run. Note that if the file is not found, katuali will attempt to
- # force an NFS cache update by changing the owner of the file to the current
- # user (using chown). If the file is still not found, execution terminates.
- # Verbose logging of the file checking can by setting CHECK_FILES_EXIST_OPTS to
- # '--debug'.
- #CHECK_FILES_EXIST_OPTS: '--debug' # activate logging of file-checking
- CHECK_FILES_EXIST_OPTS: '--quiet' # check files silently
- ############################
- # Misc. options for programs
- #
- # For advanced users. See
- # https://nanoporetech.github.io/katuali/examples.html#examples
- # for help in contructing bespoke pipelines.
- #
- # For fully specified targets, rather than predefined workflows the below can
- # be used to manipulate how each program is run. Each key acts as a shortcut
- # for a specific parameter set (as well as functioning as a target name).
- # Default parameters are given by the `""` targets
- # Guppy
- GUPPY_OPTS:
- "": "-c dna_r9.4.1_450bps_hac.cfg"
- "_v4.0.11_r9.4.1_hac_prom": "-c dna_r9.4.1_450bps_hac.cfg"
- "_v4.0.11_r10.3_hac_prom": "-c dna_r10.3_450bps_hac_prom.cfg"
- # subsample_bam
- SUBSAMPLE_BAM_OPTS:
- "": "--all_fail"
- "_prop": "--proportional --all_fail"
- "_filtered": "--quality 6 --coverage 90 --all_fail"
- # mini_align
- MINI_ALIGN_OPTS:
- "": ""
- "_unfiltered": "-A"
- # mini_assemble
- MINI_ASSEMBLE_OPTS:
- "": ""
- "_ce": "-c -e 10"
- "_ces": "-c -e 10 -n 10"
- # porpita
- PORPITA_OPTS:
- "": ""
- # Canu
- CANU_EXEC_OPTS:
- "useGrid=False" # this shouldn't be changed, snakemake will be confused
- CANU_OPTS:
- "": ""
- # Flye
- FLYE_OPTS:
- "": ""
- # Shasta
- SHASTA_OPTS:
- "": "--Reads.minReadLength 2000"
- # assess_assembly
- ASSESS_ASSM_SUFFIXES: [""]
- ASSESS_ASSM_OPTS:
- "": "-C -H" # runs cataloguing of errors and homopolymer analysis
- #############################
- # Options for medaka training
- # This section can be ignored if not running a medaka training workflow
- # Read depths at which to create assemblies for training, this should
- # span the range of depths at which the model is to be used
- DEPTHS:
- [25, 50, 75, 100, 125, 150, 175, 200]
- # If any medaka feature files are missing (e.g. due to insufficient coverage
- # for some runs / contigs) the medaka training step will not find all the files
- # it expects and will not run. To train on the feauture files which were
- # created, set this flag to true (after having already created the features
- # with the flag set to false)
- USE_ONLY_EXISTING_MEDAKA_FEAT:
- false
- # Run multiple training replicates - output will be in medaka_train_{key},
- # values should be a key of the PIPELINES dictionary in this file. In simple
- # cases this allows running technical replicates of the training, but also
- # allows the pipeline to be changed to for example create features in a
- # different manner. For the latter change the value component to an alternative
- # user defined pipeline.
- MEDAKA_TRAIN_REPLICATES:
- "_rep_1": "all_medaka_feat"
- "_rep_2": "all_medaka_feat"
- "_rep_3": "all_medaka_feat"
- # Evaluation of trained models, entries in this list should be keys
- # of MEDAKA_OPTS, the values of which need to specify the path of the
- # trained model using the `-m option`.
- MEDAKA_EVAL_SUFFIXES:
- ["_rep_1_best_val", "_rep_2_best_val", "_rep_3_best_val"]
- MEDAKA_OPTS:
- "": "-m r941_min_high_g344"
- "_hac": "-m r941_min_high_g344"
- "_hac_prom": "-m r941_prom_high_g344"
- "_fast": "-m r941_min_fast_g303"
- "_fast_prom": "-m r941_prom_fast_g303"
- "_rep_1_best_val": "-m medaka_train_rep_1/model.best.val_cat_acc.hdf5"
- "_rep_2_best_val": "-m medaka_train_rep_2/model.best.val_cat_acc.hdf5"
- "_rep_3_best_val": "-m medaka_train_rep_3/model.best.val_cat_acc.hdf5"
- # Medaka training features
- MEDAKA_TRAIN_FEAT_OPTS:
- "": "--chunk_len 1000 --chunk_ovlp 0"
- MEDAKA_TRAIN_OPTS:
- "_rep_1": "--mini_epochs 5 --validation_split 0.10"
- "_rep_2": "--mini_epochs 5 --validation_split 0.10"
- "_rep_3": "--mini_epochs 5 --validation_split 0.10"
- # Empty VARIANT_RESOURCES stubs, required to enable medaka_train_variant.snake
- # to be included in the Snakefile without raising config key errors.
- # See config_variant.yaml for an example of a valid variant calling config.
- VARIANT_RESOURCES:
- "SAMPLES" : ""
- "VARIANTS_TO_ADD": ""
|