DagsHub-Datasets
/
ont-open-data-dataset


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
199

	
200

	
201

	
202

	
203

	
204

	
205

	
206

	
207

	
208

	
209

	
210

	
211

	
212

	
213

	
214

	
215

	
216

	
217

	
218

	
219

	
220

	
221

	
222

	
223

	
224

	
225

	
226

	
227

	
228

	
229

	
230

	
231

	
232

	
233

	
234

	
235

	
236

	
237

	
238

	
239

	
240

	
241

	
242

	
243

	
244

	
245

	
246

	
247

	
248

	
249

	
250

	
251

	
252

	
            # Master katuali configuration file template
# ------------------------------------------
#
# Having copied this file with the command:
#     katuali_config my_config.yaml
# it will be necessary to amend the `DATA:` dictionary in the first section below
# to define the dataset to be processed.
#
# The second section containing software locations should also be checked and
# amended if necessary.
#
# To run the predefined workflows: `all_fast_assm_polish`, `all_standard_assm_polish`
# `all_medaka_train_features` starting from fast5 input,
# nothing else need be done. If starting a workflow from basecall data
# (https://nanoporetech.github.io/katuali/examples.html#starting-from-existing-basecalls),
# basecalls should be placed under the basecalling folder as defined in the pipelines config section.
# For the all_fast_assm_polish pipeline for example, basecalls should be placed in 
# "{DATA}/guppy/basecalls.fasta"


####################
# Input data options

# .fast5 files can be under top-level folders named by a RUNID (these need not
# be actual run UUIDs). Fast5s should be in RUNID/reads.  The keys within this
# data dictionary are RUNIDs.
#
# REFERENCE is required for the medaka training workflows, and in general any
# time reads need to be aligned to a reference (e.g. for for subsampling)
#   
# MEDAKA_TRAIN_REGIONS and MEDAKA_EVAL_REGIONS define regions for
# training and evaluation within the medaka training workflow, and are
# otherwise not used. 
#
# GENOME_SIZE is required only in workflows using the canu or flye assembler.
# When a reference file is given, GENOME_SIZE will be calculated from it in
# the case that a value is not directly specified.
#
# In the example below we train from the "minion" run using "ecoli" and "yeast"
# contigs in the reference and evaluate on the "gridion" run using the contigs
# "ecoli", "yeast" and "na12878_chr21" in the reference.
DATA:
    '20200914_1356_6F_PAF26223_da14221a':
        'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
        'SPLIT_FAST5_REGIONS':
            ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
            'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
            'chr21', 'chr22', 'chrX', 'chrY']
    '20200914_1358_2-E5-H5_PAF26161_2e3c81af':
        'REFERENCE': 'ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'
        'SPLIT_FAST5_REGIONS':
            ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
            'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
            'chr21', 'chr22', 'chrX', 'chrY']


############################
# Definitions of standard pipelines
#
# Pipeline target template definitions containing other config parameters that
# will be filled in with config variables to generate targets, see
# documentation for full details.  
PIPELINES:
    all_initial: [
        # do alignments, split bams and fast5s by regions
        "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/fast5/",
        # calculate alignment stats 
        "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/calls2ref_stats.txt",
        "{DATA}/guppy_v4.0.11_r10.3_hac_prom/align_unfiltered/{SPLIT_FAST5_REGIONS}/calls2ref_stats.txt"
    ]


# Runtime configuration
RUNTIME:
    # default number of threads/cores/slots to use per multi-threaded task.
    THREADS_PER_JOB: 32
    # To run medaka consensus on GPU, set MEDAKA_CONSENSUS_NUM_GPU to 1.
    # More than a single GPU is not currently supported (by medaka).
    MEDAKA_CONSENSUS_NUM_GPU: 0

    ##############################
    # Location of various software
    #
    # Configuration for software is found toward the end of this config.
    IN_MEDAKA: "~/git/medaka/venv/bin/activate"  # tensorflow-gpu should be installed in the venv for GPU processing. 
    IN_POMOXIS: "~/git/pomoxis/venv/bin/activate"
    CANU_EXEC: "~/git/canu-1.8/Linux-amd64/bin/canu"
    FLYE_EXEC: "~/git/Flye/bin/flye"
    SHASTA_EXEC: "~/git/shasta-Linux-0.3.0"
    GUPPY_EXEC: "/usr/bin/guppy_basecaller"
    PORPITA_EXEC: "~/git/porpita/porpita"
    SOURCE: "source"
    LIFTOVER: "~/miniconda3/envs/liftover/bin/liftOver"
    VCFCREATEMULTI: "~/git/vcflib/bin/vcfcreatemulti"
    LD_LIBRARY_PATH: ""  # set this for e.g. CUDA library path

    #########################
    # Compute/cluster options

    # Location of scratch space on cluster nodes. This is currently only used for
    # medaka training where latency on an networked filesystem can significantly
    # slow things down. To have katuali copy medaka features to scratch space
    # before training set SCRATCH and optionally TMPSCRATCH.
    #
    # If TMPSCRATCH is given the resultant path will be:
    #     /SCRATCH/$USER/TMPSCRATCH
    # whereas if it is not given the resultant path will be:
    #    /SCRATCH/$USER/<generated unique name>
    #
    # TMPSCRATCH is useful in the case of restarting a training job on a node to
    # which features have already been copied; in this case, simply set TMPSCRATCH
    # to the directory under which the features were copied and katuali will skip
    # copying the data and use the existing data.
    #
    # Note that katuali does not delete the data in scratch, so users must do this
    # manually.
    SCRATCH: ""
    TMPSCRATCH: ""

    # Snakemake checks for the existence of input files on the submission node.  On
    # systems with a high-latency distributed file system, the existence of a file
    # on the submission node may not guarantee its presence on a compute node.
    # Katuali checks that all input files exist on the machine where a rule is
    # about to run.  Note that if the file is not found, katuali will attempt to
    # force an NFS cache update by changing the owner of the file to the current
    # user (using chown). If the file is still not found, execution terminates.
    # Verbose logging of the file checking can by setting CHECK_FILES_EXIST_OPTS to
    # '--debug'. 
    #CHECK_FILES_EXIST_OPTS: '--debug'  # activate logging of file-checking
    CHECK_FILES_EXIST_OPTS: '--quiet'  # check files silently


############################
# Misc. options for programs
#
# For advanced users. See
# https://nanoporetech.github.io/katuali/examples.html#examples
# for help in contructing bespoke pipelines.
#
# For fully specified targets, rather than predefined workflows the below can
# be used to manipulate how each program is run. Each key acts as a shortcut
# for a specific parameter set (as well as functioning as a target name).
# Default parameters are given by the `""` targets

# Guppy
GUPPY_OPTS:
    "": "-c dna_r9.4.1_450bps_hac.cfg"
    "_v4.0.11_r9.4.1_hac_prom": "-c dna_r9.4.1_450bps_hac.cfg"
    "_v4.0.11_r10.3_hac_prom": "-c dna_r10.3_450bps_hac_prom.cfg"

# subsample_bam
SUBSAMPLE_BAM_OPTS:
    "": "--all_fail"
    "_prop": "--proportional --all_fail"
    "_filtered": "--quality 6 --coverage 90 --all_fail"

# mini_align
MINI_ALIGN_OPTS:
    "": ""
    "_unfiltered": "-A"

# mini_assemble
MINI_ASSEMBLE_OPTS:
    "": ""
    "_ce": "-c -e 10"
    "_ces": "-c -e 10 -n 10"

# porpita 
PORPITA_OPTS:
    "": ""

# Canu 
CANU_EXEC_OPTS:
    "useGrid=False" # this shouldn't be changed, snakemake will be confused
CANU_OPTS:
    "": ""

# Flye 
FLYE_OPTS:
    "": ""

# Shasta 
SHASTA_OPTS:
    "": "--Reads.minReadLength 2000"

# assess_assembly
ASSESS_ASSM_SUFFIXES: [""]
ASSESS_ASSM_OPTS:
    "": "-C -H" # runs cataloguing of errors and homopolymer analysis


#############################
# Options for medaka training

# This section can be ignored if not running a medaka training workflow

# Read depths at which to create assemblies for training, this should
# span the range of depths at which the model is to be used
DEPTHS:
    [25, 50, 75, 100, 125, 150, 175, 200]

# If any medaka feature files are missing (e.g. due to insufficient coverage
# for some runs / contigs) the medaka training step will not find all the files
# it expects and will not run. To train on the feauture files which were
# created, set this flag to true (after having already created the features
# with the flag set to false) 
USE_ONLY_EXISTING_MEDAKA_FEAT:
    false

# Run multiple training replicates - output will be in medaka_train_{key},
# values should be a key of the PIPELINES dictionary in this file. In simple
# cases this allows running technical replicates of the training, but also
# allows the pipeline to be changed to for example create features in a
# different manner. For the latter change the value component to an alternative
# user defined pipeline.
MEDAKA_TRAIN_REPLICATES:
    "_rep_1": "all_medaka_feat"
    "_rep_2": "all_medaka_feat"
    "_rep_3": "all_medaka_feat"

# Evaluation of trained models, entries in this list should be keys
# of MEDAKA_OPTS, the values of which need to specify the path of the
# trained model using the `-m option`.
MEDAKA_EVAL_SUFFIXES:
    ["_rep_1_best_val", "_rep_2_best_val", "_rep_3_best_val"]
MEDAKA_OPTS:
    "": "-m r941_min_high_g344"
    "_hac": "-m r941_min_high_g344"
    "_hac_prom": "-m r941_prom_high_g344"
    "_fast": "-m r941_min_fast_g303"
    "_fast_prom": "-m r941_prom_fast_g303"
    "_rep_1_best_val": "-m medaka_train_rep_1/model.best.val_cat_acc.hdf5"
    "_rep_2_best_val": "-m medaka_train_rep_2/model.best.val_cat_acc.hdf5"
    "_rep_3_best_val": "-m medaka_train_rep_3/model.best.val_cat_acc.hdf5"

# Medaka training features
MEDAKA_TRAIN_FEAT_OPTS:
    "": "--chunk_len 1000 --chunk_ovlp 0"

MEDAKA_TRAIN_OPTS:
    "_rep_1": "--mini_epochs 5 --validation_split 0.10"
    "_rep_2": "--mini_epochs 5 --validation_split 0.10"
    "_rep_3": "--mini_epochs 5 --validation_split 0.10"

# Empty VARIANT_RESOURCES stubs, required to enable medaka_train_variant.snake
# to be included in the Snakefile without raising config key errors.  
# See config_variant.yaml for an example of a valid variant calling config.
VARIANT_RESOURCES:
    "SAMPLES" : ""
    "VARIANTS_TO_ADD": ""