-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline_options_example_metazoa.yaml
199 lines (175 loc) · 8.21 KB
/
pipeline_options_example_metazoa.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
###############################################################################
# OptimOTU metabarcoding bioinformatics pipeline #
# Configuration file #
###############################################################################
# General parameters
project_name: metabarcoding_project
# Files to work with. Specify the extension, e.g. fastq.gz / fq.gz / fastq / fq.
# The default selects any of these
# Regular expression syntax is allowed.
# Only used if no custom sample table is given.
file_extension: "fastq.gz"
# Specify sequence orientation.
# "fwd" = all sequences are expected to be in 5'-3' orientation.
# "rev" = all sequences are expected to be in 3'-5' orientation.
# "mixed" = the orientation of seqs is expected to be mixed (5'-3' and 3'-5)
# "custom" = the orientation of different files is given in a custom sample table (see below)
# In this case there should be a column "orient" which has values "fwd", "rev", or "mixed"
# if seqs are "mixed", but using "fwd" setting, then some valid seqs (or samples) will be lost.
# if seqs are "fwd", but using "mixed", then ERROR.
orient: mixed
# Sample parameters
# custom primer trimming parameters per sample can be given as columns in the
# sample table.
custom_sample_table: FALSE
# Added reference sequences (leave blank or use NULL if none)
# "table" should be an Excel *.xlsx spreadsheet, with (at least) two columns in the
# first sheet:
# Culture_ID: unique identifier for the sequence, muse _exactly_ match the sequence
# name in "fasta"
# Protax_synonym: comma-separated taxonomy string for the new ID, starting with
# kingdom. Be sure to match the taxonomy used by protax, including spelling!
# e.g.: Fungi,Basidiomycota,Agaricomycetes,Russulales,Stereaceae,Conferticium_17349,Conferticium_ravum_412824
added_reference:
fasta:
table:
############################
### parallelism settings ###
############################
# maximum number of ASV sequences to process in a single batch
max_batchsize: 10000
# for parallel execution (crew or clustermq) how many workers to launch for
# each sequencing run and orientation. 2 will yield maximum parallelism in most
# situations.
workers_per_seqrun: 2
# regardless of the number of sequencing runs, the maximum and minimum number of
# parallel jobs to run. For crew, the number of jobs actually launched may be
# less than min_jobs if additional jobs would have nothing to do.
max_jobs: 100
min_jobs: 1
################################
### primer trimming settings ###
################################
#forward PCR primer (degenerate positions allowed, according to IUPAC codes).
#Currently supports only a single fwd primer.
forward_primer: "CCHGAYATRGCHTTYCCHCG" #BF3
#reverse PCR primer (degenerate positions allowed, according to IUPAC codes).
#Currently supports only a single rev primer.
reverse_primer: "TCDGGRTGNCCRAARAAYCA" #BR2
trimming:
max_err: 0.2 # max error rate (float; e.g 0.2 = 20% error rate) in the primer sequence; or number of mismatches (int; e.g. 1 = 1 mismatch)
min_overlap: 10 # at least 10 bp of primer sequence must be present
truncQ_R1: [2, 2] # truncate ends of R1 at first base with quality score <= N
truncQ_R2: [2, 2] # truncate ends of R2 at first base with quality score <= N
max_n: 0 # remove sequences which contain N (after truncation)
min_length: 100 # min length after adapter/quality trimming
cut_R1: 0 # remove N bases from start of R1
cut_R2: 0 # remove N bases from start of R2
action: "trim" # "trim"/"retain" the primer sequences. "trim" = clip primers; "retain" = do not clip the primer after primer has been found.
##################################
### quality filtering settings ###
##################################
filtering:
maxEE_R1: 1 # max expected errors for R1 reads
maxEE_R2: 1 # max expected errors for R2 reads
####################################
### tag-jumps filtering settings ###
####################################
# comment out this section or set tag_jump to FALSE to omit this step
tag_jump:
f: 0.05 # expected cross-talk rate
p: 1 # power to rise the exponent
###############################
### Amplicon model settings ###
###############################
# comment out this section or set model_type to "none" to skip this step
# statistical sequence models are used for several purposes:
# 1) aligning ASVs prior to use of protaxA and/or NuMt detection
# 2) filtering ASVs to remove spurious sequences
amplicon_model:
model_type: HMM # allowed values are CM, HMM, and none
model_file: data/COI.hmm
# model filter will be skipped if amplicon_model$model_type is "none",
# or if this section is commented out.
# these values will need to be tuned for different amplicons/models
model_filter:
max_model_start: 245 # the match must start at this point in the model or earlier
min_model_end: 652 # the match must end at this point in the model or later
min_model_score: 200 # the match bit score must be at least this
# producing aligned sequences will be skipped if the value is false
model_align: yes
# numt filter will be skipped if the value is false
# numt filter requires model_type == "HMM" and model_align == TRUE
numt_filter: yes
###############################
### Feature-by-sample table ###
###############################
# Feature = ASV/OTU/x
repeats: "sum" # what to to do with the samples that have the SAME NAME.
# "sum" = merging the replicates
# "error" = STOPs the run if duplicated sample names are detected.
dense_table: no
#########################
### Control sequences ###
#########################
# Two types of control sequences are supported:
# 1) spike-in sequences: sequences that are added to the samples before PCR
# These sequences are expected to be present in every sample, even
# most types of negative control.
# 2) positive control sequences: sequences that are added to only a few specific
# positive control samples. These sequences are expected to be present only
# in the positive control samples, and their presence in other samples is
# indicative of cross-contamination. (Either in the lab or "tag-switching").
# In practice both types are treated the same by the pipeline, they are just
# reported separately.
#
# The sequences should be in a fasta file.
# Either or both type of control can be omitted by leaving the value blank,
# omitting it completely, or setting it to a value that evaluates to false.
control:
spike: no
positive: data/amptk_synmock_ITS_COI.fasta
#######################
### Protax settings ###
#######################
protax:
aligned: yes # "yes" assumes all reference and query sequences are aligned
location: "protaxAnimal" # directory where protax is
#### Ranks ####
# Ranks should be listed from most inclusive to lease inclusive.
# The first rank(s) may be given with values (e.g. "- kingdom: Fungi" instead
# of just "- kingdom"). This means that the rank(s) are not assigned by
# Protax, but rather are assumed to already be known by Protax; i.e. they are
# the root of the classification.
ranks:
- kingdom: Animalia
- phylum: Arthropoda
- class
- order
- family
- subfamily
- tribe
- genus
- species
###################################
### Outgroup reference settings ###
###################################
# The outgroup reference should be taxonomically annotated sequences which
# include not only the ingroup (i.e., those sequences which Protax can identify)
# but also (ideally) all other groups which could conceivably be encountered
# with the chosen marker.
# Optionally, the taxonomic annotations can be given in another file.
# (The formatting in this case is idiosyncratic, related to the release format
# of Unite.)
# If given, this is also the file used for reference-based chimera checking.
outgroup_reference:
sequences: data/BOLD/BOLD_COI-5P_subset.fasta
# taxonomy:
###########################
### Clustering settings ###
###########################
cluster_thresholds: metadata/MBRAVE_thresholds.tsv
######################
### Guild settings ###
######################
guilds: no