comparison purge_dups.xml @ 4:a315c25dc813 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/purge_dups commit d2ef7bd6598695a681446eaf9c5b8c142e8a0199"
author iuc
date Tue, 12 Oct 2021 19:07:05 +0000
parents 76d4cbefff85
children
comparison
equal deleted inserted replaced
3:76d4cbefff85 4:a315c25dc813
1 <tool id="purge_dups" name="Purge overlaps" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01"> 1 <tool id="purge_dups" name="Purge overlaps" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
2 <description>and haplotigs in an assembly based on read depth (purge_dups)</description> 2 <description>and haplotigs in an assembly based on read depth (purge_dups)</description>
3 <macros> 3 <macros>
4 <token name="@TOOL_VERSION@">1.2.5</token> 4 <import>macros.xml</import>
5 <token name="@VERSION_SUFFIX@">3</token>
6 <xml name="trimmers">
7 <section name="section_hist" title="Histogram plot options" >
8 <!--<param name="cutoffs_his" type="data" optional="true" format="txt" label="Read depth cutoffs file" />-->
9 <param argument="--ymin" type="integer" optional="true" min="0" label="Specify a minimum for the Y axis"/>
10 <param argument="--ymax" type="integer" optional="true" label="Specify a maximum for the Y axis"/>
11 <param argument="--xmin" type="integer" optional="true" min="0" label="Specify a minimum for the X axis"/>
12 <param argument="--xmax" type="integer" optional="true" label="Specify a maximum for the X axis"/>
13 <param argument="--title" type="text" value="Read depth histogram plot" label="Histogram title"/>
14 </section>
15 </xml>
16 <token name="@HIST_PLOT@"><![CDATA[
17 python '$__tool_directory__/hist_plot.py'
18 --cutoffs cutoffs.tsv
19 #if $function_select.section_hist.ymin
20 --ymin $function_select.section_hist.ymin
21 #end if
22 #if $function_select.section_hist.ymax
23 --ymax $function_select.section_hist.ymax
24 #end if
25 #if $function_select.section_hist.xmin
26 --xmin $function_select.section_hist.xmin
27 #end if
28 #if $function_select.section_hist.xmax
29 --xmax $function_select.section_hist.xmax
30 #end if
31 #if $function_select.section_hist.title
32 --title '${function_select.section_hist.title}'
33 #end if
34 depth.stat hist.png
35 ]]></token>
36 <token name="@CALCUTS@"><![CDATA[
37 calcuts
38 #if $function_select.section_calcuts.min_depth:
39 -f $function_select.section_calcuts.min_depth
40 #end if
41 #if $function_select.section_calcuts.low_depth:
42 -l $function_select.section_calcuts.low_depth
43 #end if
44 #if $function_select.section_calcuts.transition:
45 -m $function_select.section_calcuts.transition
46 #end if
47 #if $function_select.section_calcuts.upper_depth:
48 -u $function_select.section_calcuts.upper_depth
49 #end if
50 $function_select.section_calcuts.ploidy
51 ]]></token>
52 <xml name="calcuts">
53 <section name="section_calcuts" title="Calcuts options">
54 <param name="min_depth" type="float" label="Minimum depth count fraction to maximum depth coun" min="0" max="1" argument="-f" optional="true" help="Default = 0.1"/>
55 <param name="low_depth" label="Lower bound for read depth" type="integer" argument="-l" optional="true"/>
56 <param name="transition" label="Transition between haploid and diploid" type="integer" argument="-m" optional="true"/>
57 <param name="upper_depth" label="Upper bound for read depth" type="integer" argument="-u" optional="true"/>
58 <param name="ploidy" argument="-d" type="select" label="Ploidy">
59 <option value="-d 0" selected="true">Diploid [0]</option>
60 <option value="-d 1">Haploid [1]</option>
61 </param>
62 </section>
63 </xml>
64 </macros> 5 </macros>
65 <requirements> 6 <expand macro="xrefs"/>
66 <requirement type="package" version="@TOOL_VERSION@">purge_dups</requirement> 7 <expand macro="requirements"/>
67 <requirement type="package" version="3.4.2">matplotlib-base</requirement>
68 </requirements>
69 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
70 #if $function_select.functions == 'purge_dups': 9 #if $function_select.functions == 'purge_dups':
71 #for $i, $file in enumerate($function_select.input): 10 #for $i, $file in enumerate($function_select.input):
72 #if $file.is_of_type("paf"): 11 #if $file.is_of_type("paf"):
73 gzip -c '${file}' > '${i}.gz' && 12 gzip -c '${file}' > '${i}.gz' &&
80 -c '${function_select.coverage}' 19 -c '${function_select.coverage}'
81 #end if 20 #end if
82 #if $function_select.cutoffs: 21 #if $function_select.cutoffs:
83 -T '${function_select.cutoffs}' 22 -T '${function_select.cutoffs}'
84 #end if 23 #end if
85 #if $function_select.min_bad: 24 -f $function_select.min_bad
86 -f $function_select.min_bad 25 -a $function_select.min_align
87 #end if 26 -b $function_select.min_match
88 #if $function_select.min_align: 27 -m $function_select.min_chain
89 -a $function_select.min_align 28 -M $function_select.max_gap
90 #end if
91 #if $function_select.min_match:
92 -b $function_select.min_match
93 #end if
94 #if $function_select.min_chain:
95 -m $function_select.min_chain
96 #end if
97 #if $function_select.max_gap:
98 -M $function_select.max_gap
99 #end if
100 #if $function_select.double_chain.chaining_rounds == 'two': 29 #if $function_select.double_chain.chaining_rounds == 'two':
101 -2 30 -2
102 #if $function_select.double_chain.max_gap_2: 31 -G $function_select.double_chain.max_gap_2
103 -G $function_select.double_chain.max_gap_2
104 #end if
105 #end if 32 #end if
106 #if $function_select.min_chain_score: 33 -l $function_select.min_chain_score
107 -l $function_select.min_chain_score 34 -E $function_select.max_extend
108 #end if
109 #if $function_select.max_extend:
110 -E $function_select.max_extend
111 #end if
112 #for $i, $file in enumerate($function_select.input): 35 #for $i, $file in enumerate($function_select.input):
113 '${i}.gz' 36 '${i}.gz'
114 #end for 37 #end for
115 > dups.bed 2> purge_dups.log 38 > dups.bed 2> purge_dups.log
116 #else if $function_select.functions == 'split_fa': 39 #else if $function_select.functions == 'split_fa':
117 split_fa 40 split_fa
118 #if $function_select.split:
119 -n $function_select.split
120 #end if
121 '${function_select.input}' > split.fasta 41 '${function_select.input}' > split.fasta
122 #else if $function_select.functions == 'pbcstat': 42 #else if $function_select.functions == 'pbcstat':
123 #for $i, $file in enumerate($function_select.input): 43 #for $i, $file in enumerate($function_select.input):
124 #if $file.is_of_type('paf'): 44 #if $file.is_of_type('paf'):
125 gzip -c '${file}' > '${i}.gz' && 45 gzip -c '${file}' > '${i}.gz' &&
126 #else 46 #else
127 ln -s '${file}' '${i}.gz' && 47 ln -s '${file}' '${i}.gz' &&
128 #end if 48 #end if
129 #end for 49 #end for
130 pbcstat 50 pbcstat
131 #if $function_select.max_cov: 51 -M $function_select.pbcstat_options.max_cov
132 -M $function_select.max_cov 52 -f $function_select.pbcstat_options.min_map_ratio
53 #if $function_select.pbcstat_options.min_map_qual:
54 -q $function_select.pbcstat_options.min_map_qual
133 #end if 55 #end if
134 #if $function_select.min_map_ratio: 56 -l $function_select.pbcstat_options.flank
135 -f $function_select.min_map_ratio 57 $function_select.pbcstat_options.primary_alignments
136 #end if
137 #if $function_select.min_map_qual:
138 -q $function_select.min_map_qual
139 #end if
140 #if $function_select.flank:
141 -l $function_select.flank
142 #end if
143 $function_select.primary_alignments
144
145 #for $i, $file in enumerate($function_select.input): 58 #for $i, $file in enumerate($function_select.input):
146 '${i}.gz' 59 '${i}.gz'
147 #end for 60 #end for
148 && mv PB.stat depth.stat 61 && mv PB.stat depth.stat
149 && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log 62 && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
150 && @HIST_PLOT@ 63 && @HIST_PLOT@
151 64
152 #else if $function_select.functions == 'ngscstat': 65 #else if $function_select.functions == 'ngscstat':
153 ngscstat 66 ngscstat
154 #if $function_select.min_align_qual: 67 -q $function_select.ngscstat_options.min_align_qual
155 -q $function_select.min_align_qual
156 #end if
157 ## #if $function_select.max_depth: 68 ## #if $function_select.max_depth:
158 ## -M $function_select.max_depth 69 ## -M $function_select.max_depth
159 ## #end if 70 ## #end if
160 #if $function_select.max_insert: 71 -L $function_select.ngscstat_options.max_insert
161 -L $function_select.max_insert
162 #end if
163 '${function_select.input}' 72 '${function_select.input}'
164 && mv TX.stat depth.stat 73 && mv TX.stat depth.stat
165 && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log 74 && @CALCUTS@ depth.stat > cutoffs.tsv 2>calcuts.log
166 && @HIST_PLOT@ 75 && @HIST_PLOT@
167 76
168 #else if $function_select.functions == 'calcuts': 77 #else if $function_select.functions == 'calcuts':
169 @CALCUTS@ '${function_select.input}' > cutoffs.tsv 2>calcuts.log 78 @CALCUTS@ '${function_select.input}' > cutoffs.tsv 2>calcuts.log
170 79
171 #else if $function_select.functions == 'get_seqs': 80 #else if $function_select.functions == 'get_seqs':
172 get_seqs 81 get_seqs
173 $function_select.coverage 82 $function_select.advanced_options.coverage
174 $function_select.haplotigs 83 $function_select.advanced_options.haplotigs
175 $function_select.end_trim 84 $function_select.advanced_options.end_trim
176 $function_select.split 85 $function_select.advanced_options.split
177 #if $function_select.length: 86 -l $function_select.advanced_options.length
178 -l $function_select.length 87 -m $function_select.advanced_options.min_ratio
179 #end if 88 -g $function_select.advanced_options.min_gap
180 #if $function_select.min_ratio:
181 -m $function_select.min_ratio
182 #end if
183 #if $function_select.min_gap:
184 -g $function_select.min_gap
185 #end if
186 '${function_select.bed_input}' '${function_select.fasta_input}' 89 '${function_select.bed_input}' '${function_select.fasta_input}'
187 #end if 90 #end if
188 ]]></command> 91 ]]></command>
189 <inputs> 92 <inputs>
190 <conditional name="function_select"> 93 <conditional name="function_select">
191 <param type="select" name="functions" label="Select the purge_dups function"> 94 <param type="select" name="functions" label="Function mode">
95 <option value="pbcstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for PacBio data (calcuts+pbcstat)</option>
96 <option value="ngscstat">Calculate coverage cutoff, base-level read depth and create read depth histogram for Illumina data (calcuts+ngscstat)</option>
97 <option value="calcuts">Calculate coverage cutoffs (calcuts)</option>
98 <option value="split_fa">Split assembly FASTA files by 'N's (split_fa)</option>
192 <option value="purge_dups">Purge haplotigs and overlaps for an assembly (purge_dups)</option> 99 <option value="purge_dups">Purge haplotigs and overlaps for an assembly (purge_dups)</option>
193 <option value="split_fa">Split FASTA file by 'N's (split_fa)</option> 100 <option value="get_seqs">Obtain sequences after purging (get_seqs)</option>
194 <option value="pbcstat">Calculate coverage cutoff and create read depth histogram and base-levelread depth for PacBio data (calcuts+pbcstats)</option>
195 <option value="ngscstat">Calculate coverage cutoff and create read depth histogram and base-level read detph for Illumina data (calcuts+ngscstat)</option>
196 <option value="calcuts">calculate coverage cutoffs (calcuts)</option>
197 <option value="get_seqs">Obtain seqeuences after purging (get_seqs)</option>
198 </param> 101 </param>
199 <when value="purge_dups"> 102 <when value="purge_dups">
200 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/> 103 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
201 <param name="coverage" type="data" format="tabular" optional="true" argument="-c" label="Base-level coverage file" /> 104 <param argument="-c" name="coverage" type="data"
202 <param name="cutoffs" type="data" format="tabular" label ="Cutoffs file" optional="true" argument="-T"/> 105 format="tabular" optional="true" label="Base-level coverage file"
203 <param name="min_bad" type="float" min="0" max="1" argument="-f" optional="true" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence" help="Default = 0.8"/> 106 help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level
204 <param name="min_align" type="integer" label="Minimum alignment score" argument="-a" optional="true"/> 107 read depth and create read depth histogram' option"/>
205 <param name="min_match" type="integer" label="Minimum max match score" argument="-b" optional="true"/> 108 <param argument="-T" name="cutoffs" type="data"
206 <param name="min_chain" label="Minimum matching bases for chaining" type="integer" argument="-m" optional="true"/> 109 format="tabular" optional="true" label ="Cutoffs file"
207 <param name="max_gap" label="Maximum gap size for chaining" type="integer" argument="-M" optional="true"/> 110 help="This file is generated with purge_dups by using the 'Calculate coverage cutoff, base-level
111 read depth and create read depth histogram' option"/>
112 <param argument="-f" name="min_bad" type="float"
113 min="0" max="1" value="0.8" label="Minimum fraction of haploid/diploid/bad/repetitive bases in a sequence"
114 help="This parameter is set for a suspect haplotigs. If 80% (default value) of a scaffold is high coverage
115 (defined by the sixth column of the cutoff file), it's a repetitive contig. If 80% is low coverage (defined
116 in the first column of the cutoff file), it's a junk contig. If 80% is above diploid coverage(defined in
117 the fourth column of the cutoff file), it's a diploid, otherwise it's a suspect haplotig"/>
118 <param argument="-a" name="min_align" type="integer"
119 min="0" value="70" label="Minimum alignment score"
120 help="If its alignment score is larger than this parameter and max match score larger than
121 the 'mininum max match score' (-b), it is marked as a repeat; if the alignment score is larger than this parameter
122 and max match score no larger than the 'mininum max match score', it is marked as a haplotig.
123 Otherwise it is left as a candidate primary contig. If after purging, the complete genes reported by BUSCO are
124 too low, you can try to increase this parameter"/>
125 <param argument="-b" name="min_match" type="integer"
126 min="0" value="200" label="Minimum max match score"
127 help="If its alignment score is larger than the 'minimum align score' (-a) and max match score larger than
128 this parameter, it is marked as a repeat; if the alignment score is larger than the 'minimum
129 align' and max match score no larger than this parameter, it is marked as a haplotig.
130 Otherwise it is left as a candidate primary contig."/>
131 <param argument="-m" name="min_chain" type="integer"
132 min="0" value="500" label="Minimum matching bases for chaining"
133 help="In the first round, it will asset chains consistent alignments within this parameter value"/>
134 <param argument="-M" name="max_gap" type="integer"
135 min="0" value="20000" label="Maximum gap size for chaining"
136 help="In the first round, it will asset chains consistent alignments within this parameter value"/>
208 <conditional name="double_chain"> 137 <conditional name="double_chain">
209 <param type="select" name="chaining_rounds" label="Rounds of chaining"> 138 <param type="select" name="chaining_rounds" label="Rounds of chaining">
210 <option value="one">1 round</option> 139 <option value="one">1 round</option>
211 <option value="two">2 rounds</option> 140 <option value="two">2 rounds</option>
212 </param> 141 </param>
213 <when value="two"> 142 <when value="two">
214 <param name="max_gap_2" argument="-G" optional="true" label="Maximum gap size for second round of chaining" type="integer"/> 143 <param argument="-G" name="max_gap_2" type="integer"
144 min="0" value="50000" label="Maximum gap size for second round of chaining"
145 help="In the second round, it will asset chains consistent alignments within this parameter value"/>
215 </when> 146 </when>
216 <when value="one"/> 147 <when value="one"/>
217 </conditional> 148 </conditional>
218 <param name="min_chain_score" argument="-l" optional="true" label="Minimum chaining score for a match" type="integer" /> 149 <param argument="-l" name="min_chain_score" type="integer"
219 <param name="max_extend" argument="-E" optional="true" label="Maximum extension for contig ends" type="integer" /> 150 min="0" value="10000" label="Minimum chaining score for a match"
151 help="This parameter control the overlap size. You should decrease its value to allow more overlaps"/>
152 <param argument="-E" name="max_extend" type="integer"
153 min="0" value="15000" label="Maximum extension for contig ends"
154 help="If the chained alignment is within this value to the contig ends, it will extended to the ends"/>
155 <param name="log_file" type="boolean" truevalue="true" falsevalue="false" label="Generate log file"/>
220 </when> 156 </when>
221 <when value="split_fa"> 157 <when value="split_fa">
222 <param name="input" type="data" format="fasta" label="Base-level coverage file"/> 158 <param name="input" type="data" format="fasta,fasta.gz" label="Assembly FASTA file" help="The sequence will be cleaved in those position in which the nucleotides is an 'N' or an 'n'."/>
223 <param name="split" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Base-level coverage file" /> 159 <!-- This option disables the cleaving process, and yield the original sequence
160 <param argument="-n" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Block split by N" help="Enable this option if you do not want break your scaffols into contigs."/>
161 -->
224 </when> 162 </when>
225 <when value="pbcstat"> 163 <when value="pbcstat">
226 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/> 164 <param name="input" type="data" format="paf,paf.gz" multiple="true" label="PAF input file"/>
227 <param name="max_cov" type="integer" label="Maximum coverage" argument="-M" optional="true"/> 165 <section name="pbcstat_options" title="PBCSTAT options" expanded="true">
228 <param name="min_map_ratio" argument="-f" type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/> 166 <param argument="-M" name="max_cov" type="integer" min="0" value="500" label="Maximum coverage"/>
229 <param name="min_map_qual" type="integer" argument="-q" optional="true" label="Minimum mapping quality"/> 167 <param argument="-f" name="min_map_ratio" type="float" min="0" max="1" value="0" label="Minimum mapping length ratio"/>
230 <param name="flank" type="integer" argument="-l" optional="true" label="Flanking space" /> 168 <param argument="-q" name="min_map_qual" type="integer" optional="true" label="Minimum mapping quality"/>
231 <param name="primary_alignments" argument="-p" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" /> 169 <param argument="-l" name="flank" type="integer" min="0" value="0" label="Flanking space" />
170 <param argument="-p" name="primary_alignments" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Use only primary alignments" />
171 </section>
232 <expand macro="calcuts" /> 172 <expand macro="calcuts" />
233 <expand macro="trimmers"/> 173 <expand macro="trimmers"/>
174 <expand macro="output_macro">
175 <option value="pbcstat_coverage" selected="true">PBCSTAT base coverage</option>
176 <option value="pbcstat_wig">PBCSTAT base coverage (WIG)</option>
177 <option value="depth_stats">PBCSTAT depths</option>
178 </expand>
234 </when> 179 </when>
235 <when value="ngscstat"> 180 <when value="ngscstat">
236 <param name="input" type="data" format="bam" label="BAM input file"/> 181 <param name="input" type="data" format="bam" label="BAM input file"/>
237 <param name="min_align_qual" type="integer" argument="-q" optional="true" label="Minimum alignment quality" /> 182 <section name="ngscstat_options" title="NGSCSTAT options" expanded="true">
238 <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? --> 183 <param argument="-q" name="min_align_qual" type="integer" min="0" value="30" label="Minimum alignment quality" />
239 <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> --> 184 <!-- Param exists in help text, but isn't actually part of the code. Maybe in the next release? -->
240 <param name="max_insert" type="integer" argument="-L" optional="true" label="Maximum insert size"/> 185 <!-- <param name="max_depth" type="integer" label="Maximum read depth" argument="-M" optional="true"/> -->
186 <param argument="-L" name="max_insert" type="integer" min="0" value="1000" label="Maximum insert size"/>
187 </section>
241 <expand macro="calcuts" /> 188 <expand macro="calcuts" />
242 <expand macro="trimmers"/> 189 <expand macro="trimmers"/>
190 <expand macro="output_macro">
191 <option value="ngscstat_coverage" selected="true">NGSCSTAT base coverage</option>
192 </expand>
243 </when> 193 </when>
244 194
245 <when value="calcuts"> 195 <when value="calcuts">
246 <param name="input" type="data" format="tabular" label="STAT input file"/> 196 <param name="input" type="data" format="tabular" label="Depths input file"/>
247 <expand macro="calcuts" /> 197 <expand macro="calcuts" />
248 </when> 198 </when>
249
250 <when value="get_seqs"> 199 <when value="get_seqs">
251 <param name="fasta_input" type="data" format="fasta" label="Fasta input file"/> 200 <param name="fasta_input" type="data" format="fasta" label="Assembly FASTA file"/>
252 <param name="bed_input" type="data" format="bed" label="Bed input file"/> 201 <param name="bed_input" type="data" format="bed" label="BED input file" help="Generated by the 'purge_dups' function mode."/>
253 <param name="coverage" type="boolean" argument="-c" truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/> 202 <section name="advanced_options" title="Advanced options">
254 <param name="haplotigs" type="boolean" argument="-a" truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/> 203 <param argument="-c" name="coverage" type="boolean"
255 <param name="length" type="integer" argument="-l" optional="true" label="Minimum primary contig length" help="Default: 1000"/> 204 truevalue="-c" falsevalue="" checked="false" label="Keep high coverage contigs in the primary contig set"/>
256 <param name="min_ratio" type="float" min="0" max="1" argument="-m" optional="true" label="Minimum ratio of remaining primary contig length to the original contig length"/> 205 <param argument="-a" name="haplotigs" type="boolean" truevalue="-a" falsevalue="" checked="false" label="Do not add prefix to haplotigs"/>
257 <param name="end_trim" type="boolean" argument="-e" truevalue="-e" falsevalue="" checked="true" label="Trim end sequences" help="Only remove sequences at end of halplotigs If you also want to remove the duplications in the middle, set to false, however that may delete false positive duplications."/> 206 <param argument="-l" name="length" type="integer" min="0" value="10000" label="Minimum primary contig length" />
258 <param name="split" type="boolean" argument="-s" truevalue="-s" falsevalue="" checked="false" label="Split contigs"/> 207 <param argument="-m" name="min_ratio" type="float"
259 <param name="min_gap" type="integer" argument="-g" optional="true" help="default=10k" label="Minimum gap size between duplications" /> 208 min="0" max="1" value="0.05" label="Minimum ratio of remaining primary contig length to the original contig length"/>
209 <param argument="-e" name="end_trim" type="boolean"
210 truevalue="-e" falsevalue="" checked="true" label="Trim end sequences"
211 help="Only remove sequences at end of halplotigs. If you also want to remove the duplications in the middle,
212 set to false, however that may delete false positive duplications."/>
213 <param argument="-s" name="split" type="boolean" truevalue="-s" falsevalue="" checked="false" label="Split contigs"/>
214 <param argument="-g" name="min_gap" type="integer" min="0" value="10000" label="Minimum gap size between duplications" />
215 </section>
260 </when> 216 </when>
261 </conditional> 217 </conditional>
262 </inputs> 218 </inputs>
263 <outputs> 219 <outputs>
264 <!-- Get Seqs --> 220 <!-- Get Seqs -->
265 <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get seqs haplotype fasta" > 221 <data name="get_seqs_hap" format="fasta" from_work_dir="hap.fa" label="${tool.name} on ${on_string}: get_seqs haplotype" >
266 <filter>function_select['functions'] == 'get_seqs'</filter> 222 <filter>function_select['functions'] == 'get_seqs'</filter>
267 </data> 223 </data>
268 <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get seqs purged fasta"> 224 <data name="get_seqs_purged" format="fasta" from_work_dir="purged.fa" label="${tool.name} on ${on_string}: get_seqs purged sequences">
269 <filter>function_select['functions'] == 'get_seqs'</filter> 225 <filter>function_select['functions'] == 'get_seqs'</filter>
270 </data> 226 </data>
271 <!-- Split FA --> 227 <!-- Split FA -->
272 <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split fasta"> 228 <data name="split_fasta" format="fasta" from_work_dir="split.fasta" label="${tool.name} on ${on_string}: split FASTA">
273 <filter>function_select['functions'] == 'split_fa'</filter> 229 <filter>function_select['functions'] == 'split_fa'</filter>
274 </data> 230 </data>
275 <!-- Ngscstat --> 231 <!-- Ngscstat -->
276 <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: ngscstat base coverage file"> 232 <data name="ngscstat_cov" format="tabular" from_work_dir="TX.base.cov" label="${tool.name} on ${on_string}: NGSCSTAT base coverage">
277 <filter>function_select['functions'] == 'ngscstat'</filter> 233 <filter>function_select['functions'] == 'ngscstat'</filter>
278 </data> 234 <filter>'ngscstat_coverage' in function_select['output_options']</filter>
279 <data name="stat_file" format="tabular" from_work_dir="depth.stat" label="${tool.name} on ${on_string}: stat file"> 235 </data>
236 <data name="stat_file" format="tabular" from_work_dir="depth.stat" label="${tool.name} on ${on_string}: depths">
280 <filter>function_select['functions'] == 'ngscstat' or function_select['functions'] == 'pbcstat'</filter> 237 <filter>function_select['functions'] == 'ngscstat' or function_select['functions'] == 'pbcstat'</filter>
238 <filter>'depth_stats' in function_select['output_options']</filter>
281 </data> 239 </data>
282 <!-- Pbcstat --> 240 <!-- Pbcstat -->
283 <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov" label="${tool.name} on ${on_string}: pbcstat base coverage file"> 241 <data name="pbcstat_cov" format="tabular" from_work_dir="PB.base.cov" label="${tool.name} on ${on_string}: PBCSTAT base coverage">
284 <filter>function_select['functions'] == 'pbcstat'</filter> 242 <filter>function_select['functions'] == 'pbcstat'</filter>
285 </data> 243 <filter>'pbcstat_coverage' in function_select['output_options']</filter>
286 <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: pbcstat base wig file"> 244 </data>
245 <data name="pbcstat_wig" format="wig" from_work_dir="PB.cov.wig" label="${tool.name} on ${on_string}: PBCSTAT base coverage (WIG)">
287 <filter>function_select['functions'] == 'pbcstat'</filter> 246 <filter>function_select['functions'] == 'pbcstat'</filter>
247 <filter>'pbcstat_wig' in function_select['output_options']</filter>
288 </data> 248 </data>
289 249
290 <data name="hist" format="png" from_work_dir="hist.png" label="${tool.name} on ${on_string}: histogram plot"> 250 <data name="hist" format="png" from_work_dir="hist.png" label="${tool.name} on ${on_string}: histogram plot">
291 <filter>function_select['functions'] == 'pbcstat' or function_select['functions'] == 'ngscstat'</filter> 251 <filter>function_select['functions'] == 'pbcstat' or function_select['functions'] == 'ngscstat'</filter>
252 <filter>'histogram' in function_select['output_options']</filter>
292 </data> 253 </data>
293 254
294 <!-- Calcuts --> 255 <!-- Calcuts -->
295 <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log file"> 256 <data name="calcuts_log" format="txt" from_work_dir="calcuts.log" label="${tool.name} on ${on_string}: calcuts log">
296 <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter> 257 <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
297 </data> 258 <filter>'calcuts_log' in function_select['output_options']</filter>
298 <data name="calcuts_tab" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff file"> 259 </data>
260 <data name="calcuts_cutoff" format="tabular" from_work_dir="cutoffs.tsv" label="${tool.name} on ${on_string}: calcuts cutoff">
299 <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter> 261 <filter>function_select['functions'] in ('pbcstat', 'ngscstat', 'calcuts')</filter>
262 <filter>'calcuts_cutoff' in function_select['output_options']</filter>
300 </data> 263 </data>
301 <!-- Purge dups --> 264 <!-- Purge dups -->
302 <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log file"> 265 <data name="purge_dups_log" format="txt" from_work_dir="purge_dups.log" label="${tool.name} on ${on_string}: purge_dups log">
303 <filter>function_select['functions'] == 'purge_dups'</filter> 266 <filter>function_select['functions'] == 'purge_dups'</filter>
304 </data> 267 <filter>function_select['log_file']</filter>
305 <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups bed file"> 268 </data>
269 <data name="purge_dups_bed" format="bed" from_work_dir="dups.bed" label="${tool.name} on ${on_string}: purge_dups BED">
306 <filter>function_select['functions'] == 'purge_dups'</filter> 270 <filter>function_select['functions'] == 'purge_dups'</filter>
307 </data> 271 </data>
308 </outputs> 272 </outputs>
309 <tests> 273 <tests>
310 <!-- Purge dups --> 274 <!-- Test 1 Purge dups -->
311 <test expect_num_outputs="2"> 275 <test expect_num_outputs="1">
312 <conditional name="function_select"> 276 <conditional name="function_select">
313 <param name="functions" value="purge_dups"/> 277 <param name="functions" value="purge_dups"/>
314 <param name="input" value="test.paf"/> 278 <param name="input" value="assembly_test.paf"/>
315 <param name="coverage" value="test.cov" ftype="tabular"/> 279 <param name="coverage" value="test.cov" ftype="tabular"/>
316 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/> 280 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
317 <param name="min_bad" value="0.01"/> 281 <param name="min_bad" value="0.01"/>
318 <param name="min_align" value="10"/> 282 <param name="min_align" value="10"/>
319 <param name="min_match" value="100"/> 283 <param name="min_match" value="100"/>
324 <param name="max_gap_2" value="1001"/> 288 <param name="max_gap_2" value="1001"/>
325 </conditional> 289 </conditional>
326 <param name="min_chain_score" value="1"/> 290 <param name="min_chain_score" value="1"/>
327 <param name="max_extend" value="100"/> 291 <param name="max_extend" value="100"/>
328 </conditional> 292 </conditional>
329 <output name="purge_dups_bed" value="purge_dups_out.bed"/> 293 <output name="purge_dups_bed" value="purge_dups_01.bed" ftype="bed"/>
330 </test> 294 </test>
331 <!-- Purge dups gzip --> 295 <!-- Test 2 Purge dups gzip -->
332 <test expect_num_outputs="2"> 296 <test expect_num_outputs="2">
333 <conditional name="function_select"> 297 <conditional name="function_select">
334 <param name="functions" value="purge_dups"/> 298 <param name="functions" value="purge_dups"/>
335 <param name="input" value="test.paf.gz" ftype="paf.gz"/> 299 <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
336 <param name="coverage" value="test.cov" ftype="tabular"/> 300 <param name="coverage" value="test.cov" ftype="tabular"/>
337 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/> 301 <param name="cutoffs" value="cutoffs.tsv" ftype="tabular"/>
338 <param name="min_bad" value="0.01"/> 302 <param name="min_bad" value="0.01"/>
339 <param name="min_align" value="10"/> 303 <param name="min_align" value="10"/>
340 <param name="min_match" value="100"/> 304 <param name="min_match" value="100"/>
344 <param name="chaining_rounds" value="two"/> 308 <param name="chaining_rounds" value="two"/>
345 <param name="max_gap_2" value="1001"/> 309 <param name="max_gap_2" value="1001"/>
346 </conditional> 310 </conditional>
347 <param name="min_chain_score" value="1"/> 311 <param name="min_chain_score" value="1"/>
348 <param name="max_extend" value="100"/> 312 <param name="max_extend" value="100"/>
349 </conditional> 313 <param name="log_file" value="true"/>
350 <output name="purge_dups_bed" value="purge_dups_out.bed"/> 314 </conditional>
351 </test> 315 <output name="purge_dups_bed" value="purge_dups_02.bed" ftype="bed"/>
352 <!-- Purge dups multiple input --> 316 <output name="purge_dups_log" value="purge_dups_log_02.txt" ftype="txt"/>
353 <test expect_num_outputs="2"> 317
318 </test>
319 <!-- Test 3 Purge dups multiple input -->
320 <test expect_num_outputs="1">
354 <conditional name="function_select"> 321 <conditional name="function_select">
355 <param name="functions" value="purge_dups"/> 322 <param name="functions" value="purge_dups"/>
356 <param name="input" value="test.paf,test2.paf.gz"/> 323 <param name="input" value="assembly_test.paf,test2.paf.gz"/>
357 </conditional> 324 </conditional>
358 <output name="purge_dups_bed" value="purge_dups_out_2.bed"/> 325 <output name="purge_dups_bed" value="purge_dups_03.bed" ftype="bed"/>
359 </test> 326 </test>
360 <!-- Split fa --> 327 <!-- Test 4 Split fa -->
361 <test expect_num_outputs="1"> 328 <test expect_num_outputs="1">
362 <conditional name="function_select"> 329 <conditional name="function_select">
363 <param name="functions" value="split_fa"/> 330 <param name="functions" value="split_fa"/>
364 <param name="input" value="test.fasta"/> 331 <param name="input" value="assembly_test.fasta"/>
365 <param name="split" value="-n"/> 332 </conditional>
366 </conditional> 333 <output name="split_fasta" value="split_04.fasta" ftype="fasta"/>
367 <output name="split_fasta" value="split_out.fasta"/> 334 </test>
368 </test> 335 <!-- Test 5 pbcstat -->
369 <!-- pbcstat -->
370 <test expect_num_outputs="6"> 336 <test expect_num_outputs="6">
371 <conditional name="function_select"> 337 <conditional name="function_select">
372 <param name="functions" value="pbcstat"/> 338 <param name="functions" value="pbcstat"/>
373 <param name="input" value="test.paf"/> 339 <param name="input" value="assembly_test.paf"/>
374 <param name="max_cov" value="1000"/> 340 <section name="pbcstat_options">
375 <param name="min_map_ratio" value="0.01"/> 341 <param name="max_cov" value="1000"/>
376 <param name="min_map_qual" value="1"/> 342 <param name="min_map_ratio" value="0.01"/>
377 <param name="flank" value="1"/> 343 <param name="min_map_qual" value="1"/>
378 <param name="primary_alignments" value="-p"/> 344 <param name="flank" value="1"/>
345 <param name="primary_alignments" value="-p"/>
346 </section>
379 <section name="section_calcuts"> 347 <section name="section_calcuts">
380 <param name="min_depth" value="0.01"/> 348 <param name="min_depth" value="0.01"/>
381 <param name="low_depth" value="1"/> 349 <param name="low_depth" value="1"/>
382 <param name="transition" value="1"/> 350 <param name="transition" value="1"/>
383 <param name="upper_depth" value="100"/> 351 <param name="upper_depth" value="100"/>
384 <param name="ploidy" value="-d 0"/> 352 <param name="ploidy" value="-d 0"/>
385 </section> 353 </section>
386 </conditional> 354 <param name="output_options" value="pbcstat_coverage,pbcstat_wig,depth_stats,histogram,calcuts_cutoff,calcuts_log"/>
387 <output name="calcuts_tab" value="calcuts_out.tsv"/> 355 </conditional>
388 <output name="pbcstat_cov" value="out.cov"/> 356 <output name="calcuts_cutoff" value="calcuts_cutoff_05.tabular" ftype="tabular"/>
389 <output name="pbcstat_wig" value="out.wig"/> 357 <output name="calcuts_log" value="calcuts_log_05.txt" ftype="txt"/>
390 <output name="stat_file" value="pbcstats.tabular"/> 358 <output name="pbcstat_cov" value="pbcstat_cov_05.tabular" ftype="tabular"/>
391 <output name="hist" value="hist.png" ftype="png" compare="sim_size"/> 359 <output name="pbcstat_wig" value="pbcstat_cov_05.wig" ftype="wig"/>
392 </test> 360 <output name="stat_file" value="pbcstats_05.tabular" ftype="tabular"/>
393 <!-- pbcstat gzip --> 361 <output name="hist" value="hist_05.png" ftype="png" compare="sim_size"/>
394 <test expect_num_outputs="6"> 362 </test>
363 <!-- Test 6 pbcstat gzip -->
364 <test expect_num_outputs="2">
395 <conditional name="function_select"> 365 <conditional name="function_select">
396 <param name="functions" value="pbcstat"/> 366 <param name="functions" value="pbcstat"/>
397 <param name="input" value="test.paf.gz" ftype="paf.gz"/> 367 <param name="input" value="assembly_test.paf.gz" ftype="paf.gz"/>
398 <param name="max_cov" value="1000"/> 368 <section name="pbcstat_options">
399 <param name="min_map_ratio" value="0.01"/> 369 <param name="max_cov" value="1000"/>
400 <param name="min_map_qual" value="1"/> 370 <param name="min_map_ratio" value="0.01"/>
401 <param name="flank" value="1"/> 371 <param name="min_map_qual" value="1"/>
402 <param name="primary_alignments" value="-p"/> 372 <param name="flank" value="1"/>
373 <param name="primary_alignments" value="-p"/>
374 </section>
403 <section name="section_calcuts"> 375 <section name="section_calcuts">
404 <param name="min_depth" value="0.01"/> 376 <param name="min_depth" value="0.01"/>
405 <param name="low_depth" value="1"/> 377 <param name="low_depth" value="1"/>
406 <param name="transition" value="1"/> 378 <param name="transition" value="1"/>
407 <param name="upper_depth" value="100"/> 379 <param name="upper_depth" value="100"/>
408 <param name="ploidy" value="-d 0"/> 380 <param name="ploidy" value="-d 0"/>
409 </section> 381 </section>
410 </conditional> 382 </conditional>
411 <output name="calcuts_tab" value="calcuts_out.tsv"/> 383 <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
412 <output name="pbcstat_cov" value="out.cov"/> 384 <output name="calcuts_cutoff" value="calcuts_cutoff_06.tabular" ftype="tabular"/>
413 <output name="pbcstat_wig" value="out.wig"/> 385 <output name="pbcstat_cov" value="pbcstat_cov_06.tabular" ftype="tabular"/>
414 </test> 386 </test>
415 <!-- Pbcstat multiple input --> 387 <!-- Test 7 Pbcstat multiple input -->
416 <test expect_num_outputs="6"> 388 <test expect_num_outputs="2">
417 <conditional name="function_select"> 389 <conditional name="function_select">
418 <param name="functions" value="pbcstat"/> 390 <param name="functions" value="pbcstat"/>
419 <param name="input" value="test.paf,test2.paf.gz"/> 391 <param name="input" value="assembly_test.paf,test2.paf.gz"/>
420 <section name="section_calcuts"> 392 <section name="section_calcuts">
421 <param name="min_depth" value="0.01"/> 393 <param name="min_depth" value="0.01"/>
422 <param name="low_depth" value="1"/> 394 <param name="low_depth" value="1"/>
423 <param name="transition" value="1"/> 395 <param name="transition" value="1"/>
424 <param name="upper_depth" value="100"/> 396 <param name="upper_depth" value="100"/>
425 <param name="ploidy" value="-d 0"/> 397 <param name="ploidy" value="-d 0"/>
426 </section> 398 </section>
427 </conditional> 399 </conditional>
428 <output name="calcuts_tab" value="calcuts_out.tsv"/> 400 <param name="output_options" value="pbcstat_coverage,calcuts_cutoff"/>
429 <output name="pbcstat_cov" value="out2.cov"/> 401 <output name="calcuts_cutoff" value="calcuts_cutoff_07.tabular" ftype="tabular"/>
430 <output name="stat_file" value="pbcstats2.tabular"/> 402 <output name="pbcstat_cov" value="pbcstat_cov_07.tabular" ftype="tabular"/>
431 <output name="pbcstat_wig" value="out2.wig"/> 403 </test>
432 </test> 404 <!-- Test 8 ngscstat -->
433 <!-- ngscstat --> 405 <test expect_num_outputs="2">
434 <test expect_num_outputs="5">
435 <conditional name="function_select"> 406 <conditional name="function_select">
436 <param name="functions" value="ngscstat"/> 407 <param name="functions" value="ngscstat"/>
437 <param name="input" value="test.bam"/> 408 <param name="input" value="test.bam"/>
438 <param name="min_align_qual" value="10"/> 409 <section name="ngscstat_options">
439 <param name="max_insert" value="100"/> 410 <param name="min_align_qual" value="10"/>
411 <param name="max_insert" value="100"/>
412 </section>
440 <section name="section_calcuts"> 413 <section name="section_calcuts">
441 <param name="min_depth" value="0.01"/> 414 <param name="min_depth" value="0.01"/>
442 <param name="low_depth" value="1"/> 415 <param name="low_depth" value="1"/>
443 <param name="transition" value="1"/> 416 <param name="transition" value="1"/>
444 <param name="upper_depth" value="100"/> 417 <param name="upper_depth" value="100"/>
445 <param name="ploidy" value="-d 0"/> 418 <param name="ploidy" value="-d 0"/>
446 </section> 419 </section>
447 </conditional> 420 </conditional>
448 <output name="calcuts_tab" value="calcuts_out.tsv"/> 421 <param name="output_options" value="ngscstat_coverage,calcuts_cutoff"/>
449 <output name="ngscstat_cov" value="ngsc_out.cov"/> 422 <output name="calcuts_cutoff" value="calcuts_cutoff_08.tabular" ftype="tabular"/>
450 <output name="stat_file" value="tx_stats.tabular"/> 423 <output name="ngscstat_cov" value="ngsc_cov_08.tabular" ftype="tabular"/>
451 <output name="hist" value="hist.png" ftype="png" compare="sim_size"/> 424 </test>
452 </test> 425 <!-- Test 9 Calcuts -->
453 <!-- Calcuts -->
454 <test expect_num_outputs="2"> 426 <test expect_num_outputs="2">
455 <conditional name="function_select"> 427 <conditional name="function_select">
456 <param name="functions" value="calcuts"/> 428 <param name="functions" value="calcuts"/>
457 <param name="input" value="test.stat"/> 429 <param name="input" value="test.stat"/>
458 <section name="section_calcuts"> 430 <section name="section_calcuts">
461 <param name="transition" value="1"/> 433 <param name="transition" value="1"/>
462 <param name="upper_depth" value="100"/> 434 <param name="upper_depth" value="100"/>
463 <param name="ploidy" value="-d 0"/> 435 <param name="ploidy" value="-d 0"/>
464 </section> 436 </section>
465 </conditional> 437 </conditional>
466 <output name="calcuts_tab" value="calcuts_out.tsv"/> 438 <output name="calcuts_cutoff" value="calcuts_cutoff_09.tabular" ftype="tabular"/>
467 </test> 439 <output name="calcuts_log" value="calcuts_log_09.txt" ftype="txt"/>
468 <!-- Get seqs --> 440 </test>
441 <!-- Test 10 Get seqs -->
469 <test expect_num_outputs="2"> 442 <test expect_num_outputs="2">
470 <conditional name="function_select"> 443 <conditional name="function_select">
471 <param name="functions" value="get_seqs"/> 444 <param name="functions" value="get_seqs"/>
472 <param name="fasta_input" value="split_out.fasta"/> 445 <param name="fasta_input" value="split_out.fasta"/>
473 <param name="bed_input" value="dups.bed"/> 446 <param name="bed_input" value="dups.bed"/>
474 <param name="coverage" value="-c"/> 447 <section name="advanced_options">
475 <param name="length" value="10"/> 448 <param name="coverage" value="-c"/>
476 <param name="haplotigs" value="-a"/> 449 <param name="length" value="10"/>
477 <param name="min_ratio" value=".01"/> 450 <param name="haplotigs" value="-a"/>
478 <param name="end_trim" value="-e"/> 451 <param name="min_ratio" value=".01"/>
479 <param name="split" value="-s"/> 452 <param name="end_trim" value="-e"/>
480 <param name="min_gap" value="100000"/> 453 <param name="split" value="-s"/>
481 </conditional> 454 <param name="min_gap" value="100000"/>
482 <output name="get_seqs_purged" value="purged_out.fa"/> 455 </section>
483 </test> 456 </conditional>
484 <!-- pbcstat histogram options--> 457 <output name="get_seqs_purged" value="get_seqs_purged_10.fa" ftype="fasta"/>
485 <test expect_num_outputs="6"> 458 <output name="get_seqs_hap" value="get_seqs_hap_10.fa" ftype="fasta"/>
459 </test>
460 <!-- Test 11 pbcstat histogram options-->
461 <test expect_num_outputs="1">
486 <conditional name="function_select"> 462 <conditional name="function_select">
487 <param name="functions" value="pbcstat"/> 463 <param name="functions" value="pbcstat"/>
488 <param name="input" value="test.paf"/> 464 <param name="input" value="assembly_test.paf"/>
489 <param name="max_cov" value="1000"/> 465 <section name="pbcstat_options">
490 <param name="min_map_ratio" value="0.01"/> 466 <param name="max_cov" value="1000"/>
491 <param name="min_map_qual" value="1"/> 467 <param name="min_map_ratio" value="0.01"/>
492 <param name="flank" value="1"/> 468 <param name="min_map_qual" value="1"/>
493 <param name="primary_alignments" value="-p"/> 469 <param name="flank" value="1"/>
470 <param name="primary_alignments" value="-p"/>
471 </section>
494 <section name="section_calcuts"> 472 <section name="section_calcuts">
495 <param name="min_depth" value="0.01"/> 473 <param name="min_depth" value="0.01"/>
496 <param name="low_depth" value="1"/> 474 <param name="low_depth" value="1"/>
497 <param name="transition" value="1"/> 475 <param name="transition" value="1"/>
498 <param name="upper_depth" value="100"/> 476 <param name="upper_depth" value="100"/>
502 <param name="ymax" value="100"/> 480 <param name="ymax" value="100"/>
503 <param name="xmax" value="100"/> 481 <param name="xmax" value="100"/>
504 <param name="cutoffs_his" value="calcuts_out.tsv"/> 482 <param name="cutoffs_his" value="calcuts_out.tsv"/>
505 </section> 483 </section>
506 </conditional> 484 </conditional>
507 <output name="calcuts_tab" value="calcuts_out.tsv"/> 485 <param name="output_options" value="histogram"/>
508 <output name="pbcstat_cov" value="out_hist_options.cov"/> 486 <output name="hist" value="hist_11.png" ftype="png" compare="sim_size"/>
509 <output name="pbcstat_wig" value="out_hist_options.wig"/>
510 <output name="stat_file" value="pbcstats_hist_options.tabular"/>
511 <output name="hist" value="hist_options.png" ftype="png" compare="sim_size"/>
512 </test> 487 </test>
513 </tests> 488 </tests>
514 <help><![CDATA[ 489 <help><![CDATA[
515 .. class:: infomark 490 .. class:: infomark
516 491
517 **What it does** 492 **Purpose**
518 493
519 The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth. 494 The purge_dups tools are designed to remove haplotigs and contig overlaps in a de novo assembly based on read depth.
495 purge_dups can significantly improve genome assemblies by removing overlaps and haplotigs caused by sequence divergence
496 in heterozygous regions. This both removes false duplications in primary draft assemblies while retaining completeness and sequence
497 integrity, and can improve scaffolding.
498
499 ----
500
501 .. class:: infomark
502
503 **Pipeline Guide**
504
505 Given a primary assembly, and an alternative assembly (optional, if you have one), follow the steps shown below to build your
506 own purge_dups pipeline, steps with same number can be run simultaneously. Among all the steps, although step 5 is optional,
507 we highly recommend our users to do so, because assemblers may produce overrepresented sequences. In such a case, the final
508 step 5 can be applied to remove those seqeuences.
509
510 - Step 1: Calculate the coverage cutoffs and base coverages.
511 - Step 2: Split an assembly with the **split_fasfa** function and do a self-self alignment by using minimap2.
512 - Step 3: Purge haplotigs and overlaps with the **purge_dups** function.
513 - Step 4: Get purged primary and haplotig sequences from the draft assembly with the **get_seqs** function.
514 - Step 5: Merge hap.fa file, generated in the previous step, and the alternate assembly, and redo the above steps to get a decent haplotig set.
515
516 ----
517
518 .. class:: infomark
519
520 **Limitations**
521
522 - Read depth cutoffs calculation: the coverage cutoffs can be larger for a low heterozygosity species, which causes the purged assembly size smaller than expected. In such a case, please use script/hist_plot.py to make the histogram plot and set coverage cutoffs manually.
523 - Repeats: purge_dups has a limited ability to process repeats.
524
525 ----
526
527 .. class:: infomark
528
529 **Purged assembly validation**
530
531 There are many ways to validate the purged assembly. One way is to make a coverage plot for it, the 2nd way is to run `BUSCO <https://busco.ezlab.org/>`_. A thid option is to use `Merqury <https://github.com/marbl/merqury>`_
532
533
520 534
521 ]]></help> 535 ]]></help>
522 <citations> 536 <expand macro="citations"/>
523 <citation type="doi">10.1093/bioinformatics/btaa025</citation>
524 </citations>
525 </tool> 537 </tool>