comparison tools/mira4_0/mira4_de_novo.xml @ 2:4eb32a3d67d1 draft

v0.0.8 - renamed folder, added note about mirabait
author peterjc
date Wed, 02 Sep 2015 07:46:29 -0400
parents
children a4f602cc3aa9
comparison
equal deleted inserted replaced
1:70248e6e3efc 2:4eb32a3d67d1
1 <tool id="mira_4_0_de_novo" name="MIRA v4.0 de novo assember" version="0.0.8">
2 <description>Takes Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio reads</description>
3 <requirements>
4 <requirement type="binary">mira</requirement>
5 <requirement type="binary">miraconvert</requirement>
6 <requirement type="package" version="4.0.2">MIRA</requirement>
7 <requirement type="binary">samtools</requirement>
8 <requirement type="package" version="0.1.19">samtools</requirement>
9 </requirements>
10 <code file="mira4_validator.py" />
11 <stdio>
12 <!-- Assume anything other than zero is an error -->
13 <exit_code range="1:" />
14 <exit_code range=":-1" />
15 </stdio>
16 <version_command interpreter="python">mira4.py --version</version_command>
17 <command interpreter="python">mira4.py
18 --manifest "$manifest"
19 #if str($maf_wanted)=="true":
20 --maf "$out_maf"
21 #end if
22 #if str($bam_wanted)=="true":
23 --bam "$out_bam"
24 #end if
25 --fasta "$out_fasta"
26 --log "$out_log"
27 </command>
28 <configfiles>
29 <configfile name="manifest">
30 project = MIRA
31 job = denovo,${job_type},${job_quality}
32 parameters = -NW:cmrnl=no -DI:trt=/tmp -OUT:orc=no
33 ## -GE:not is short for -GENERAL:number_of_threads and using one (1)
34 ## can be useful for repeatability of assemblies and bug hunting.
35 ## This is overriden by the command line -t switch which is easier
36 ## to set from within Galaxy.
37 ##
38 ## -NW:cmrnl is short for -NAG_AND_WARN:check_maxreadnamelength
39 ## and without this MIRA aborts with read names over 40 characters
40 ## due to limitations of some downstream tools.
41 ##
42 ## -DI:trt is short for -DIRECTORY:tmp_redirected_to and should
43 ## point to a local hard drive (not something like NFS on network).
44 ## We replace /tmp with an environment variable via mira4.py
45 ##
46 ## -OUT:orc=no is short for -OUTPUT:output_result_caf=no
47 ## which turns off an output file we don't want anyway.
48
49 #for $rg in $read_group
50
51 ##This bar goes into the manifest as a comment line
52 #------------------------------------------------------------------------------
53
54 readgroup
55 technology = ${rg.technology}
56 ##Record the segment placement (if any)
57 #if str($rg.segments.type) == "paired"
58 segment_placement = ${rg.segments.placement}
59 segment_naming = ${rg.segments.naming}
60 #if str($rg.segments.min_size) != "" or str($rg.segments.max_size) != ""
61 ##If our min/max validation failed I trust MIRA to give an error message...
62 template_size = $rg.segments.min_size $rg.segments.max_size
63 #end if
64 #end if
65 ##if str($rg.segments.type) == "none"
66 ##MIRA4 manual says use segment_placement = unknown or ? for unpaired data
67 ##but this stopped working in MIRA 4.0 RC5 and 4.0 (final). See:
68 ##http://www.freelists.org/post/mira_talk/Unpaired-reads-and-segment-placement--or-unknown
69 ##segment_placement = ?
70 ##end if
71 ##MIRA will accept multiple filenames on one data line, or multiple data lines
72 #for $f in $rg.filenames
73 ##Must now map Galaxy datatypes to MIRA file types...
74 #if $f.ext.startswith("fastq")
75 ##MIRA doesn't like fastqsanger etc, just plain old fastq:
76 data = fastq::$f
77 #elif $f.ext == "mira"
78 ##We're calling *.maf the "mira" format in Galaxy (name space collision)
79 data = maf::$f
80 #else
81 ##MIRA is happy with fasta as name,
82 data = ${f.ext}::$f
83 #end if
84 #end for
85 #end for
86 </configfile>
87 </configfiles>
88 <inputs>
89 <param name="job_type" type="select" label="Assembly type">
90 <option value="genome">Genome</option>
91 <option value="est">EST (transcriptome)</option>
92 </param>
93 <param name="job_quality" type="select" label="Assembly quality grade">
94 <option value="accurate">Accurate</option>
95 <option value="draft">Draft</option>
96 </param>
97 <repeat name="read_group" title="Read Group" min="1">
98 <param name="technology" type="select" label="Read technology">
99 <option value="solexa">Solexa/Illumina</option>
100 <option value="sanger">Sanger cappillary sequencing</option>
101 <option value="454">Roche 454</option>
102 <option value="iontor">Ion Torrent</option>
103 <option value="pcbiolq">PacBio low quality (raw)</option>
104 <option value="pcbiohq">PacBio high quality (corrected)</option>
105 <option value="text">Synthetic reads (database entries, consensus sequences, artifical reads, etc)</option>
106 <!-- TODO reference/backbone as an entry here? -->
107 </param>
108 <conditional name="segments">
109 <param name="type" type="select" label="Are these paired reads?">
110 <option value="paired">Paired reads</option>
111 <option value="none">Single reads or not relevant (e.g. primer walking with Sanger capillary sequencing)</option>
112 </param>
113 <when value="paired">
114 <param name="placement" type="select" label="Pairing type (segment placing)">
115 <option value="FR">---&gt; &lt;--- (e.g. Sanger capillary or Solexa/Illumina paired-end library)</option>
116 <option value="RF">&lt;--- ---&gt; (e.g. Solexa/Illumina mate-pair library)</option>
117 <option value="SB">2---&gt; 1---&gt; (e.g. Roche 454 paired-end libraries or IonTorrent long-mate; see note)</option>
118 </param>
119 <!-- min/max validation is done via the <code> tag -->
120 <param name="min_size" type="integer" optional="true" min="0" value=""
121 label="Minimum size of 'good' DNA templates in the library preparation"
122 help="Optional, but if used you must also supply a maximum value." />
123 <param name="max_size" type="integer" optional="true" min="0" value=""
124 label="Maximum size of 'good' DNA templates in the library preparation"
125 help="Optional, but if used you must also supply a minimum value." />
126 <param name="naming" type="select" label="Pair naming convention">
127 <option value="solexa">Solexa/Illumina (using '/1' and '/2' suffixes, or later Illumina colon system)</option>
128 <option value="FR">Forward/Reverse scheme (using '.f*' and '.r*' suffixes)</option>
129 <option value="tigr">TIGR scheme (using 'TF*' and 'TR*' suffixes)</option>
130 <option value="sanger">Sanger scheme (see notes)</option>
131 <option value="stlouis">St. Louis scheme (see notes)</option>
132 </param>
133 </when>
134 <when value="none" /><!-- no further questions -->
135 </conditional>
136 <param name="filenames" type="data" format="fastq,mira" multiple="true" required="true" label="Read file(s)"
137 help="Multiple files allowed, for example paired reads can be given as two files (MIRA looks at read names to identify pairs)." />
138 </repeat>
139 <param name="maf_wanted" type="boolean" label="Output assembly in MIRA's own format?" checked="False" />
140 <param name="bam_wanted" type="boolean" label="Convert assembly into BAM format?" checked="True" />
141 </inputs>
142 <outputs>
143 <data name="out_fasta" format="fasta" label="MIRA de novo contigs (FASTA)" />
144 <data name="out_bam" format="bam" label="MIRA de novo assembly (BAM)">
145 <filter>bam_wanted is True</filter>
146 </data>
147 <data name="out_maf" format="mira" label="MIRA de novo assembly">
148 <filter>maf_wanted is True</filter>
149 </data>
150 <!-- TODO?
151 <data name="out_contigstats" format="tabular" label="MIRA contig stats" />
152 -->
153 <data name="out_log" format="txt" label="MIRA de novo log" />
154 </outputs>
155 <tests>
156 <!-- Tiger mitochondria, selected paired end Illumina reads from SRR639755
157 Note we're using just one repeat group, and only the filenames parameter
158 within it, so this should work with current test framework limitations:
159 TODO: Revise example and/or -NW:cac=warn and -NW:acv=80 settings
160 MIRA 4.0 complains as coverage is about x93 which is over 80 limit.
161 Also MIRA 4.0 gives three contigs as output.
162 <test>
163 <param name="job_type" value="genome" />
164 <param name="job_quality" value="accurate" />
165 <param name="filenames" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
166 <output name="out_fasta" file="SRR639755_mito_pairs.mira4_de_novo.fasta" ftype="fasta" />
167 </test>
168 -->
169 <!-- Simple assembly based on MIRA's minidemo/demo4 example
170 Note we're using just one repeat group,
171 but several parameters with the repeat
172 -->
173 <test>
174 <param name="job_type" value="genome" />
175 <param name="job_quality" value="accurate" />
176 <param name="technology" value="sanger" />
177 <param name="type" value="none" />
178 <param name="filenames" value="U13small_m.fastq" ftype="fastqsanger" />
179 <param name="maf_wanted" value="true"/>
180 <param name="bam_wanted" value="true"/>
181 <output name="out_fasta" file="U13small_m.mira4_de_novo.fasta" ftype="fasta" />
182 <output name="out_bam" file="empty_file.dat" compare="contains" />
183 <!-- TODO: Suggest startswith as a compare method? -->
184 <output name="out_maf" file="header.mira" compare="contains" />
185 <output name="out_log" file="empty_file.dat" compare="contains" />
186 </test>
187 <!-- Simple assembly based on MIRA's minidemo/solexa1 example
188 Note we're using just one repeat group,
189 but two parameters within the repeat (filename, no pairing)
190 -->
191 <test>
192 <param name="job_type" value="genome" />
193 <param name="job_quality" value="accurate" />
194 <param name="type" value="none" />
195 <param name="filenames" value="ecoli.fastq" ftype="fastqsanger" />
196 <param name="maf_wanted" value="false"/>
197 <param name="bam_wanted" value="false"/>
198 <output name="out_fasta" file="ecoli.mira4_de_novo.fasta" ftype="fasta" />
199 <output name="out_log" file="empty_file.dat" compare="contains" />
200 </test>
201 </tests>
202 <help>
203
204 **What it does**
205
206 Runs MIRA v4.0 in de novo mode, collects the output, generates a sorted BAM
207 file, and then throws away all the temporary files.
208
209 MIRA is an open source assembly tool capable of handling sequence data from
210 a range of platforms (Sanger capillary, Solexa/Illumina, Roche 454, Ion Torrent
211 and also PacBio).
212
213 It is particularly suited to small genomes such as bacteria.
214
215
216 **Notes on paired reads**
217
218 .. class:: warningmark
219
220 MIRA uses read naming conventions to identify paired read partners
221 (and does not care about their order in the input files). In most cases,
222 the Solexa/Illumina setting is fine. For Sanger capillary sequencing,
223 you may need to rename your reads to match one of the standard conventions
224 supported by MIRA. For Roche 454 or Ion Torrent the appropriate settings
225 depend on how the FASTQ file was produced:
226
227 * If using Roche's ``sffinfo`` or older versions of ``sff_extract``
228 to convert SFF files to FASTQ, your reads will probably have the
229 ``---&gt; &lt;---`` orientation and use the ``.f`` and ``.r``
230 suffixes (FR naming).
231
232 * If using a recent version of ``sff_extract``, then the ``/1`` and ``/2``
233 suffixes are used (Solexa/Illumina style naming) and the original
234 ``2---&gt; 1---&gt;`` orientation is preserved.
235
236 The reason for this is the raw data for Roche 454 and Ion Torrent paired-end
237 libraries sequences a circularised fragment such that the raw data begins
238 with the end of the fragment, a linker, then the start of the fragment.
239 This means both the start and end are sequenced from the same strand, and
240 have the orientation ``2---&gt; 1---&gt;``. However, in order to use the data
241 with traditional tools expecting Sanger capillary style ``---&gt; &lt;---``
242 orientation it was common to reverse complement one of the pair to mimic this.
243
244
245 **Citation**
246
247 If you use this Galaxy tool in work leading to a scientific publication please
248 cite the following papers:
249
250 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
251 Galaxy tools and workflows for sequence analysis with applications
252 in molecular plant pathology. PeerJ 1:e167
253 http://dx.doi.org/10.7717/peerj.167
254
255 Bastien Chevreux, Thomas Wetter and Sándor Suhai (1999).
256 Genome Sequence Assembly Using Trace Signals and Additional Sequence Information.
257 Computer Science and Biology: Proceedings of the German Conference on Bioinformatics (GCB) 99, pp. 45-56.
258 http://www.bioinfo.de/isb/gcb99/talks/chevreux/main.html
259
260 This wrapper is available to install into other Galaxy Instances via the Galaxy
261 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/mira4_assembler
262 </help>
263 <citations>
264 <citation type="doi">10.7717/peerj.167</citation>
265 <citation type="bibtex">@ARTICLE{Chevreux1999-mira3,
266 author = {B. Chevreux and T. Wetter and S. Suhai},
267 year = {1999},
268 title = {Genome Sequence Assembly Using Trace Signals and Additional Sequence Information},
269 journal = {Computer Science and Biology: Proceedings of the German Conference on Bioinformatics (GCB)}
270 volume = {99},
271 pages = {45-56},
272 url = {http://www.bioinfo.de/isb/gcb99/talks/chevreux/main.html}
273 }</citation>
274 </citations>
275 </tool>