comparison tools/mira4_0/mira4_mapping.xml @ 2:4eb32a3d67d1 draft

v0.0.8 - renamed folder, added note about mirabait
author peterjc
date Wed, 02 Sep 2015 07:46:29 -0400
parents
children a4f602cc3aa9
comparison
equal deleted inserted replaced
1:70248e6e3efc 2:4eb32a3d67d1
1 <tool id="mira_4_0_mapping" name="MIRA v4.0 mapping" version="0.0.8">
2 <description>Maps Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio reads</description>
3 <requirements>
4 <requirement type="binary">mira</requirement>
5 <requirement type="binary">miraconvert</requirement>
6 <requirement type="package" version="4.0.2">MIRA</requirement>
7 <requirement type="binary">samtools</requirement>
8 <requirement type="package" version="0.1.19">samtools</requirement>
9 </requirements>
10 <stdio>
11 <!-- Assume anything other than zero is an error -->
12 <exit_code range="1:" />
13 <exit_code range=":-1" />
14 </stdio>
15 <version_command interpreter="python">mira4.py --version</version_command>
16 <command interpreter="python">mira4.py
17 --manifest "$manifest"
18 #if str($maf_wanted) == "true":
19 --maf "$out_maf"
20 #end if
21 #if str($bam_wanted) == "true":
22 --bam "$out_bam"
23 #end if
24 --fasta "$out_fasta"
25 --log "$out_log"
26 </command>
27 <configfiles>
28 <configfile name="manifest">
29 project = MIRA
30 job = mapping,${job_type},${job_quality}
31 parameters = -NW:cmrnl=no -DI:trt=/tmp -OUT:orc=no
32 ## -GE:not is short for -GENERAL:number_of_threads and using one (1)
33 ## can be useful for repeatability of assemblies and bug hunting.
34 ## This is overriden by the command line -t switch which is easier
35 ## to set from within Galaxy.
36 ##
37 ## -NW:cmrnl is short for -NAG_AND_WARN:check_maxreadnamelength
38 ## and without this MIRA aborts with read names over 40 characters
39 ## due to limitations of some downstream tools.
40 ##
41 ## -DI:trt is short for -DIRECTORY:tmp_redirected_to and should
42 ## point to a local hard drive (not something like NFS on network).
43 ## We replace /tmp with an environment variable via mira4.py
44 ##
45 ## -OUT:orc=no is short for -OUTPUT:output_result_caf=no
46 ## which turns off an output file we don't want anyway.
47
48 ##This bar goes into the manifest as a comment line
49 #------------------------------------------------------------------------------
50
51 readgroup
52 is_reference
53 #if str($strain_setup)=="same"
54 strain = StrainX
55 #end if
56 #for $f in $references
57 ##Must now map Galaxy datatypes to MIRA file types...
58 #if $f.ext.startswith("fastq")
59 ##MIRA doesn't like fastqsanger etc, just plain old fastq:
60 data = fastq::$f
61 #elif $f.ext == "mira"
62 ##We're calling *.maf the "mira" format in Galaxy (name space collision)
63 data = maf::$f
64 #elif $f.ext == "fasta"
65 ##We're calling MIRA with the file type as "fna" as otherwise it wants quals
66 data = fna::$f
67 #else
68 ##Currently don't expect anything else...
69 data = ${f.ext}::$f
70 #end if
71 #end for
72 #for $rg in $read_group
73
74 ##This bar goes into the manifest as a comment line
75 #------------------------------------------------------------------------------
76
77 readgroup
78 technology = ${rg.technology}
79 #if str($strain_setup)=="same"
80 ##This is perhaps redundant as MIRA defaults to StrainX for the reads:
81 strain = StrainX
82 #end if
83 ##Record the segment placement (if any)
84 #if str($rg.segments.type) == "paired"
85 segment_placement = ${rg.segments.placement}
86 segment_naming = ${rg.segments.naming}
87 #end if
88 ##if str($rg.segments.type) == "none"
89 ##MIRA4 manual says use segment_placement = unknown or ? for unpaired data
90 ##but this stopped working in MIRA 4.0 RC5 and 4.0 (final). See:
91 ##http://www.freelists.org/post/mira_talk/Unpaired-reads-and-segment-placement--or-unknown
92 ##segment_placement = ?
93 ##end if
94 ##MIRA will accept multiple filenames on one data line, or multiple data lines
95 #for $f in $rg.filenames
96 ##Must now map Galaxy datatypes to MIRA file types...
97 #if $f.ext.startswith("fastq")
98 ##MIRA doesn't like fastqsanger etc, just plain old fastq:
99 data = fastq::$f
100 #elif $f.ext == "mira"
101 ##We're calling *.maf the "mira" format in Galaxy (name space collision)
102 data = maf::$f
103 #else
104 ##Currently don't expect anything else...
105 data = ${f.ext}::$f
106 #end if
107 #end for
108 #end for
109 </configfile>
110 </configfiles>
111 <inputs>
112 <param name="job_type" type="select" label="Assembly type">
113 <option value="genome">Genome</option>
114 <option value="est">EST (transcriptome)</option>
115 </param>
116 <param name="job_quality" type="select" label="Assembly quality grade">
117 <option value="accurate">Accurate</option>
118 <option value="draft">Draft</option>
119 </param>
120 <!-- TODO? Allow technology type for references? -->
121 <!-- TODO? Allow strain settings for reference(s) and reads? -->
122 <!-- TODO? Use a repeat to allow for multi-strain references? -->
123 <!-- TODO? Add strain to the mapping read groups? -->
124 <param name="references" type="data" format="fasta,fastq,mira" multiple="true" required="true" label="Backbone reference file(s)"
125 help="Multiple files allowed, for example one FASTA file per chromosome or plasmid." />
126 <param name="strain_setup" type="select" label="Strain configuration (reference vs reads)">
127 <option value="default">Different strains - mapping reads onto a related reference ('StrainX' vs 'ReferenceStrain')</option>
128 <option value="same">Same strain - mapping reads from same reference (all 'StrainX')</option>
129 </param>
130 <repeat name="read_group" title="Read Group" min="1">
131 <param name="technology" type="select" label="Read technology">
132 <option value="solexa">Solexa/Illumina</option>
133 <option value="sanger">Sanger cappillary sequencing</option>
134 <option value="454">Roche 454</option>
135 <option value="iontor">Ion Torrent</option>
136 <option value="pcbiolq">PacBio low quality (raw)</option>
137 <option value="pcbiohq">PacBio high quality (corrected)</option>
138 <option value="text">Synthetic reads (database entries, consensus sequences, artifical reads, etc)</option>
139 </param>
140 <conditional name="segments">
141 <param name="type" type="select" label="Are these paired reads?">
142 <option value="paired">Paired reads</option>
143 <option value="none">Single reads or not relevant (e.g. primer walking with Sanger capillary sequencing)</option>
144 </param>
145 <when value="paired">
146 <param name="placement" type="select" label="Pairing type (segment placing)">
147 <option value="FR">---&gt; &lt;--- (e.g. Sanger capillary or Solexa/Illumina paired-end library)</option>
148 <option value="RF">&lt;--- ---&gt; (e.g. Solexa/Illumina mate-pair library)</option>
149 <option value="SB">2---&gt; 1---&gt; (e.g. Roche 454 paired-end libraries or IonTorrent long-mate; see note)</option>
150 </param>
151 <param name="naming" type="select" label="Pair naming convention">
152 <option value="solexa">Solexa/Illumina (using '/1' and '/2' suffixes, or later Illumina colon system)</option>
153 <option value="FR">Forward/Reverse scheme (using '.f*' and '.r*' suffixes)</option>
154 <option value="tigr">TIGR scheme (using 'TF*' and 'TR*' suffixes)</option>
155 <option value="sanger">Sanger scheme (see notes)</option>
156 <option value="stlouis">St. Louis scheme (see notes)</option>
157 </param>
158 </when>
159 <when value="none" /><!-- no further questions -->
160 </conditional>
161 <param name="filenames" type="data" format="fastq,mira" multiple="true" required="true" label="Read file(s)"
162 help="Multiple files allowed, for example paired reads can be given as two files (MIRA looks at read names to identify pairs)." />
163 </repeat>
164 <param name="maf_wanted" type="boolean" label="Output mapping in MIRA's own format?" checked="False" />
165 <param name="bam_wanted" type="boolean" label="Convert mapping into BAM format?" checked="True" />
166 </inputs>
167 <outputs>
168 <data name="out_fasta" format="fasta" label="MIRA #if str($strain_setup)=='same' then 'same strain' else 'reference' # mapping contigs (FASTA)" />
169 <data name="out_bam" format="bam" label="MIRA #if str($strain_setup)=='same' then 'same strain' else 'reference' # mapping assembly (BAM)">
170 <filter>bam_wanted is True</filter>
171 </data>
172 <data name="out_maf" format="mira" label="MIRA #if str($strain_setup)=='same' then 'same strain' else 'reference' # mapping assembly">
173 <filter>maf_wanted is True</filter>
174 </data>
175 <data name="out_log" format="txt" label="MIRA #if str($strain_setup)=='same' then 'same strain' else 'reference' # mapping log" />
176 </outputs>
177 <tests>
178 <test>
179 <param name="job_type" value="genome" />
180 <param name="job_quality" value="accurate" />
181 <param name="references" value="tvc_contigs.fasta" ftype="fasta" />
182 <param name="strain_setup" value="default" />
183 <param name="type" value="none" />
184 <param name="filenames" value="tvc_mini.fastq" ftype="fastqsanger" />
185 <param name="maf_wanted" value="true"/>
186 <param name="bam_wanted" value="true"/>
187 <output name="out_fasta" file="tvc_map_ref_strain.fasta" ftype="fasta" />
188 <output name="out_bam" file="empty_file.dat" compare="contains" />
189 <!-- TODO: Suggest startswith as a compare method? -->
190 <output name="out_maf" file="header.mira" compare="contains" />
191 <output name="out_log" file="empty_file.dat" compare="contains" />
192 </test>
193 <test>
194 <param name="job_type" value="genome" />
195 <param name="job_quality" value="accurate" />
196 <param name="references" value="tvc_contigs.fasta" ftype="fasta" />
197 <param name="strain_setup" value="same" />
198 <param name="type" value="none" />
199 <param name="filenames" value="tvc_mini.fastq" ftype="fastqsanger" />
200 <param name="maf_wanted" value="false"/>
201 <param name="bam_wanted" value="false"/>
202 <output name="out_fasta" file="tvc_map_same_strain.fasta" ftype="fasta" />
203 <output name="out_log" file="empty_file.dat" compare="contains" />
204 </test>
205 </tests>
206 <help>
207
208 **What it does**
209
210 Runs MIRA v4.0 in mapping mode, collects the output, generates a sorted BAM
211 file, and throws away all the temporary files.
212
213 MIRA is an open source assembly tool capable of handling sequence data from
214 a range of platforms (Sanger capillary, Solexa/Illumina, Roche 454, Ion Torrent
215 and also PacBio).
216
217 It is particularly suited to small genomes such as bacteria.
218
219
220 **Notes on paired reads**
221
222 .. class:: warningmark
223
224 MIRA uses read naming conventions to identify paired read partners
225 (and does not care about their order in the input files). In most cases,
226 the Solexa/Illumina setting is fine. For Sanger capillary sequencing,
227 you may need to rename your reads to match one of the standard conventions
228 supported by MIRA. For Roche 454 or Ion Torrent the appropriate settings
229 depend on how the FASTQ file was produced:
230
231 * If using Roche's ``sffinfo`` or older versions of ``sff_extract``
232 to convert SFF files to FASTQ, your reads will probably have the
233 ``---&gt; &lt;---`` orientation and use the ``.f`` and ``.r``
234 suffixes (FR naming).
235
236 * If using a recent version of ``sff_extract``, then the ``/1`` and ``/2``
237 suffixes are used (Solexa/Illumina style naming) and the original
238 ``2---&gt; 1---&gt;`` orientation is preserved.
239
240 The reason for this is the raw data for Roche 454 and Ion Torrent paired-end
241 libraries sequences a circularised fragment such that the raw data begins
242 with the end of the fragment, a linker, then the start of the fragment.
243 This means both the start and end are sequenced from the same strand, and
244 have the orientation ``2---&gt; 1---&gt;``. However, in order to use the data
245 with traditional tools expecting Sanger capillary style ``---&gt; &lt;---``
246 orientation it was common to reverse complement one of the pair to mimic this.
247
248
249 **Citation**
250
251 If you use this Galaxy tool in work leading to a scientific publication please
252 cite the following papers:
253
254 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
255 Galaxy tools and workflows for sequence analysis with applications
256 in molecular plant pathology. PeerJ 1:e167
257 http://dx.doi.org/10.7717/peerj.167
258
259 Bastien Chevreux, Thomas Wetter and Sándor Suhai (1999).
260 Genome Sequence Assembly Using Trace Signals and Additional Sequence Information.
261 Computer Science and Biology: Proceedings of the German Conference on Bioinformatics (GCB) 99, pp. 45-56.
262 http://www.bioinfo.de/isb/gcb99/talks/chevreux/main.html
263
264 This wrapper is available to install into other Galaxy Instances via the Galaxy
265 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/mira4_assembler
266 </help>
267 <citations>
268 <citation type="doi">10.7717/peerj.167</citation>
269 <citation type="bibtex">@ARTICLE{Chevreux1999-mira3,
270 author = {B. Chevreux and T. Wetter and S. Suhai},
271 year = {1999},
272 title = {Genome Sequence Assembly Using Trace Signals and Additional Sequence Information},
273 journal = {Computer Science and Biology: Proceedings of the German Conference on Bioinformatics (GCB)}
274 volume = {99},
275 pages = {45-56},
276 url = {http://www.bioinfo.de/isb/gcb99/talks/chevreux/main.html}
277 }</citation>
278 </citations>
279 </tool>