Mercurial > repos > bonsai > sortmerna
comparison sortmerna_wrapper.xml @ 0:2e7f0da431e3 draft default tip
Uploaded version 1.0
| author | bonsai |
|---|---|
| date | Tue, 30 Apr 2013 13:12:35 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2e7f0da431e3 |
|---|---|
| 1 <?xml version="1.0" encoding="utf-8"?> | |
| 2 <tool id="sortmerna_wrapper" version="1.0" name="Filter with SortMeRNA"> | |
| 3 <requirements> | |
| 4 <requirement type='package' version="1.7">sortmerna</requirement> | |
| 5 </requirements> | |
| 6 <description>Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data</description> | |
| 7 <command interpreter="python"> | |
| 8 sortmerna_wrapper.py | |
| 9 --sortmerna " | |
| 10 $strand_search | |
| 11 #if str( $read_family.read_family_selector ) == 'other': | |
| 12 --I $input_reads -r $read_family.ratio_parameter | |
| 13 #else: | |
| 14 $read_family.read_family_selector $input_reads | |
| 15 #end if | |
| 16 #if str( $sequencing_type.sequencing_type_selector ) == 'paired': | |
| 17 $sequencing_type.paired_type | |
| 18 #end if | |
| 19 | |
| 20 #if $outputs_selected: | |
| 21 #if 'accept' in $outputs_selected.value: | |
| 22 --accept accept_file | |
| 23 #end if | |
| 24 #if 'other' in $outputs_selected.value: | |
| 25 --other other_file | |
| 26 #end if | |
| 27 #end if | |
| 28 $log | |
| 29 #if str( $options.options_type_selector ) == 'more': | |
| 30 -a $options.number_of_threads | |
| 31 #end if | |
| 32 " | |
| 33 #if str( $databases_type.databases_selector ) == 'history': | |
| 34 --buildtrie | |
| 35 #for $db in $databases_type.input_databases | |
| 36 $db.database_name | |
| 37 #end for | |
| 38 #else: | |
| 39 ## databases path is not directly accessible, must match by hand with LOC file contents | |
| 40 ${' '.join([dict([(x[0], x[2]) for x in $databases_type.input_databases.input.options.tool_data_table.data])[y] | |
| 41 for y in $databases_type.input_databases.value])} | |
| 42 #end if | |
| 43 </command> | |
| 44 <inputs> | |
| 45 <conditional name="read_family"> | |
| 46 <param name="read_family_selector" type="select" format="text" | |
| 47 help="The Illumina platform is more common for large scale metatranscriptomic projects requiring a high throughput."> | |
| 48 <label>Sequencing technology of querying sequences (reads)</label> | |
| 49 <option value="--I">Illumina Solexa</option> | |
| 50 <option value="--454">454 Roche</option> | |
| 51 <option value="other">Other</option> | |
| 52 </param> | |
| 53 <when value="other"> | |
| 54 <param name="ratio_parameter" type="float" value="1" min="0" max="1" | |
| 55 label="Ratio parameter (the number of hits on the read / read length)" | |
| 56 help="The ratio parameter for SortMeRNA has been set to r=0.25 for Illumina Solexa reads and to r=0.15 for 454 Roche reads. | |
| 57 For other read types, if the sequencing technology produces high quality reads with a low substitution error rate | |
| 58 (0.1 substitutions per 100 bases, such as Illumina), then the ratio parameter can be set to r=[0.23,0.27]. | |
| 59 If the sequencing technology has a high indel error rate (1-2 indels per 100 bases, such as 454 or Ion Torrent), | |
| 60 then the ratio parameter can be set to r=[0.13,0.17]."/> | |
| 61 </when> | |
| 62 </conditional> | |
| 63 <param format="fasta,fastq" name="input_reads" type="data" label="Querying sequences (reads)" help=""/> | |
| 64 | |
| 65 <conditional name="sequencing_type"> | |
| 66 <param name="sequencing_type_selector" type="select" label="Sequencing type"> | |
| 67 <option value="not_paired">Reads are not paired</option> | |
| 68 <option value="paired">Reads are paired</option> | |
| 69 </param> | |
| 70 <when value="paired"> | |
| 71 <param name="paired_type" type="select" label="If one read of a pair is accepted and the other not, output both reads" display="radio" | |
| 72 help="SortMeRNA does not use the pairing information for filtering RNA, | |
| 73 however if one read of a pair is accepted and the other is not, | |
| 74 the resulting output may break apart the pair into two separate files. | |
| 75 The purpose of 'Reads are paired' option is to preserve the pairing of the reads."> | |
| 76 <option value="--paired-in">to accepted file</option> | |
| 77 <option value="--paired-out">to rejected file</option> | |
| 78 </param> | |
| 79 </when> | |
| 80 </conditional> | |
| 81 | |
| 82 <param name="strand_search" type="select" label="Which strands to search" display="radio"> | |
| 83 <option value="">Search both strands</option> | |
| 84 <option value="-F">Search only the forward strand</option> | |
| 85 <option value="-R">Search only the reverse-complementary strand</option> | |
| 86 </param> | |
| 87 | |
| 88 <conditional name="databases_type"> | |
| 89 <param name="databases_selector" type="select" label="Databases to query" | |
| 90 help="Public rRNA databases provided with SortMeRNA have been indexed. | |
| 91 On the contrary, personal databases must be indexed each time SortMeRNA is launched. | |
| 92 Please be patient, this may take some time depending on the size of the given database."> | |
| 93 <option value="cached" selected="true">Public ribosomal databases</option> | |
| 94 <option value="history">Databases from your history</option> | |
| 95 </param> | |
| 96 <when value="cached"> | |
| 97 <param name="input_databases" label="rRNA database" | |
| 98 type="select" display="checkboxes" multiple="true"> | |
| 99 <options from_data_table="rRNA_databases" /> | |
| 100 <validator type="no_options" message="Select at least one database"/> | |
| 101 </param> | |
| 102 </when> | |
| 103 <when value="history"> | |
| 104 <repeat name="input_databases" title="Database" min="1"> | |
| 105 <param name="database_name" type="data" format="fasta" label="rRNA database" | |
| 106 help="Your database will be indexed first, which may take up to several minutes."/> | |
| 107 </repeat> | |
| 108 </when> | |
| 109 </conditional> | |
| 110 | |
| 111 <!-- Outputs --> | |
| 112 <param name="outputs_selected" type="select" display="checkboxes" multiple="true" label="Output options"> | |
| 113 <option value="accept" selected="True">Reads matching to at least one database</option> | |
| 114 <option value="other">Reads not found in any database</option> | |
| 115 </param> | |
| 116 <param name="log" type="boolean" checked="False" truevalue="--log log_file" falsevalue="" label="Statistics file" | |
| 117 help="Generates statistics for the rRNA content of reads, as well as rRNA subunit distribution."> | |
| 118 </param> | |
| 119 | |
| 120 <!-- Advanced options --> | |
| 121 <conditional name="options"> | |
| 122 <param name="options_type_selector" type="select" label="Advanced Options"> | |
| 123 <option value="less" selected="True">Less options</option> | |
| 124 <option value="more">More options</option> | |
| 125 </param> | |
| 126 <when value="less"> | |
| 127 <!-- no options --> | |
| 128 </when> | |
| 129 <when value="more"> | |
| 130 <param name="number_of_threads" type="integer" label="Number of threads to use" value="1" min="1"/> | |
| 131 </when> | |
| 132 </conditional> | |
| 133 </inputs> | |
| 134 <outputs> | |
| 135 <data format="input" format_source="input_reads" name="output_accept" from_work_dir="accept_file.dat" | |
| 136 label="Matching reads on ${on_string} (${input_reads.datatype.file_ext})"> | |
| 137 <filter>outputs_selected and 'accept' in outputs_selected</filter> | |
| 138 </data> | |
| 139 <data format="input" format_source="input_reads" name="output_other" from_work_dir="other_file.dat" | |
| 140 label="Reads not found on ${on_string} (${input_reads.datatype.file_ext})"> | |
| 141 <filter>outputs_selected and 'other' in outputs_selected</filter> | |
| 142 </data> | |
| 143 <data format="txt" name="output_log" label="${tool.name} statistics (txt)" from_work_dir="log_file.log"> | |
| 144 <filter>log</filter> | |
| 145 </data> | |
| 146 </outputs> | |
| 147 <stdio> | |
| 148 <regex match="This program builds a Burst trie on an input rRNA database" | |
| 149 source="both" | |
| 150 level="fatal" | |
| 151 description="Buildtrie program failed to execute." /> | |
| 152 <regex match="The database name" | |
| 153 source="both" | |
| 154 level="fatal" | |
| 155 description="The database ${databases} has not been preprocessed using buildtrie before using SortMeRNA." /> | |
| 156 </stdio> | |
| 157 <tests> | |
| 158 <test> | |
| 159 <param name="read_family_selector" value="I" /> | |
| 160 <param name="input_reads" value="sortmerna_wrapper_in1.fastq" /> | |
| 161 <param name="sequencing_type_selector" value ="not_paired" /> | |
| 162 <param name="strand_search" value="" /> | |
| 163 <param name="databases_selector" value="cached" /> | |
| 164 <param name="input_databases" value="rfam-5.8s,rfam-5s" /> | |
| 165 <param name="outputs_selected" value="accept,other" /> | |
| 166 <param name="log" value="" /> | |
| 167 <param name="options_type_selector" value="less" /> | |
| 168 <output name="output_accept" file="sortmerna_wrapper_accept1.fastq" /> | |
| 169 <output name="output_other" file="sortmerna_wrapper_other1.fastq" /> | |
| 170 </test> | |
| 171 </tests> | |
| 172 <help> | |
| 173 **Overview** | |
| 174 | |
| 175 SortMeRNA_ is a software designed to rapidly filter ribosomal RNA fragments | |
| 176 from metatransriptomic data produced by next-generation sequencers. | |
| 177 It is capable of handling large RNA databases and sorting out all fragments | |
| 178 matching to the database with high accuracy and specificity. | |
| 179 | |
| 180 .. _SortMeRNA: http://bioinfo.lifl.fr/RNA/sortmerna/ | |
| 181 | |
| 182 If you use this tool, please cite Kopylova E., Noé L. and Touzet H., | |
| 183 `"SortMeRNA: Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data"`__, | |
| 184 Bioinformatics (2012), doi: 10.1093/bioinformatics/bts611. | |
| 185 | |
| 186 .. __: http://bioinformatics.oxfordjournals.org/content/28/24/3211 | |
| 187 | |
| 188 ------ | |
| 189 | |
| 190 **Input** | |
| 191 | |
| 192 The input is one file of reads in FASTA or FASTQ format and any number of rRNA databases to search against. | |
| 193 If the user has two foward-reverse paired-sequencing reads files, they may use | |
| 194 the script "merge_paired_reads.sh" to interleave the reads into one file, preserving their order. | |
| 195 | |
| 196 If the sequencing type for the reads is paired-ended, the user has two options under | |
| 197 "Sequencing type" to filter the reads and preserve their order in the file. | |
| 198 For a further example of each option, please refer to Section 4.2.3 in the `SortMeRNA User Manual`_. | |
| 199 | |
| 200 .. _sortmerna user manual: http://bioinfo.lifl.fr/RNA/sortmerna/code/SortMeRNA-user-manual-v1.7.pdf | |
| 201 | |
| 202 ------ | |
| 203 | |
| 204 **Output** | |
| 205 | |
| 206 The output will follow the same format (FASTA or FASTQ) as the reads. | |
| 207 | |
| 208 In the standalone version of SortMeRNA, the user may output the matching reads in a separate file per database (--bydbs option). This option will be made available in a future version of Galaxy. | |
| 209 | |
| 210 ------ | |
| 211 | |
| 212 **rRNA databases** | |
| 213 | |
| 214 SortMeRNA is distributed with 8 representative rRNA databases, which were | |
| 215 all constructed from the SILVA SSU,LSU (version 111) and the RFAM 5/5.8S | |
| 216 (version 11.0) databases using the tool UCLUST. | |
| 217 | |
| 218 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 219 | Representative database | id % | avergage id% | # seq | Origin | # seq | filtered to remove | | |
| 220 +==========================+======+==============+=======+========================+========+====================+ | |
| 221 | SILVA 16S bacteria | 85 | 91.6 | 8174 | SILVA SSU Ref NR v.111 | 244077 | 23s | | |
| 222 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 223 | SILVA 16S archaea | 95 | 96.7 | 3845 | SILVA SSU Ref NR v.111 | 10919 | 23s | | |
| 224 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 225 | SILVA 18S eukarya | 95 | 96.7 | 4512 | SILVA SSU Ref NR v.111 | 31862 | 26s,28s,23s | | |
| 226 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 227 | | | |
| 228 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 229 | SILVA 23S bacteria | 98 | 99.4 | 3055 | SILVA LSU Ref v.111 | 19580 | 16s,26s,28s | | |
| 230 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 231 | SILVA 23s archaea | 98 | 99.5 | 164 | SILVA LSU Ref v.111 | 405 | 16s,26s,28s | | |
| 232 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 233 | SILVA 28S eukarya | 98 | 99.1 | 4578 | SILVA LSU Ref v.111 | 9321 | 18s | | |
| 234 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 235 | | | |
| 236 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 237 | Rfam 5S archaea/bacteria | 98 | 99.2 | 59513 | RFAM | 116760 | | | |
| 238 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 239 | Rfam 5.8S eukarya | 98 | 98.9 | 13034 | RFAM | 225185 | | | |
| 240 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+ | |
| 241 | |
| 242 | |
| 243 id % : | |
| 244 members of the cluster must have identity at least 'id %' identity with the representative sequence | |
| 245 | |
| 246 average id % : | |
| 247 average identity of a cluster member to the representative sequence | |
| 248 | |
| 249 The user may also choose to use their own rRNA databases. | |
| 250 | |
| 251 .. class:: warningmark | |
| 252 | |
| 253 Note that your personal databases are indexed each time, and that | |
| 254 this may take some time depending on the size of the given database. | |
| 255 | |
| 256 ------ | |
| 257 | |
| 258 **SortMeRNA parameter list** | |
| 259 | |
| 260 The standalone, command-line version of SortMeRNA uses the following parameters. | |
| 261 | |
| 262 For indexing (buildtrie): | |
| 263 | |
| 264 This program builds a Burst trie on an input rRNA database file in fasta format | |
| 265 and stores the material in binary files under the folder '/automata':: | |
| 266 | |
| 267 ./buildtrie --db [path to rrnas database file name {.fasta}] {OPTIONS} | |
| 268 | |
| 269 The list of OPTIONS can be left blank, the default values will be used:: | |
| 270 | |
| 271 -L length of the sliding window (the seed) | |
| 272 (default: 18) | |
| 273 | |
| 274 -F search only the forward strand | |
| 275 -R search only the reverse-complementary strand | |
| 276 (default: both strands are searched) | |
| 277 | |
| 278 -h help | |
| 279 | |
| 280 | |
| 281 | |
| 282 | |
| 283 For sorting (sortmerna): | |
| 284 | |
| 285 To run SortMeRNA, type in any order after 'sortmerna':: | |
| 286 | |
| 287 --I [illumina reads file name {fasta/fastq}] | |
| 288 | |
| 289 --454 [roche 454 reads file name {fasta/fastq}] | |
| 290 | |
| 291 -n number of databases to use (must precede --db) | |
| 292 | |
| 293 --db [rrnas database name(s)] | |
| 294 | |
| 295 One database, | |
| 296 ex 1. -n 1 --db /path1/database1.fasta | |
| 297 | |
| 298 Multiple databases, | |
| 299 ex 2. -n 2 --db /path2/database2.fasta /path3/database3.fasta | |
| 300 | |
| 301 {OPTIONS} | |
| 302 | |
| 303 The list of OPTIONS can be left blank, the default values will be used:: | |
| 304 | |
| 305 --accept [accepted reads file name] | |
| 306 --other [rejected reads file name] | |
| 307 (default: no output file is created) | |
| 308 | |
| 309 --bydbs output the accepted reads by database | |
| 310 (default: concatenated file of reads) | |
| 311 | |
| 312 --log [overall statistics file name] | |
| 313 (default: no statistics file created) | |
| 314 | |
| 315 --paired-in put both paired-end reads into --accept file | |
| 316 --paired-out put both paired-end reads into --other file | |
| 317 (default: if one read is accepted and the other is not, | |
| 318 separate the reads into --accept and --other files) | |
| 319 | |
| 320 -r ratio of the number of hits on the read / read length | |
| 321 (default Illumina: 0.25, Roche 454: 0.15) | |
| 322 | |
| 323 -F search only the forward strand | |
| 324 -R search only the reverse-complementary strand | |
| 325 (default: both strands are searched) | |
| 326 | |
| 327 -a number of threads to use | |
| 328 (default: 1) | |
| 329 | |
| 330 -m (m x 4096 bytes) for loading the reads into memory | |
| 331 ex. '-m 4' means 4*4096 = 16384 bytes will be allocated for the reads | |
| 332 note: maximum -m is 1020039 | |
| 333 (default: m = 262144 = 1GB) | |
| 334 | |
| 335 -v verbose | |
| 336 (default: deactivated) | |
| 337 | |
| 338 -h help | |
| 339 | |
| 340 --version version number | |
| 341 | |
| 342 ------ | |
| 343 | |
| 344 **Bibliography** | |
| 345 | |
| 346 [1] Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, Glöckner FO (2013) The SILVA ribosomal RNA gene database project: improved data processing and web-based tools, Nucleic Acids Research, 41 (D1): D590-D596. | |
| 347 | |
| 348 [2] Rfam 11.0: 10 years of RNA families. S.W. Burge, J. Daub, R. Eberhardt, J. Tate, L. Barquist, E.P. Nawrocki, S.R. Eddy, P.P. Gardner, A. Bateman. Nucleic Acids Research (2012), doi: 10.1093/nar/gks1005 | |
| 349 | |
| 350 [3] Edgar, R.C. (2010) Search and clustering orders of magnitude faster than BLAST, Bioinformatics 26(19), 2460-2461, doi: 10.1093/bioinformatics/btq461 | |
| 351 | |
| 352 [4] Loman, N. J. and Misra, Raju V and Dallman, Timothy J and Constantinidou, Chrystala and Gharbia, Saheer E and Wain, John and Pallen, Mark J., Performance comparison of benchtop high-throughput sequencing platforms (2012), Nature Biotechnology, 30 (5). pp. 434-439 | |
| 353 </help> | |
| 354 </tool> |
