comparison tools/filters/get_orfs_or_cdss.xml @ 0:9cff9a1176ea

Uploaded v0.0.1
author peterjc
date Thu, 19 Jan 2012 10:17:10 -0500
parents
children 922d69bd5258
comparison
equal deleted inserted replaced
-1:000000000000 0:9cff9a1176ea
1 <tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.1">
2 <description>e.g. to get peptides from ESTs</description>
3 <command interpreter="python">
4 get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
5 </command>
6 <inputs>
7 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />
8 <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">
9 <option value="1">1. Standard</option>
10 <option value="2">2. Vertebrate Mitochondrial</option>
11 <option value="3">3. Yeast Mitochondrial</option>
12 <option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
13 <option value="5">5. Invertebrate Mitochondrial</option>
14 <option value="6">6. Ciliate Macronuclear and Dasycladacean</option>
15 <option value="9">9. Echinoderm Mitochondrial</option>
16 <option value="10">10. Euplotid Nuclear</option>
17 <option value="11">11. Bacterial</option>
18 <option value="12">12. Alternative Yeast Nuclear</option>
19 <option value="13">13. Ascidian Mitochondrial</option>
20 <option value="14">14. Flatworm Mitochondrial</option>
21 <option value="15">15. Blepharisma Macronuclear</option>
22 <option value="16">16. Chlorophycean Mitochondrial</option>
23 <option value="21">21. Trematode Mitochondrial</option>
24 <option value="22">22. Scenedesmus obliquus</option>
25 <option value="23">23. Thraustochytrium Mitochondrial</option>
26 </param>
27 <param name="ftype" type="select" value="True" label="Look for ORFs or CDSs">
28 <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option>
29 <option value="CDS">Look for CDSs (with start and stop codons)</option>
30 </param>
31 <param name="ends" type="select" value="open" label="Sequence end treatment">
32 <option value="open">Open ended (will allow missing start/stop codons at the ends)</option>
33 <option value="closed">Complete (will check for start/stop codons at the ends)</option>
34 <!-- TODO? Circular, for using this on finished bacteria etc -->
35 </param>
36
37 <param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF.">
38 <option value="all">All ORFs/CDSs from each sequence</option>
39 <option value="top">All ORFs/CDSs from each sequence with the maximum length</option>
40 <option value="one">First ORF/CDS from each sequence with the maximum length</option>
41 </param>
42 <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)">
43 </param>
44 <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing.">
45 <option value="both">Search both the forward and reverse strand</option>
46 <option value="forward">Only search the forward strand</option>
47 <option value="reverse">Only search the reverse strand</option>
48 </param>
49 </inputs>
50 <outputs>
51 <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" />
52 <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" />
53 </outputs>
54 <tests>
55 <test>
56 <param name="input_file" value="get_orf_input.fasta" />
57 <param name="table" value="1" />
58 <param name="ftype" value="CDS" />
59 <param name="ends" value="open" />
60 <param name="mode" value="all" />
61 <param name="min_len" value="10" />
62 <param name="strand" value="forward" />
63 <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" />
64 <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" />
65 </test>
66 <test>
67 <param name="input_file" value="get_orf_input.fasta" />
68 <param name="table" value="11" />
69 <param name="ftype" value="CDS" />
70 <param name="ends" value="closed" />
71 <param name="mode" value="all" />
72 <param name="min_len" value="10" />
73 <param name="strand" value="forward" />
74 <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" />
75 <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" />
76 </test>
77 <test>
78 <param name="input_file" value="get_orf_input.fasta" />
79 <param name="table" value="11" />
80 <param name="ftype" value="CDS" />
81 <param name="ends" value="open" />
82 <param name="mode" value="all" />
83 <param name="min_len" value="10" />
84 <param name="strand" value="forward" />
85 <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />
86 <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />
87 </test>
88 </tests>
89 <requirements>
90 <requirement type="python-module">Bio</requirement>
91 </requirements>
92 <help>
93
94 **What it does**
95
96 Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ
97 and Standard Flowgram Format (SFF) are supported), and searches each sequence
98 for open reading frames (ORFs) or potential coding sequences (CDSs) of the
99 given minimum length. These are returned as FASTA files of nucleotides and
100 protein sequences.
101
102 You can choose to have all the ORFs/CDSs above the minimum length for each
103 sequence (similar to the EMBOSS getorf tool), those with the longest length
104 equal, or the first ORF/CDS with the longest length (in the special case
105 where a sequence encodes two or more long ORFs/CDSs of the same length). The
106 last option is a reasonable choice when the input sequences represent EST or
107 mRNA sequences, where only one ORF/CDS is expected.
108
109 Note that if no ORFs/CDSs in a sequence match the criteria, there will be no
110 output for that sequence.
111
112 Also note that the ORFs/CDSs are assigned modified identifiers to distinguish
113 them from the original full length sequences, by appending a suffix.
114
115 The start and stop codons are taken from the `NCBI Genetic Codes
116 &lt;http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi&gt;`_.
117 When searching for ORFs, the sequences will run from stop codon to stop
118 codon, and any start codons are ignored. When searching for CDSs, the first
119 potential start codon will be used, giving the longest possible CDS within
120 each ORF, and thus the longest possible protein sequence. This is useful
121 for things like BLAST or domain searching, but since this may not be the
122 correct start codon may not be appropriate for signal peptide detection
123 etc.
124
125 **Example Usage**
126
127 Given some EST sequences (Sanger capillary reads) assembled into unigenes,
128 or a transcriptome assembly from some RNA-Seq, each of your nucleotide
129 sequences should (barring sequencing, assembly errors, frame-shifts etc)
130 encode one protein as a single ORF/CDS, which you wish to extract (and
131 perhaps translate into amino acids).
132
133 If your RNS-Seq data was strand specific, and assembled taking this into
134 account, you should only search for ORFs/CDSs on the forward strand.
135
136 **Citation**
137
138 This tool uses Biopython. If you use this tool in scientific work leading
139 to a publication, please cite the Biopython application note (and Galaxy
140 too of course):
141
142 Cock et al 2009. Biopython: freely available Python tools for computational
143 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
144 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
145
146 </help>
147 </tool>