comparison interproscan5/interproscan.xml @ 9:1d6b3be059c8 draft default tip

Name change. Removed redundant tool def.
author mkh
date Mon, 08 Feb 2016 12:10:24 -0500
parents f2153ec1ccfd
children
comparison
equal deleted inserted replaced
8:f2153ec1ccfd 9:1d6b3be059c8
1 <tool id="interproscan" name="Interproscan functional predictions of ORFs" version="5.0.0a">
2 <description>Interproscan functional predictions of ORFs</description>
3 <requirements>
4 <!--
5 <requirement type="package">signalp</requirement>
6 <requirement type="package">phobius</requirement>
7 <requirement type="package">tmhmm</requirement>
8 -->
9 <requirement type="set_environment">INTERPROSCAN_SCRIPT_PATH</requirement>
10 </requirements>
11
12 <command>
13 #import os
14 \$IPS_HOME/interproscan.sh
15 ## disables the precalculated lookup service, all calculation will be run locally
16 -dp
17 --input $infile
18 --seqtype $seqtype
19 -f $oformat
20 --applications $appl
21 --tempdir \$TEMP
22
23 $pathways
24 $goterms
25 $iprlookup
26 $mode
27
28 #if str($oformat) in ['SVG', 'HTML']:
29 --output-file-base $outfile
30 2>&#38;1;
31 mkdir -p $outfile.files_path;
32 #set temp_archive_file = str($outfile) + '.' + str($oformat).lower() + '.tar.gz'
33 tar -C $outfile.files_path -xvmzf $temp_archive_file;
34 python \$INTERPROSCAN_SCRIPT_PATH/create_index.py $outfile $outfile.files_path;
35 rm $temp_archive_file
36 #else:
37 -o $outfile
38 2>&#38;1
39 #end if
40 </command>
41
42 <inputs>
43 <param name="infile" type="data" format="fasta" label="Protein Fasta File"/>
44
45 <param name="seqtype" type="select" label="Type of the input sequences" help="">
46 <option value="p" selected="true">Protein</option>
47 <option value="n">DNA / RNA</option>
48 </param>
49
50 <param name="mode" type="boolean" label="Run on cluster?" help="Check to submit job to cluster."
51 truevalue="--mode=cluster --clusterrunid=gtdi-ips-analysis"
52 falsevalue=""/>
53
54 <param name="appl" type="select" multiple="True" display="checkboxes" label="Applications to run"
55 help="Select your programm.">
56 <option value="TIGRFAM"
57 selected="true">TIGRFAM: protein families based on Hidden Markov Models or HMMs</option>
58 <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a hierarchical order (evolutionary relationships)</option>
59 <option value="ProDom"
60 selected="true">ProDom: set of protein domain families generated from the UniProtKB</option>
61 <option value="Panther"
62 selected="true">Panther: Protein ANalysis THrough Evolutionary Relationships</option>
63 <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on Hidden Markov Models or HMMs</option>
64 <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional sites as well as associated profiles to identify them</option>
65 <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional sites as well as associated patterns to identify them</option>
66 <option value="HAMAP"
67 selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes</option>
68 <option value="PfamA" selected="true">PfamA: protein families, each represented by multiple sequence alignments and hidden Markov models</option>
69 <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise a protein family</option>
70 <option value="SuperFamily"
71 selected="true">SUPERFAMILY: database of structural and functional annotation</option>
72 <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option>
73 <option value="Gene3d" selected="true">Gene3d: Structural assignment for whole genes and genomes using the CATH domain structure database</option>
74 <option value="SignalP-GRAM_POSITIVE" selected="false">SignalP Gram Positive Bacteria</option>
75 <option value="SignalP-GRAM_NEGATIVE" selected="false">SignalP Gram Negative Bacteria</option>
76 <option value="SignalP-EUK" selected="false">SignalP Eukaryotic Bacteria</option>
77 <option value="Phobius" selected="false">Phobius: combined transmembrane topology and signal peptide predictor</option>
78 <option value="TMHMM" selected="false">TMHMM: Prediction of transmembrane helices in proteins</option>
79 </param>
80
81 <param name="pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean"
82 label="Include pathway information"
83 help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries. (--pathways)"/>
84 <param name="goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean"
85 label="Include Gene Ontology (GO) mappings"
86 help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option. (--goterms)"/>
87 <param name="iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean"
88 label="Provide additional mappings"
89 help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into (--iprlookup)"/>
90
91 <param name="oformat" type="select" label="Output format" help="Please select a output format.">
92 <option value="TSV" selected="true">Tab-separated values format (TSV)</option>
93 <option value="GFF3">GFF3</option>
94 <option value="SVG">SVG</option>
95 <option value="HTML">HTML</option>
96 <option value="XML">XML</option>
97 </param>
98
99 </inputs>
100
101 <outputs>
102 <data format="tabular" name="outfile" label="Interproscan calculation on ${on_string}">
103 <change_format>
104 <when input="oformat" value="HTML" format="html"/>
105 <when input="oformat" value="XML" format="xml"/>
106 <when input="oformat" value="SVG" format="html"/>
107 <when input="oformat" value="GFF3" format="gff"/>
108 </change_format>
109 </data>
110 </outputs>
111
112 <requirements>
113 </requirements>
114
115 <help>
116
117 **What it does**
118
119 Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches of profile and other functional databases.
120
121
122 #####
123 Input
124 #####
125
126 Required is a FASTA file containing protein or nucleotide sequences.
127
128
129 ######
130 Output
131 ######
132
133 In this version of InterProScan_, you can retrieve output in any of the following five formats:
134
135 * TSV: a simple tab-delimited file format
136 * XML: the new "IMPACT" XML format (XSD available here_).
137 * GFF: The `GFF 3.0`_ format
138 * HTML: An HTML representation of the protein matches
139 * SVG: An Scalable Vector Graphics representation of the protein matches
140
141
142 .. _`GFF 3.0`: http://gmod.org/wiki/GFF#GFF3_Format
143 .. _here: http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5
144
145
146
147 Tab-separated values format (TSV)
148 =================================
149
150 Basic tab delimited format.
151
152
153 Example Output
154 --------------
155
156 ::
157
158 P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain 1 2670 2799 7.9E-43 T 15-03-2013
159 P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302
160 P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013
161 ...
162
163
164 The TSV format presents the match data in columns as follows:
165
166 - Protein Accession (e.g. P51587)
167 - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
168 - Sequence Length (e.g. 3418)
169 - Analysis (e.g. Pfam / PRINTS / Gene3D)
170 - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
171 - Signature Description (e.g. BRCA2 repeat profile)
172 - Start location
173 - Stop location
174 - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52)
175 - Status - is the status of the match (T: true)
176 - Date - is the date of the run
177 - (InterProScan_ annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option is switched on)
178 - (InterProScan_ annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan option is switched on)
179 - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on)
180 - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on)
181
182
183 Extensible Markup Language (XML)
184 ================================
185
186 XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here].
187
188 Example Output
189 --------------
190
191 .. image:: $PATH_TO_IMAGES/example_xml_output.png
192
193
194
195 Generic Feature Format Version 3 (GFF3)
196 =======================================
197
198 The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [http://www.sequenceontology.org/gff3.shtml].
199
200 Example Output
201 --------------
202
203 ::
204
205 ##gff-version 3
206 ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269
207 ##sequence-region AACH01000027 1 1347
208 ##seqid|source|type|start|end|score|strand|phase|attributes
209 AACH01000027 provided_by_user nucleic_acid 1 1347 . + . Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027
210 AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347
211 AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347
212 AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13"
213 ##sequence-region 2
214 ...
215 >pep_AACH01000027_1_1347
216 LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV
217 LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA
218 GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI
219 LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ
220 ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA
221 TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV
222 DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML
223 RSQKAKGVLIYRDDWISITPEIQLLFTEF
224 ...
225 >match$8_84_314
226 KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK
227 RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL
228 LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR
229 AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS
230
231
232 Scalable Vector Graphics (SVG) and HyperText Markup Language (HTML)
233 ====================================================================
234
235 InterProScan_ 5 outputs a single HTML/SVG file for each protein sequence analysed.
236
237
238 Example Output
239 --------------
240
241 .. image:: $PATH_TO_IMAGES/P51587.svg.png
242
243 .. _InterProScan: http://www.ebi.ac.uk/interpro
244
245
246 ----------
247 References
248 ----------
249
250
251 If you use this Galaxy tool in work leading to a scientific publication please
252 cite the following papers:
253
254 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
255 Galaxy tools and workflows for sequence analysis with applications
256 in molecular plant pathology. PeerJ 1:e167
257 http://dx.doi.org/10.7717/peerj.167
258
259 Zdobnov EM, Apweiler R (2001)
260 InterProScan an integration platform for the signature-recognition methods in InterPro.
261 Bioinformatics 17, 847-848.
262 http://dx.doi.org/10.1093/bioinformatics/17.9.847
263
264 Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005)
265 InterProScan: protein domains identifier.
266 Nucleic Acids Research 33 (Web Server issue), W116-W120.
267 http://dx.doi.org/10.1093/nar/gki442
268
269 Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009)
270 InterPro: the integrative protein signature database.
271 Nucleic Acids Research 37 (Database Issue), D224-228.
272 http://dx.doi.org/10.1093/nar/gkn785
273
274
275 This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at
276 http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5
277
278
279 **Galaxy Wrapper Author**::
280
281 * Bjoern Gruening, University of Freiburg
282 * Konrad Paszkiewicz, University of Exeter
283
284 </help>
285 </tool>