comparison interproscan5/interproscan.xml @ 0:f41c8f299270 draft default tip

Untested version
author mkh
date Sat, 16 Jan 2016 12:30:10 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f41c8f299270
1 <tool id="interproscan" name="Interproscan functional predictions of ORFs" version="5.0.1">
2 <description>Interproscan functional predictions of ORFs</description>
3 <requirements>
4 <requirement type="package">signalp</requirement>
5 <requirement type="package">phobius</requirement>
6 <requirement type="package">tmhmm</requirement>
7 <requirement type="set_environment">INTERPROSCAN_SCRIPT_PATH</requirement>
8 </requirements>
9
10 <command>
11 #import os
12 interproscan.sh
13 ## disables the precalculated lookup service, all calculation will be run locally
14 -dp
15 --input $infile
16 --seqtype $seqtype
17 -f $oformat
18 --applications $appl
19 --tempdir \$TEMP
20
21 $pathways
22 $goterms
23 $iprlookup
24
25 #if str($oformat) in ['SVG', 'HTML']:
26 --output-file-base $outfile
27 2>&#38;1;
28 mkdir -p $outfile.files_path;
29 #set temp_archive_file = str($outfile) + '.' + str($oformat).lower() + '.tar.gz'
30 tar -C $outfile.files_path -xvmzf $temp_archive_file;
31 python \$INTERPROSCAN_SCRIPT_PATH/create_index.py $outfile $outfile.files_path;
32 rm $temp_archive_file
33 #else:
34 -o $outfile
35 2>&#38;1
36 #end if
37 </command>
38
39 <inputs>
40 <param name="infile" type="data" format="fasta" label="Protein Fasta File"/>
41
42 <param name="seqtype" type="select" label="Type of the input sequences" help="">
43 <option value="p" selected="true">Protein</option>
44 <option value="n">DNA / RNA</option>
45 </param>
46
47 <param name="appl" type="select" multiple="True" display="checkboxes" label="Applications to run"
48 help="Select your programm.">
49 <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on Hidden Markov Models or HMMs
50 </option>
51 <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a
52 hierarchical order (evolutionary relationships)
53 </option>
54 <option value="ProDom" selected="true">ProDom: set of protein domain families generated from the UniProtKB
55 </option>
56 <option value="Panther" selected="true">Panther: Protein ANalysis THrough Evolutionary Relationships
57 </option>
58 <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on
59 Hidden Markov Models or HMMs
60 </option>
61 <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional
62 sites as well as associated profiles to identify them
63 </option>
64 <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional
65 sites as well as associated patterns to identify them
66 </option>
67 <option value="HAMAP" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes
68 </option>
69 <option value="PfamA" selected="true">PfamA: protein families, each represented by multiple sequence
70 alignments and hidden Markov models
71 </option>
72 <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise
73 a protein family
74 </option>
75 <option value="SuperFamily" selected="true">SUPERFAMILY: database of structural and functional annotation
76 </option>
77 <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option>
78 <option value="Gene3d" selected="true">Gene3d: Structural assignment for whole genes and genomes using the
79 CATH domain structure database
80 </option>
81 <option value="SignalP-GRAM_POSITIVE" selected="false">SignalP Gram Positive Bacteria</option>
82 <option value="SignalP-GRAM_NEGATIVE" selected="false">SignalP Gram Negative Bacteria</option>
83 <option value="SignalP-EUK" selected="true">SignalP Eukaryotic Bacteria</option>
84 <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide
85 predictor
86 </option>
87 <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option>
88 </param>
89
90 <param name="pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean"
91 label="Include pathway information"
92 help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries. (--pathways)"/>
93 <param name="goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean"
94 label="Include Gene Ontology (GO) mappings"
95 help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option. (--goterms)"/>
96 <param name="iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean"
97 label="Provide additional mappings"
98 help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into (--iprlookup)"/>
99
100 <param name="oformat" type="select" label="Output format" help="Please select a output format.">
101 <option value="TSV" selected="true">Tab-separated values format (TSV)</option>
102 <option value="GFF3">GFF3</option>
103 <option value="SVG">SVG</option>
104 <option value="HTML">HTML</option>
105 <option value="XML">XML</option>
106 </param>
107 </inputs>
108
109 <outputs>
110 <data format="tabular" name="outfile" label="Interproscan calculation on ${on_string}">
111 <change_format>
112 <when input="oformat" value="HTML" format="html"/>
113 <when input="oformat" value="XML" format="xml"/>
114 <when input="oformat" value="SVG" format="html"/>
115 <when input="oformat" value="GFF3" format="gff"/>
116 </change_format>
117 </data>
118
119 </outputs>
120
121 <requirements>
122 </requirements>
123
124 <help>
125 **What it does and does not do**
126
127 Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches
128 of profile and other functional databases.
129
130
131 #####
132 Input
133 #####
134
135 Required is a FASTA file containing protein or nucleotide sequences.
136
137
138 ######
139 Output
140 ######
141
142 In this version of InterProScan_, you can retrieve output in any of the following five formats:
143
144 * TSV: a simple tab-delimited file format
145 * XML: the new "IMPACT" XML format (XSD available here_).
146 * GFF: The `GFF 3.0`_ format
147 * HTML: An HTML representation of the protein matches
148 * SVG: An Scalable Vector Graphics representation of the protein matches
149
150
151 .. _`GFF 3.0`: http://gmod.org/wiki/GFF#GFF3_Format
152 .. _here: http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5
153
154
155 Tab-separated values format (TSV)
156 =================================
157
158 Basic tab delimited format.
159
160
161 Example Output
162 --------------
163
164 ::
165
166 P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain
167 1 2670 2799 7.9E-43 T 15-03-2013
168 P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T
169 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302
170 P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013
171 ...
172
173
174 The TSV format presents the match data in columns as follows:
175
176 - Protein Accession (e.g. P51587)
177 - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
178 - Sequence Length (e.g. 3418)
179 - Analysis (e.g. Pfam / PRINTS / Gene3D)
180 - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
181 - Signature Description (e.g. BRCA2 repeat profile)
182 - Start location
183 - Stop location
184 - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52)
185 - Status - is the status of the match (T: true)
186 - Date - is the date of the run
187 - (InterProScan_ annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option
188 is switched on)
189 - (InterProScan_ annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan
190 option is switched on)
191 - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on)
192 - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on)
193
194
195 Extensible Markup Language (XML)
196 ================================
197
198 XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is
199 available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here].
200
201 Example Output
202 --------------
203
204 .. image:: $PATH_TO_IMAGES/example_xml_output.png
205
206
207 Generic Feature Format Version 3 (GFF3)
208 =======================================
209
210 The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to
211 trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format
212 representation of the predicted protein sequences and their matches. You will find a documentation of all the
213 columns and attributes used on [http://www.sequenceontology.org/gff3.shtml].
214
215 Example Output
216 --------------
217
218 ::
219
220 ##gff-version 3
221 ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269
222 ##sequence-region AACH01000027 1 1347
223 ##seqid|source|type|start|end|score|strand|phase|attributes
224 AACH01000027 provided_by_user nucleic_acid 1 1347 . + .
225 Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027
226 AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1
227 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347
228 AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347
229 AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase
230 family;Target=null 84
231 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13"
232 ##sequence-region 2
233 ...
234 >pep_AACH01000027_1_1347
235 LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV
236 LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA
237 GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI
238 LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ
239 ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA
240 TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV
241 DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML
242 RSQKAKGVLIYRDDWISITPEIQLLFTEF
243 ...
244 >match$8_84_314
245 KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK
246 RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL
247 LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR
248 AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS
249
250
251 Scalable Vector Graphics (SVG) and HyperText Markup Language (HTML)
252 ====================================================================
253
254 InterProScan_ 5 outputs a single HTML/SVG file for each protein sequence analysed.
255
256
257 Example Output
258 --------------
259
260 .. image:: $PATH_TO_IMAGES/P51587.svg.png
261
262 .. _InterProScan: http://www.ebi.ac.uk/interpro
263
264
265 ----------
266 References
267 ----------
268
269
270 If you use this Galaxy tool in work leading to a scientific publication please
271 cite the following papers:
272
273 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
274 Galaxy tools and workflows for sequence analysis with applications
275 in molecular plant pathology. PeerJ 1:e167
276 http://dx.doi.org/10.7717/peerj.167
277
278 Zdobnov EM, Apweiler R (2001)
279 InterProScan an integration platform for the signature-recognition methods in InterPro.
280 Bioinformatics 17, 847-848.
281 http://dx.doi.org/10.1093/bioinformatics/17.9.847
282
283 Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005)
284 InterProScan: protein domains identifier.
285 Nucleic Acids Research 33 (Web Server issue), W116-W120.
286 http://dx.doi.org/10.1093/nar/gki442
287
288 Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn
289 RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J,
290 McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ,
291 Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009)
292 InterPro: the integrative protein signature database.
293 Nucleic Acids Research 37 (Database Issue), D224-228.
294 http://dx.doi.org/10.1093/nar/gkn785
295
296
297 This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at
298 http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5
299
300
301 **Galaxy Wrapper Author**::
302
303 * Bjoern Gruening, University of Freiburg
304 * Konrad Paszkiewicz, University of Exeter
305
306 </help>
307 </tool>