Mercurial > repos > mkh > play
comparison interproscan5/interproscan.xml @ 0:f41c8f299270 draft default tip
Untested version
| author | mkh |
|---|---|
| date | Sat, 16 Jan 2016 12:30:10 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f41c8f299270 |
|---|---|
| 1 <tool id="interproscan" name="Interproscan functional predictions of ORFs" version="5.0.1"> | |
| 2 <description>Interproscan functional predictions of ORFs</description> | |
| 3 <requirements> | |
| 4 <requirement type="package">signalp</requirement> | |
| 5 <requirement type="package">phobius</requirement> | |
| 6 <requirement type="package">tmhmm</requirement> | |
| 7 <requirement type="set_environment">INTERPROSCAN_SCRIPT_PATH</requirement> | |
| 8 </requirements> | |
| 9 | |
| 10 <command> | |
| 11 #import os | |
| 12 interproscan.sh | |
| 13 ## disables the precalculated lookup service, all calculation will be run locally | |
| 14 -dp | |
| 15 --input $infile | |
| 16 --seqtype $seqtype | |
| 17 -f $oformat | |
| 18 --applications $appl | |
| 19 --tempdir \$TEMP | |
| 20 | |
| 21 $pathways | |
| 22 $goterms | |
| 23 $iprlookup | |
| 24 | |
| 25 #if str($oformat) in ['SVG', 'HTML']: | |
| 26 --output-file-base $outfile | |
| 27 2>&1; | |
| 28 mkdir -p $outfile.files_path; | |
| 29 #set temp_archive_file = str($outfile) + '.' + str($oformat).lower() + '.tar.gz' | |
| 30 tar -C $outfile.files_path -xvmzf $temp_archive_file; | |
| 31 python \$INTERPROSCAN_SCRIPT_PATH/create_index.py $outfile $outfile.files_path; | |
| 32 rm $temp_archive_file | |
| 33 #else: | |
| 34 -o $outfile | |
| 35 2>&1 | |
| 36 #end if | |
| 37 </command> | |
| 38 | |
| 39 <inputs> | |
| 40 <param name="infile" type="data" format="fasta" label="Protein Fasta File"/> | |
| 41 | |
| 42 <param name="seqtype" type="select" label="Type of the input sequences" help=""> | |
| 43 <option value="p" selected="true">Protein</option> | |
| 44 <option value="n">DNA / RNA</option> | |
| 45 </param> | |
| 46 | |
| 47 <param name="appl" type="select" multiple="True" display="checkboxes" label="Applications to run" | |
| 48 help="Select your programm."> | |
| 49 <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on Hidden Markov Models or HMMs | |
| 50 </option> | |
| 51 <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a | |
| 52 hierarchical order (evolutionary relationships) | |
| 53 </option> | |
| 54 <option value="ProDom" selected="true">ProDom: set of protein domain families generated from the UniProtKB | |
| 55 </option> | |
| 56 <option value="Panther" selected="true">Panther: Protein ANalysis THrough Evolutionary Relationships | |
| 57 </option> | |
| 58 <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on | |
| 59 Hidden Markov Models or HMMs | |
| 60 </option> | |
| 61 <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional | |
| 62 sites as well as associated profiles to identify them | |
| 63 </option> | |
| 64 <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional | |
| 65 sites as well as associated patterns to identify them | |
| 66 </option> | |
| 67 <option value="HAMAP" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes | |
| 68 </option> | |
| 69 <option value="PfamA" selected="true">PfamA: protein families, each represented by multiple sequence | |
| 70 alignments and hidden Markov models | |
| 71 </option> | |
| 72 <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise | |
| 73 a protein family | |
| 74 </option> | |
| 75 <option value="SuperFamily" selected="true">SUPERFAMILY: database of structural and functional annotation | |
| 76 </option> | |
| 77 <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option> | |
| 78 <option value="Gene3d" selected="true">Gene3d: Structural assignment for whole genes and genomes using the | |
| 79 CATH domain structure database | |
| 80 </option> | |
| 81 <option value="SignalP-GRAM_POSITIVE" selected="false">SignalP Gram Positive Bacteria</option> | |
| 82 <option value="SignalP-GRAM_NEGATIVE" selected="false">SignalP Gram Negative Bacteria</option> | |
| 83 <option value="SignalP-EUK" selected="true">SignalP Eukaryotic Bacteria</option> | |
| 84 <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide | |
| 85 predictor | |
| 86 </option> | |
| 87 <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option> | |
| 88 </param> | |
| 89 | |
| 90 <param name="pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean" | |
| 91 label="Include pathway information" | |
| 92 help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries. (--pathways)"/> | |
| 93 <param name="goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean" | |
| 94 label="Include Gene Ontology (GO) mappings" | |
| 95 help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option. (--goterms)"/> | |
| 96 <param name="iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean" | |
| 97 label="Provide additional mappings" | |
| 98 help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into (--iprlookup)"/> | |
| 99 | |
| 100 <param name="oformat" type="select" label="Output format" help="Please select a output format."> | |
| 101 <option value="TSV" selected="true">Tab-separated values format (TSV)</option> | |
| 102 <option value="GFF3">GFF3</option> | |
| 103 <option value="SVG">SVG</option> | |
| 104 <option value="HTML">HTML</option> | |
| 105 <option value="XML">XML</option> | |
| 106 </param> | |
| 107 </inputs> | |
| 108 | |
| 109 <outputs> | |
| 110 <data format="tabular" name="outfile" label="Interproscan calculation on ${on_string}"> | |
| 111 <change_format> | |
| 112 <when input="oformat" value="HTML" format="html"/> | |
| 113 <when input="oformat" value="XML" format="xml"/> | |
| 114 <when input="oformat" value="SVG" format="html"/> | |
| 115 <when input="oformat" value="GFF3" format="gff"/> | |
| 116 </change_format> | |
| 117 </data> | |
| 118 | |
| 119 </outputs> | |
| 120 | |
| 121 <requirements> | |
| 122 </requirements> | |
| 123 | |
| 124 <help> | |
| 125 **What it does and does not do** | |
| 126 | |
| 127 Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches | |
| 128 of profile and other functional databases. | |
| 129 | |
| 130 | |
| 131 ##### | |
| 132 Input | |
| 133 ##### | |
| 134 | |
| 135 Required is a FASTA file containing protein or nucleotide sequences. | |
| 136 | |
| 137 | |
| 138 ###### | |
| 139 Output | |
| 140 ###### | |
| 141 | |
| 142 In this version of InterProScan_, you can retrieve output in any of the following five formats: | |
| 143 | |
| 144 * TSV: a simple tab-delimited file format | |
| 145 * XML: the new "IMPACT" XML format (XSD available here_). | |
| 146 * GFF: The `GFF 3.0`_ format | |
| 147 * HTML: An HTML representation of the protein matches | |
| 148 * SVG: An Scalable Vector Graphics representation of the protein matches | |
| 149 | |
| 150 | |
| 151 .. _`GFF 3.0`: http://gmod.org/wiki/GFF#GFF3_Format | |
| 152 .. _here: http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 | |
| 153 | |
| 154 | |
| 155 Tab-separated values format (TSV) | |
| 156 ================================= | |
| 157 | |
| 158 Basic tab delimited format. | |
| 159 | |
| 160 | |
| 161 Example Output | |
| 162 -------------- | |
| 163 | |
| 164 :: | |
| 165 | |
| 166 P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain | |
| 167 1 2670 2799 7.9E-43 T 15-03-2013 | |
| 168 P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T | |
| 169 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302 | |
| 170 P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013 | |
| 171 ... | |
| 172 | |
| 173 | |
| 174 The TSV format presents the match data in columns as follows: | |
| 175 | |
| 176 - Protein Accession (e.g. P51587) | |
| 177 - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579) | |
| 178 - Sequence Length (e.g. 3418) | |
| 179 - Analysis (e.g. Pfam / PRINTS / Gene3D) | |
| 180 - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140) | |
| 181 - Signature Description (e.g. BRCA2 repeat profile) | |
| 182 - Start location | |
| 183 - Stop location | |
| 184 - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52) | |
| 185 - Status - is the status of the match (T: true) | |
| 186 - Date - is the date of the run | |
| 187 - (InterProScan_ annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option | |
| 188 is switched on) | |
| 189 - (InterProScan_ annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan | |
| 190 option is switched on) | |
| 191 - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on) | |
| 192 - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on) | |
| 193 | |
| 194 | |
| 195 Extensible Markup Language (XML) | |
| 196 ================================ | |
| 197 | |
| 198 XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is | |
| 199 available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here]. | |
| 200 | |
| 201 Example Output | |
| 202 -------------- | |
| 203 | |
| 204 .. image:: $PATH_TO_IMAGES/example_xml_output.png | |
| 205 | |
| 206 | |
| 207 Generic Feature Format Version 3 (GFF3) | |
| 208 ======================================= | |
| 209 | |
| 210 The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to | |
| 211 trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format | |
| 212 representation of the predicted protein sequences and their matches. You will find a documentation of all the | |
| 213 columns and attributes used on [http://www.sequenceontology.org/gff3.shtml]. | |
| 214 | |
| 215 Example Output | |
| 216 -------------- | |
| 217 | |
| 218 :: | |
| 219 | |
| 220 ##gff-version 3 | |
| 221 ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 | |
| 222 ##sequence-region AACH01000027 1 1347 | |
| 223 ##seqid|source|type|start|end|score|strand|phase|attributes | |
| 224 AACH01000027 provided_by_user nucleic_acid 1 1347 . + . | |
| 225 Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 | |
| 226 AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 | |
| 227 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 | |
| 228 AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 | |
| 229 AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase | |
| 230 family;Target=null 84 | |
| 231 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" | |
| 232 ##sequence-region 2 | |
| 233 ... | |
| 234 >pep_AACH01000027_1_1347 | |
| 235 LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV | |
| 236 LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA | |
| 237 GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI | |
| 238 LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ | |
| 239 ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA | |
| 240 TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV | |
| 241 DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML | |
| 242 RSQKAKGVLIYRDDWISITPEIQLLFTEF | |
| 243 ... | |
| 244 >match$8_84_314 | |
| 245 KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK | |
| 246 RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL | |
| 247 LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR | |
| 248 AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS | |
| 249 | |
| 250 | |
| 251 Scalable Vector Graphics (SVG) and HyperText Markup Language (HTML) | |
| 252 ==================================================================== | |
| 253 | |
| 254 InterProScan_ 5 outputs a single HTML/SVG file for each protein sequence analysed. | |
| 255 | |
| 256 | |
| 257 Example Output | |
| 258 -------------- | |
| 259 | |
| 260 .. image:: $PATH_TO_IMAGES/P51587.svg.png | |
| 261 | |
| 262 .. _InterProScan: http://www.ebi.ac.uk/interpro | |
| 263 | |
| 264 | |
| 265 ---------- | |
| 266 References | |
| 267 ---------- | |
| 268 | |
| 269 | |
| 270 If you use this Galaxy tool in work leading to a scientific publication please | |
| 271 cite the following papers: | |
| 272 | |
| 273 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). | |
| 274 Galaxy tools and workflows for sequence analysis with applications | |
| 275 in molecular plant pathology. PeerJ 1:e167 | |
| 276 http://dx.doi.org/10.7717/peerj.167 | |
| 277 | |
| 278 Zdobnov EM, Apweiler R (2001) | |
| 279 InterProScan an integration platform for the signature-recognition methods in InterPro. | |
| 280 Bioinformatics 17, 847-848. | |
| 281 http://dx.doi.org/10.1093/bioinformatics/17.9.847 | |
| 282 | |
| 283 Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) | |
| 284 InterProScan: protein domains identifier. | |
| 285 Nucleic Acids Research 33 (Web Server issue), W116-W120. | |
| 286 http://dx.doi.org/10.1093/nar/gki442 | |
| 287 | |
| 288 Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn | |
| 289 RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, | |
| 290 McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, | |
| 291 Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) | |
| 292 InterPro: the integrative protein signature database. | |
| 293 Nucleic Acids Research 37 (Database Issue), D224-228. | |
| 294 http://dx.doi.org/10.1093/nar/gkn785 | |
| 295 | |
| 296 | |
| 297 This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at | |
| 298 http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5 | |
| 299 | |
| 300 | |
| 301 **Galaxy Wrapper Author**:: | |
| 302 | |
| 303 * Bjoern Gruening, University of Freiburg | |
| 304 * Konrad Paszkiewicz, University of Exeter | |
| 305 | |
| 306 </help> | |
| 307 </tool> |
