# HG changeset patch # User mkh # Date 1452965410 18000 # Node ID f41c8f299270f63be0c8318ffb703936fcfdd29b Untested version diff -r 000000000000 -r f41c8f299270 interproscan5/create_index.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/create_index.py Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python + +import os +import sys + +o = open(sys.argv[1], 'w+') + +o.write('

InterProScan result summary page

') +o.close() diff -r 000000000000 -r f41c8f299270 interproscan5/interproscan.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/interproscan.xml Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,307 @@ + + Interproscan functional predictions of ORFs + + signalp + phobius + tmhmm + INTERPROSCAN_SCRIPT_PATH + + + + #import os + interproscan.sh + ## disables the precalculated lookup service, all calculation will be run locally + -dp + --input $infile + --seqtype $seqtype + -f $oformat + --applications $appl + --tempdir \$TEMP + + $pathways + $goterms + $iprlookup + + #if str($oformat) in ['SVG', 'HTML']: + --output-file-base $outfile + 2>&1; + mkdir -p $outfile.files_path; + #set temp_archive_file = str($outfile) + '.' + str($oformat).lower() + '.tar.gz' + tar -C $outfile.files_path -xvmzf $temp_archive_file; + python \$INTERPROSCAN_SCRIPT_PATH/create_index.py $outfile $outfile.files_path; + rm $temp_archive_file + #else: + -o $outfile + 2>&1 + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + **What it does and does not do** + + Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches + of profile and other functional databases. + + + ##### + Input + ##### + + Required is a FASTA file containing protein or nucleotide sequences. + + + ###### + Output + ###### + + In this version of InterProScan_, you can retrieve output in any of the following five formats: + + * TSV: a simple tab-delimited file format + * XML: the new "IMPACT" XML format (XSD available here_). + * GFF: The `GFF 3.0`_ format + * HTML: An HTML representation of the protein matches + * SVG: An Scalable Vector Graphics representation of the protein matches + + + .. _`GFF 3.0`: http://gmod.org/wiki/GFF#GFF3_Format + .. _here: http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 + + + Tab-separated values format (TSV) + ================================= + + Basic tab delimited format. + + + Example Output + -------------- + + :: + + P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain + 1 2670 2799 7.9E-43 T 15-03-2013 + P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T + 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302 + P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013 + ... + + + The TSV format presents the match data in columns as follows: + + - Protein Accession (e.g. P51587) + - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579) + - Sequence Length (e.g. 3418) + - Analysis (e.g. Pfam / PRINTS / Gene3D) + - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140) + - Signature Description (e.g. BRCA2 repeat profile) + - Start location + - Stop location + - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52) + - Status - is the status of the match (T: true) + - Date - is the date of the run + - (InterProScan_ annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option + is switched on) + - (InterProScan_ annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan + option is switched on) + - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on) + - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on) + + + Extensible Markup Language (XML) + ================================ + + XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is + available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here]. + + Example Output + -------------- + + .. image:: $PATH_TO_IMAGES/example_xml_output.png + + + Generic Feature Format Version 3 (GFF3) + ======================================= + + The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to + trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format + representation of the predicted protein sequences and their matches. You will find a documentation of all the + columns and attributes used on [http://www.sequenceontology.org/gff3.shtml]. + + Example Output + -------------- + + :: + + ##gff-version 3 + ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 + ##sequence-region AACH01000027 1 1347 + ##seqid|source|type|start|end|score|strand|phase|attributes + AACH01000027 provided_by_user nucleic_acid 1 1347 . + . + Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 + AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 + 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 + AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 + AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase + family;Target=null 84 + 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" + ##sequence-region 2 + ... + >pep_AACH01000027_1_1347 + LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV + LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA + GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI + LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ + ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA + TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV + DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML + RSQKAKGVLIYRDDWISITPEIQLLFTEF + ... + >match$8_84_314 + KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK + RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL + LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR + AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS + + + Scalable Vector Graphics (SVG) and HyperText Markup Language (HTML) + ==================================================================== + + InterProScan_ 5 outputs a single HTML/SVG file for each protein sequence analysed. + + + Example Output + -------------- + + .. image:: $PATH_TO_IMAGES/P51587.svg.png + + .. _InterProScan: http://www.ebi.ac.uk/interpro + + + ---------- + References + ---------- + + + If you use this Galaxy tool in work leading to a scientific publication please + cite the following papers: + + Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). + Galaxy tools and workflows for sequence analysis with applications + in molecular plant pathology. PeerJ 1:e167 + http://dx.doi.org/10.7717/peerj.167 + + Zdobnov EM, Apweiler R (2001) + InterProScan an integration platform for the signature-recognition methods in InterPro. + Bioinformatics 17, 847-848. + http://dx.doi.org/10.1093/bioinformatics/17.9.847 + + Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) + InterProScan: protein domains identifier. + Nucleic Acids Research 33 (Web Server issue), W116-W120. + http://dx.doi.org/10.1093/nar/gki442 + + Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn + RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, + McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, + Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) + InterPro: the integrative protein signature database. + Nucleic Acids Research 37 (Database Issue), D224-228. + http://dx.doi.org/10.1093/nar/gkn785 + + + This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at + http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5 + + + **Galaxy Wrapper Author**:: + + * Bjoern Gruening, University of Freiburg + * Konrad Paszkiewicz, University of Exeter + + + diff -r 000000000000 -r f41c8f299270 interproscan5/readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/readme.rst Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,83 @@ +================================================== +Galaxy wrapper for InterProScan 5 prediction tools +================================================== + +InterProScan is a tool that combines different protein signature recognition methods native to the InterPro +member databases into one resource with look up of corresponding InterPro and GO annotation. + +This wrapper is copyright 2013 by: + * Bjoern Gruening + * Konrad Paszkiewicz + + +This repository contains a wrapper for the InterProScan_ command line tool. + +.. _InterProScan: http://www.ebi.ac.uk/interpro/interproscan.html + + +Quevillon E., Silventoinen V., Pillai S., Harte N., Mulder N., Apweiler R., Lopez R. (2005). InterProScan: protein domains identifier. Nucleic Acids Res. 33 (Web Server issue): W116-W120 + + +============ +Installation +============ + +Please download install InterProScan according to: + +https://code.google.com/p/interproscan/wiki/HowToDownload + + +======== +Citation +======== + +If you use this Galaxy tool in work leading to a scientific +publication, in addition to citing the invididual underlying tools, please cite: + +Peter Cock, Bjoern Gruening, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 + +Full reference information is included in the help text. + + +============= +Input formats +============= + +The standard interproscan input is either genomic or protein sequences. +In the case of genomic sequences Interproscan will run an ORF prediction tool. + + +======= +History +======= + +interproscan: + + - v5.0: Initial public release of version 5.0 + + +============= +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff -r 000000000000 -r f41c8f299270 interproscan5/static/images/P51587.svg.png Binary file interproscan5/static/images/P51587.svg.png has changed diff -r 000000000000 -r f41c8f299270 interproscan5/static/images/example_xml_output.png Binary file interproscan5/static/images/example_xml_output.png has changed diff -r 000000000000 -r f41c8f299270 interproscan5/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/tool_dependencies.xml Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,9 @@ + + + + $REPOSITORY_INSTALL_DIR + + + + +