Mercurial > repos > mkh > play
changeset 0:f41c8f299270 draft default tip
Untested version
author | mkh |
---|---|
date | Sat, 16 Jan 2016 12:30:10 -0500 |
parents | |
children | |
files | interproscan5/create_index.py interproscan5/interproscan.xml interproscan5/readme.rst interproscan5/static/images/P51587.svg.png interproscan5/static/images/example_xml_output.png interproscan5/tool_dependencies.xml |
diffstat | 6 files changed, 413 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/create_index.py Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python + +import os +import sys + +o = open(sys.argv[1], 'w+') + +o.write('<html> <body> <h1> InterProScan result summary page </h1> <ul>') + +for filename in [f for f in os.listdir(sys.argv[2]) if os.path.isfile(os.path.join(sys.argv[2], f))]: + o.write('<li><a href="%s"> %s </a></li>' % (filename, os.path.splitext(filename)[0])) + +o.write('</ul></body></html>') +o.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/interproscan.xml Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,307 @@ +<tool id="interproscan" name="Interproscan functional predictions of ORFs" version="5.0.1"> + <description>Interproscan functional predictions of ORFs</description> + <requirements> + <requirement type="package">signalp</requirement> + <requirement type="package">phobius</requirement> + <requirement type="package">tmhmm</requirement> + <requirement type="set_environment">INTERPROSCAN_SCRIPT_PATH</requirement> + </requirements> + + <command> + #import os + interproscan.sh + ## disables the precalculated lookup service, all calculation will be run locally + -dp + --input $infile + --seqtype $seqtype + -f $oformat + --applications $appl + --tempdir \$TEMP + + $pathways + $goterms + $iprlookup + + #if str($oformat) in ['SVG', 'HTML']: + --output-file-base $outfile + 2>&1; + mkdir -p $outfile.files_path; + #set temp_archive_file = str($outfile) + '.' + str($oformat).lower() + '.tar.gz' + tar -C $outfile.files_path -xvmzf $temp_archive_file; + python \$INTERPROSCAN_SCRIPT_PATH/create_index.py $outfile $outfile.files_path; + rm $temp_archive_file + #else: + -o $outfile + 2>&1 + #end if + </command> + + <inputs> + <param name="infile" type="data" format="fasta" label="Protein Fasta File"/> + + <param name="seqtype" type="select" label="Type of the input sequences" help=""> + <option value="p" selected="true">Protein</option> + <option value="n">DNA / RNA</option> + </param> + + <param name="appl" type="select" multiple="True" display="checkboxes" label="Applications to run" + help="Select your programm."> + <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on Hidden Markov Models or HMMs + </option> + <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a + hierarchical order (evolutionary relationships) + </option> + <option value="ProDom" selected="true">ProDom: set of protein domain families generated from the UniProtKB + </option> + <option value="Panther" selected="true">Panther: Protein ANalysis THrough Evolutionary Relationships + </option> + <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on + Hidden Markov Models or HMMs + </option> + <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional + sites as well as associated profiles to identify them + </option> + <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional + sites as well as associated patterns to identify them + </option> + <option value="HAMAP" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes + </option> + <option value="PfamA" selected="true">PfamA: protein families, each represented by multiple sequence + alignments and hidden Markov models + </option> + <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise + a protein family + </option> + <option value="SuperFamily" selected="true">SUPERFAMILY: database of structural and functional annotation + </option> + <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option> + <option value="Gene3d" selected="true">Gene3d: Structural assignment for whole genes and genomes using the + CATH domain structure database + </option> + <option value="SignalP-GRAM_POSITIVE" selected="false">SignalP Gram Positive Bacteria</option> + <option value="SignalP-GRAM_NEGATIVE" selected="false">SignalP Gram Negative Bacteria</option> + <option value="SignalP-EUK" selected="true">SignalP Eukaryotic Bacteria</option> + <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide + predictor + </option> + <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option> + </param> + + <param name="pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean" + label="Include pathway information" + help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries. (--pathways)"/> + <param name="goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean" + label="Include Gene Ontology (GO) mappings" + help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option. (--goterms)"/> + <param name="iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean" + label="Provide additional mappings" + help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into (--iprlookup)"/> + + <param name="oformat" type="select" label="Output format" help="Please select a output format."> + <option value="TSV" selected="true">Tab-separated values format (TSV)</option> + <option value="GFF3">GFF3</option> + <option value="SVG">SVG</option> + <option value="HTML">HTML</option> + <option value="XML">XML</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="outfile" label="Interproscan calculation on ${on_string}"> + <change_format> + <when input="oformat" value="HTML" format="html"/> + <when input="oformat" value="XML" format="xml"/> + <when input="oformat" value="SVG" format="html"/> + <when input="oformat" value="GFF3" format="gff"/> + </change_format> + </data> + + </outputs> + + <requirements> + </requirements> + + <help> + **What it does and does not do** + + Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches + of profile and other functional databases. + + + ##### + Input + ##### + + Required is a FASTA file containing protein or nucleotide sequences. + + + ###### + Output + ###### + + In this version of InterProScan_, you can retrieve output in any of the following five formats: + + * TSV: a simple tab-delimited file format + * XML: the new "IMPACT" XML format (XSD available here_). + * GFF: The `GFF 3.0`_ format + * HTML: An HTML representation of the protein matches + * SVG: An Scalable Vector Graphics representation of the protein matches + + + .. _`GFF 3.0`: http://gmod.org/wiki/GFF#GFF3_Format + .. _here: http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 + + + Tab-separated values format (TSV) + ================================= + + Basic tab delimited format. + + + Example Output + -------------- + + :: + + P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain + 1 2670 2799 7.9E-43 T 15-03-2013 + P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T + 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302 + P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013 + ... + + + The TSV format presents the match data in columns as follows: + + - Protein Accession (e.g. P51587) + - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579) + - Sequence Length (e.g. 3418) + - Analysis (e.g. Pfam / PRINTS / Gene3D) + - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140) + - Signature Description (e.g. BRCA2 repeat profile) + - Start location + - Stop location + - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52) + - Status - is the status of the match (T: true) + - Date - is the date of the run + - (InterProScan_ annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option + is switched on) + - (InterProScan_ annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan + option is switched on) + - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on) + - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on) + + + Extensible Markup Language (XML) + ================================ + + XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is + available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here]. + + Example Output + -------------- + + .. image:: $PATH_TO_IMAGES/example_xml_output.png + + + Generic Feature Format Version 3 (GFF3) + ======================================= + + The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to + trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format + representation of the predicted protein sequences and their matches. You will find a documentation of all the + columns and attributes used on [http://www.sequenceontology.org/gff3.shtml]. + + Example Output + -------------- + + :: + + ##gff-version 3 + ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 + ##sequence-region AACH01000027 1 1347 + ##seqid|source|type|start|end|score|strand|phase|attributes + AACH01000027 provided_by_user nucleic_acid 1 1347 . + . + Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 + AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 + 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 + AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 + AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase + family;Target=null 84 + 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" + ##sequence-region 2 + ... + >pep_AACH01000027_1_1347 + LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV + LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA + GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI + LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ + ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA + TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV + DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML + RSQKAKGVLIYRDDWISITPEIQLLFTEF + ... + >match$8_84_314 + KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK + RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL + LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR + AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS + + + Scalable Vector Graphics (SVG) and HyperText Markup Language (HTML) + ==================================================================== + + InterProScan_ 5 outputs a single HTML/SVG file for each protein sequence analysed. + + + Example Output + -------------- + + .. image:: $PATH_TO_IMAGES/P51587.svg.png + + .. _InterProScan: http://www.ebi.ac.uk/interpro + + + ---------- + References + ---------- + + + If you use this Galaxy tool in work leading to a scientific publication please + cite the following papers: + + Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). + Galaxy tools and workflows for sequence analysis with applications + in molecular plant pathology. PeerJ 1:e167 + http://dx.doi.org/10.7717/peerj.167 + + Zdobnov EM, Apweiler R (2001) + InterProScan an integration platform for the signature-recognition methods in InterPro. + Bioinformatics 17, 847-848. + http://dx.doi.org/10.1093/bioinformatics/17.9.847 + + Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) + InterProScan: protein domains identifier. + Nucleic Acids Research 33 (Web Server issue), W116-W120. + http://dx.doi.org/10.1093/nar/gki442 + + Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn + RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, + McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, + Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) + InterPro: the integrative protein signature database. + Nucleic Acids Research 37 (Database Issue), D224-228. + http://dx.doi.org/10.1093/nar/gkn785 + + + This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at + http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5 + + + **Galaxy Wrapper Author**:: + + * Bjoern Gruening, University of Freiburg + * Konrad Paszkiewicz, University of Exeter + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/readme.rst Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,83 @@ +================================================== +Galaxy wrapper for InterProScan 5 prediction tools +================================================== + +InterProScan is a tool that combines different protein signature recognition methods native to the InterPro +member databases into one resource with look up of corresponding InterPro and GO annotation. + +This wrapper is copyright 2013 by: + * Bjoern Gruening + * Konrad Paszkiewicz + + +This repository contains a wrapper for the InterProScan_ command line tool. + +.. _InterProScan: http://www.ebi.ac.uk/interpro/interproscan.html + + +Quevillon E., Silventoinen V., Pillai S., Harte N., Mulder N., Apweiler R., Lopez R. (2005). InterProScan: protein domains identifier. Nucleic Acids Res. 33 (Web Server issue): W116-W120 + + +============ +Installation +============ + +Please download install InterProScan according to: + +https://code.google.com/p/interproscan/wiki/HowToDownload + + +======== +Citation +======== + +If you use this Galaxy tool in work leading to a scientific +publication, in addition to citing the invididual underlying tools, please cite: + +Peter Cock, Bjoern Gruening, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 + +Full reference information is included in the help text. + + +============= +Input formats +============= + +The standard interproscan input is either genomic or protein sequences. +In the case of genomic sequences Interproscan will run an ORF prediction tool. + + +======= +History +======= + +interproscan: + + - v5.0: Initial public release of version 5.0 + + +============= +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/tool_dependencies.xml Sat Jan 16 12:30:10 2016 -0500 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable name="INTERPROSCAN_SCRIPT_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> +</tool_dependency> + + +