Mercurial > repos > bgruening > interproscan

<tool id="interproscan" name="InterProScan" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
    <description>functional annotation</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <xrefs>
        <xref type="bio.tools">interproscan_4</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">interproscan</requirement>
    </requirements>
    <version_command>interproscan.sh --version</version_command>
    <command><![CDATA[
## Adapt properties file to use data from data table
mkdir -p \$HOME/.interproscan-5
&&
sed 's|^\(data.directory=\).*$|\1${database.fields.path}/data|' \$(dirname \$(readlink -f \$(command -v interproscan.sh)))/interproscan.properties > \$HOME/.interproscan-5/interproscan.properties
&&
## Java doesn't read the HOME env var
export _JAVA_OPTIONS=-Duser.home=\$HOME
&&

## Now run interproscan
interproscan.sh

## disables the precalculated lookup service, all calculation will be run locally
-dp
--input '$input'
--seqtype $seqtype
-f ${','.join($oformat)}

#if $licensed.use == 'true' and $licensed.applications_licensed:
    --applications ${','.join($applications)},${','.join($licensed.applications_licensed)}
#else:
    --applications ${','.join($applications)}
#end if
--tempdir \${TEMP:-\$_GALAXY_JOB_TMP_DIR}

$pathways
$goterms
$iprlookup

--cpu \${GALAXY_SLOTS:-4}

--output-file-base 'output'
    ]]></command>
    <inputs>
        <param argument="--input" type="data" format="fasta" label="Protein FASTA File"/>

        <param argument="--seqtype" type="select" label="Type of the input sequences" help="">
            <option value="p" selected="true">Protein</option>
            <option value="n">DNA / RNA</option>
        </param>

        <param name="database" label="InterProScan database" type="select">
            <options from_data_table="interproscan">
                <column name="value" index="0" />
                <column name="name" index="1" />
                <column name="path" index="3" />
                <filter type="sort_by" column="0" />
                <filter type="static_value" column="2" value="@TOOL_VERSION@" />
            </options>
        </param>

        <param name="applications" type="select" multiple="True" label="Applications to run" help="Select your program">
            <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on hidden Markov models (HMMs)</option>
            <option value="SFLD" selected="true">SFLD: a database of protein families based on hidden Markov models (HMMs)</option>
            <option value="SUPERFAMILY" selected="true">SUPERFAMILY: database of structural and functional annotation for all proteins and genomes</option>
            <option value="PANTHER" selected="true">PANTHER: Protein ANalysis THrough Evolutionary Relationships</option>
            <option value="Gene3D" selected="true">Gene3d: Structural assignment for whole genes and genomes using the CATH domain structure database</option>
            <option value="Hamap" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes</option>
            <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional sites as well as associated profiles to identify them</option>
            <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option>
            <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on Hidden Markov Models or HMMs</option>
            <option value="CDD" selected="true">SMART: protein domains and families based on well-annotated multiple sequence alignment models</option>
            <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise a protein family</option>
            <option value="PIRSR" selected="true">PIRSR: protein families based on hidden Markov models (HMMs) and Site Rules</option>
            <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional sites as well as associated patterns to identify them</option>
            <option value="AntiFam" selected="true">AntiFam: a resource of profile-HMMs designed to identify spurious protein predictions.</option>
            <option value="Pfam" selected="true">Pfam: protein families, each represented by multiple sequence alignments and hidden Markov models</option>
            <option value="MobiDBLite" selected="true">MobiDBLite: Prediction of intrinsically disordered regions in proteins</option>
            <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a hierarchical order (evolutionary relationships)</option>
        </param>

        <conditional name="licensed">
            <param name="use" type="select" label="Use applications with restricted license, only for non-commercial use?" help="The corresponding tools must be installed manually by the administrator of this Galaxy instance" >
                <option value="false" selected="true">No</option>
                <option value="true">Yes</option>
            </param>
            <when value="false" />
            <when value="true">
                <param name="applications_licensed" type="select" multiple="True" label="Applications to run" help="Select your programm.">
                    <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide predictor</option>
                    <option value="SignalP_GRAM_NEGATIVE" selected="false">SignalP (gram-negative): signal peptide cleavage sites in amino acid sequences for gram-negative prokaryotes</option>
                    <option value="SignalP_EUK" selected="true">SignalP (eukaryotes): signal peptide cleavage sites in amino acid sequences for eukaryotes</option>
                    <option value="SignalP_GRAM_POSITIVE" selected="false">SignalP (Gram Positive Bacteria): signal peptide cleavage sites in amino acid sequences for gram-positive prokaryotes</option>
                    <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option>
                </param>
            </when>
        </conditional>

        <param argument="--pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean" label="Include pathway information"
            help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries."/>
        <param argument="--goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean" label="Include Gene Ontology (GO) mappings"
            help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option."/>
        <param argument="--iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean"
            label="Provide additional mappings" help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into"/>

        <param name="oformat" type="select" multiple="true" label="Output format" help="Please select a output format (JSON output can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/).">
            <option value="TSV" selected="true">Tab-separated values format (TSV)</option>
            <option value="GFF3">GFF3</option>
            <option value="XML">XML</option>
            <option value="JSON">JSON</option>
        </param>
    </inputs>

    <outputs>
        <data format="tabular" name="outfile_tsv" from_work_dir="output.tsv" label="InterProScan on ${on_string} (tsv)">
            <filter>oformat and 'TSV' in oformat</filter>
        </data>
        <data format="xml" name="outfile_xml" from_work_dir="output.xml" label="InterProScan on ${on_string} (xml)">
            <filter>oformat and 'XML' in oformat</filter>
        </data>
        <data format="gff3" name="outfile_gff3" from_work_dir="output.gff3" label="InterProScan on ${on_string} (gff3)">
            <filter>oformat and 'GFF3' in oformat</filter>
        </data>
        <data format="json" name="outfile_json" from_work_dir="output.json" label="InterProScan on ${on_string} (json)">
            <filter>oformat and 'JSON' in oformat</filter>
        </data>
    </outputs>

    <tests>
        <test expect_num_outputs="1">
            <param name="input" value="prots.fa" />
            <param name="seqtype" value="p" />
            <param name="database" value="@TOOL_VERSION@" />
            <param name="applications" value="MobiDBLite" />
            <param name="oformat" value="TSV" />
            <output name="outfile_tsv">
                <assert_contents>
                    <has_text text="FUN_000011-T1" />
                    <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" />
                    <has_text text="FUN_000012-T1" />
                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <param name="input" value="prots.fa" />
            <param name="seqtype" value="p" />
            <param name="database" value="@TOOL_VERSION@" />
            <param name="applications" value="MobiDBLite" />
            <param name="oformat" value="TSV,GFF3,XML,JSON" />
            <output name="outfile_tsv">
                <assert_contents>
                    <has_text text="FUN_000011-T1" />
                    <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" />
                    <has_text text="FUN_000012-T1" />
                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
                </assert_contents>
            </output>
            <output name="outfile_xml">
                <assert_contents>
                    <has_text text="mobidblite-location" />
                    <has_text text="Polyampholyte" />
                    <has_text text="consensus disorder prediction" />
                    <has_text text="FUN_000011-T1 FUN_000011" />
                </assert_contents>
            </output>
            <output name="outfile_gff3">
                <assert_contents>
                    <has_text text="protein_match" />
                    <has_text text="ID=FUN_000011-T1;md5=" />
                    <has_text text="MobiDBLite" />
                </assert_contents>
            </output>
            <output name="outfile_json">
                <assert_contents>
                    <has_text text="signatureLibraryRelease" />
                    <has_text text="disorder_prediction" />
                    <has_text text="Polyampholyte" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <param name="input" value="transcripts.fa" />
            <param name="seqtype" value="n" />
            <param name="database" value="@TOOL_VERSION@" />
            <param name="applications" value="MobiDBLite" />
            <param name="oformat" value="TSV,GFF3,XML,JSON" />
            <output name="outfile_tsv">
                <assert_contents>
                    <has_text text="FUN_000018-T1_orf336" />
                    <has_text text="0b28fe115d4cc09260b038b19fb0b21d" />
                    <has_text text="FUN_000012-T1_orf133" />
                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
                </assert_contents>
            </output>
            <output name="outfile_xml">
                <assert_contents>
                    <has_text text="mobidblite-location" />
                    <has_text text="Polyampholyte" />
                    <has_text text="consensus disorder prediction" />
                    <has_text text="orf355" />
                </assert_contents>
            </output>
            <output name="outfile_gff3">
                <assert_contents>
                    <has_text text="protein_match" />
                    <has_text text="ID=FUN_000012-T1;" />
                    <has_text text="MobiDBLite" />
                </assert_contents>
            </output>
            <output name="outfile_json">
                <assert_contents>
                    <has_text text="signatureLibraryRelease" />
                    <has_text text="disorder_prediction" />
                    <has_text text="Polyampholyte" />
                </assert_contents>
            </output>
        </test>
        <test expect_failure="true" expect_num_outputs="1">
            <param name="input" value="prots.fa" />
            <param name="seqtype" value="p" />
            <param name="database" value="@TOOL_VERSION@" />
            <param name="applications" value="MobiDBLite" />
            <conditional name="licensed">
                <param name="use" value="true" />
                <param name="applications_licensed" value="Phobius,TMHMM" />
            </conditional>
            <param name="oformat" value="TSV" />
            <assert_stdout>
                <!-- expected to be "deactivated" as they are not installed by default -->
                <has_text text="Analysis Phobius does not exist or is deactivated" />
                <has_text text="Analysis TMHMM does not exist or is deactivated" />
            </assert_stdout>
        </test>
    </tests>

    <help><![CDATA[

**What it does**

Interproscan is a batch tool to query the InterPro database. It provides annotations based on multiple searches of profile and other functional databases.

Phobius (licensed software), SignalP, SMART (licensed components) and TMHMM use
licensed code and data provided by third parties. If you wish to run these
analyses it will be necessary for you to obtain a licence from the vendor and
configure the Galaxy server InterProScan installation to use them.

**Input**

Required is a FASTA file containing protein or nucleotide sequences.

**Output**

In this version of InterProScan, you can retrieve output in any of the following five formats:

 * TSV: tab-separated values format
 * XML: XML format
 * GFF: The GFF 3.0 format
 * JSON: A JSON representation of the protein matches that can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/

**Example Output**


::

  P51587  14086411a2cdf1c4cba63020e1622579        3418    Pfam    PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain 1        2670    2799    7.9E-43 T       15-03-2013
  P51587  14086411a2cdf1c4cba63020e1622579        3418    ProSiteProfiles PS50138 BRCA2 repeat profile.   1002    1036    0.0     T       18-03-2013      IPR002093       BRCA2 repeat    GO:0005515|GO:0006302
  P51587  14086411a2cdf1c4cba63020e1622579        3418    Gene3D  G3DSA:2.40.50.140               2966    3051    3.1E-52 T       15-03-2013
  ...


The TSV format presents the match data in columns as follows:

  - Protein Accession (e.g. P51587)
  - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
  - Sequence Length (e.g. 3418)
  - Analysis (e.g. Pfam / PRINTS / Gene3D)
  - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
  - Signature Description (e.g. BRCA2 repeat profile)
  - Start location
  - Stop location
  - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52)
  - Status - is the status of the match (T: true)
  - Date - is the date of the run
  - (InterProScan annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option is switched on)
  - (InterProScan annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan option is switched on)
  - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on)
  - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on)


**Extensible Markup Language (XML)**

XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here].

**Generic Feature Format Version 3 (GFF3)**

The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md].

**Example Output**


::

  ##gff-version 3
  ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269
  ##sequence-region AACH01000027 1 1347
  ##seqid|source|type|start|end|score|strand|phase|attributes
  AACH01000027    provided_by_user        nucleic_acid    1       1347    .       +       .       Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027
  AACH01000027    getorf  ORF     1       1347    .       +       .       Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347
  AACH01000027    getorf  polypeptide     1       449     .       +       .       md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347
  AACH01000027    Pfam    protein_match   84      314     1.2E-45 +       .       Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13"
  ##sequence-region 2
  ...
  >pep_AACH01000027_1_1347
  LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV
  LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA
  GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI
  LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ
  ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA
  TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV
  DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML
  RSQKAKGVLIYRDDWISITPEIQLLFTEF
  ...
  >match$8_84_314
  KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK
  RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL
  LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR
  AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS

]]></help>

    <expand macro="citations" />
</tool>
author	iuc
date	Fri, 29 Apr 2022 20:14:21 +0000
parents	c55643c3d813
children	74810db257cc