Mercurial > repos > galaxyp > dbbuilder

<tool id="dbbuilder" name="Protein Database Downloader" version="0.3.4">
    <description></description>
    <requirements>
        <requirement type="package" version="1.20.1">wget</requirement>
        <requirement type="package" version="3.8">python</requirement>
        <requirement type="package" version="2.20.1">requests</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:"  level="fatal" description="Error downloading database." />
        <regex match="ERROR" level="fatal" source="stderr" description="Error downloading database." />
    </stdio>
      <!-- TODO: escape quotes. -->
        <!-- Add NCBI and maxquant contaminants. -->
        <!-- http://maxquant.org/contaminants.zip -->
        <!-- ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.protein.faa.gz-->
    <command>
<![CDATA[
        #if $source.from == "uniprot"
            #set $query_fields = []
            #if $source.taxon_id
                #silent $query_fields.append('taxonomy_id:' + str($source.taxon_id))
            #else
                #silent $query_fields.append('taxonomy_id:' + str($source.taxon))
            #end if
            #if $source.reviewed:
                #silent $query_fields.append(str($source.reviewed))
            #end if
            #if $source.set:
                #silent $query_fields.append(str($source.set))
            #end if
            #set $query  = '(' + ')+AND+('.join($query_fields) + ')'
            #set $url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta' + str($source.include_isoform) + '&query=' + $query
            #set $type = "uniprotkb_stream"
        #elif $source.from == "cRAP"
            ##set $url = "ftp://ftp.thegpm.org/fasta/cRAP/crap.fasta"
            #set $url = "https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta"
            #set $type = "direct"
        #elif $source.from == "HMP"
            #set $url = 'http://downloads.hmpdacc.org/data/reference_genomes/body_sites/' + str($source.site) + '.pep.fsa'
            #set $type = "direct"
        #elif $source.from == "HOMD"
            #set $url = 'ftp://ftp.homd.org/human_oral_microbial_genomic_sequences/current/' + str($source.annotation)
            #if str($source.annotation).endswith('.tar.gz'):
                #set $type = "tgz"
            #elif str($source.annotation).endswith('.zip'):
                #set $type = "zip"
            #end if
        #elif $source.from == "url"
            #set $url = $source.url
            #set $type = $source.archive_type
        #end if
        #if $type =="uniprotkb_stream"
            python '$__tool_directory__/uniprotkb.py' --url '$url' -o 'tmp.gz' && gzip -dc 'tmp.gz' > '${output_database}'
        #elif $type =="direct"
            wget -nv '$url' -O '${output_database}' --no-check-certificate
        #elif $type =="zip"
            wget -nv '$url' -O tmp.zip --no-check-certificate && zcat -c tmp.zip > '${output_database}'
        #elif $type =="gzip"
            wget -nv '$url' -O tmp.gz --no-check-certificate && (if `command -v gzcat > /dev/null`;  then gzcat tmp.gz; else zcat tmp.gz ; fi) > '${output_database}'
        #elif $type =="bzip2"
            wget -nv '$url' -O tmp.bz2 --no-check-certificate && bzcat tmp.bz2 > '${output_database}'
        #elif $type =="tgz"
            wget -nv '$url' -O tmp.tar.gz && tar zxfO tmp.tar.gz > '${output_database}'
        #elif $type =="tbz"
            wget -nv '$url' -O tmp.tar.bz && tar jxfO tmp.tar.bz > '${output_database}'
        #end if
]]>
    </command>
    <inputs>
        <conditional name="source">
            <param name="from" type="select" label="Download from"
                help="Select database source. cRAP acts as a database for common MS contaminants. UniProtKB is a cross species collection of functional protein databases">
                <option value="uniprot">UniProtKB</option>
                <option value="cRAP">cRAP (contaminants)</option>
                <option value="HMP">Human Microbiome Project body sites</option>
                <option value="HOMD">Human Oral Microbiome Database (HOMD)</option>
                <option value="url">Custom URL</option>
            </param>
            <when value="uniprot">
                <param name="taxon" type="select" format="text" help="select species for protein database">
                    <label>Taxonomy</label>
                    <options from_file="uniprot_taxons.loc">
                        <column name="name" index="0" />
                        <column name="value" index="1" />
                        <filter type="add_value" name="Escherichia coli (strain K12)" value="83333" />
                    </options>
                </param>
                <param name="taxon_id" type="integer" value="" min="1" optional="true" help="Specify a NCBI taxon id to override species selection"/>
                <param name="reviewed" type="select" help="UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database- may contain redundant sequences, but there is a higher chance peptides will be identified. UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database- less of a chance peptides will be identified but less sequence redundancy">
                    <option value="">UniProtKB</option>
                    <option value="reviewed:true">UniProtKB/Swiss-Prot (reviewed only)</option>
                    <option value="reviewed:false">UniProtKB/TrEMBL (unreviewed only)</option>
                    <sanitizer>
                        <valid>
                            <add value=":"/>
                        </valid>
                    </sanitizer>
                </param>
                <param name="set" type="select" label="Proteome Set">
                    <option value="">Any</option>
                    <option value="keyword:KW-1185" selected="true">Reference Proteome Set</option>
                    <sanitizer>
                        <valid>
                            <add value=":"/>
                        </valid>
                    </sanitizer>
                </param>
                <param name="include_isoform" type="boolean" truevalue="&amp;includeIsoform=true" falsevalue=""
                    label="Include isoform data" help="several different forms of a given protein are incorporated into database" />
            </when>
            <when value="cRAP" />
            <when value="HMP">
                <param name="site" type="select" label="Proteome for body site">
                    <option value="Airways">HMP airways</option>
                    <option value="Blood">HMP Blood</option>
                    <option value="Gastrointestinal_tract">HMP Gastro-intestinal tract</option>
                    <option value="Oral">HMP Oral</option>
                    <option value="Skin">HMP Skin</option>
                    <option value="Urogenital_tract">HMP urogenital Tract</option>
                </param>
            </when>
            <when value="HOMD">
                <param name="annotation" type="select" label="Human Oral Microbiome Proteome">
                    <option value="oral_microbiome_dynamic.aa.zip">HOMD with dynamic annotation</option>
                    <option value="oral_microbiome.aa.tar.gz">HOMD with static annotation</option>
                </param>
            </when>
            <when value="url">
                <param name="url" value="" type="text" label="URL (http, ftp) of Fasta sequences">
                    <sanitizer>
                        <valid>
                            <add value="%"/>
                            <add value="~"/>
                        </valid>
                    </sanitizer>
                </param>
                <param name="archive_type" type="select" label="Fasta source compression type">
                    <option value="direct" selected="true">fasta file (uncompressed)</option>
                    <option value="gzip">fasta.gz (gzip compressed)</option>
                    <option value="bzip2">fasta.bz2 (bzip2 compressed)</option>
                    <option value="zip">fasta.zip or fasta.Z (Zip compressed)</option>
                    <option value="tgz">fasta.tgz or fasta.tar.gz (tar archive gzip compressed)</option>
                    <option value="tbz">fasta.tbz or fasta.tar.bz (tar archive bzip2 compressed)</option>
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="fasta" name="output_database" label="Protein Database ${source.from}" />
    </outputs>
    <tests>
        <test>
            <conditional name="source">
                <param name="from" value="cRAP" />
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text="KKA1_ECOLX" />
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="source">
                <param name="from" value="uniprot" />
                <param name="taxon" value="83333"/>
                <param name="taxon_id" value="2697049"/>
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text="SPIKE_SARS2" />
                </assert_contents>
            </output>
        </test>

        <test>
            <conditional name="source">
                <param name="from" value="uniprot" />
                <param name="taxon_id" value="2697049"/>
                <param name="reviewed" value="reviewed:true"/>
                <param name="set" value=""/>
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text=">sp|P0DTC1|R1A_SARS2" />
                    <not_has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />
                </assert_contents>
            </output>
        </test>

        <test>
            <conditional name="source">
                <param name="from" value="uniprot" />
                <param name="taxon_id" value="2697049"/>
                <param name="reviewed" value="reviewed:false"/>
                <param name="set" value=""/>
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />
                    <not_has_text text=">sp|P0DTC1|R1A_SARS2" />
                </assert_contents>
            </output>
        </test>

        <test>
            <conditional name="source">
                <param name="from" value="uniprot" />
                <param name="taxon" value="83333"/>
                <param name="reviewed" value="reviewed:true"/>
                <param name="set" value=""/>
                <param name="include_isoform" value="True"/>
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text="sp|P06615|REDF_ECOLI" />
                    <has_text text=">sp|P06710-2|DPO3X_ECOLI" />
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="source">
                <param name="from" value="uniprot" />
                <param name="taxon" value="83333"/>
                <param name="reviewed" value="reviewed:true"/>
                <param name="set" value=""/>
                <param name="include_isoform" value="False"/>
            </conditional>
            <output name="output_database">
                <assert_contents>
                    <has_text text="sp|P06615|REDF_ECOLI" />
                    <has_text text="REDF_ECOLI" />
                    <not_has_text text=">sp|P06710-2|DPO3X_ECOLI" />
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="from" value="url" />
            <param name="url" value="https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta" />
            <param name="archive_type" value="direct" />
            <output name="output_database">
                <assert_contents>
                    <has_text text="KKA1_ECOLX" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
<![CDATA[
**Output**

Creates a FASTA file of specified protein sequences for comparison with experimental MS/MS data in search algorithm.

**External Links**

  - Galaxy-P_101_ shows usage Protein Database Downloader tool in the creation of a workflow
  - UniProtKB_ provides additional information about the UniProt Knowledgebase


.. _Galaxy-P_101: http://msi-galaxy-p.readthedocs.org/en/latest/sections/galaxyp_101.html
.. _UniProtKB: http://www.uniprot.org/help/uniprotkb


**Additional Protein Fasta URLs**

  *HUMAN GUT METAPROTEOME:*

    *  61MB gzip http://www.bork.embl.de/~arumugam/Qin_et_al_2010/frequent_microbe_proteins.fasta.gz


  *MOUSE GUT MICROBIOTA:*

    * See: http://gigadb.org/dataset/view/id/100114/token/mZlMYJIF04LshpgP


]]>
    </help>
    <citations>
        <citation type="doi">10.1093/nar/gkw1099</citation>
        <citation type="doi">10.1093/nar/gkv1195 </citation>
        <citation type="doi">10.1093/database/baq013</citation>
        <citation type="doi">10.1038/nature11209</citation>
        <citation type="doi">10.1038/nature11234</citation>
    </citations>
</tool>
author	galaxyp
date	Tue, 27 Sep 2022 21:57:37 +0000
parents	8e637098a8ab
children