Repository 'dbbuilder'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/dbbuilder

Changeset 11:8e637098a8ab (2022-09-27)
Previous changeset 10:e9df53a75f3c (2020-11-25) Next changeset 12:983bf725dfc2 (2022-09-27)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/dbbuilder commit 16ba4570b04301b774ee0420694f379cc640744b
modified:
dbbuilder.xml
added:
uniprotkb.py
b
diff -r e9df53a75f3c -r 8e637098a8ab dbbuilder.xml
--- a/dbbuilder.xml Wed Nov 25 17:43:51 2020 +0000
+++ b/dbbuilder.xml Tue Sep 27 13:22:04 2022 +0000
[
b'@@ -1,7 +1,9 @@\n-<tool id="dbbuilder" name="Protein Database Downloader" version="0.3.2">\n+<tool id="dbbuilder" name="Protein Database Downloader" version="0.3.3">\n     <description></description>\n     <requirements>\n         <requirement type="package" version="1.20.1">wget</requirement>\n+        <requirement type="package" version="3.8">python</requirement>\n+        <requirement type="package" version="2.20.1">requests</requirement>\n     </requirements>\n     <stdio>\n         <exit_code range="1:"  level="fatal" description="Error downloading database." />\n@@ -14,8 +16,18 @@\n     <command>\n <![CDATA[\n         #if $source.from == "uniprot"\n-            #set $url = \'http://www.uniprot.org/uniprot/?query=taxonomy:"\' + str($source.taxon) + \'"\' + str($source.set) + str($source.reviewed) + \'&force=yes&format=fasta\' + str($source.include_isoform)\n-            #set $type = "direct"\n+            #if $source.set:\n+                #set $modified_set = \'&\' + str($source.set) \n+            #else\n+                #set $modified_set = \'\'\n+            #end if\n+            #if $source.taxon_id\n+                #set $taxon_id = $source.taxon_id\n+            #else\n+                #set $taxon_id = $source.taxon\n+            #end if\n+            #set $url = \'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=taxonomy_id:"\' + str($taxon_id) + \'"\' + str($modified_set) + str($source.reviewed) + str($source.include_isoform)\n+            #set $type = "uniprotkb_stream"\n         #elif $source.from == "cRAP"\n             ##set $url = "ftp://ftp.thegpm.org/fasta/cRAP/crap.fasta"\n             #set $url = "https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta"\n@@ -34,7 +46,9 @@\n             #set $url = $source.url\n             #set $type = $source.archive_type\n         #end if\n-        #if $type =="direct"\n+        #if $type =="uniprotkb_stream"\n+            python \'$__tool_directory__/uniprotkb.py\' --url \'$url\' -o \'tmp.gz\' && gzip -dc \'tmp.gz\' > \'${output_database}\' \n+        #elif $type =="direct"\n             wget -nv \'$url\' -O \'${output_database}\' --no-check-certificate\n         #elif $type =="zip"\n             wget -nv \'$url\' -O tmp.zip --no-check-certificate && zcat -c tmp.zip > \'${output_database}\'\n@@ -51,7 +65,8 @@\n     </command>\n     <inputs>\n         <conditional name="source">\n-            <param name="from" type="select" label="Download from" help="Select database source. cRAP acts as a database for common MS contaminants. UniProtKB is a cross species collection of functional protein databases">\n+            <param name="from" type="select" label="Download from"\n+                help="Select database source. cRAP acts as a database for common MS contaminants. UniProtKB is a cross species collection of functional protein databases">\n                 <option value="uniprot">UniProtKB</option>\n                 <option value="cRAP">cRAP (contaminants)</option>\n                 <option value="HMP">Human Microbiome Project body sites</option>\n@@ -64,12 +79,14 @@\n                     <options from_file="uniprot_taxons.loc">\n                         <column name="name" index="0" />\n                         <column name="value" index="1" />\n+                        <filter type="add_value" name="Escherichia coli (strain K12)" value="83333" />\n                     </options>\n                 </param>\n+                <param name="taxon_id" type="integer" value="" min="1" optional="true" help="Specify a NCBI taxon id to override species selection"/>\n                 <param name="reviewed" type="select" help="UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database- may contain redundant sequences, but there is a higher chance peptides will be identified. UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database- less of a chance peptides will be identified but less sequence redundancy">\n-                    <option value="+">UniProtKB</option>\n-                    '..b'ue="+reviewed%3Ano">UniProtKB/TrEMBL (unreviewed only)</option>\n+                    <option value="">UniProtKB</option>\n+                    <option value="+reviewed%3Atrue">UniProtKB/Swiss-Prot (reviewed only)</option>\n+                    <option value="+reviewed%3Afalse">UniProtKB/TrEMBL (unreviewed only)</option>\n                     <sanitizer>\n                         <valid>\n                             <add value="%"/>\n@@ -77,15 +94,16 @@\n                     </sanitizer>\n                 </param>\n                 <param name="set" type="select" label="Proteome Set">\n-                    <option value="+">Any</option>\n-                    <option value="+keyword%3a1185" selected="true">Reference Proteome Set</option>\n+                    <option value="">Any</option>\n+                    <option value="keyword%3aKW-1185" selected="true">Reference Proteome Set</option>\n                     <sanitizer>\n                         <valid>\n                             <add value="%"/>\n                         </valid>\n                     </sanitizer>\n                 </param>\n-                <param name="include_isoform" type="boolean" truevalue="&amp;include=yes" falsevalue="" label="Include isoform data" help="several different forms of a given protein are incorporated into database" />\n+                <param name="include_isoform" type="boolean" truevalue="&amp;includeIsoform=true" falsevalue="" \n+                    label="Include isoform data" help="several different forms of a given protein are incorporated into database" />\n             </when>\n             <when value="cRAP" />\n             <when value="HMP">\n@@ -129,7 +147,9 @@\n     </outputs>\n     <tests>\n         <test>\n-            <param name="from" value="cRAP" />\n+            <conditional name="source">\n+                <param name="from" value="cRAP" />\n+            </conditional>\n             <output name="output_database">\n                 <assert_contents>\n                     <has_text text="KKA1_ECOLX" />\n@@ -137,6 +157,47 @@\n             </output>\n         </test>\n         <test>\n+            <conditional name="source">\n+                <param name="from" value="uniprot" />\n+                <param name="taxon" value="83333"/>\n+                <param name="taxon_id" value="2697049"/>\n+            </conditional>\n+            <output name="output_database">\n+                <assert_contents>\n+                    <has_text text="SPIKE_SARS2" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test>\n+            <conditional name="source">\n+                <param name="from" value="uniprot" />\n+                <param name="taxon_id" value="2697049"/>\n+                <param name="reviewed" value="+reviewed%3Atrue"/>\n+                <param name="set" value=""/>\n+            </conditional>\n+            <output name="output_database">\n+                <assert_contents>\n+                    <has_text text=">sp|P0DTC1|R1A_SARS2" />\n+                    <not_has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test>\n+            <conditional name="source">\n+                <param name="from" value="uniprot" />\n+                <param name="taxon_id" value="2697049"/>\n+                <param name="reviewed" value="+reviewed%3Afalse"/>\n+                <param name="set" value=""/>\n+            </conditional>\n+            <output name="output_database">\n+                <assert_contents>\n+                    <has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />\n+                    <not_has_text text=">sp|P0DTC1|R1A_SARS2" />\n+                </assert_contents>\n+            </output>\n+        </test>\n+\n+        <test>\n             <param name="from" value="url" />\n             <param name="url" value="https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta" />\n             <param name="archive_type" value="direct" />\n'
b
diff -r e9df53a75f3c -r 8e637098a8ab uniprotkb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/uniprotkb.py Tue Sep 27 13:22:04 2022 +0000
b
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+import requests
+
+uniprotkb_url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query='
+
+
+def __main__():
+    parser = argparse.ArgumentParser(
+        description='Retrieve Uniprot data using streaming')
+    parser.add_argument('-u', '--url', help="Uniprot rest api URL")
+    parser.add_argument('-q', '--query', help="UniprotKB Query")
+    parser.add_argument('-o', '--output', type=argparse.FileType('wb'), default=sys.stdout, help='data')
+    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
+    args = parser.parse_args()
+    if args.url:
+        url = args.url
+    else:
+        url = uniprotkb_url + args.query
+    with requests.get(url, stream=True) as request:
+        request.raise_for_status()
+        for chunk in request.iter_content(chunk_size=2**20):
+            args.output.write(chunk)
+
+
+if __name__ == "__main__":
+    __main__()