Galaxy |

Changeset 11:8e637098a8ab (2022-09-27)

Previous changeset 10:e9df53a75f3c (2020-11-25) Next changeset 12:983bf725dfc2 (2022-09-27)

Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/dbbuilder commit 16ba4570b04301b774ee0420694f379cc640744b

modified:
dbbuilder.xml

added:
uniprotkb.py

diff -r e9df53a75f3c -r 8e637098a8ab dbbuilder.xml
--- a/dbbuilder.xml Wed Nov 25 17:43:51 2020 +0000
+++ b/dbbuilder.xml Tue Sep 27 13:22:04 2022 +0000

[

b'@@ -1,7 +1,9 @@\n-<tool id="dbbuilder" name="Protein Database Downloader" version="0.3.2">\n+<tool id="dbbuilder" name="Protein Database Downloader" version="0.3.3">\n <description></description>\n <requirements>\n <requirement type="package" version="1.20.1">wget</requirement>\n+ <requirement type="package" version="3.8">python</requirement>\n+ <requirement type="package" version="2.20.1">requests</requirement>\n </requirements>\n <stdio>\n <exit_code range="1:" level="fatal" description="Error downloading database." />\n@@ -14,8 +16,18 @@\n <command>\n <![CDATA[\n #if $source.from == "uniprot"\n- #set $url = \'http://www.uniprot.org/uniprot/?query=taxonomy:"\' + str($source.taxon) + \'"\' + str($source.set) + str($source.reviewed) + \'&force=yes&format=fasta\' + str($source.include_isoform)\n- #set $type = "direct"\n+ #if $source.set:\n+ #set $modified_set = \'&\' + str($source.set) \n+ #else\n+ #set $modified_set = \'\'\n+ #end if\n+ #if $source.taxon_id\n+ #set $taxon_id = $source.taxon_id\n+ #else\n+ #set $taxon_id = $source.taxon\n+ #end if\n+ #set $url = \'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=taxonomy_id:"\' + str($taxon_id) + \'"\' + str($modified_set) + str($source.reviewed) + str($source.include_isoform)\n+ #set $type = "uniprotkb_stream"\n #elif $source.from == "cRAP"\n ##set $url = "ftp://ftp.thegpm.org/fasta/cRAP/crap.fasta"\n #set $url = "https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta"\n@@ -34,7 +46,9 @@\n #set $url = $source.url\n #set $type = $source.archive_type\n #end if\n- #if $type =="direct"\n+ #if $type =="uniprotkb_stream"\n+ python \'$__tool_directory__/uniprotkb.py\' --url \'$url\' -o \'tmp.gz\' && gzip -dc \'tmp.gz\' > \'${output_database}\' \n+ #elif $type =="direct"\n wget -nv \'$url\' -O \'${output_database}\' --no-check-certificate\n #elif $type =="zip"\n wget -nv \'$url\' -O tmp.zip --no-check-certificate && zcat -c tmp.zip > \'${output_database}\'\n@@ -51,7 +65,8 @@\n </command>\n <inputs>\n <conditional name="source">\n- <param name="from" type="select" label="Download from" help="Select database source. cRAP acts as a database for common MS contaminants. UniProtKB is a cross species collection of functional protein databases">\n+ <param name="from" type="select" label="Download from"\n+ help="Select database source. cRAP acts as a database for common MS contaminants. UniProtKB is a cross species collection of functional protein databases">\n <option value="uniprot">UniProtKB</option>\n <option value="cRAP">cRAP (contaminants)</option>\n <option value="HMP">Human Microbiome Project body sites</option>\n@@ -64,12 +79,14 @@\n <options from_file="uniprot_taxons.loc">\n <column name="name" index="0" />\n <column name="value" index="1" />\n+ <filter type="add_value" name="Escherichia coli (strain K12)" value="83333" />\n </options>\n </param>\n+ <param name="taxon_id" type="integer" value="" min="1" optional="true" help="Specify a NCBI taxon id to override species selection"/>\n <param name="reviewed" type="select" help="UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database- may contain redundant sequences, but there is a higher chance peptides will be identified. UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database- less of a chance peptides will be identified but less sequence redundancy">\n- <option value="+">UniProtKB</option>\n- '..b'ue="+reviewed%3Ano">UniProtKB/TrEMBL (unreviewed only)</option>\n+ <option value="">UniProtKB</option>\n+ <option value="+reviewed%3Atrue">UniProtKB/Swiss-Prot (reviewed only)</option>\n+ <option value="+reviewed%3Afalse">UniProtKB/TrEMBL (unreviewed only)</option>\n <sanitizer>\n <valid>\n <add value="%"/>\n@@ -77,15 +94,16 @@\n </sanitizer>\n </param>\n <param name="set" type="select" label="Proteome Set">\n- <option value="+">Any</option>\n- <option value="+keyword%3a1185" selected="true">Reference Proteome Set</option>\n+ <option value="">Any</option>\n+ <option value="keyword%3aKW-1185" selected="true">Reference Proteome Set</option>\n <sanitizer>\n <valid>\n <add value="%"/>\n </valid>\n </sanitizer>\n </param>\n- <param name="include_isoform" type="boolean" truevalue="&include=yes" falsevalue="" label="Include isoform data" help="several different forms of a given protein are incorporated into database" />\n+ <param name="include_isoform" type="boolean" truevalue="&includeIsoform=true" falsevalue="" \n+ label="Include isoform data" help="several different forms of a given protein are incorporated into database" />\n </when>\n <when value="cRAP" />\n <when value="HMP">\n@@ -129,7 +147,9 @@\n </outputs>\n <tests>\n <test>\n- <param name="from" value="cRAP" />\n+ <conditional name="source">\n+ <param name="from" value="cRAP" />\n+ </conditional>\n <output name="output_database">\n <assert_contents>\n <has_text text="KKA1_ECOLX" />\n@@ -137,6 +157,47 @@\n </output>\n </test>\n <test>\n+ <conditional name="source">\n+ <param name="from" value="uniprot" />\n+ <param name="taxon" value="83333"/>\n+ <param name="taxon_id" value="2697049"/>\n+ </conditional>\n+ <output name="output_database">\n+ <assert_contents>\n+ <has_text text="SPIKE_SARS2" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+ <test>\n+ <conditional name="source">\n+ <param name="from" value="uniprot" />\n+ <param name="taxon_id" value="2697049"/>\n+ <param name="reviewed" value="+reviewed%3Atrue"/>\n+ <param name="set" value=""/>\n+ </conditional>\n+ <output name="output_database">\n+ <assert_contents>\n+ <has_text text=">sp|P0DTC1|R1A_SARS2" />\n+ <not_has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+ <test>\n+ <conditional name="source">\n+ <param name="from" value="uniprot" />\n+ <param name="taxon_id" value="2697049"/>\n+ <param name="reviewed" value="+reviewed%3Afalse"/>\n+ <param name="set" value=""/>\n+ </conditional>\n+ <output name="output_database">\n+ <assert_contents>\n+ <has_text text=">tr|A0A679G4D8|A0A679G4D8_SARS2" />\n+ <not_has_text text=">sp|P0DTC1|R1A_SARS2" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+\n+ <test>\n <param name="from" value="url" />\n <param name="url" value="https://raw.githubusercontent.com/pravs3683/cRAP/master/cRAP_protein_database.fasta" />\n <param name="archive_type" value="direct" />\n'

diff -r e9df53a75f3c -r 8e637098a8ab uniprotkb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/uniprotkb.py Tue Sep 27 13:22:04 2022 +0000

@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+import requests
+
+uniprotkb_url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query='
+
+
+def __main__():
+    parser = argparse.ArgumentParser(
+        description='Retrieve Uniprot data using streaming')
+    parser.add_argument('-u', '--url', help="Uniprot rest api URL")
+    parser.add_argument('-q', '--query', help="UniprotKB Query")
+    parser.add_argument('-o', '--output', type=argparse.FileType('wb'), default=sys.stdout, help='data')
+    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
+    args = parser.parse_args()
+    if args.url:
+        url = args.url
+    else:
+        url = uniprotkb_url + args.query
+    with requests.get(url, stream=True) as request:
+        request.raise_for_status()
+        for chunk in request.iter_content(chunk_size=2**20):
+            args.output.write(chunk)
+
+
+if __name__ == "__main__":
+    __main__()