Mercurial > repos > galaxyp > uniprotxml_downloader
changeset 7:4ddc8da62671 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60
author | galaxyp |
---|---|
date | Wed, 11 Dec 2024 13:34:54 +0000 |
parents | a371252a2cf6 |
children | |
files | macros.xml uniprotxml_downloader.py uniprotxml_downloader.xml |
diffstat | 3 files changed, 221 insertions(+), 108 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Thu Jul 06 21:15:39 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<macros> - <xml name="query_field"> - <param name="field" type="select" label="Field"> - <option value="taxonomy_name">Taxonomy Name</option> - <option value="taxonomy_id">Taxonomy ID</option> - <option value="accession">Accession</option> - </param> - </xml> -</macros>
--- a/uniprotxml_downloader.py Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.py Wed Dec 11 13:34:54 2024 +0000 @@ -17,31 +17,7 @@ from urllib import parse import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -DEFAULT_TIMEOUT = 5 # seconds -retry_strategy = Retry( - total=5, - backoff_factor=2, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] -) - - -class TimeoutHTTPAdapter(HTTPAdapter): - def __init__(self, *args, **kwargs): - self.timeout = DEFAULT_TIMEOUT - if "timeout" in kwargs: - self.timeout = kwargs["timeout"] - del kwargs["timeout"] - super().__init__(*args, **kwargs) - - def send(self, request, **kwargs): - timeout = kwargs.get("timeout") - if timeout is None: - kwargs["timeout"] = self.timeout - return super().send(request, **kwargs) +from requests.adapters import HTTPAdapter, Retry def __main__(): @@ -51,9 +27,10 @@ parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') - parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') + parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format') parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') + parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() search_ids = set(options.search_id) @@ -75,25 +52,35 @@ dest_path = "uniprot_%s.xml" % '_'.join(search_ids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' try: - url = 'https://rest.uniprot.org/uniprotkb/stream' - query = "%s%s" % (search_query, reviewed) - params = {'query': query, 'format': options.format} - if options.debug: - print("%s ? %s" % (url, params), file=sys.stderr) - data = parse.urlencode(params) - print(f"Retrieving: {url}?{data}") - adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) + re_next_link = re.compile(r'<(.+)>; rel="next"') + retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retries)) + + def get_next_link(headers): + if "Link" in headers: + match = re_next_link.match(headers["Link"]) + if match: + return match.group(1) - http = requests.Session() - http.mount("https://", adapter) - response = http.get(url, params=params) - http.close() + def get_batch(batch_url): + while batch_url: + response = session.get(batch_url) + response.raise_for_status() + total = response.headers["x-total-results"] + release = response.headers["x-uniprot-release"] + yield response, total, release + batch_url = get_next_link(response.headers) - if response.status_code != 200: - exit(f"Request failed with status code {response.status_code}:\n{response.text}") + params = {'size': 500, 'format': options.format, 'query': search_query + reviewed} + if options.output_columns: + params['fields'] = options.output_columns + url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}' + print(f"Downloading from:{url}") with open(dest_path, 'w') as fh: - fh.write(response.text) + for batch, total, release in get_batch(url): + fh.write(batch.text) if options.format == 'xml': with open(dest_path, 'r') as contents: @@ -112,11 +99,9 @@ else: print("failed: Not a uniprot xml file", file=sys.stderr) exit(1) - print("Search IDs:%s" % search_ids, file=sys.stdout) - if 'X-UniProt-Release' in response.headers: - print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) - if 'X-Total-Results' in response.headers: - print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) + print(f"Search IDs:{search_ids}") + print(f"UniProt-Release:{release}") + print(f"Entries:{total}") except Exception as e: exit("%s" % e)
--- a/uniprotxml_downloader.xml Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.xml Wed Dec 11 13:34:54 2024 +0000 @@ -1,7 +1,13 @@ -<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01"> +<tool id="uniprotxml_downloader" name="UniProt" version="2.5.0" profile="23.1"> <description>download proteome as XML or fasta</description> <macros> - <import>macros.xml</import> + <xml name="query_field"> + <param name="field" type="select" label="Field"> + <option value="taxonomy_name">Taxonomy Name</option> + <option value="taxonomy_id">Taxonomy ID</option> + <option value="accession">Accession</option> + </param> + </xml> </macros> <requirements> <requirement type="package" version="2.25.1">requests</requirement> @@ -28,7 +34,10 @@ --input='${input_method.id_file}' --column=#echo int(str($input_method.column)) - 1# #end if ---format $format +--format $format_cond.format +#if $format_cond.format == "tsv" + --output_columns #echo ','.join($format_cond.columns) +#end if --output '${proteome}' ]]> </command> @@ -36,8 +45,8 @@ <conditional name="input_method"> <param name="input_choice" type="select" label="Select"> <option value="common">A Common Organism</option> - <option value="enter_ids">A manually entered list of Uniprot IDs</option> - <option value="history">A history dataset with a column containing Uniprot IDs</option> + <option value="enter_ids">A manually entered list of accessions or taxonomy IDs/names</option> + <option value="history">A history dataset with a column containing accessions or taxonomy IDs/names</option> </param> <when value="common"> <param name="organism" type="select" label="Common Organisms" @@ -72,46 +81,91 @@ <expand macro="query_field"/> </when> </conditional> - <param name="format" type="select" label="uniprot output format"> - <option value="xml">xml</option> - <option value="fasta">fasta</option> - </param> + <conditional name="format_cond"> + <param name="format" type="select" label="uniprot output format"> + <option value="fasta">fasta</option> + <option value="tsv">TSV</option> + <option value="xml">xml</option> + </param> + <when value="fasta"/> + <when value="xml"/> + <when value="tsv"> + <param name="columns" type="select" multiple="true"> + <options from_url="https://rest.uniprot.org/configure/uniprotkb/result-fields"> + <postprocess_expression type="ecma5.1"><![CDATA[${ + var options = []; + inputs.forEach(function(group) { + var groupName = group.groupName; + group.fields.forEach(function(field) { + var D = ["accession", "id", "reviewed", "protein_name", "gene_names", "organism_name", "length"]; + var selected = D.includes(field.name); + options.push([group.groupName + " - " + field.label, field.name, selected]); + }); + }); + return options; + }]]></postprocess_expression> + </options> + </param> + </when> + </conditional> </inputs> <outputs> - <data format="uniprotxml" name="proteome" label="UniProt.${format}"> + <data format="uniprotxml" name="proteome"> <change_format> - <when input="format" value="fasta" format="fasta" /> + <when input="format_cond.format" value="fasta" format="fasta" /> + <when input="format_cond.format" value="tsv" format="tsv" /> </change_format> </data> </outputs> <tests> <test> - <param name="input_choice" value="enter_ids"/> - <param name="ids" value="1566990"/> - <param name="format" value="xml"/> - <output name="proteome"> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="1566990"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="xml"/> + </conditional> + <output name="proteome" ftype="uniprotxml"> <assert_contents> <has_text text="</uniprot>" /> </assert_contents> </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_line line="Entries:0"/> <!-- searching by name using an ID --> + </assert_stdout> </test> <test> - <param name="input_choice" value="enter_ids"/> - <param name="ids" value="765963,512562"/> - <param name="field" value="taxonomy_id"/> - <param name="format" value="fasta"/> - <output name="proteome"> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="765963,512562"/> + <param name="field" value="taxonomy_id"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="fasta"/> + </conditional> + <output name="proteome" ftype="fasta"> <assert_contents> <has_text text="Shi470" /> <has_text text="PeCan4" /> </assert_contents> </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> </test> <test> - <param name="input_choice" value="enter_ids"/> - <param name="ids" value="Shi470,PeCan4"/> - <param name="field" value="taxonomy_name"/> - <param name="format" value="fasta"/> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="Shi470,PeCan4"/> + <param name="field" value="taxonomy_name"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="fasta" ftype="fasta"/> + </conditional> <output name="proteome"> <assert_contents> <has_text text="Shi470" /> @@ -120,55 +174,138 @@ </output> </test> <test> - <param name="input_choice" value="enter_ids"/> - <param name="ids" value="E1Q2I0,E1Q3C4"/> - <param name="field" value="accession"/> - <param name="format" value="fasta"/> - <output name="proteome"> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="E1Q2I0,E1Q3C4"/> + <param name="field" value="accession"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="fasta"/> + </conditional> + <output name="proteome" ftype="fasta"> <assert_contents> <has_text text="E1Q2I0" /> <has_text text="E1Q3C4" /> </assert_contents> </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> </test> <test> - <param name="input_choice" value="history"/> - <param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/> - <param name="column" value="1"/> - <param name="field" value="taxonomy_name"/> + <conditional name="input_method"> + <param name="input_choice" value="history"/> + <param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/> + <param name="column" value="1"/> + <param name="field" value="taxonomy_name"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="fasta"/> + </conditional> + <output name="proteome" ftype="fasta"> + <assert_contents> + <has_text text="Shi470" /> + <has_text text="PeCan4" /> + </assert_contents> + </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> + </test> + <test> + <conditional name="input_method"> + <param name="input_choice" value="history"/> + <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/> + <param name="column" value="2"/> + <param name="field" value="taxonomy_id"/> + </conditional> + <conditional name="format_cond"> <param name="format" value="fasta"/> - <output name="proteome"> + </conditional> + <output name="proteome" ftype="fasta"> <assert_contents> <has_text text="Shi470" /> <has_text text="PeCan4" /> </assert_contents> </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> </test> <test> - <param name="input_choice" value="history"/> - <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/> - <param name="column" value="2"/> - <param name="field" value="taxonomy_id"/> - <param name="format" value="fasta"/> - <output name="proteome"> - <assert_contents> - <has_text text="Shi470" /> - <has_text text="PeCan4" /> - </assert_contents> - </output> - </test> - <test> - <param name="input_choice" value="history"/> - <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/> - <param name="column" value="1"/> - <param name="field" value="accession"/> - <param name="format" value="fasta"/> - <output name="proteome"> + <conditional name="input_method"> + <param name="input_choice" value="history"/> + <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/> + <param name="column" value="1"/> + <param name="field" value="accession"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="fasta"/> + </conditional> + <output name="proteome" ftype="fasta"> <assert_contents> <has_text text="E1Q2I0" /> <has_text text="E1Q3C4" /> </assert_contents> </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> + </test> + <!-- tsv output --> + <test> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="765963,512562"/> + <param name="field" value="taxonomy_id"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="tsv"/> + </conditional> + <output name="proteome" ftype="tsv"> + <assert_contents> + <has_n_columns n="7" /> + <has_text text="Shi470" /> + <has_text text="PeCan4" /> + </assert_contents> + </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> + </test> + <!-- tsv output non default columns--> + <test> + <conditional name="input_method"> + <param name="input_choice" value="enter_ids"/> + <param name="ids" value="765963,512562"/> + <param name="field" value="taxonomy_id"/> + </conditional> + <conditional name="format_cond"> + <param name="format" value="tsv"/> + <param name="columns" value="accession,sequence"/> + </conditional> + <output name="proteome" ftype="tsv"> + <assert_contents> + <has_n_columns n="2" /> + <has_text text="Shi470" negate="true"/> + <has_text text="B2US14" /> + </assert_contents> + </output> + <assert_stdout> + <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/> + <has_text_matching expression="Entries:\d+"/> + <has_line line="Entries:0" negate="true"/> + </assert_stdout> </test> </tests> <help>