| Previous changeset 6:a371252a2cf6 (2023-07-06) |
|
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60 |
|
modified:
uniprotxml_downloader.py uniprotxml_downloader.xml |
|
removed:
macros.xml |
| b |
| diff -r a371252a2cf6 -r 4ddc8da62671 macros.xml --- a/macros.xml Thu Jul 06 21:15:39 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
| b |
| @@ -1,9 +0,0 @@ -<macros> - <xml name="query_field"> - <param name="field" type="select" label="Field"> - <option value="taxonomy_name">Taxonomy Name</option> - <option value="taxonomy_id">Taxonomy ID</option> - <option value="accession">Accession</option> - </param> - </xml> -</macros> |
| b |
| diff -r a371252a2cf6 -r 4ddc8da62671 uniprotxml_downloader.py --- a/uniprotxml_downloader.py Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.py Wed Dec 11 13:34:54 2024 +0000 |
| [ |
| @@ -17,31 +17,7 @@ from urllib import parse import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -DEFAULT_TIMEOUT = 5 # seconds -retry_strategy = Retry( - total=5, - backoff_factor=2, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] -) - - -class TimeoutHTTPAdapter(HTTPAdapter): - def __init__(self, *args, **kwargs): - self.timeout = DEFAULT_TIMEOUT - if "timeout" in kwargs: - self.timeout = kwargs["timeout"] - del kwargs["timeout"] - super().__init__(*args, **kwargs) - - def send(self, request, **kwargs): - timeout = kwargs.get("timeout") - if timeout is None: - kwargs["timeout"] = self.timeout - return super().send(request, **kwargs) +from requests.adapters import HTTPAdapter, Retry def __main__(): @@ -51,9 +27,10 @@ parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') - parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') + parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format') parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') + parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() search_ids = set(options.search_id) @@ -75,25 +52,35 @@ dest_path = "uniprot_%s.xml" % '_'.join(search_ids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' try: - url = 'https://rest.uniprot.org/uniprotkb/stream' - query = "%s%s" % (search_query, reviewed) - params = {'query': query, 'format': options.format} - if options.debug: - print("%s ? %s" % (url, params), file=sys.stderr) - data = parse.urlencode(params) - print(f"Retrieving: {url}?{data}") - adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) + re_next_link = re.compile(r'<(.+)>; rel="next"') + retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retries)) + + def get_next_link(headers): + if "Link" in headers: + match = re_next_link.match(headers["Link"]) + if match: + return match.group(1) - http = requests.Session() - http.mount("https://", adapter) - response = http.get(url, params=params) - http.close() + def get_batch(batch_url): + while batch_url: + response = session.get(batch_url) + response.raise_for_status() + total = response.headers["x-total-results"] + release = response.headers["x-uniprot-release"] + yield response, total, release + batch_url = get_next_link(response.headers) - if response.status_code != 200: - exit(f"Request failed with status code {response.status_code}:\n{response.text}") + params = {'size': 500, 'format': options.format, 'query': search_query + reviewed} + if options.output_columns: + params['fields'] = options.output_columns + url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}' + print(f"Downloading from:{url}") with open(dest_path, 'w') as fh: - fh.write(response.text) + for batch, total, release in get_batch(url): + fh.write(batch.text) if options.format == 'xml': with open(dest_path, 'r') as contents: @@ -112,11 +99,9 @@ else: print("failed: Not a uniprot xml file", file=sys.stderr) exit(1) - print("Search IDs:%s" % search_ids, file=sys.stdout) - if 'X-UniProt-Release' in response.headers: - print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) - if 'X-Total-Results' in response.headers: - print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) + print(f"Search IDs:{search_ids}") + print(f"UniProt-Release:{release}") + print(f"Entries:{total}") except Exception as e: exit("%s" % e) |
| b |
| diff -r a371252a2cf6 -r 4ddc8da62671 uniprotxml_downloader.xml --- a/uniprotxml_downloader.xml Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.xml Wed Dec 11 13:34:54 2024 +0000 |
| [ |
| b'@@ -1,7 +1,13 @@\n-<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01">\n+<tool id="uniprotxml_downloader" name="UniProt" version="2.5.0" profile="23.1">\n <description>download proteome as XML or fasta</description>\n <macros>\n- <import>macros.xml</import>\n+ <xml name="query_field">\n+ <param name="field" type="select" label="Field">\n+ <option value="taxonomy_name">Taxonomy Name</option>\n+ <option value="taxonomy_id">Taxonomy ID</option>\n+ <option value="accession">Accession</option>\n+ </param>\n+ </xml>\n </macros>\n <requirements>\n <requirement type="package" version="2.25.1">requests</requirement>\n@@ -28,7 +34,10 @@\n --input=\'${input_method.id_file}\'\n --column=#echo int(str($input_method.column)) - 1#\n #end if\n---format $format\n+--format $format_cond.format\n+#if $format_cond.format == "tsv"\n+ --output_columns #echo \',\'.join($format_cond.columns)\n+#end if\n --output \'${proteome}\'\n ]]>\n </command>\n@@ -36,8 +45,8 @@\n <conditional name="input_method">\n <param name="input_choice" type="select" label="Select">\n <option value="common">A Common Organism</option>\n- <option value="enter_ids">A manually entered list of Uniprot IDs</option>\n- <option value="history">A history dataset with a column containing Uniprot IDs</option>\n+ <option value="enter_ids">A manually entered list of accessions or taxonomy IDs/names</option>\n+ <option value="history">A history dataset with a column containing accessions or taxonomy IDs/names</option>\n </param>\n <when value="common">\n <param name="organism" type="select" label="Common Organisms"\n@@ -72,46 +81,91 @@\n <expand macro="query_field"/>\n </when>\n </conditional>\n- <param name="format" type="select" label="uniprot output format">\n- <option value="xml">xml</option>\n- <option value="fasta">fasta</option>\n- </param>\n+ <conditional name="format_cond">\n+ <param name="format" type="select" label="uniprot output format">\n+ <option value="fasta">fasta</option>\n+ <option value="tsv">TSV</option>\n+ <option value="xml">xml</option>\n+ </param>\n+ <when value="fasta"/>\n+ <when value="xml"/>\n+ <when value="tsv">\n+ <param name="columns" type="select" multiple="true">\n+ <options from_url="https://rest.uniprot.org/configure/uniprotkb/result-fields">\n+ <postprocess_expression type="ecma5.1"><![CDATA[${\n+ var options = [];\n+ inputs.forEach(function(group) {\n+ var groupName = group.groupName;\n+ group.fields.forEach(function(field) {\n+ var D = ["accession", "id", "reviewed", "protein_name", "gene_names", "organism_name", "length"];\n+ var selected = D.includes(field.name);\n+ options.push([group.groupName + " - " + field.label, field.name, selected]);\n+ });\n+ });\n+ return options;\n+ }]]></postprocess_expression>\n+ </options>\n+ </param>\n+ </when>\n+ </conditional>\n </inputs>\n <outputs>\n- <data format="uniprotxml" name="proteome" label="UniProt.${format}">\n+ <data format="uniprotxml" name="proteome">\n <change_format>\n- <when input="format" value="fasta" format="fasta" />\n+ <when input="format_cond.format" value="fasta" format="fasta" />\n+ <when input="format_cond'..b' <param name="input_choice" value="history"/>\n- <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>\n- <param name="column" value="2"/>\n- <param name="field" value="taxonomy_id"/>\n- <param name="format" value="fasta"/>\n- <output name="proteome">\n- <assert_contents>\n- <has_text text="Shi470" />\n- <has_text text="PeCan4" />\n- </assert_contents>\n- </output>\n- </test>\n- <test>\n- <param name="input_choice" value="history"/>\n- <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>\n- <param name="column" value="1"/>\n- <param name="field" value="accession"/>\n- <param name="format" value="fasta"/>\n- <output name="proteome">\n+ <conditional name="input_method">\n+ <param name="input_choice" value="history"/>\n+ <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>\n+ <param name="column" value="1"/>\n+ <param name="field" value="accession"/>\n+ </conditional>\n+ <conditional name="format_cond">\n+ <param name="format" value="fasta"/>\n+ </conditional>\n+ <output name="proteome" ftype="fasta">\n <assert_contents>\n <has_text text="E1Q2I0" />\n <has_text text="E1Q3C4" />\n </assert_contents>\n </output>\n+ <assert_stdout>\n+ <has_text_matching expression="UniProt-Release:\\d{4}_\\d{2}"/>\n+ <has_text_matching expression="Entries:\\d+"/>\n+ <has_line line="Entries:0" negate="true"/>\n+ </assert_stdout>\n+ </test>\n+ <!-- tsv output -->\n+ <test>\n+ <conditional name="input_method">\n+ <param name="input_choice" value="enter_ids"/>\n+ <param name="ids" value="765963,512562"/>\n+ <param name="field" value="taxonomy_id"/>\n+ </conditional>\n+ <conditional name="format_cond">\n+ <param name="format" value="tsv"/>\n+ </conditional>\n+ <output name="proteome" ftype="tsv">\n+ <assert_contents>\n+ <has_n_columns n="7" />\n+ <has_text text="Shi470" />\n+ <has_text text="PeCan4" />\n+ </assert_contents>\n+ </output>\n+ <assert_stdout>\n+ <has_text_matching expression="UniProt-Release:\\d{4}_\\d{2}"/>\n+ <has_text_matching expression="Entries:\\d+"/>\n+ <has_line line="Entries:0" negate="true"/>\n+ </assert_stdout>\n+ </test>\n+ <!-- tsv output non default columns-->\n+ <test>\n+ <conditional name="input_method">\n+ <param name="input_choice" value="enter_ids"/>\n+ <param name="ids" value="765963,512562"/>\n+ <param name="field" value="taxonomy_id"/>\n+ </conditional>\n+ <conditional name="format_cond">\n+ <param name="format" value="tsv"/>\n+ <param name="columns" value="accession,sequence"/>\n+ </conditional>\n+ <output name="proteome" ftype="tsv">\n+ <assert_contents>\n+ <has_n_columns n="2" />\n+ <has_text text="Shi470" negate="true"/>\n+ <has_text text="B2US14" />\n+ </assert_contents>\n+ </output>\n+ <assert_stdout>\n+ <has_text_matching expression="UniProt-Release:\\d{4}_\\d{2}"/>\n+ <has_text_matching expression="Entries:\\d+"/>\n+ <has_line line="Entries:0" negate="true"/>\n+ </assert_stdout>\n </test>\n </tests>\n <help>\n' |