# HG changeset patch # User galaxyp # Date 1733924094 0 # Node ID 4ddc8da6267151b72fcb0452a00e6028ed4aa3a2 # Parent a371252a2cf6970bb9e84d3cce48e226626f9cb0 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60 diff -r a371252a2cf6 -r 4ddc8da62671 macros.xml --- a/macros.xml Thu Jul 06 21:15:39 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ - - - - - - - - - diff -r a371252a2cf6 -r 4ddc8da62671 uniprotxml_downloader.py --- a/uniprotxml_downloader.py Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.py Wed Dec 11 13:34:54 2024 +0000 @@ -17,31 +17,7 @@ from urllib import parse import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -DEFAULT_TIMEOUT = 5 # seconds -retry_strategy = Retry( - total=5, - backoff_factor=2, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] -) - - -class TimeoutHTTPAdapter(HTTPAdapter): - def __init__(self, *args, **kwargs): - self.timeout = DEFAULT_TIMEOUT - if "timeout" in kwargs: - self.timeout = kwargs["timeout"] - del kwargs["timeout"] - super().__init__(*args, **kwargs) - - def send(self, request, **kwargs): - timeout = kwargs.get("timeout") - if timeout is None: - kwargs["timeout"] = self.timeout - return super().send(request, **kwargs) +from requests.adapters import HTTPAdapter, Retry def __main__(): @@ -51,9 +27,10 @@ parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') - parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') + parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format') parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') + parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() search_ids = set(options.search_id) @@ -75,25 +52,35 @@ dest_path = "uniprot_%s.xml" % '_'.join(search_ids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' try: - url = 'https://rest.uniprot.org/uniprotkb/stream' - query = "%s%s" % (search_query, reviewed) - params = {'query': query, 'format': options.format} - if options.debug: - print("%s ? %s" % (url, params), file=sys.stderr) - data = parse.urlencode(params) - print(f"Retrieving: {url}?{data}") - adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) + re_next_link = re.compile(r'<(.+)>; rel="next"') + retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retries)) + + def get_next_link(headers): + if "Link" in headers: + match = re_next_link.match(headers["Link"]) + if match: + return match.group(1) - http = requests.Session() - http.mount("https://", adapter) - response = http.get(url, params=params) - http.close() + def get_batch(batch_url): + while batch_url: + response = session.get(batch_url) + response.raise_for_status() + total = response.headers["x-total-results"] + release = response.headers["x-uniprot-release"] + yield response, total, release + batch_url = get_next_link(response.headers) - if response.status_code != 200: - exit(f"Request failed with status code {response.status_code}:\n{response.text}") + params = {'size': 500, 'format': options.format, 'query': search_query + reviewed} + if options.output_columns: + params['fields'] = options.output_columns + url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}' + print(f"Downloading from:{url}") with open(dest_path, 'w') as fh: - fh.write(response.text) + for batch, total, release in get_batch(url): + fh.write(batch.text) if options.format == 'xml': with open(dest_path, 'r') as contents: @@ -112,11 +99,9 @@ else: print("failed: Not a uniprot xml file", file=sys.stderr) exit(1) - print("Search IDs:%s" % search_ids, file=sys.stdout) - if 'X-UniProt-Release' in response.headers: - print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) - if 'X-Total-Results' in response.headers: - print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) + print(f"Search IDs:{search_ids}") + print(f"UniProt-Release:{release}") + print(f"Entries:{total}") except Exception as e: exit("%s" % e) diff -r a371252a2cf6 -r 4ddc8da62671 uniprotxml_downloader.xml --- a/uniprotxml_downloader.xml Thu Jul 06 21:15:39 2023 +0000 +++ b/uniprotxml_downloader.xml Wed Dec 11 13:34:54 2024 +0000 @@ -1,7 +1,13 @@ - + download proteome as XML or fasta - macros.xml + + + + + + + requests @@ -28,7 +34,10 @@ --input='${input_method.id_file}' --column=#echo int(str($input_method.column)) - 1# #end if ---format $format +--format $format_cond.format +#if $format_cond.format == "tsv" + --output_columns #echo ','.join($format_cond.columns) +#end if --output '${proteome}' ]]> @@ -36,8 +45,8 @@ - - + + - - - - + + + + + + + + + + + + + + + + - + - + + - - - - + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + - - - - + + + + + + + + @@ -120,55 +174,138 @@ - - - - - + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +