# HG changeset patch
# User bgruening
# Date 1716412695 0
# Node ID 468c71dac78a9dbc65882c58d9af75f55ae1d951
# Parent af5eccf8360579dd585674047be53e27adbc3b94
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit da476148d1c609f5c26e880a3e593f0fa71ff2f6
diff -r af5eccf83605 -r 468c71dac78a macros.xml
--- a/macros.xml Mon Nov 21 22:02:41 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,226 +0,0 @@
-
-1.0
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r af5eccf83605 -r 468c71dac78a test-data/test2_map.tab
--- a/test-data/test2_map.tab Mon Nov 21 22:02:41 2022 +0000
+++ b/test-data/test2_map.tab Wed May 22 21:18:15 2024 +0000
@@ -1,4 +1,4 @@
-From To
-NM_001087 AAMP_HUMAN
-NM_130786 A1BG_HUMAN
-NM_130786 V9HWD8_HUMAN
+From Entry Entry Name Reviewed Protein names Gene Names Organism Length
+NM_001087 Q13685 AAMP_HUMAN reviewed Angio-associated migratory cell protein AAMP Homo sapiens (Human) 434
+NM_130786 P04217 A1BG_HUMAN reviewed Alpha-1B-glycoprotein (Alpha-1-B glycoprotein) A1BG Homo sapiens (Human) 495
+NM_130786 V9HWD8 V9HWD8_HUMAN unreviewed Epididymis secretory sperm binding protein Li 163pA HEL-S-163pA Homo sapiens (Human) 495
diff -r af5eccf83605 -r 468c71dac78a uniprot.py
--- a/uniprot.py Mon Nov 21 22:02:41 2022 +0000
+++ b/uniprot.py Wed May 22 21:18:15 2024 +0000
@@ -1,92 +1,266 @@
-#!/usr/bin/env python
-"""
-uniprot python interface
-to access the uniprot database
-
-Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot
-available services:
- map
- retrieve
-
-rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
-"""
import argparse
+import json
+import re
import sys
+import time
+import zlib
+from urllib.parse import (
+ parse_qs,
+ urlencode,
+ urlparse,
+)
+from xml.etree import ElementTree
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-
-
-DEFAULT_TIMEOUT = 5 # seconds
-URL = 'https://legacy.uniprot.org/'
-
-retry_strategy = Retry(
- total=5,
- backoff_factor=2,
- status_forcelist=[429, 500, 502, 503, 504],
- allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
+from requests.adapters import (
+ HTTPAdapter,
+ Retry,
)
-class TimeoutHTTPAdapter(HTTPAdapter):
- def __init__(self, *args, **kwargs):
- self.timeout = DEFAULT_TIMEOUT
- if "timeout" in kwargs:
- self.timeout = kwargs["timeout"]
- del kwargs["timeout"]
- super().__init__(*args, **kwargs)
+POLLING_INTERVAL = 3
+API_URL = "https://rest.uniprot.org"
+
+
+retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
+session = requests.Session()
+session.mount("https://", HTTPAdapter(max_retries=retries))
+
+
+def check_response(response):
+ try:
+ response.raise_for_status()
+ except requests.HTTPError:
+ print(response.json())
+ raise
+
+
+def submit_id_mapping(from_db, to_db, ids):
+ print(f"{from_db} {to_db}")
+ request = requests.post(
+ f"{API_URL}/idmapping/run",
+ data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
+ )
+ check_response(request)
+ return request.json()["jobId"]
+
+
+def get_next_link(headers):
+ re_next_link = re.compile(r'<(.+)>; rel="next"')
+ if "Link" in headers:
+ match = re_next_link.match(headers["Link"])
+ if match:
+ return match.group(1)
+
+
+def check_id_mapping_results_ready(job_id):
+ while True:
+ request = session.get(f"{API_URL}/idmapping/status/{job_id}")
+ check_response(request)
+ j = request.json()
+ if "jobStatus" in j:
+ if j["jobStatus"] == "RUNNING":
+ print(f"Retrying in {POLLING_INTERVAL}s")
+ time.sleep(POLLING_INTERVAL)
+ else:
+ raise Exception(j["jobStatus"])
+ else:
+ return bool(j["results"] or j["failedIds"])
+
+
+def get_batch(batch_response, file_format, compressed):
+ batch_url = get_next_link(batch_response.headers)
+ while batch_url:
+ batch_response = session.get(batch_url)
+ batch_response.raise_for_status()
+ yield decode_results(batch_response, file_format, compressed)
+ batch_url = get_next_link(batch_response.headers)
- def send(self, request, **kwargs):
- timeout = kwargs.get("timeout")
- if timeout is None:
- kwargs["timeout"] = self.timeout
- return super().send(request, **kwargs)
+
+def combine_batches(all_results, batch_results, file_format):
+ if file_format == "json":
+ for key in ("results", "failedIds"):
+ if key in batch_results and batch_results[key]:
+ all_results[key] += batch_results[key]
+ elif file_format == "tsv":
+ return all_results + batch_results[1:]
+ else:
+ return all_results + batch_results
+ return all_results
+
+
+def get_id_mapping_results_link(job_id):
+ url = f"{API_URL}/idmapping/details/{job_id}"
+ request = session.get(url)
+ check_response(request)
+ return request.json()["redirectURL"]
+
+
+def decode_results(response, file_format, compressed):
+ if compressed:
+ decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
+ if file_format == "json":
+ j = json.loads(decompressed.decode("utf-8"))
+ return j
+ elif file_format == "tsv":
+ return [line for line in decompressed.decode("utf-8").split("\n") if line]
+ elif file_format == "xlsx":
+ return [decompressed]
+ elif file_format == "xml":
+ return [decompressed.decode("utf-8")]
+ else:
+ return decompressed.decode("utf-8")
+ elif file_format == "json":
+ return response.json()
+ elif file_format == "tsv":
+ return [line for line in response.text.split("\n") if line]
+ elif file_format == "xlsx":
+ return [response.content]
+ elif file_format == "xml":
+ return [response.text]
+ return response.text
+
+
+def get_xml_namespace(element):
+ m = re.match(r"\{(.*)\}", element.tag)
+ return m.groups()[0] if m else ""
+
+
+def merge_xml_results(xml_results):
+ merged_root = ElementTree.fromstring(xml_results[0])
+ for result in xml_results[1:]:
+ root = ElementTree.fromstring(result)
+ for child in root.findall("{http://uniprot.org/uniprot}entry"):
+ merged_root.insert(-1, child)
+ ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
+ return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
-def _map(query, f, t, format='tab', chunk_size=100):
- """ _map is not meant for use with the python interface, use `map` instead
- """
- tool = 'uploadlists/'
- data = {'format': format, 'from': f, 'to': t}
+def print_progress_batches(batch_index, size, total):
+ n_fetched = min((batch_index + 1) * size, total)
+ print(f"Fetched: {n_fetched} / {total}")
+
- req = []
- for i in range(0, len(query), chunk_size):
- q = query[i:i + chunk_size]
- req.append(dict([("url", URL + tool),
- ('data', data),
- ("files", {'file': ' '.join(q)})]))
- return req
- response = requests.post(URL + tool, data=data)
- response.raise_for_status()
- page = response.text
- if "The service is temporarily unavailable" in page:
- exit("The UNIPROT service is temporarily unavailable. Please try again later.")
- return page
+def get_id_mapping_results_search(url):
+ parsed = urlparse(url)
+ query = parse_qs(parsed.query)
+ file_format = query["format"][0] if "format" in query else "json"
+ if "size" in query:
+ size = int(query["size"][0])
+ else:
+ size = 500
+ query["size"] = size
+ compressed = (
+ query["compressed"][0].lower() == "true" if "compressed" in query else False
+ )
+ parsed = parsed._replace(query=urlencode(query, doseq=True))
+ url = parsed.geturl()
+ request = session.get(url)
+ check_response(request)
+ results = decode_results(request, file_format, compressed)
+ total = int(request.headers["x-total-results"])
+ print_progress_batches(0, size, total)
+ for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
+ results = combine_batches(results, batch, file_format)
+ print_progress_batches(i, size, total)
+ if file_format == "xml":
+ return merge_xml_results(results)
+ return results
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='retrieve uniprot mapping')
- subparsers = parser.add_subparsers(dest='tool')
+# print(results)
+# {'results': [{'from': 'P05067', 'to': 'CHEMBL2487'}], 'failedIds': ['P12345']}
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="retrieve uniprot mapping")
+ subparsers = parser.add_subparsers(dest="tool")
- mapping = subparsers.add_parser('map')
- mapping.add_argument('f', help='from')
- mapping.add_argument('t', help='to')
- mapping.add_argument('inp', nargs='?', type=argparse.FileType('r'),
- default=sys.stdin, help='input file (default: stdin)')
- mapping.add_argument('out', nargs='?', type=argparse.FileType('w'),
- default=sys.stdout, help='output file (default: stdout)')
- mapping.add_argument('--format', default='tab', help='output format')
+ mapping = subparsers.add_parser("map")
+ mapping.add_argument("f", help="from")
+ mapping.add_argument("t", help="to")
+ mapping.add_argument(
+ "inp",
+ nargs="?",
+ type=argparse.FileType("r"),
+ default=sys.stdin,
+ help="input file (default: stdin)",
+ )
+ mapping.add_argument(
+ "out",
+ nargs="?",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="output file (default: stdout)",
+ )
+ mapping.add_argument("--format", default="tab", help="output format")
- retrieve = subparsers.add_parser('retrieve')
- retrieve.add_argument('inp', metavar='in', nargs='?', type=argparse.FileType('r'),
- default=sys.stdin, help='input file (default: stdin)')
- retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'),
- default=sys.stdout, help='output file (default: stdout)')
- retrieve.add_argument('-f', '--format', help='specify output format', default='txt')
+ retrieve = subparsers.add_parser("retrieve")
+ retrieve.add_argument(
+ "inp",
+ metavar="in",
+ nargs="?",
+ type=argparse.FileType("r"),
+ default=sys.stdin,
+ help="input file (default: stdin)",
+ )
+ retrieve.add_argument(
+ "out",
+ nargs="?",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="output file (default: stdout)",
+ )
+ retrieve.add_argument("-f", "--format", help="specify output format", default="txt")
+ mapping = subparsers.add_parser("menu")
args = parser.parse_args()
+ # code for auto generating the from - to conditional
+ if args.tool == "menu":
+ from lxml import etree
+
+ request = session.get("https://rest.uniprot.org/configure/idmapping/fields")
+ check_response(request)
+ fields = request.json()
+
+ tos = dict()
+ from_cond = etree.Element("conditional", name="from_cond")
+ from_select = etree.SubElement(
+ from_cond, "param", name="from", type="select", label="Source database:"
+ )
+
+ rules = dict()
+ for rule in fields["rules"]:
+ rules[rule["ruleId"]] = rule["tos"]
+
+ for group in fields["groups"]:
+ group_name = group["groupName"]
+ group_name = group_name.replace("databases", "DBs")
+ for item in group["items"]:
+ if item["to"]:
+ tos[item["name"]] = f"{group_name} - {item['displayName']}"
+
+ for group in fields["groups"]:
+ group_name = group["groupName"]
+ group_name = group_name.replace("databases", "DBs")
+ for item in group["items"]:
+ if not item["from"]:
+ continue
+ option = etree.SubElement(from_select, "option", value=item["name"])
+ option.text = f"{group_name} - {item['displayName']}"
+ when = etree.SubElement(from_cond, "when", value=item["name"])
+
+ to_select = etree.SubElement(
+ when, "param", name="to", type="select", label="Target database:"
+ )
+ ruleId = item["ruleId"]
+ for to in rules[ruleId]:
+ option = etree.SubElement(to_select, "option", value=to)
+ option.text = tos[to]
+ etree.indent(from_cond, space=" ")
+ print(etree.tostring(from_cond, pretty_print=True, encoding="unicode"))
+ sys.exit(0)
+
# get the IDs from the file as sorted list
# (sorted is convenient for testing)
query = set()
@@ -94,15 +268,19 @@
query.add(line.strip())
query = sorted(query)
- if args.tool == 'map':
- pload = _map(query, args.f, args.t, chunk_size=100)
- elif args.tool == 'retrieve':
- pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100)
+ if args.tool == "map":
+ job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query)
+ elif args.tool == "retrieve":
+ job_id = submit_id_mapping(
+ from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query
+ )
- adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
- http = requests.Session()
- http.mount("https://", adapter)
- for i, p in enumerate(pload):
- response = http.post(**p)
- args.out.write(response.text)
- http.close()
+ if check_id_mapping_results_ready(job_id):
+ link = get_id_mapping_results_link(job_id)
+ link = f"{link}?format={args.format}"
+ print(link)
+ results = get_id_mapping_results_search(link)
+
+ if not isinstance(results, str):
+ results = "\n".join(results)
+ args.out.write(f"{results}\n")
diff -r af5eccf83605 -r 468c71dac78a uniprot.xml
--- a/uniprot.xml Mon Nov 21 22:02:41 2022 +0000
+++ b/uniprot.xml Wed May 22 21:18:15 2024 +0000
@@ -1,197 +1,824 @@
-
+
ID mapping and retrieval
-
- macros.xml
-
requests
echo "UniProt ID mapping for Galaxy in version 0.1"
id_file.tabular &&
- '$__tool_directory__/uniprot.py'
-
+ python '$__tool_directory__/uniprot.py'
#if $tool.tool_choice == "retrieve":
retrieve -f $tool.format id_file.tabular ./output
#elif $tool.tool_choice == "map":
map
-
- #if $tool.from.category_FROM == "uniprot"
- '${tool.from.db_uniprot_FROM}'
- #elif $tool.from.category_FROM == "oseqdb"
- ${tool.from.db_oseqdb}
- #elif $tool.from.category_FROM == "3Dstrdb"
- ${tool.from.db_3Dstrdb}
- #elif $tool.from.category_FROM == "ppidb"
- ${tool.from.db_ppidb}
- #elif $tool.from.category_FROM == "chemistry"
- ${tool.from.db_chemistry}
- #elif $tool.from.category_FROM == "protfgdb"
- ${tool.from.db_protfgdb}
- #elif $tool.from.category_FROM == "polymorphismANDmutation"
- ${tool.from.db_polymorphismANDmutation}
- #elif $tool.from.category_FROM == "2DgelDB"
- ${tool.from.db_2DgelDB}
- #elif $tool.from.category_FROM == "ProtocolsMaterialsDB"
- ${tool.from.db_ProtocolsMaterialsDB}
- #elif $tool.from.category_FROM == "GenomeAnnotationDB"
- ${tool.from.db_GenomeAnnotationDB}
- #elif $tool.from.category_FROM == "OrganismSpecificGeneDB"
- ${tool.from.db_OrganismSpecificGeneDB}
- #elif $tool.from.category_FROM == "phylogenomic"
- ${tool.from.db_phylogenomic}
- #elif $tool.from.category_FROM == "EnzymePathwayDB"
- ${tool.from.db_EnzymePathwayDB}
- #elif $tool.from.category_FROM == "GeneExpression"
- ${tool.from.db_GeneExpression}
- #elif $tool.from.category_FROM == "other"
- ${tool.from.db_other}
- #end if
-
- #if $tool.to.category_TO == "uniprot"
- ${tool.to.db_uniprot_TO}
- #elif $tool.to.category_TO == "oseqdb"
- ${tool.to.db_oseqdb}
- #elif $tool.to.category_TO == "3Dstrdb"
- ${tool.to.db_3Dstrdb}
- #elif $tool.to.category_TO == "ppidb"
- ${tool.to.db_ppidb}
- #elif $tool.to.category_TO == "chemistry"
- ${tool.to.db_chemistry}
- #elif $tool.to.category_TO == "protfgdb"
- ${tool.to.db_protfgdb}
- #elif $tool.to.category_TO == "polymorphismANDmutation"
- ${tool.to.db_polymorphismANDmutation}
- #elif $tool.to.category_TO == "2DgelDB"
- ${tool.to.db_2DgelDB}
- #elif $tool.to.category_TO == "ProtocolsMaterialsDB"
- ${tool.to.db_ProtocolsMaterialsDB}
- #elif $tool.to.category_TO == "GenomeAnnotationDB"
- ${tool.to.db_GenomeAnnotationDB}
- #elif $tool.to.category_TO == "OrganismSpecificGeneDB"
- ${tool.to.db_OrganismSpecificGeneDB}
- #elif $tool.to.category_TO == "phylogenomic"
- ${tool.to.db_phylogenomic}
- #elif $tool.to.category_TO == "EnzymePathwayDB"
- ${tool.to.db_EnzymePathwayDB}
- #elif $tool.to.category_TO == "GeneExpression"
- ${tool.to.db_GeneExpression}
- #elif $tool.to.category_TO == "other"
- ${tool.to.db_other}
- #end if
-
+ --format tsv
+ "$from_cond.from"
+ "$from_cond.to"
id_file.tabular
./output
#end if
-
]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -206,58 +833,54 @@
+ label="${tool.name} on ${on_string}: fasta">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'fasta'
+ label="${tool.name} on ${on_string}: gff">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'gff'
+ label="${tool.name} on ${on_string}: txt">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'txt'
+ label="${tool.name} on ${on_string}: mapping">
tool['tool_choice'] == 'map'
-
+
-
+
-
+
-
-
-
-
+
+
-
+
-
-
-
-
+
+