# HG changeset patch # User bgruening # Date 1722955785 0 # Node ID 95fb5712344f316b7a95efddc97b802094609ee0 # Parent 468c71dac78a9dbc65882c58d9af75f55ae1d951 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7 diff -r 468c71dac78a -r 95fb5712344f test-data/test1_map.tab --- a/test-data/test1_map.tab Wed May 22 21:18:15 2024 +0000 +++ b/test-data/test1_map.tab Tue Aug 06 14:49:45 2024 +0000 @@ -2,8 +2,6 @@ A0A077Z587 TTRE_0000309301 A0A077ZFY8 TTRE_0000758701 A0A077ZHN8 TTRE_0000819801 -M5B8V9 CMN_01519 -M5BAG7 cydC O14639 ABLIM1 Q0P8A9 fdhC Q13685 AAMP diff -r 468c71dac78a -r 95fb5712344f test-data/test2_retrieve.gff --- a/test-data/test2_retrieve.gff Wed May 22 21:18:15 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -##gff-version 3 -##sequence-region M5BAG7 1 563 -M5BAG7 UniProtKB Transmembrane 21 43 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Transmembrane 49 71 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Transmembrane 132 153 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Transmembrane 159 181 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Transmembrane 236 259 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Transmembrane 274 296 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5BAG7 UniProtKB Domain 20 301 . . . Note=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929 -M5BAG7 UniProtKB Domain 345 559 . . . Note=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893 -M5BAG7 UniProtKB Nucleotide binding 379 386 . . . Note=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434 -M5BAG7 UniProtKB Region 317 337 . . . Note=Disordered;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:MobiDB-lite -##sequence-region A0A077ZHN8 1 634 -A0A077ZHN8 UniProtKB Transmembrane 14 36 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077ZHN8 UniProtKB Transmembrane 56 80 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077ZHN8 UniProtKB Transmembrane 113 132 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077ZHN8 UniProtKB Transmembrane 290 310 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077ZHN8 UniProtKB Domain 312 364 . . . Note=HAMP;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50885 -A0A077ZHN8 UniProtKB Domain 369 598 . . . Note=Methyl-accepting transducer;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50111 -A0A077ZHN8 UniProtKB Coiled coil 170 204 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils -A0A077ZHN8 UniProtKB Coiled coil 569 607 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils -##sequence-region M5B8V9 1 582 -M5B8V9 UniProtKB Transmembrane 20 43 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5B8V9 UniProtKB Transmembrane 55 77 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5B8V9 UniProtKB Transmembrane 134 154 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5B8V9 UniProtKB Transmembrane 161 180 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5B8V9 UniProtKB Transmembrane 236 260 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -M5B8V9 UniProtKB Domain 20 302 . . . Note=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929 -M5B8V9 UniProtKB Domain 340 570 . . . Note=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893 -M5B8V9 UniProtKB Nucleotide binding 372 379 . . . Note=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434 -##sequence-region S0DS17 1 369 -S0DS17 UniProtKB Chain 1 369 . . . ID=PRO_0000437163;Note=Cytochrome P450 monooxygenase apf8 -S0DS17 UniProtKB Metal binding 303 303 . . . Note=Iron (heme axial ligand);Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:P04798 -##sequence-region A0A077Z587 1 772 -A0A077Z587 UniProtKB Transmembrane 593 617 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077Z587 UniProtKB Transmembrane 637 656 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077Z587 UniProtKB Transmembrane 668 692 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077Z587 UniProtKB Transmembrane 704 727 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077Z587 UniProtKB Transmembrane 733 755 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -A0A077Z587 UniProtKB Domain 20 94 . . . Note=PDZ;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50106 -A0A077Z587 UniProtKB Domain 552 761 . . . Note=Cytochrome b561;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50939 -##sequence-region Q0P8A9 1 310 -Q0P8A9 UniProtKB Transmembrane 55 78 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -Q0P8A9 UniProtKB Transmembrane 99 124 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -Q0P8A9 UniProtKB Transmembrane 136 156 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -Q0P8A9 UniProtKB Transmembrane 195 216 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -Q0P8A9 UniProtKB Transmembrane 244 264 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius -Q0P8A9 UniProtKB Domain 93 274 . . . Note=Ni_hydr_CYTB;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01292 -##sequence-region O14639 1 778 -O14639 UniProtKB Chain 1 778 . . . ID=PRO_0000075697;Note=Actin-binding LIM protein 1 -O14639 UniProtKB Domain 97 156 . . . Note=LIM zinc-binding 1;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125 -O14639 UniProtKB Domain 156 216 . . . Note=LIM zinc-binding 2;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125 -O14639 UniProtKB Domain 224 283 . . . Note=LIM zinc-binding 3;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125 -O14639 UniProtKB Domain 283 343 . . . Note=LIM zinc-binding 4;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125 -O14639 UniProtKB Domain 710 778 . . . Note=HP;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00595 -O14639 UniProtKB Coiled coil 590 614 . . . Ontology_term=ECO:0000255;evidence=ECO:0000255 -O14639 UniProtKB Modified residue 216 216 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5 -O14639 UniProtKB Modified residue 367 367 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332 -O14639 UniProtKB Modified residue 373 373 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332 -O14639 UniProtKB Modified residue 396 396 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:15592455;Dbxref=PMID:15592455 -O14639 UniProtKB Modified residue 422 422 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:23186163;Dbxref=PMID:23186163 -O14639 UniProtKB Modified residue 426 426 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:24275569;Dbxref=PMID:19690332,PMID:24275569 -O14639 UniProtKB Modified residue 431 431 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:19690332,PMID:23186163 -O14639 UniProtKB Modified residue 433 433 . . . Note=Phosphothreonine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332 -O14639 UniProtKB Modified residue 435 435 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:20068231,ECO:0000244|PubMed:21406692,ECO:0000244|PubMed:23186163,ECO:0000244|PubMed:24275569;Dbxref=PMID:18669648,PMID:19690332,PMID:20068231,PMID:21406692,PMID:23186163,PMID:24275569 -O14639 UniProtKB Modified residue 439 439 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:15144186;Dbxref=PMID:15144186 -O14639 UniProtKB Modified residue 452 452 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:23186163;Dbxref=PMID:23186163 -O14639 UniProtKB Modified residue 455 455 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163,ECO:0000244|PubMed:24275569;Dbxref=PMID:18669648,PMID:23186163,PMID:24275569 -O14639 UniProtKB Modified residue 458 458 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163 -O14639 UniProtKB Modified residue 498 498 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5 -O14639 UniProtKB Modified residue 587 587 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163 -O14639 UniProtKB Modified residue 640 640 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:20068231;Dbxref=PMID:18669648,PMID:20068231 -O14639 UniProtKB Modified residue 655 655 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163 -O14639 UniProtKB Modified residue 677 677 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5 -O14639 UniProtKB Modified residue 706 706 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569 -O14639 UniProtKB Cross-link 620 620 . . . Note=Glycyl lysine isopeptide (Lys-Gly) (interchain with G-Cter in SUMO2);Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:28112733;Dbxref=PMID:28112733 -O14639 UniProtKB Alternative sequence 1 316 . . . ID=VSP_012099;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005 -O14639 UniProtKB Alternative sequence 1 81 . . . ID=VSP_012100;Note=In isoform 2 and isoform 6. MPAFLGLKCLGKLCSSEKSKVTSSERTSARGSNRKRLIVEDRRVSGTSFTAHRRATITHLLYLCPKDYCPRGRVCNSVDPF->MLMTLEMTELTDPHHTMGDYK;Ontology_term=ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:7584044;Dbxref=PMID:14702039,PMID:7584044 -O14639 UniProtKB Alternative sequence 347 347 . . . ID=VSP_041185;Note=In isoform 5 and isoform 6. R->RLPNIRRSSSDFFYSKSLIRRTGRSPSLQ;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039 -O14639 UniProtKB Alternative sequence 348 373 . . . ID=VSP_012101;Note=In isoform 4. Missing;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:15489334;Dbxref=PMID:15489334 -O14639 UniProtKB Alternative sequence 480 514 . . . ID=VSP_012102;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005 -O14639 UniProtKB Alternative sequence 531 531 . . . ID=VSP_057209;Note=In isoform 6. H->HDA;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039 -O14639 UniProtKB Natural variant 434 434 . . . ID=VAR_050141;Note=P->T;Dbxref=dbSNP:rs11593544 -O14639 UniProtKB Natural variant 637 637 . . . ID=VAR_050142;Note=R->G;Dbxref=dbSNP:rs7091419 -O14639 UniProtKB Sequence conflict 499 499 . . . Note=R->L;Ontology_term=ECO:0000305;evidence=ECO:0000305 -O14639 UniProtKB Sequence conflict 532 532 . . . Note=A->R;Ontology_term=ECO:0000305;evidence=ECO:0000305 -O14639 UniProtKB Sequence conflict 563 563 . . . Note=K->E;Ontology_term=ECO:0000305;evidence=ECO:0000305 -O14639 UniProtKB Sequence conflict 578 578 . . . Note=V->I;Ontology_term=ECO:0000305;evidence=ECO:0000305 -##sequence-region A0A077ZFY8 1 973 -A0A077ZFY8 UniProtKB Domain 1 89 . . . Note=Mur_ligase;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01225 -A0A077ZFY8 UniProtKB Domain 96 279 . . . Note=Mur_ligase_M;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF08245 -A0A077ZFY8 UniProtKB Domain 300 349 . . . Note=Mur_ligase_C;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF02875 -A0A077ZFY8 UniProtKB Coiled coil 867 887 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils -A0A077ZFY8 UniProtKB Coiled coil 951 971 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils -##sequence-region Q13685 1 434 -Q13685 UniProtKB Chain 1 434 . . . ID=PRO_0000050832;Note=Angio-associated migratory cell protein -Q13685 UniProtKB Repeat 89 129 . . . Note=WD 1 -Q13685 UniProtKB Repeat 132 171 . . . Note=WD 2 -Q13685 UniProtKB Repeat 173 212 . . . Note=WD 3 -Q13685 UniProtKB Repeat 214 254 . . . Note=WD 4 -Q13685 UniProtKB Repeat 258 299 . . . Note=WD 5 -Q13685 UniProtKB Repeat 315 354 . . . Note=WD 6 -Q13685 UniProtKB Repeat 356 395 . . . Note=WD 7 -Q13685 UniProtKB Repeat 398 433 . . . Note=WD 8 -Q13685 UniProtKB Compositional bias 53 59 . . . Note=Poly-Glu -Q13685 UniProtKB Modified residue 20 20 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569 -Q13685 UniProtKB Natural variant 250 250 . . . ID=VAR_037061;Note=I->V;Dbxref=dbSNP:rs2305835 diff -r 468c71dac78a -r 95fb5712344f uniprot.py --- a/uniprot.py Wed May 22 21:18:15 2024 +0000 +++ b/uniprot.py Tue Aug 06 14:49:45 2024 +0000 @@ -4,6 +4,7 @@ import sys import time import zlib +from time import sleep from urllib.parse import ( parse_qs, urlencode, @@ -18,7 +19,8 @@ ) -POLLING_INTERVAL = 3 +BATCH_SIZE = 50000 # Limit at UniProt is 100k +POLLING_INTERVAL = 5 API_URL = "https://rest.uniprot.org" @@ -31,7 +33,6 @@ try: response.raise_for_status() except requests.HTTPError: - print(response.json()) raise @@ -59,7 +60,7 @@ check_response(request) j = request.json() if "jobStatus" in j: - if j["jobStatus"] == "RUNNING": + if j["jobStatus"] in ["NEW", "RUNNING"]: print(f"Retrying in {POLLING_INTERVAL}s") time.sleep(POLLING_INTERVAL) else: @@ -102,7 +103,7 @@ if file_format == "json": j = json.loads(decompressed.decode("utf-8")) return j - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in decompressed.decode("utf-8").split("\n") if line] elif file_format == "xlsx": return [decompressed] @@ -112,7 +113,7 @@ return decompressed.decode("utf-8") elif file_format == "json": return response.json() - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in response.text.split("\n") if line] elif file_format == "xlsx": return [response.content] @@ -141,7 +142,7 @@ print(f"Fetched: {n_fetched} / {total}") -def get_id_mapping_results_search(url): +def get_id_mapping_results_search(url, first): parsed = urlparse(url) query = parse_qs(parsed.query) file_format = query["format"][0] if "format" in query else "json" @@ -163,6 +164,8 @@ for i, batch in enumerate(get_batch(request, file_format, compressed), 1): results = combine_batches(results, batch, file_format) print_progress_batches(i, size, total) + if len(results) > 1 and file_format == "tsv" and not first: + results = results[1:] if file_format == "xml": return merge_xml_results(results) return results @@ -266,20 +269,27 @@ query = set() for line in args.inp: query.add(line.strip()) - query = sorted(query) + query = list(query) + results = [] + first = True # if False the header is removed + while len(query) > 0: + batch = query[:BATCH_SIZE] + query = query[BATCH_SIZE:] + print(f"processing {len(batch)} left {len(query)}") + if args.tool == "map": + job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch) + elif args.tool == "retrieve": + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch) - if args.tool == "map": - job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query) - elif args.tool == "retrieve": - job_id = submit_id_mapping( - from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query - ) - - if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - link = f"{link}?format={args.format}" - print(link) - results = get_id_mapping_results_search(link) + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + link = f"{link}?format={args.format}" + print(link) + results.extend(get_id_mapping_results_search(link, first)) + first = False + print(f"got {len(results)} results so far") + if len(query): + sleep(5) if not isinstance(results, str): results = "\n".join(results) diff -r 468c71dac78a -r 95fb5712344f uniprot.xml --- a/uniprot.xml Wed May 22 21:18:15 2024 +0000 +++ b/uniprot.xml Tue Aug 06 14:49:45 2024 +0000 @@ -1,4 +1,4 @@ - + ID mapping and retrieval requests @@ -865,7 +865,12 @@ - + + + + + + @@ -873,7 +878,7 @@ - + @@ -881,7 +886,7 @@ - +