| Previous changeset 9:468c71dac78a (2024-05-22) Next changeset 11:60f7e2a6b9c7 (2025-07-31) |
|
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7 |
|
modified:
test-data/test1_map.tab uniprot.py uniprot.xml |
|
removed:
test-data/test2_retrieve.gff |
| b |
| diff -r 468c71dac78a -r 95fb5712344f test-data/test1_map.tab --- a/test-data/test1_map.tab Wed May 22 21:18:15 2024 +0000 +++ b/test-data/test1_map.tab Tue Aug 06 14:49:45 2024 +0000 |
| b |
| @@ -2,8 +2,6 @@ A0A077Z587 TTRE_0000309301 A0A077ZFY8 TTRE_0000758701 A0A077ZHN8 TTRE_0000819801 -M5B8V9 CMN_01519 -M5BAG7 cydC O14639 ABLIM1 Q0P8A9 fdhC Q13685 AAMP |
| b |
| diff -r 468c71dac78a -r 95fb5712344f test-data/test2_retrieve.gff --- a/test-data/test2_retrieve.gff Wed May 22 21:18:15 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
| b |
| b'@@ -1,107 +0,0 @@\n-##gff-version 3\n-##sequence-region M5BAG7 1 563\n-M5BAG7\tUniProtKB\tTransmembrane\t21\t43\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t49\t71\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t132\t153\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t159\t181\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t236\t259\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t274\t296\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tDomain\t20\t301\t.\t.\t.\tNote=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929\t\n-M5BAG7\tUniProtKB\tDomain\t345\t559\t.\t.\t.\tNote=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893\t\n-M5BAG7\tUniProtKB\tNucleotide binding\t379\t386\t.\t.\t.\tNote=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434\t\n-M5BAG7\tUniProtKB\tRegion\t317\t337\t.\t.\t.\tNote=Disordered;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:MobiDB-lite\t\n-##sequence-region A0A077ZHN8 1 634\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t14\t36\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t56\t80\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t113\t132\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t290\t310\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tDomain\t312\t364\t.\t.\t.\tNote=HAMP;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50885\t\n-A0A077ZHN8\tUniProtKB\tDomain\t369\t598\t.\t.\t.\tNote=Methyl-accepting transducer;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50111\t\n-A0A077ZHN8\tUniProtKB\tCoiled coil\t170\t204\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-A0A077ZHN8\tUniProtKB\tCoiled coil\t569\t607\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-##sequence-region M5B8V9 1 582\n-M5B8V9\tUniProtKB\tTransmembrane\t20\t43\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t55\t77\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t134\t154\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t161\t180\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t236\t260\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tDomain\t20\t302\t.\t.\t.\tNote=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929\t\n-M5B8V9\tUniProtKB\tDomain\t340\t570\t.\t.\t.\tNote=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893\t\n-M5B8V9\tUniProtKB\tNucleotide binding\t372\t379\t.\t.\t.\tNote=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434\t\n-##sequence-region S0DS17 1 369\n-S0DS17\tUniProtKB\tChain\t1\t369\t.\t.\t.\tID=PRO_0000437163;Note=Cytochrome P450 monooxygenase apf8\t\n-S0DS17\tUniProtKB\tMetal binding\t303\t303\t.\t.\t.\tNote=Iron (heme axial ligand);Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:P04798\t\n-##sequence-region A0A077Z587 1 772\n-A0A077Z587\tUniProtKB\tTransmembrane\t593\t617\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t637\t656\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t668\t692\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t'..b'CO:0000250|UniProtKB:Q8K4G5\t\n-O14639\tUniProtKB\tModified residue\t706\t706\t.\t.\t.\tNote=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569\t\n-O14639\tUniProtKB\tCross-link\t620\t620\t.\t.\t.\tNote=Glycyl lysine isopeptide (Lys-Gly) (interchain with G-Cter in SUMO2);Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:28112733;Dbxref=PMID:28112733\t\n-O14639\tUniProtKB\tAlternative sequence\t1\t316\t.\t.\t.\tID=VSP_012099;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005\t\n-O14639\tUniProtKB\tAlternative sequence\t1\t81\t.\t.\t.\tID=VSP_012100;Note=In isoform 2 and isoform 6. MPAFLGLKCLGKLCSSEKSKVTSSERTSARGSNRKRLIVEDRRVSGTSFTAHRRATITHLLYLCPKDYCPRGRVCNSVDPF->MLMTLEMTELTDPHHTMGDYK;Ontology_term=ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:7584044;Dbxref=PMID:14702039,PMID:7584044\t\n-O14639\tUniProtKB\tAlternative sequence\t347\t347\t.\t.\t.\tID=VSP_041185;Note=In isoform 5 and isoform 6. R->RLPNIRRSSSDFFYSKSLIRRTGRSPSLQ;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039\t\n-O14639\tUniProtKB\tAlternative sequence\t348\t373\t.\t.\t.\tID=VSP_012101;Note=In isoform 4. Missing;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:15489334;Dbxref=PMID:15489334\t\n-O14639\tUniProtKB\tAlternative sequence\t480\t514\t.\t.\t.\tID=VSP_012102;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005\t\n-O14639\tUniProtKB\tAlternative sequence\t531\t531\t.\t.\t.\tID=VSP_057209;Note=In isoform 6. H->HDA;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039\t\n-O14639\tUniProtKB\tNatural variant\t434\t434\t.\t.\t.\tID=VAR_050141;Note=P->T;Dbxref=dbSNP:rs11593544\t\n-O14639\tUniProtKB\tNatural variant\t637\t637\t.\t.\t.\tID=VAR_050142;Note=R->G;Dbxref=dbSNP:rs7091419\t\n-O14639\tUniProtKB\tSequence conflict\t499\t499\t.\t.\t.\tNote=R->L;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t532\t532\t.\t.\t.\tNote=A->R;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t563\t563\t.\t.\t.\tNote=K->E;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t578\t578\t.\t.\t.\tNote=V->I;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-##sequence-region A0A077ZFY8 1 973\n-A0A077ZFY8\tUniProtKB\tDomain\t1\t89\t.\t.\t.\tNote=Mur_ligase;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01225\t\n-A0A077ZFY8\tUniProtKB\tDomain\t96\t279\t.\t.\t.\tNote=Mur_ligase_M;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF08245\t\n-A0A077ZFY8\tUniProtKB\tDomain\t300\t349\t.\t.\t.\tNote=Mur_ligase_C;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF02875\t\n-A0A077ZFY8\tUniProtKB\tCoiled coil\t867\t887\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-A0A077ZFY8\tUniProtKB\tCoiled coil\t951\t971\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-##sequence-region Q13685 1 434\n-Q13685\tUniProtKB\tChain\t1\t434\t.\t.\t.\tID=PRO_0000050832;Note=Angio-associated migratory cell protein\t\n-Q13685\tUniProtKB\tRepeat\t89\t129\t.\t.\t.\tNote=WD 1\t\n-Q13685\tUniProtKB\tRepeat\t132\t171\t.\t.\t.\tNote=WD 2\t\n-Q13685\tUniProtKB\tRepeat\t173\t212\t.\t.\t.\tNote=WD 3\t\n-Q13685\tUniProtKB\tRepeat\t214\t254\t.\t.\t.\tNote=WD 4\t\n-Q13685\tUniProtKB\tRepeat\t258\t299\t.\t.\t.\tNote=WD 5\t\n-Q13685\tUniProtKB\tRepeat\t315\t354\t.\t.\t.\tNote=WD 6\t\n-Q13685\tUniProtKB\tRepeat\t356\t395\t.\t.\t.\tNote=WD 7\t\n-Q13685\tUniProtKB\tRepeat\t398\t433\t.\t.\t.\tNote=WD 8\t\n-Q13685\tUniProtKB\tCompositional bias\t53\t59\t.\t.\t.\tNote=Poly-Glu\t\n-Q13685\tUniProtKB\tModified residue\t20\t20\t.\t.\t.\tNote=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569\t\n-Q13685\tUniProtKB\tNatural variant\t250\t250\t.\t.\t.\tID=VAR_037061;Note=I->V;Dbxref=dbSNP:rs2305835\t\n' |
| b |
| diff -r 468c71dac78a -r 95fb5712344f uniprot.py --- a/uniprot.py Wed May 22 21:18:15 2024 +0000 +++ b/uniprot.py Tue Aug 06 14:49:45 2024 +0000 |
| [ |
| @@ -4,6 +4,7 @@ import sys import time import zlib +from time import sleep from urllib.parse import ( parse_qs, urlencode, @@ -18,7 +19,8 @@ ) -POLLING_INTERVAL = 3 +BATCH_SIZE = 50000 # Limit at UniProt is 100k +POLLING_INTERVAL = 5 API_URL = "https://rest.uniprot.org" @@ -31,7 +33,6 @@ try: response.raise_for_status() except requests.HTTPError: - print(response.json()) raise @@ -59,7 +60,7 @@ check_response(request) j = request.json() if "jobStatus" in j: - if j["jobStatus"] == "RUNNING": + if j["jobStatus"] in ["NEW", "RUNNING"]: print(f"Retrying in {POLLING_INTERVAL}s") time.sleep(POLLING_INTERVAL) else: @@ -102,7 +103,7 @@ if file_format == "json": j = json.loads(decompressed.decode("utf-8")) return j - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in decompressed.decode("utf-8").split("\n") if line] elif file_format == "xlsx": return [decompressed] @@ -112,7 +113,7 @@ return decompressed.decode("utf-8") elif file_format == "json": return response.json() - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in response.text.split("\n") if line] elif file_format == "xlsx": return [response.content] @@ -141,7 +142,7 @@ print(f"Fetched: {n_fetched} / {total}") -def get_id_mapping_results_search(url): +def get_id_mapping_results_search(url, first): parsed = urlparse(url) query = parse_qs(parsed.query) file_format = query["format"][0] if "format" in query else "json" @@ -163,6 +164,8 @@ for i, batch in enumerate(get_batch(request, file_format, compressed), 1): results = combine_batches(results, batch, file_format) print_progress_batches(i, size, total) + if len(results) > 1 and file_format == "tsv" and not first: + results = results[1:] if file_format == "xml": return merge_xml_results(results) return results @@ -266,20 +269,27 @@ query = set() for line in args.inp: query.add(line.strip()) - query = sorted(query) + query = list(query) + results = [] + first = True # if False the header is removed + while len(query) > 0: + batch = query[:BATCH_SIZE] + query = query[BATCH_SIZE:] + print(f"processing {len(batch)} left {len(query)}") + if args.tool == "map": + job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch) + elif args.tool == "retrieve": + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch) - if args.tool == "map": - job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query) - elif args.tool == "retrieve": - job_id = submit_id_mapping( - from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query - ) - - if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - link = f"{link}?format={args.format}" - print(link) - results = get_id_mapping_results_search(link) + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + link = f"{link}?format={args.format}" + print(link) + results.extend(get_id_mapping_results_search(link, first)) + first = False + print(f"got {len(results)} results so far") + if len(query): + sleep(5) if not isinstance(results, str): results = "\n".join(results) |
| b |
| diff -r 468c71dac78a -r 95fb5712344f uniprot.xml --- a/uniprot.xml Wed May 22 21:18:15 2024 +0000 +++ b/uniprot.xml Tue Aug 06 14:49:45 2024 +0000 |
| [ |
| @@ -1,4 +1,4 @@ -<tool id="uniprot" name="UniProt" version="0.5" profile="23.1"> +<tool id="uniprot" name="UniProt" version="0.6" profile="23.1"> <description>ID mapping and retrieval</description> <requirements> <requirement type="package" version="2.25.1">requests</requirement> @@ -865,7 +865,12 @@ <param name="id_column" value="c1"/> <param name="format" value="gff"/> <param name="tool_choice" value="retrieve"/> - <output name="outfile_retrieve_gff" file="test2_retrieve.gff" ftype="gff" compare="sim_size" /> + <output name="outfile_retrieve_gff" ftype="gff"> + <assert_contents> + <has_n_lines min="80"/> + <has_text text="UniProtKB"/> + </assert_contents> + </output> </test> <test expect_num_outputs="1"> <param name="infile" value="id_uniprot.tab" ftype="tabular"/> @@ -873,7 +878,7 @@ <param name="tool_choice" value="map"/> <param name="from" value="UniProtKB_AC-ID"/> <param name="to" value="Gene_Name"/> - <output name="outfile_map" file="test1_map.tab" ftype="tabular"/> + <output name="outfile_map" file="test1_map.tab" ftype="tabular" sort="true"/> </test> <test expect_num_outputs="1"> <param name="infile" value="id_map_refseq.txt" ftype="tabular"/> @@ -881,7 +886,7 @@ <param name="tool_choice" value="map"/> <param name="from" value="RefSeq_Nucleotide"/> <param name="to" value="UniProtKB"/> - <output name="outfile_map" file="test2_map.tab" ftype="tabular"/> + <output name="outfile_map" file="test2_map.tab" ftype="tabular" sort="true"/> </test> </tests> <help><![CDATA[ |