Repository 'uniprot_rest_interface'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/uniprot_rest_interface

Changeset 10:95fb5712344f (2024-08-06)
Previous changeset 9:468c71dac78a (2024-05-22) Next changeset 11:60f7e2a6b9c7 (2025-07-31)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7
modified:
test-data/test1_map.tab
uniprot.py
uniprot.xml
removed:
test-data/test2_retrieve.gff
b
diff -r 468c71dac78a -r 95fb5712344f test-data/test1_map.tab
--- a/test-data/test1_map.tab Wed May 22 21:18:15 2024 +0000
+++ b/test-data/test1_map.tab Tue Aug 06 14:49:45 2024 +0000
b
@@ -2,8 +2,6 @@
 A0A077Z587 TTRE_0000309301
 A0A077ZFY8 TTRE_0000758701
 A0A077ZHN8 TTRE_0000819801
-M5B8V9 CMN_01519
-M5BAG7 cydC
 O14639 ABLIM1
 Q0P8A9 fdhC
 Q13685 AAMP
b
diff -r 468c71dac78a -r 95fb5712344f test-data/test2_retrieve.gff
--- a/test-data/test2_retrieve.gff Wed May 22 21:18:15 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,107 +0,0 @@\n-##gff-version 3\n-##sequence-region M5BAG7 1 563\n-M5BAG7\tUniProtKB\tTransmembrane\t21\t43\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t49\t71\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t132\t153\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t159\t181\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t236\t259\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tTransmembrane\t274\t296\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5BAG7\tUniProtKB\tDomain\t20\t301\t.\t.\t.\tNote=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929\t\n-M5BAG7\tUniProtKB\tDomain\t345\t559\t.\t.\t.\tNote=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893\t\n-M5BAG7\tUniProtKB\tNucleotide binding\t379\t386\t.\t.\t.\tNote=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434\t\n-M5BAG7\tUniProtKB\tRegion\t317\t337\t.\t.\t.\tNote=Disordered;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:MobiDB-lite\t\n-##sequence-region A0A077ZHN8 1 634\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t14\t36\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t56\t80\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t113\t132\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tTransmembrane\t290\t310\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077ZHN8\tUniProtKB\tDomain\t312\t364\t.\t.\t.\tNote=HAMP;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50885\t\n-A0A077ZHN8\tUniProtKB\tDomain\t369\t598\t.\t.\t.\tNote=Methyl-accepting transducer;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50111\t\n-A0A077ZHN8\tUniProtKB\tCoiled coil\t170\t204\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-A0A077ZHN8\tUniProtKB\tCoiled coil\t569\t607\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-##sequence-region M5B8V9 1 582\n-M5B8V9\tUniProtKB\tTransmembrane\t20\t43\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t55\t77\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t134\t154\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t161\t180\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tTransmembrane\t236\t260\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-M5B8V9\tUniProtKB\tDomain\t20\t302\t.\t.\t.\tNote=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929\t\n-M5B8V9\tUniProtKB\tDomain\t340\t570\t.\t.\t.\tNote=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893\t\n-M5B8V9\tUniProtKB\tNucleotide binding\t372\t379\t.\t.\t.\tNote=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434\t\n-##sequence-region S0DS17 1 369\n-S0DS17\tUniProtKB\tChain\t1\t369\t.\t.\t.\tID=PRO_0000437163;Note=Cytochrome P450 monooxygenase apf8\t\n-S0DS17\tUniProtKB\tMetal binding\t303\t303\t.\t.\t.\tNote=Iron (heme axial ligand);Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:P04798\t\n-##sequence-region A0A077Z587 1 772\n-A0A077Z587\tUniProtKB\tTransmembrane\t593\t617\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t637\t656\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t668\t692\t.\t.\t.\tNote=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius\t\n-A0A077Z587\tUniProtKB\tTransmembrane\t'..b'CO:0000250|UniProtKB:Q8K4G5\t\n-O14639\tUniProtKB\tModified residue\t706\t706\t.\t.\t.\tNote=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569\t\n-O14639\tUniProtKB\tCross-link\t620\t620\t.\t.\t.\tNote=Glycyl lysine isopeptide (Lys-Gly) (interchain with G-Cter in SUMO2);Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:28112733;Dbxref=PMID:28112733\t\n-O14639\tUniProtKB\tAlternative sequence\t1\t316\t.\t.\t.\tID=VSP_012099;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005\t\n-O14639\tUniProtKB\tAlternative sequence\t1\t81\t.\t.\t.\tID=VSP_012100;Note=In isoform 2 and isoform 6. MPAFLGLKCLGKLCSSEKSKVTSSERTSARGSNRKRLIVEDRRVSGTSFTAHRRATITHLLYLCPKDYCPRGRVCNSVDPF->MLMTLEMTELTDPHHTMGDYK;Ontology_term=ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:7584044;Dbxref=PMID:14702039,PMID:7584044\t\n-O14639\tUniProtKB\tAlternative sequence\t347\t347\t.\t.\t.\tID=VSP_041185;Note=In isoform 5 and isoform 6. R->RLPNIRRSSSDFFYSKSLIRRTGRSPSLQ;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039\t\n-O14639\tUniProtKB\tAlternative sequence\t348\t373\t.\t.\t.\tID=VSP_012101;Note=In isoform 4. Missing;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:15489334;Dbxref=PMID:15489334\t\n-O14639\tUniProtKB\tAlternative sequence\t480\t514\t.\t.\t.\tID=VSP_012102;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005\t\n-O14639\tUniProtKB\tAlternative sequence\t531\t531\t.\t.\t.\tID=VSP_057209;Note=In isoform 6. H->HDA;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039\t\n-O14639\tUniProtKB\tNatural variant\t434\t434\t.\t.\t.\tID=VAR_050141;Note=P->T;Dbxref=dbSNP:rs11593544\t\n-O14639\tUniProtKB\tNatural variant\t637\t637\t.\t.\t.\tID=VAR_050142;Note=R->G;Dbxref=dbSNP:rs7091419\t\n-O14639\tUniProtKB\tSequence conflict\t499\t499\t.\t.\t.\tNote=R->L;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t532\t532\t.\t.\t.\tNote=A->R;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t563\t563\t.\t.\t.\tNote=K->E;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-O14639\tUniProtKB\tSequence conflict\t578\t578\t.\t.\t.\tNote=V->I;Ontology_term=ECO:0000305;evidence=ECO:0000305\t\n-##sequence-region A0A077ZFY8 1 973\n-A0A077ZFY8\tUniProtKB\tDomain\t1\t89\t.\t.\t.\tNote=Mur_ligase;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01225\t\n-A0A077ZFY8\tUniProtKB\tDomain\t96\t279\t.\t.\t.\tNote=Mur_ligase_M;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF08245\t\n-A0A077ZFY8\tUniProtKB\tDomain\t300\t349\t.\t.\t.\tNote=Mur_ligase_C;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF02875\t\n-A0A077ZFY8\tUniProtKB\tCoiled coil\t867\t887\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-A0A077ZFY8\tUniProtKB\tCoiled coil\t951\t971\t.\t.\t.\tOntology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils\t\n-##sequence-region Q13685 1 434\n-Q13685\tUniProtKB\tChain\t1\t434\t.\t.\t.\tID=PRO_0000050832;Note=Angio-associated migratory cell protein\t\n-Q13685\tUniProtKB\tRepeat\t89\t129\t.\t.\t.\tNote=WD 1\t\n-Q13685\tUniProtKB\tRepeat\t132\t171\t.\t.\t.\tNote=WD 2\t\n-Q13685\tUniProtKB\tRepeat\t173\t212\t.\t.\t.\tNote=WD 3\t\n-Q13685\tUniProtKB\tRepeat\t214\t254\t.\t.\t.\tNote=WD 4\t\n-Q13685\tUniProtKB\tRepeat\t258\t299\t.\t.\t.\tNote=WD 5\t\n-Q13685\tUniProtKB\tRepeat\t315\t354\t.\t.\t.\tNote=WD 6\t\n-Q13685\tUniProtKB\tRepeat\t356\t395\t.\t.\t.\tNote=WD 7\t\n-Q13685\tUniProtKB\tRepeat\t398\t433\t.\t.\t.\tNote=WD 8\t\n-Q13685\tUniProtKB\tCompositional bias\t53\t59\t.\t.\t.\tNote=Poly-Glu\t\n-Q13685\tUniProtKB\tModified residue\t20\t20\t.\t.\t.\tNote=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569\t\n-Q13685\tUniProtKB\tNatural variant\t250\t250\t.\t.\t.\tID=VAR_037061;Note=I->V;Dbxref=dbSNP:rs2305835\t\n'
b
diff -r 468c71dac78a -r 95fb5712344f uniprot.py
--- a/uniprot.py Wed May 22 21:18:15 2024 +0000
+++ b/uniprot.py Tue Aug 06 14:49:45 2024 +0000
[
@@ -4,6 +4,7 @@
 import sys
 import time
 import zlib
+from time import sleep
 from urllib.parse import (
     parse_qs,
     urlencode,
@@ -18,7 +19,8 @@
 )
 
 
-POLLING_INTERVAL = 3
+BATCH_SIZE = 50000  # Limit at UniProt is 100k
+POLLING_INTERVAL = 5
 API_URL = "https://rest.uniprot.org"
 
 
@@ -31,7 +33,6 @@
     try:
         response.raise_for_status()
     except requests.HTTPError:
-        print(response.json())
         raise
 
 
@@ -59,7 +60,7 @@
         check_response(request)
         j = request.json()
         if "jobStatus" in j:
-            if j["jobStatus"] == "RUNNING":
+            if j["jobStatus"] in ["NEW", "RUNNING"]:
                 print(f"Retrying in {POLLING_INTERVAL}s")
                 time.sleep(POLLING_INTERVAL)
             else:
@@ -102,7 +103,7 @@
         if file_format == "json":
             j = json.loads(decompressed.decode("utf-8"))
             return j
-        elif file_format == "tsv":
+        elif file_format in ["tsv", "gff"]:
             return [line for line in decompressed.decode("utf-8").split("\n") if line]
         elif file_format == "xlsx":
             return [decompressed]
@@ -112,7 +113,7 @@
             return decompressed.decode("utf-8")
     elif file_format == "json":
         return response.json()
-    elif file_format == "tsv":
+    elif file_format in ["tsv", "gff"]:
         return [line for line in response.text.split("\n") if line]
     elif file_format == "xlsx":
         return [response.content]
@@ -141,7 +142,7 @@
     print(f"Fetched: {n_fetched} / {total}")
 
 
-def get_id_mapping_results_search(url):
+def get_id_mapping_results_search(url, first):
     parsed = urlparse(url)
     query = parse_qs(parsed.query)
     file_format = query["format"][0] if "format" in query else "json"
@@ -163,6 +164,8 @@
     for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
         results = combine_batches(results, batch, file_format)
         print_progress_batches(i, size, total)
+    if len(results) > 1 and file_format == "tsv" and not first:
+        results = results[1:]
     if file_format == "xml":
         return merge_xml_results(results)
     return results
@@ -266,20 +269,27 @@
     query = set()
     for line in args.inp:
         query.add(line.strip())
-    query = sorted(query)
+    query = list(query)
+    results = []
+    first = True  # if False the header is removed
+    while len(query) > 0:
+        batch = query[:BATCH_SIZE]
+        query = query[BATCH_SIZE:]
+        print(f"processing {len(batch)} left {len(query)}")
+        if args.tool == "map":
+            job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch)
+        elif args.tool == "retrieve":
+            job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch)
 
-    if args.tool == "map":
-        job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query)
-    elif args.tool == "retrieve":
-        job_id = submit_id_mapping(
-            from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query
-        )
-
-    if check_id_mapping_results_ready(job_id):
-        link = get_id_mapping_results_link(job_id)
-        link = f"{link}?format={args.format}"
-        print(link)
-        results = get_id_mapping_results_search(link)
+        if check_id_mapping_results_ready(job_id):
+            link = get_id_mapping_results_link(job_id)
+            link = f"{link}?format={args.format}"
+            print(link)
+            results.extend(get_id_mapping_results_search(link, first))
+            first = False
+        print(f"got {len(results)} results so far")
+        if len(query):
+            sleep(5)
 
     if not isinstance(results, str):
         results = "\n".join(results)
b
diff -r 468c71dac78a -r 95fb5712344f uniprot.xml
--- a/uniprot.xml Wed May 22 21:18:15 2024 +0000
+++ b/uniprot.xml Tue Aug 06 14:49:45 2024 +0000
[
@@ -1,4 +1,4 @@
-<tool id="uniprot" name="UniProt" version="0.5" profile="23.1">
+<tool id="uniprot" name="UniProt" version="0.6" profile="23.1">
     <description>ID mapping and retrieval</description>
     <requirements>
         <requirement type="package" version="2.25.1">requests</requirement>
@@ -865,7 +865,12 @@
             <param name="id_column" value="c1"/>
             <param name="format" value="gff"/>
             <param name="tool_choice" value="retrieve"/>
-            <output name="outfile_retrieve_gff" file="test2_retrieve.gff" ftype="gff" compare="sim_size" />
+            <output name="outfile_retrieve_gff" ftype="gff">
+                <assert_contents>
+                    <has_n_lines min="80"/>
+                    <has_text text="UniProtKB"/>
+                </assert_contents>
+            </output>
         </test>
         <test expect_num_outputs="1">
             <param name="infile" value="id_uniprot.tab" ftype="tabular"/>
@@ -873,7 +878,7 @@
             <param name="tool_choice" value="map"/>
             <param name="from" value="UniProtKB_AC-ID"/>
             <param name="to" value="Gene_Name"/>
-            <output name="outfile_map" file="test1_map.tab" ftype="tabular"/>
+            <output name="outfile_map" file="test1_map.tab" ftype="tabular" sort="true"/>
         </test>
         <test expect_num_outputs="1">
             <param name="infile" value="id_map_refseq.txt" ftype="tabular"/>
@@ -881,7 +886,7 @@
             <param name="tool_choice" value="map"/>
             <param name="from" value="RefSeq_Nucleotide"/>
             <param name="to" value="UniProtKB"/>
-            <output name="outfile_map" file="test2_map.tab" ftype="tabular"/>
+            <output name="outfile_map" file="test2_map.tab" ftype="tabular" sort="true"/>
         </test>
     </tests>
     <help><![CDATA[