diff impc_tool.py @ 0:0a9cf7f52b9c draft default tip

planemo upload commit 213f6eeb03f96bb13d0ace6e0c87e2562d37f728-dirty
author infr
date Wed, 22 Jun 2022 13:36:44 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/impc_tool.py	Wed Jun 22 13:36:44 2022 +0000
@@ -0,0 +1,636 @@
+import sys
+import requests
+import pandas as pd
+import urllib.request as url
+
+impc_api_url = "https://www.gentar.org/impc-dev-api/"
+impc_api_search_url = f"{impc_api_url}/genes"
+impc_api_gene_bundle_url = f"{impc_api_url}/geneBundles"
+
+
+def stop_err(msg):
+    sys.exit(msg)
+
+
+def main():
+    inp = str(sys.argv[1])
+    query = str(sys.argv[3])
+
+    try:
+        if query == '7':
+            full_gene_table()
+            sys.exit(0)
+
+        if str(sys.argv[5])=="txt":
+            s = str(sys.argv[6])
+            if s == "t":
+                sep = "\t"
+            elif s == "s":
+                sep = " "
+            elif s in ",;.":
+                sep = s
+            else:
+                sys.exit("Separator not valid, please change it.")
+            inp = pd.read_csv(inp, header=None, delimiter=sep)
+            if len(inp.columns)==1:
+                inp = str(inp[0].values[0]).replace("'","")
+            else:
+                inp = inp.to_string(header=False, index=False).replace(" ",",")
+
+        if query == '8':
+            genes_in_pipeline(inp)
+            sys.exit(0)
+        elif query == '10':  # it's here but not totally implemented
+            par_pip_ma(inp)
+            sys.exit(0)
+        elif query == '11':  # it's here but not totally implemented
+            par_gen(inp)
+            sys.exit(0)
+        elif query == '2' or query == "4":
+            final_list=pheno_mapping(inp)
+        else:
+            final_list=gene_mapping(inp)
+        inp= ",".join(final_list)
+
+
+        if query == '1':
+            get_pheno(inp)
+            sys.exit(0)
+        elif query == '2':
+            get_genes(inp)
+            sys.exit(0)
+        elif query == '3':
+            gene_set(inp)
+            sys.exit(0)
+        elif query == '4':
+            extr_img(inp)
+            sys.exit(0)
+        elif query == '5':
+            parameters(inp)
+            sys.exit(0)
+        elif query == '6':
+            sign_par(inp)
+            sys.exit(0)
+        elif query == '9':
+            sign_mp(inp)
+            sys.exit(0)
+        else:
+            stop_err("Error, non-implemented query selected: " + query)
+    except Exception as ex:
+        stop_err('Error running impc_tool.py:\n' + str(ex))
+
+
+# 1-Given a gene id, retrieve all the phenotypes related to it (id and name)
+def get_pheno(inp):
+    head = sys.argv[4]
+    mgi_accession_id = inp
+
+    gene_url = f"{impc_api_search_url}/{mgi_accession_id}"
+    gene_data = requests.get(gene_url).json()
+
+    p_list = []
+    id_list = []
+
+    if gene_data['significantMpTerms'] == None:
+        stop_err("No significant MP terms found for this gene")
+    else:
+        for x in gene_data['significantMpTerms']:
+            p_list.append(x['mpTermId'])
+            id_list.append(x['mpTermName'])
+
+    df = pd.DataFrame()
+    df['MP term name'] = p_list
+    df['MP term id'] = id_list
+
+    if head == 'True':
+        df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+# 3-Extract all genes having a particular phenotype or a set of phenotypes (e.g. relevant to a disease)
+def get_genes(inp):
+    head = sys.argv[4]
+    target_mp_terms = inp
+
+    ## All the data is paginated using the page and size parameters, by default the endpoint returns the first 20 hits
+    gene_by_phenotypes_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={target_mp_terms}&page=0&size=20"
+    genes_with_clinical_chemistry_phenotypes = requests.get(gene_by_phenotypes_query).json()
+    print(f"Genes with {target_mp_terms}: {genes_with_clinical_chemistry_phenotypes['page']['totalElements']}")
+    list_of_genes = pd.DataFrame(columns=['Gene accession id', 'Gene name', 'Gene bundle url'])
+    acc = []
+    name = []
+    url = []
+
+    for gene in genes_with_clinical_chemistry_phenotypes['_embedded']['genes']:
+        acc.append(gene['mgiAccessionId'])
+        name.append(gene['markerName'])
+        url.append(gene['_links']['geneBundle']['href'])
+
+    list_of_genes['Gene accession id'] = acc
+    list_of_genes['Gene name'] = name
+    list_of_genes['Gene bundle url'] = url
+
+    if head == 'True':
+        list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+# 4. Extract all phenotypes which are present in a particular gene set (e.g. genes together in a pathway)
+
+def gene_set(inp):
+    head = sys.argv[4]
+    target_genes = inp
+
+    genes_in_gene_list_query = f"{impc_api_search_url}/search/findAllByMgiAccessionIdIn?mgiAccessionIds={target_genes}"
+
+    genes_in_gene_list = requests.get(genes_in_gene_list_query).json()
+    list_of_mp_terms_vs_gene_index = {}
+
+    for gene in genes_in_gene_list['_embedded']['genes']:
+        mp_terms = gene['significantMpTerms']
+        gene_acc_id = gene["mgiAccessionId"]
+        if mp_terms is None:
+            continue
+        for mp_term_name in mp_terms:
+            if mp_term_name['mpTermId'] not in list_of_mp_terms_vs_gene_index:
+                list_of_mp_terms_vs_gene_index[mp_term_name['mpTermId']] = {"mp_term": mp_term_name['mpTermId'], "mp_name": mp_term_name['mpTermName'], "genes": []}
+            list_of_mp_terms_vs_gene_index[mp_term_name['mpTermId']]["genes"].append(gene_acc_id)
+    genes_by_mp_term = list(list_of_mp_terms_vs_gene_index.values())
+
+    df = pd.DataFrame()
+    terms = []
+    names = []
+    genes = []
+    for i in genes_by_mp_term:
+        terms.append(i['mp_term'])
+        names.append(i['mp_name'])
+        genes.append(",".join(i['genes']))
+
+    df['mp_term'] = terms
+    df['mp_name'] = names
+    df['genes'] = genes
+
+    if head == 'True':
+        df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+    # 7. Extract images with a particular phenotype or a set of phenotypes
+
+
+def extr_img(inp):
+    head = sys.argv[4]
+    target_mp_terms = inp  # ['MP:0002110', 'MP:0000559']
+
+    ## All the data is paginated using the page and size parameters, by default the endpoint returns the first 20 hits
+    gene_by_phenotypes_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={target_mp_terms}&page=0&size=20"
+    genes_with_morphology_mps = requests.get(gene_by_phenotypes_query).json()
+    list_of_gene_bundle_urls = [gene["_links"]["geneBundle"]['href'] for gene in
+                                genes_with_morphology_mps['_embedded']['genes']]
+
+    gene_bundles = []
+    for gene_bundle_url in list_of_gene_bundle_urls:
+        gene_bundle = requests.get(gene_bundle_url).json()
+        gene_bundles.append(gene_bundle)
+
+    images_with_morphology_mps = []
+
+    ## Doing just the first 20 and filtering out fields on the images
+    display_fields = ['geneSymbol', 'parameterName', 'biologicalSampleGroup', 'colonyId', 'zygosity', 'sex',
+                      'downloadUrl', 'externalSampleId', 'thumbnailUrl']
+
+
+    for gene_bundle in gene_bundles[:20]:
+        if len(gene_bundle) == 4:
+            continue
+        if gene_bundle["geneImages"] is not None:
+            images = gene_bundle["geneImages"]
+            for image in images:
+                display_image = {k: v for k, v in image.items() if k in display_fields}
+                images_with_morphology_mps.append(display_image)
+
+    images_table = []
+    print(f"Images related to phenotype {target_mp_terms}: {len(images_with_morphology_mps)}")
+    ## Displaying just the first 20 images
+    for i in images_with_morphology_mps[:20]:
+        row = [f"<img src='{i['thumbnailUrl']}' />"] + list(i.values())
+        images_table.append(row)
+
+    df = pd.DataFrame()
+    externalSampleId = []
+    geneSymbol = []
+    biologicalSampleGroup = []
+    sex = []
+    colonyId = []
+    zygosity = []
+    parameterName = []
+    downloadUrl = []
+    thumbnailUrl = []
+
+    for i in images_table:
+        externalSampleId.append(i[1])
+        geneSymbol.append(i[2])
+        biologicalSampleGroup.append(i[3])
+        sex.append(i[4])
+        colonyId.append(i[5])
+        zygosity.append(i[6])
+        parameterName.append(i[7])
+        downloadUrl.append(i[8])
+        thumbnailUrl.append(i[9])
+
+    df['externalSampleId'] = externalSampleId
+    df['geneSymbol'] = geneSymbol
+    df['biologicalSampleGroup'] = biologicalSampleGroup
+    df['sex'] = sex
+    df['colonyId'] = colonyId
+    df['zygosity'] = zygosity
+    df['parameterName'] = parameterName
+    df['downloadUrl'] = downloadUrl
+    df['thumbnailUrl'] = thumbnailUrl
+
+    if head == 'True':
+        df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+    # 11- Which parameters have been measured for a particular knockout EASY
+
+
+def parameters(inp):
+    head = sys.argv[4]
+    knockout = inp  # "MGI:104636"
+    gene_info = requests.get(impc_api_search_url + "/" + knockout).json()
+
+    if gene_info['phenotypingDataAvailable']:
+        geneBundle = requests.get(gene_info['_links']['geneBundle']['href']).json()
+        gen_imgs = geneBundle['geneImages']
+        par_list = []
+        l = {}
+        for i in gen_imgs:
+            l = {"Parameter Name": i['parameterName']}
+            if l not in par_list:
+                par_list.append(l)
+        df = pd.DataFrame()
+        l = []
+
+        for i in par_list:
+            l.append(i['Parameter Name'])
+
+        df['Parameter'] = l
+        if head == 'True':
+            df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+        else:
+            df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+    else:
+        stop_err("No parameters available for this knockout gene")
+
+
+# 12- Which parameters identified a significant finding for a particular knockout line (colony) EASY
+def sign_par(inp):
+    head = sys.argv[4]
+    knockout = inp  # "MGI:104636"
+
+    gene_info = requests.get(f"{impc_api_url}statisticalResults/search/findAllByMarkerAccessionIdIsAndSignificantTrue?mgiAccessionId=" + knockout).json()
+    gene_stats = gene_info['_embedded']['statisticalResults']
+
+    if len(gene_stats) == 0:
+        stop_err("No statistically relevant parameters found for this knockout gene")
+    else:
+        df = pd.DataFrame()
+        n = []
+        p = []
+        for g in gene_stats:
+            n.append(g['parameterName'])
+            p.append(g['pvalue'])
+
+        df['Parameter name'] = n
+        df['p-value'] = p
+        if head == 'True':
+            df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+        else:
+            df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+# 13- List of genes names and ID measured in a pipeline
+def genes_in_pipeline(inp):
+    head = sys.argv[4]
+    pip = inp
+
+    g_in_p_query = f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={pip}&page=0&size=1000"
+    genes_in_pip = requests.get(g_in_p_query).json()
+    pages = genes_in_pip['page']['totalPages']
+    max_elem = genes_in_pip['page']['totalElements']
+
+    print(f"Genes with {pip}: {genes_in_pip['page']['totalElements']}")
+    d ={ }
+    list_d = []
+    list_of_genes = pd.DataFrame(columns=['Gene accession id', 'Gene name'])
+    acc = []
+    name = []
+
+    if max_elem > 1000:
+        g_in_p_query = genes_in_pip['_embedded']['genes']
+        for i in range(1,pages):
+            gl = requests.get(f'{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={pip}&page={i}&size=1000').json()['_embedded']['genes']
+            g_in_p_query += gl
+    else:
+        g_in_p_query = genes_in_pip['_embedded']['genes']
+
+    for g in g_in_p_query:
+        d = {"Gene Accession ID": g['mgiAccessionId'], "Gene Name": g['markerName']}
+        list_d.append(d)
+
+    for i in list_d:
+        acc.append(i['Gene Accession ID'])
+        name.append(i['Gene Name'])
+
+    list_of_genes['Gene accession id'] = acc
+    list_of_genes['Gene name'] = name
+
+    if head == 'True':
+        list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+# 14- Extract all genes and corresponding phenotypes related to a particular organ system(eg: significatMPTerm)
+def sign_mp(inp):
+    head = sys.argv[4]
+    mp_term = inp  # ['MP:0005391']
+
+    gene_by_mpterm_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={mp_term}&size=1000"
+    genes_with_mpterm = requests.get(gene_by_mpterm_query).json()
+
+    pages = genes_with_mpterm['page']['totalPages']
+    genes_info = genes_with_mpterm['_embedded']['genes']
+
+    for pn in range(1,pages):
+        pq = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={mp_term}&page={pn}&size=1000"
+        g = requests.get(pq).json()['_embedded']['genes']
+        genes_info += g
+
+    list_d=[]
+    d={}
+    for g in genes_info:
+        names=[]
+        ids=[]
+        for s in g['significantMpTerms']:
+            names.append(s['mpTermName'])
+            ids.append(s['mpTermId'])
+        d={'Gene':g['mgiAccessionId'], 'mpTermId': ids, 'mpTermName':names}
+        list_d.append(d)
+
+
+    g = []
+    ids = []
+    names = []
+    for i in list_d:
+        g.append(i['Gene'])
+        ids.append(i['mpTermId'])
+        names.append(i['mpTermName'])
+
+    df = pd.DataFrame()
+    df['Gene Id']=g
+    df['Significant MP terms Ids']=ids
+    df['Significant MP terms Names']=names
+
+    if head == 'True':
+        df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+# 16- Full table of genes and all identified phenotypes
+
+def full_gene_table():
+    head = sys.argv[4]
+    gene_list = requests.get(impc_api_search_url + '?page=0&size=1000').json()
+    pages = gene_list['page']['totalPages']
+    genes_info = gene_list['_embedded']['genes']
+
+    for pn in range(1,pages):
+        gp = requests.get(impc_api_search_url + f'?page={pn}&size=1000').json()['_embedded']['genes']
+        genes_info += gp
+
+    d = {}
+    list_d=[]
+
+    for i in genes_info:
+        l = []
+        if i['significantMpTerms'] is None:
+            d={"Gene": i['mgiAccessionId'], "Identified phenotypes": "None"}
+        else:
+            d = {"Gene": i['mgiAccessionId'], "Identified phenotypes": [sub['mpTermId'] for sub in i['significantMpTerms']]}
+        list_d.append(d)
+
+    df = pd.DataFrame()
+    g = []
+    p = []
+    for i in list_d:
+        g.append(i['Gene'])
+        p.append(i['Identified phenotypes'])
+
+    df['MGI id'] = g
+    df['MP term list'] = p
+
+    for i in range(0, len(df)):
+        if df['MP term list'][i] != "None":
+            df['MP term list'][i] = str(df['MP term list'][i])[1:-1].replace("'", "")
+
+    if str(sys.argv[1]) == 'True':
+        if head == 'True':
+            df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+        else:
+            df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+    else:
+        df = df[df['MP term list'] != "None"]
+        df.reset_index(drop=True, inplace=True)
+        if head == 'True':
+            df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+        else:
+            df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+# Old method, chech which is faster
+#    max_elem = gene_list['page']['totalElements']
+#    d = {}
+#    list_d = []
+#    for i in range(0, pages):
+#        gl = requests.get(impc_api_search_url + '?page=' + str(i) + '&size=' + str(max_elem)).json()
+#        for g in gl['_embedded']['genes']:
+#            if g['significantMpTerms'] is None:
+#                d = {"Gene": g['mgiAccessionId'], "Identified phenotypes": "None"}
+#            else:
+#                d = {"Gene": g['mgiAccessionId'], "Identified phenotypes": [ sub['mpTermId'] for sub in g['significantMpTerms'] ]}
+#            list_d.append(d)
+
+
+
+
+# 18- Extract measurements and analysis for a parameter or pipeline
+
+def par_pip_ma(inp):
+    head = sys.argv[4]
+    id = inp
+
+    if id[0:4] == "IMPC":
+        par = True
+        ma_query = f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page=0&size=1000"
+    else:
+        ma_query = f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={id}&page=0&size=1000"
+        par = False
+
+    ma_in_pip = requests.get(ma_query).json()
+    pages = ma_in_pip['page']['totalPages']
+    max_elem = ma_in_pip['page']['totalElements']
+
+    print(f"Genes with {id}: {ma_in_pip['page']['totalElements']}")
+    d = {}
+    list_d = []
+    list_of_genes = pd.DataFrame(columns=['Measurements', 'Analysis'])
+    mes = []
+    an = []
+
+    if max_elem > 1000:
+
+        ma_in_pip = ma_in_pip['_embedded']['genes']
+        for pn in range(1, pages):
+            if par:
+                pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page={pn}&size=1000").json()['_embedded']['genes']
+            else:
+                pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={id}&page={pn}&size=1000").json()['_embedded']['genes']
+            ma_in_pip += pip
+
+    else:
+        ma_in_pip = ma_in_pip['_embedded']['genes']
+
+    for g in ma_in_pip:
+        d = {"Measurements": g[''], "Analysis": g['']}
+        list_d.append(d)
+
+    for i in list_d:
+        mes.append(i[''])
+        an.append(i[''])
+
+    list_of_genes['Analysis'] = an
+    list_of_genes['Measurements'] = mes
+
+    if head == 'True':
+        list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+# 19- Get all genes and measured values for a particular parameter
+def par_gen(inp):
+    head = sys.argv[4]
+    id = inp
+
+    pa_query = f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page=0&size=1000"
+
+    gm_par = requests.get(pa_query).json()
+    pages = gm_par['page']['totalPages']
+    max_elem = gm_par['page']['totalElements']
+
+    print(f"Genes with {id}: {gm_par['page']['totalElements']}")
+    d = {}
+    list_d = []
+    list_of_genes = pd.DataFrame(columns=['Genes', 'Measured Values'])
+    gen = []
+    mes = []
+
+    if max_elem > 1000:
+
+        gm_par = gm_par['_embedded']['genes']
+
+        for pn in range(1, pages):
+            pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page={pn}&size=1000").json()['_embedded']['genes']
+            gm_par += pip
+
+    else:
+        gm_par = gm_par['_embedded']['genes']
+
+
+    for g in gm_par:
+        d = {"Genes": g['mgiAccessionId'], "Measured Values": g['']}
+        list_d.append(d)
+
+    for i in list_d:
+        gen.append(i['Genes'])
+        mes.append(i['Measured Values'])
+
+    list_of_genes['Genes'] = gen
+    list_of_genes['Measured Values'] = mes
+
+    if head == 'True':
+        list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
+    else:
+        list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
+
+
+def gene_mapping(inp):
+    tmp = inp.split(",")
+    final_list = []
+    sym_list = []
+    for i in tmp:
+        if 'MGI:' in i:
+            final_list.append(i)
+        else:
+            sym_list.append(i)
+    del (i)
+    if len(sym_list) != 0:
+        sym_list = ",".join(sym_list)
+        biodbnet = f'https://biodbnet.abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.xml?method=db2db&format=row&input=genesymbol&inputValues={sym_list}&outputs=mgiid&taxonId=10090'
+        u = url.urlopen(biodbnet)
+        db = pd.read_xml(u, elems_only=True)
+        empty = True
+        discarded = []
+        for i in db.index:
+            if db['MGIID'][i] != '-':
+                empty = False
+                final_list.append(db['MGIID'][i][4:])
+                break
+            else:
+                discarded.append(db['MGIID'][i][4:])
+
+        if (len(db) == 0 and len(final_list) == 0) or (empty and len(final_list) == 0):
+            stop_err("Error: it was not possible to map the input.")
+        elif empty:
+            print("Warning: it was not possible to map any of the gene symbols entry. Only MGI entries will be used.")
+        elif len(discarded) != 0:
+            print("Warning: it was not possible to map these elements: " + ",".join(discarded) + "\n")
+    return(final_list)
+
+def pheno_mapping(inp):
+    tmp = inp.split(",")
+    final_list = []
+    sym_list = []
+    for i in tmp:
+        if 'MP:' in i:
+            final_list.append(i)
+        else:
+            sym_list.append(i)
+    del (i)
+    if len(sym_list) != 0:
+        url="https://raw.githubusercontent.com/AndreaFurlani/hp_mp_mapping_test/main/hp_mp_mapping.csv"
+        mapper = pd.read_csv(url,header=0,index_col=2)
+        empty = True
+        discarded = []
+        for i in sym_list:
+            try:
+                final_list.append(mapper.loc[i]['mpId'])
+                empty=False
+            except KeyError:
+                discarded.append(i)
+                continue
+        if empty and len(final_list)==0:
+            stop_err("Error: it was not possible to map the input.")
+        elif empty:
+            print("Warning: it was not possible to map any of the HP term entries. Only MP entries will be used.")
+        elif len(discarded) != 0:
+            print("Warning: it was not possible to map these elements: " + ",".join(discarded) + "\n")
+    return (final_list)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file