# HG changeset patch
# User iuc
# Date 1697035862 0
# Node ID d319dc5f3ea8aac7a77d898357aea4a332a74d08
planemo upload for repository https://github.com/INFRAFRONTIERDIB/tools-iuc/tree/query_impc/tools/query_impc commit 991881b5df5f5228ecf4445ee2cc1431b9602ea8
diff -r 000000000000 -r d319dc5f3ea8 impc_tool.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/impc_tool.py Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,759 @@
+import sys
+
+import mygene
+import pandas as pd
+import requests
+
+
+impc_api_url = "https://www.ebi.ac.uk/mi/impc/bulkdata-api"
+impc_api_search_url = f"{impc_api_url}/genes"
+impc_api_gene_bundle_url = f"{impc_api_url}/geneBundles"
+
+
+def stop_err(msg):
+ sys.exit(msg)
+
+
+def main():
+ inp = str(sys.argv[1])
+ query = str(sys.argv[3])
+
+ try:
+ if query == "7":
+ g_out = str(sys.argv[5])
+ full_gene_table(g_out)
+ sys.exit(0)
+
+ if str(sys.argv[5]) == "txt":
+ s = str(sys.argv[6])
+ if s == "t":
+ sep = "\t"
+ elif s == "s":
+ sep = " "
+ elif s in ",;.":
+ sep = s
+ else:
+ sys.exit("Separator not valid, please change it.")
+ inp = pd.read_csv(inp, header=None, delimiter=sep)
+ if len(inp.columns) == 1:
+ inp = inp.to_csv(header=None,
+ index=False).strip("\n").split("\n")
+ inp = ",".join(inp)
+ else:
+ inp = inp.to_csv(header=None,
+ index=False).strip(sep).split(sep)
+ inp = ",".join(inp)
+
+ if query == "8":
+ if str(sys.argv[5]) == "txt":
+ g_out = str(sys.argv[7])
+ else:
+ g_out = str(sys.argv[6])
+ genes_in_pipeline(inp, g_out)
+ sys.exit(0)
+ elif query == "9":
+ if str(sys.argv[5]) == "txt":
+ g_out = str(sys.argv[7])
+ else:
+ g_out = str(sys.argv[6])
+ sign_mp(inp, g_out)
+ sys.exit(0)
+ elif query == "10":
+ par_pip_ma(inp)
+ sys.exit(0)
+ elif query == "11":
+ par_gen(inp)
+ sys.exit(0)
+ elif query == "2" or query == "4":
+ final_list = pheno_mapping(inp)
+ else:
+ final_list = gene_mapping(inp)
+ inp = ",".join(final_list)
+
+ if query == "1":
+ get_pheno(inp)
+ sys.exit(0)
+ elif query == "2":
+ if str(sys.argv[5]) == "txt":
+ g_out = str(sys.argv[7])
+ else:
+ g_out = str(sys.argv[6])
+ get_genes(inp, g_out)
+ sys.exit(0)
+ elif query == "3":
+ gene_set(inp)
+ sys.exit(0)
+ elif query == "4":
+ extr_img(inp)
+ sys.exit(0)
+ elif query == "5":
+ parameters(inp)
+ sys.exit(0)
+ elif query == "6":
+ sign_par(inp)
+ sys.exit(0)
+ else:
+ stop_err("Error, non-implemented query selected: " + query)
+ except Exception as ex:
+ stop_err("Error running impc_tool.py:\n" + str(ex))
+
+
+# 1-Given a gene id, retrieve all the phenotypes related to it (id and name)
+def get_pheno(inp):
+ head = sys.argv[4]
+ mgi_accession_id = inp
+
+ gene_url = f"{impc_api_search_url}/{mgi_accession_id}"
+ gene_data = requests.get(gene_url).json()
+
+ p_list = []
+ id_list = []
+
+ if gene_data["significantMpTerms"] is None:
+ stop_err("No significant MP terms found for this gene")
+ else:
+ for x in gene_data["significantMpTerms"]:
+ p_list.append(x["mpTermId"])
+ id_list.append(x["mpTermName"])
+
+ df = pd.DataFrame()
+ df["MP term name"] = p_list
+ df["MP term id"] = id_list
+
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 3-Extract all genes having a particular phenotype or a set of phenotypes
+# (e.g. relevant to a disease)
+def get_genes(inp, g_out):
+ head = sys.argv[4]
+ target_mp_terms = inp
+
+# All the data is paginated using the page and size parameters,
+# by default the endpoint returns the first 20 hits
+ gene_by_phenotypes_query = f"{impc_api_search_url}" \
+ f"/search/findAllBySignificantMpTermIdsContains" \
+ f"?mpTermIds={target_mp_terms}&page=0&size=20"
+ genes_with_clinical_chemistry_phen = \
+ requests.get(gene_by_phenotypes_query).json()
+ print(f"Genes with {target_mp_terms}: "
+ f"{genes_with_clinical_chemistry_phen['page']['totalElements']}")
+ acc = []
+ name = []
+ url = []
+
+ for gene in genes_with_clinical_chemistry_phen["_embedded"]["genes"]:
+ acc.append(gene["mgiAccessionId"])
+ name.append(gene["markerName"])
+ url.append(gene["_links"]["geneBundle"]["href"])
+
+ if g_out == "sym":
+ list_of_genes = pd.DataFrame(columns=["Gene symbol id", "Gene name",
+ "Gene bundle url"])
+ list_of_genes["Gene symbol id"] = mgi_sym_map(acc)
+ else:
+ list_of_genes = pd.DataFrame(columns=["Gene accession id",
+ "Gene name", "Gene bundle url"])
+ list_of_genes["Gene accession id"] = acc
+ list_of_genes["Gene name"] = name
+ list_of_genes["Gene bundle url"] = url
+
+ if head == "True":
+ list_of_genes.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ list_of_genes.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 4. Extract all phenotypes which are present in a particular gene set
+# (e.g. genes together in a pathway)
+def gene_set(inp):
+ head = sys.argv[4]
+ target_genes = inp
+
+ genes_in_gene_list_query = f"{impc_api_search_url}/search/" \
+ f"findAllByMgiAccessionIdIn?" \
+ f"mgiAccessionIds={target_genes}"
+
+ genes_in_gene_list = requests.get(genes_in_gene_list_query).json()
+ mp_terms_vs_gene_idx = {}
+
+ for gene in genes_in_gene_list["_embedded"]["genes"]:
+ mp_terms = gene["significantMpTerms"]
+ gene_acc_id = gene["mgiAccessionId"]
+ if mp_terms is None:
+ continue
+ for mp_term_name in mp_terms:
+ if mp_term_name["mpTermId"] not in mp_terms_vs_gene_idx:
+ mp_terms_vs_gene_idx[mp_term_name["mpTermId"]] = \
+ {"mp_term": mp_term_name["mpTermId"],
+ "mp_name": mp_term_name["mpTermName"], "genes": []}
+ mp_terms_vs_gene_idx[mp_term_name["mpTermId"]]["genes"].\
+ append(gene_acc_id)
+ genes_by_mp_term = list(mp_terms_vs_gene_idx.values())
+
+ df = pd.DataFrame()
+ terms = []
+ names = []
+ genes = []
+ for i in genes_by_mp_term:
+ terms.append(i["mp_term"])
+ names.append(i["mp_name"])
+ genes.append(",".join(i["genes"]))
+
+ df["mp_term"] = terms
+ df["mp_name"] = names
+ df["genes"] = genes
+
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 7. Extract images with a particular phenotype or a set of phenotypes
+def extr_img(inp):
+ head = sys.argv[4]
+ target_mp_terms = inp # ["MP:0002110", "MP:0000559"]
+
+# All the data is paginated using the page and size parameters,
+# by default the endpoint returns the first 20 hits
+ gene_by_phenotypes_query = f"{impc_api_search_url}/search/" \
+ f"findAllBySignificantMpTermIdsContains?" \
+ f"mpTermIds={target_mp_terms}&page=0&size=20"
+ genes_with_morph_mps = requests.get(gene_by_phenotypes_query).json()
+ list_of_gene_bundle_urls = [
+ gene["_links"]["geneBundle"]["href"] for gene in
+ genes_with_morph_mps["_embedded"]["genes"]
+ ]
+
+ gene_bundles = []
+ for gene_bundle_url in list_of_gene_bundle_urls:
+ gene_bundle = requests.get(gene_bundle_url).json()
+ gene_bundles.append(gene_bundle)
+
+ images_with_morphology_mps = []
+
+ # Doing just the first 20 and filtering out fields on the images
+ display_fields = ["geneSymbol", "parameterName", "biologicalSampleGroup",
+ "colonyId", "zygosity", "sex", "downloadUrl",
+ "externalSampleId", "thumbnailUrl"]
+
+ for gene_bundle in gene_bundles[:20]:
+ if len(gene_bundle) == 4:
+ continue
+ if gene_bundle["geneImages"] is not None:
+ images = gene_bundle["geneImages"]
+ for image in images:
+ display_image = {k: v for k, v in image.items()
+ if k in display_fields}
+ images_with_morphology_mps.append(display_image)
+
+ images_table = []
+ print(f"Images related to phenotype {target_mp_terms}: "
+ f"{len(images_with_morphology_mps)}")
+ # Displaying just the first 20 images
+ for i in images_with_morphology_mps[:20]:
+ row = [f""] + list(i.values())
+ images_table.append(row)
+
+ df = pd.DataFrame()
+ externalSampleId = []
+ geneSymbol = []
+ biologicalSampleGroup = []
+ sex = []
+ colonyId = []
+ zygosity = []
+ parameterName = []
+ downloadUrl = []
+ thumbnailUrl = []
+
+ for i in images_table:
+ externalSampleId.append(i[1])
+ geneSymbol.append(i[2])
+ biologicalSampleGroup.append(i[3])
+ sex.append(i[4])
+ colonyId.append(i[5])
+ zygosity.append(i[6])
+ parameterName.append(i[7])
+ downloadUrl.append(i[8])
+ thumbnailUrl.append(i[9])
+
+ df["externalSampleId"] = externalSampleId
+ df["geneSymbol"] = geneSymbol
+ df["biologicalSampleGroup"] = biologicalSampleGroup
+ df["sex"] = sex
+ df["colonyId"] = colonyId
+ df["zygosity"] = zygosity
+ df["parameterName"] = parameterName
+ df["downloadUrl"] = downloadUrl
+ df["thumbnailUrl"] = thumbnailUrl
+
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 11- Which parameters have been measured for a particular knockout
+def parameters(inp):
+ head = sys.argv[4]
+ knockout = inp # "MGI:104636"
+ gene_info = requests.get(impc_api_search_url + "/" + knockout).json()
+
+ if gene_info["phenotypingDataAvailable"]:
+ geneBundle = requests.get(gene_info["_links"]["geneBundle"]["href"])\
+ .json()
+ gen_imgs = geneBundle["geneImages"]
+ par_list = []
+ lis = {}
+ for i in gen_imgs:
+ lis = {"Parameter Name": i["parameterName"]}
+ if lis not in par_list:
+ par_list.append(lis)
+ df = pd.DataFrame()
+ li = []
+
+ for i in par_list:
+ li.append(i["Parameter Name"])
+
+ df["Parameter"] = li
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+ else:
+ stop_err("No parameters available for this knockout gene")
+
+
+# 12- Which parameters identified a significant finding for a particular
+# knockout line (colony)
+def sign_par(inp):
+ head = sys.argv[4]
+ knockout = inp # "MGI:104636"
+
+ gene_info = requests.get(f"{impc_api_url}statisticalResults/search/"
+ f"findAllByMarkerAccessionIdIsAndSignificantTrue?"
+ f"mgiAccessionId=" + knockout).json()
+ gene_stats = gene_info["_embedded"]["statisticalResults"]
+
+ if len(gene_stats) == 0:
+ stop_err("No statistically relevant parameters found "
+ "for this knockout gene")
+ else:
+ df = pd.DataFrame()
+ n = []
+ p = []
+ for g in gene_stats:
+ n.append(g["parameterName"])
+ p.append(g["pvalue"])
+
+ df["Parameter name"] = n
+ df["p-value"] = p
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 13- List of genes names and ID measured in a pipeline
+def genes_in_pipeline(inp, g_out):
+ head = sys.argv[4]
+ pip = inp
+
+ g_in_p_query = f"{impc_api_search_url}/search/" \
+ f"findAllByTestedPipelineId?pipelineId={pip}&" \
+ f"page=0&size=1000"
+ genes_in_pip = requests.get(g_in_p_query).json()
+ pages = genes_in_pip["page"]["totalPages"]
+ max_elem = genes_in_pip["page"]["totalElements"]
+
+ print(f"Genes with {pip}: {genes_in_pip['page']['totalElements']}")
+ list_d = []
+ acc = []
+ name = []
+
+ if max_elem > 1000:
+ g_in_p_query = genes_in_pip["_embedded"]["genes"]
+ for i in range(1, pages):
+ gl = requests.get(f"{impc_api_search_url}/search/"
+ f"findAllByTestedPipelineId?pipelineId={pip}&"
+ f"page={i}&"
+ f"size=1000").json()["_embedded"]["genes"]
+ g_in_p_query += gl
+ else:
+ g_in_p_query = genes_in_pip["_embedded"]["genes"]
+
+ for g in g_in_p_query:
+ d = {"Gene Accession ID": g["mgiAccessionId"],
+ "Gene Name": g["markerName"]}
+ list_d.append(d)
+
+ for i in list_d:
+ acc.append(i["Gene Accession ID"])
+ name.append(i["Gene Name"])
+ if g_out == "sym":
+ list_of_genes = pd.DataFrame(columns=["Gene symbol", "Gene name"])
+ list_of_genes["Gene symbol"] = mgi_sym_map(acc)
+ else:
+ list_of_genes = pd.DataFrame(columns=["Gene accession id",
+ "Gene name"])
+ list_of_genes["Gene accession id"] = acc
+ list_of_genes["Gene name"] = name
+
+ if head == "True":
+ list_of_genes.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ list_of_genes.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 14- Extract all genes and corresponding phenotypes related to a
+# particular organ system (eg: significatMPTerm)
+def sign_mp(inp, g_out):
+ head = sys.argv[4]
+ mp_term = inp # ["MP:0005391"]
+
+ gene_by_mpterm_query = f"{impc_api_search_url}/search/" \
+ f"findAllBySignificantMpTermIdsContains?" \
+ f"mpTermIds={mp_term}&size=1000"
+ genes_with_mpterm = requests.get(gene_by_mpterm_query).json()
+
+ pages = genes_with_mpterm["page"]["totalPages"]
+ genes_info = genes_with_mpterm["_embedded"]["genes"]
+
+ for pn in range(1, pages):
+ pq = f"{impc_api_search_url}/search/" \
+ f"findAllBySignificantMpTermIdsContains?" \
+ f"mpTermIds={mp_term}&page={pn}&size=1000"
+ g = requests.get(pq).json()["_embedded"]["genes"]
+ genes_info += g
+
+ list_d = []
+ d = {}
+ for g in genes_info:
+ names = []
+ ids = []
+ for s in g["significantMpTerms"]:
+ names.append(s["mpTermName"])
+ ids.append(s["mpTermId"])
+ d = {"Gene": g["mgiAccessionId"], "mpTermId": ids, "mpTermName": names}
+ list_d.append(d)
+
+ g = []
+ ids = []
+ names = []
+ for i in list_d:
+ g.append(i["Gene"])
+ ids.append(i["mpTermId"])
+ names.append(i["mpTermName"])
+
+ df = pd.DataFrame()
+ if g_out == "sym":
+ df["Gene symbol"] = mgi_sym_map(g)
+ else:
+ df["Gene Id"] = g
+ df["Significant MP terms Ids"] = ids
+ df["Significant MP terms Names"] = names
+
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 16- Full table of genes and all identified phenotypes
+def full_gene_table(g_out):
+ head = sys.argv[4]
+ gene_list = requests.get(impc_api_search_url + "?page=0&size=1000").json()
+ pages = gene_list["page"]["totalPages"]
+ genes_info = gene_list["_embedded"]["genes"]
+
+ for pn in range(1, pages):
+ gp = requests.get(impc_api_search_url
+ + f"?page={pn}&"
+ f"size=1000").json()["_embedded"]["genes"]
+ genes_info += gp
+
+ d = {}
+ list_d = []
+
+ for i in genes_info:
+ if i["significantMpTerms"] is None:
+ d = {"Gene": i["mgiAccessionId"], "Identified phenotypes": "None"}
+ else:
+ d = {"Gene": i["mgiAccessionId"],
+ "Identified phenotypes": [
+ sub["mpTermId"] for sub in i["significantMpTerms"]
+ ]}
+ list_d.append(d)
+
+ df = pd.DataFrame()
+ g = []
+ p = []
+ for i in list_d:
+ g.append(i["Gene"])
+ p.append(i["Identified phenotypes"])
+
+ if g_out == "sym":
+ df["Gene symbol"] = mgi_sym_map(g)
+ else:
+ df["MGI id"] = g
+ df["MP term list"] = p
+
+ for i in range(0, len(df)):
+ if df["MP term list"][i] != "None":
+ df["MP term list"][i] = str(
+ df["MP term list"][i]
+ )[1:-1].replace("'", "")
+
+ if str(sys.argv[1]) == "True":
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+ else:
+ df = df[df["MP term list"] != "None"]
+ df.reset_index(drop=True, inplace=True)
+ if head == "True":
+ df.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ df.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 18- Extract measurements and analysis for a parameter or pipeline
+def par_pip_ma(inp):
+ head = sys.argv[4]
+ id = inp
+
+ if id[0:4] == "IMPC":
+ par = True
+ ma_query = f"{impc_api_search_url}/search/" \
+ f"findAllByTestedParameterId?" \
+ f"parameterId={id}&page=0&size=1000"
+ else:
+ ma_query = f"{impc_api_search_url}/search/" \
+ f"findAllByTestedPipelineId?" \
+ f"pipelineId={id}&page=0&size=1000"
+ par = False
+
+ ma_in_pip = requests.get(ma_query).json()
+ pages = ma_in_pip["page"]["totalPages"]
+ max_elem = ma_in_pip["page"]["totalElements"]
+
+ print(f"Genes with {id}: {ma_in_pip['page']['totalElements']}")
+ list_d = []
+ list_of_genes = pd.DataFrame(columns=["Measurements", "Analysis"])
+ mes = []
+ an = []
+
+ if max_elem > 1000:
+
+ ma_in_pip = ma_in_pip["_embedded"]["genes"]
+ for pn in range(1, pages):
+ if par:
+ pip = requests.get(f"{impc_api_search_url}/search/"
+ f"findAllByTestedParameterId?"
+ f"parameterId={id}&page={pn}&"
+ f"size=1000").json()["_embedded"]["genes"]
+ else:
+ pip = requests.get(f"{impc_api_search_url}/search/"
+ f"findAllByTestedPipelineId?"
+ f"pipelineId={id}&page={pn}&"
+ f"size=1000").json()["_embedded"]["genes"]
+ ma_in_pip += pip
+
+ else:
+ ma_in_pip = ma_in_pip["_embedded"]["genes"]
+
+ for g in ma_in_pip:
+ d = {"Measurements": g[""], "Analysis": g[""]}
+ list_d.append(d)
+
+ for i in list_d:
+ mes.append(i[""])
+ an.append(i[""])
+
+ list_of_genes["Analysis"] = an
+ list_of_genes["Measurements"] = mes
+
+ if head == "True":
+ list_of_genes.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ list_of_genes.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# 19- Get all genes and measured values for a particular parameter
+def par_gen(inp, g_out):
+ head = sys.argv[4]
+ id = inp
+
+ pa_query = f"{impc_api_search_url}/search/" \
+ f"findAllByTestedParameterId?parameterId={id}&page=0&size=1000"
+
+ gm_par = requests.get(pa_query).json()
+ pages = gm_par["page"]["totalPages"]
+ max_elem = gm_par["page"]["totalElements"]
+
+ print(f"Genes with {id}: {gm_par['page']['totalElements']}")
+ list_d = []
+ gen = []
+ mes = []
+
+ if max_elem > 1000:
+
+ gm_par = gm_par["_embedded"]["genes"]
+
+ for pn in range(1, pages):
+ pip = requests.get(f"{impc_api_search_url}/search/"
+ f"findAllByTestedParameterId?"
+ f"parameterId={id}&page={pn}&"
+ f"size=1000").json()["_embedded"]["genes"]
+ gm_par += pip
+
+ else:
+ gm_par = gm_par["_embedded"]["genes"]
+
+ for g in gm_par:
+ d = {"Genes": g["mgiAccessionId"], "Measured Values": g[""]}
+ list_d.append(d)
+
+ for i in list_d:
+ gen.append(i["Genes"])
+ mes.append(i["Measured Values"])
+
+ if g_out == "sym":
+ list_of_genes = pd.DataFrame(columns=["Gene symbol",
+ "Measured Values"])
+ list_of_genes["Gene symbol"] = mgi_sym_map(gen)
+ else:
+ list_of_genes = pd.DataFrame(columns=["Gene accession id",
+ "Measured Values"])
+ list_of_genes["Gene accession id"] = gen
+ list_of_genes["Measured Values"] = mes
+
+ if head == "True":
+ list_of_genes.to_csv(sys.argv[2], header=True, index=False,
+ sep="\t", index_label=False)
+ else:
+ list_of_genes.to_csv(sys.argv[2], header=False, index=False,
+ sep="\t", index_label=False)
+
+
+# Function to map gene symbol to MGI ids
+def gene_mapping(inp):
+ tmp = inp.split(",")
+ final_list = []
+ sym_list = []
+ for i in tmp:
+ if "MGI:" in i:
+ final_list.append(i)
+ else:
+ sym_list.append(i)
+ del i
+
+ # symbol for symbols, mgi for MGI :
+ # https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
+ if len(sym_list) != 0:
+ mg = mygene.MyGeneInfo()
+ ginfo = mg.querymany(sym_list, scopes="symbol", fields="symbol,MGI",
+ species="mouse")
+ empty = True
+ discarded = []
+ for i in ginfo:
+ try:
+ final_list.append(i["MGI"])
+ empty = False
+ except KeyError:
+ discarded.append(i["query"])
+ if empty and len(final_list) == 0:
+ stop_err("Error: it was not possible to map the input.")
+ elif empty:
+ print("Warning: it was not possible to map any of the symbol ids. "
+ "Only MGI ids will be used.")
+ elif len(discarded) != 0:
+ print("Warning: it was not possible to map these elements: "
+ "" + ",".join(discarded) + "\n")
+
+ return final_list
+
+
+# Function to map phenotypes ids to names
+def pheno_mapping(inp):
+ tmp = inp.split(",")
+ final_list = []
+ sym_list = []
+ for i in tmp:
+ if "MP:" in i:
+ final_list.append(i)
+ else:
+ sym_list.append(i)
+ del i
+ if len(sym_list) != 0:
+ url = "https://raw.githubusercontent.com/AndreaFurlani/" \
+ "hp_mp_mapping_test/main/hp_mp_mapping.csv"
+ mapper = pd.read_csv(url, header=0, index_col=2)
+ empty = True
+ discarded = []
+ for i in sym_list:
+ try:
+ final_list.append(mapper.loc[i]["mpId"])
+ empty = False
+ except KeyError:
+ discarded.append(i)
+ continue
+ if empty and len(final_list) == 0:
+ stop_err("Error: it was not possible to map the input.")
+ elif empty:
+ print("Warning: it was not possible to map any of the "
+ "HP term entries. Only MP entries will be used.")
+ elif len(discarded) != 0:
+ print("Warning: it was not possible to "
+ "map these elements: " + ",".join(discarded) + "\n")
+ return final_list
+
+
+# Function to map MGI ids to Gene Symbols
+def mgi_sym_map(mgi_list):
+ sym_list = []
+ mg = mygene.MyGeneInfo()
+ ginfo = mg.querymany(mgi_list, scopes="MGI", fields="symbol,MGI",
+ species="mouse")
+ discarded = []
+ for i in ginfo:
+ try:
+ sym_list.append(i["symbol"])
+ except KeyError:
+ sym_list.append(i["query"])
+ discarded.append(i["query"])
+ if len(discarded) != 0:
+ print("It was not possible to map these genes: " + ",".join(discarded))
+ return sym_list
+
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r d319dc5f3ea8 impc_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/impc_tool.xml Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,351 @@
+
+ query tool
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ requests
+ pandas
+ lxml
+ mygene
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ https://doi.org/10.1093/nar/gku1193
+ https://doi.org/10.12688/f1000research.25369.1
+ https://doi.org/10.1038/nature19356
+
+
\ No newline at end of file
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_output_1_1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output_1_1.tabular Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,10 @@
+MP term name MP term id
+MP:0002135 abnormal kidney morphology
+MP:0000194 increased circulating calcium level
+MP:0002574 increased vertical activity
+MP:0005633 increased circulating sodium level
+MP:0001303 abnormal lens morphology
+MP:0002965 increased circulating serum albumin level
+MP:0001304 cataract
+MP:0010052 increased grip strength
+MP:0001402 decreased locomotor activity
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_output_1_2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output_1_2.tabular Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,5 @@
+MP term name MP term id
+MP:0000194 increased circulating calcium level
+MP:0011110 preweaning lethality, incomplete penetrance
+MP:0001303 abnormal lens morphology
+MP:0010053 decreased grip strength
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_output_2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output_2.tabular Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,21 @@
+Gene accession id Gene name Gene bundle url
+MGI:1345144 sprouty RTK signaling antagonist 4 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1345144
+MGI:2670964 terminal nucleotidyltransferase 5A https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:2670964
+MGI:95490 fibrillin 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95490
+MGI:95689 growth differentiation factor 6 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95689
+MGI:1341886 ajuba LIM protein https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1341886
+MGI:1347352 hormonally upregulated Neu-associated kinase https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1347352
+MGI:109331 nucleoredoxin https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:109331
+MGI:1914061 dual oxidase maturation factor 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1914061
+MGI:1915958 RAB, member RAS oncogene family-like 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1915958
+MGI:1917363 ciliary microtubule associated protein 1B https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1917363
+MGI:1920858 MARVEL (membrane-associating) domain containing 3 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1920858
+MGI:106576 chondroitin polymerizing factor https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:106576
+MGI:107185 chaperonin containing Tcp1, subunit 5 (epsilon) https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107185
+MGI:1931881 DnaJ heat shock protein family (Hsp40) member B12 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1931881
+MGI:109327 BCL2/adenovirus E1B interacting protein 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:109327
+MGI:1913955 deoxyribonuclease 1-like 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1913955
+MGI:107374 paired-like homeodomain transcription factor 1 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107374
+MGI:1335088 proline-serine-threonine phosphatase-interacting protein 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1335088
+MGI:95688 growth differentiation factor 5 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95688
+MGI:107474 CD38 antigen https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107474
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_output_3.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output_3.tabular Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,79 @@
+MP:0002764 short tibia MGI:99960,MGI:108071
+MP:0001785 edema MGI:99960
+MP:0002968 increased circulating alkaline phosphatase level MGI:99960
+MPATH:590 fibro-osseous lesion MGI:99960
+MP:0001399 hyperactivity MGI:99960,MGI:1354170
+MP:0011100 preweaning lethality, complete penetrance MGI:99960,MGI:1344380,MGI:1917473
+MP:0010052 increased grip strength MGI:99960,MGI:96709
+MPATH:134 hyperplasia MGI:99960
+MP:0000218 increased leukocyte cell number MGI:99960,MGI:96709
+MP:0005013 increased lymphocyte cell number MGI:99960
+MP:0001363 increased anxiety-related response MGI:1354170
+MP:0001258 decreased body length MGI:1354170,MGI:108071,MGI:1915775,MGI:2443026
+MP:0003795 abnormal bone structure MGI:1354170
+MP:0001417 decreased exploration in new environment MGI:1354170,MGI:96709
+MP:0002797 increased thigmotaxis MGI:1354170
+MP:0002757 decreased vertical activity MGI:1354170
+MP:0011960 abnormal eye anterior chamber depth MGI:1354170
+MP:0010124 decreased bone mineral content MGI:1354170
+MP:0001402 decreased locomotor activity MGI:1354170
+MP:0004924 abnormal behavior MGI:1354170,MGI:96709
+MP:0013279 increased fasting circulating glucose level MGI:99502,MGI:1860418,MGI:103225
+MP:0005333 decreased heart rate MGI:3616082
+MP:0001406 abnormal gait MGI:96709
+MP:0010053 decreased grip strength MGI:96709,MGI:1924093,MGI:1915775
+MP:0001523 impaired righting response MGI:96709
+MP:0005559 increased circulating glucose level MGI:96709
+MP:0000745 tremors MGI:96709
+MPATH:52 lipid depletion MGI:1913564
+MPATH:42 lipid deposition MGI:1913564
+MP:0005419 decreased circulating serum albumin level MGI:1860418
+MP:0000219 increased neutrophil cell number MGI:1860418
+MP:0005567 decreased circulating total protein level MGI:1860418,MGI:1915775
+MP:0008810 increased circulating iron level MGI:1914361
+MP:0002875 decreased erythrocyte cell number MGI:1914361
+MP:0000208 decreased hematocrit MGI:1914361
+MP:0002874 decreased hemoglobin content MGI:1914361
+MP:0005566 decreased blood urea nitrogen level MGI:103225,MGI:1915775
+MP:0005343 increased circulating aspartate transaminase level MGI:103225
+MP:0011954 shortened PQ interval MGI:103225
+MP:0005344 increased circulating bilirubin level MGI:103225,MGI:95479
+MP:0002644 decreased circulating triglyceride level MGI:103225
+MP:0001415 increased exploration in new environment MGI:103225
+MP:0010511 shortened PR interval MGI:103225
+MP:0002574 increased vertical activity MGI:1915291
+MP:0003917 increased kidney weight MGI:1915291
+MP:0013292 embryonic lethality prior to organogenesis MGI:1344380
+MP:0000221 decreased leukocyte cell number MGI:95479
+MP:0005016 decreased lymphocyte cell number MGI:95479
+MP:0012361 decreased large unstained cell number MGI:95479
+MP:0001146 abnormal testis morphology MGI:2443598
+MP:0002152 abnormal brain morphology MGI:2443598
+MPATH:127 atrophy MGI:2443598
+MPATH:639 hydrocephalus MGI:2443598
+MP:0001925 male infertility MGI:2443598
+MP:0002092 abnormal eye morphology MGI:2443598
+MP:0005238 increased brain size MGI:2443598
+MP:0001147 small testis MGI:2443598
+MP:0000598 abnormal liver morphology MGI:2441730
+MP:0002833 increased heart weight MGI:2441730
+MP:0011110 preweaning lethality, incomplete penetrance MGI:2441730,MGI:1915775,MGI:2443026
+MP:0004738 abnormal auditory brainstem response MGI:2441730
+MP:0000599 enlarged liver MGI:2441730
+MP:0009476 enlarged cecum MGI:2441730
+MP:0005565 increased blood urea nitrogen level MGI:2441730
+MP:0001284 absent vibrissae MGI:2441730
+MP:0004832 enlarged ovary MGI:2441730
+MP:0005084 abnormal gallbladder morphology MGI:1915775
+MP:0000274 enlarged heart MGI:1915775
+MP:0009142 decreased prepulse inhibition MGI:1915775
+MP:0000692 small spleen MGI:1915775
+MP:0030610 absent teeth MGI:1915775
+MP:0001325 abnormal retina morphology MGI:1915775
+MP:0000266 abnormal heart morphology MGI:1915775
+MPATH:64 developmental dysplasia MGI:1915775
+MP:0000494 abnormal cecum morphology MGI:1915775
+MP:0001120 abnormal uterus morphology MGI:1915775
+MP:0000689 abnormal spleen morphology MGI:1915775
+MP:0009709 hydrometra MGI:1915775
+MP:0002060 abnormal skin morphology MGI:1915775
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_output_9.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output_9.tabular Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,4 @@
+Gene symbol Significant MP terms Ids Significant MP terms Names
+Cacna1s ['MP:0001697', 'MP:0001785', 'MP:0003231', 'MP:0005388', 'MP:0001491', 'MP:0001575', 'MP:0003743', 'MP:0001914', 'MP:0011100', 'MP:0005560'] ['abnormal embryo size', 'edema', 'abnormal placenta vasculature', 'respiratory system phenotype', 'unresponsive to tactile stimuli', 'cyanosis', 'abnormal facial morphology', 'hemorrhage', 'preweaning lethality, complete penetrance', 'decreased circulating glucose level']
+Ndel1 ['MP:0001697', 'MP:0003984', 'MP:0002111', 'MP:0005388', 'MP:0011100'] ['abnormal embryo size', 'embryonic growth retardation', 'abnormal tail morphology', 'respiratory system phenotype', 'preweaning lethality, complete penetrance']
+Zfp536 ['MP:0003019', 'MP:0005564', 'MP:0005388', 'MP:0001575', 'MP:0001399', 'MP:0011100', 'MP:0005641'] ['increased circulating chloride level', 'increased hemoglobin content', 'respiratory system phenotype', 'cyanosis', 'hyperactivity', 'preweaning lethality, complete penetrance', 'increased mean corpuscular hemoglobin concentration']
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_query_1.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query_1.txt Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,1 @@
+MGI:1923523
\ No newline at end of file
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_query_2.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query_2.txt Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,1 @@
+MP:0002110 MP:0000559
\ No newline at end of file
diff -r 000000000000 -r d319dc5f3ea8 test-data/test_query_3.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query_3.txt Wed Oct 11 14:51:02 2023 +0000
@@ -0,0 +1,1 @@
+MGI:1913564 MGI:1915291 MGI:1914361 MGI:1915775 MGI:1354170 MGI:103225 MGI:2441730 MGI:108071 MGI:2443598 MGI:106643 MGI:1917473 MGI:1338073 MGI:1924093 MGI:99960 MGI:99502 MGI:95479 MGI:1344380 MGI:1860418 MGI:1354721 MGI:3616082 MGI:96709 MGI:2443026
\ No newline at end of file