Mercurial > repos > iuc > query_impc
diff impc_tool.py @ 0:d319dc5f3ea8 draft default tip
planemo upload for repository https://github.com/INFRAFRONTIERDIB/tools-iuc/tree/query_impc/tools/query_impc commit 991881b5df5f5228ecf4445ee2cc1431b9602ea8
author | iuc |
---|---|
date | Wed, 11 Oct 2023 14:51:02 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/impc_tool.py Wed Oct 11 14:51:02 2023 +0000 @@ -0,0 +1,759 @@ +import sys + +import mygene +import pandas as pd +import requests + + +impc_api_url = "https://www.ebi.ac.uk/mi/impc/bulkdata-api" +impc_api_search_url = f"{impc_api_url}/genes" +impc_api_gene_bundle_url = f"{impc_api_url}/geneBundles" + + +def stop_err(msg): + sys.exit(msg) + + +def main(): + inp = str(sys.argv[1]) + query = str(sys.argv[3]) + + try: + if query == "7": + g_out = str(sys.argv[5]) + full_gene_table(g_out) + sys.exit(0) + + if str(sys.argv[5]) == "txt": + s = str(sys.argv[6]) + if s == "t": + sep = "\t" + elif s == "s": + sep = " " + elif s in ",;.": + sep = s + else: + sys.exit("Separator not valid, please change it.") + inp = pd.read_csv(inp, header=None, delimiter=sep) + if len(inp.columns) == 1: + inp = inp.to_csv(header=None, + index=False).strip("\n").split("\n") + inp = ",".join(inp) + else: + inp = inp.to_csv(header=None, + index=False).strip(sep).split(sep) + inp = ",".join(inp) + + if query == "8": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + genes_in_pipeline(inp, g_out) + sys.exit(0) + elif query == "9": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + sign_mp(inp, g_out) + sys.exit(0) + elif query == "10": + par_pip_ma(inp) + sys.exit(0) + elif query == "11": + par_gen(inp) + sys.exit(0) + elif query == "2" or query == "4": + final_list = pheno_mapping(inp) + else: + final_list = gene_mapping(inp) + inp = ",".join(final_list) + + if query == "1": + get_pheno(inp) + sys.exit(0) + elif query == "2": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + get_genes(inp, g_out) + sys.exit(0) + elif query == "3": + gene_set(inp) + sys.exit(0) + elif query == "4": + extr_img(inp) + sys.exit(0) + elif query == "5": + parameters(inp) + sys.exit(0) + elif query == "6": + sign_par(inp) + sys.exit(0) + else: + stop_err("Error, non-implemented query selected: " + query) + except Exception as ex: + stop_err("Error running impc_tool.py:\n" + str(ex)) + + +# 1-Given a gene id, retrieve all the phenotypes related to it (id and name) +def get_pheno(inp): + head = sys.argv[4] + mgi_accession_id = inp + + gene_url = f"{impc_api_search_url}/{mgi_accession_id}" + gene_data = requests.get(gene_url).json() + + p_list = [] + id_list = [] + + if gene_data["significantMpTerms"] is None: + stop_err("No significant MP terms found for this gene") + else: + for x in gene_data["significantMpTerms"]: + p_list.append(x["mpTermId"]) + id_list.append(x["mpTermName"]) + + df = pd.DataFrame() + df["MP term name"] = p_list + df["MP term id"] = id_list + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 3-Extract all genes having a particular phenotype or a set of phenotypes +# (e.g. relevant to a disease) +def get_genes(inp, g_out): + head = sys.argv[4] + target_mp_terms = inp + +# All the data is paginated using the page and size parameters, +# by default the endpoint returns the first 20 hits + gene_by_phenotypes_query = f"{impc_api_search_url}" \ + f"/search/findAllBySignificantMpTermIdsContains" \ + f"?mpTermIds={target_mp_terms}&page=0&size=20" + genes_with_clinical_chemistry_phen = \ + requests.get(gene_by_phenotypes_query).json() + print(f"Genes with {target_mp_terms}: " + f"{genes_with_clinical_chemistry_phen['page']['totalElements']}") + acc = [] + name = [] + url = [] + + for gene in genes_with_clinical_chemistry_phen["_embedded"]["genes"]: + acc.append(gene["mgiAccessionId"]) + name.append(gene["markerName"]) + url.append(gene["_links"]["geneBundle"]["href"]) + + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol id", "Gene name", + "Gene bundle url"]) + list_of_genes["Gene symbol id"] = mgi_sym_map(acc) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Gene name", "Gene bundle url"]) + list_of_genes["Gene accession id"] = acc + list_of_genes["Gene name"] = name + list_of_genes["Gene bundle url"] = url + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 4. Extract all phenotypes which are present in a particular gene set +# (e.g. genes together in a pathway) +def gene_set(inp): + head = sys.argv[4] + target_genes = inp + + genes_in_gene_list_query = f"{impc_api_search_url}/search/" \ + f"findAllByMgiAccessionIdIn?" \ + f"mgiAccessionIds={target_genes}" + + genes_in_gene_list = requests.get(genes_in_gene_list_query).json() + mp_terms_vs_gene_idx = {} + + for gene in genes_in_gene_list["_embedded"]["genes"]: + mp_terms = gene["significantMpTerms"] + gene_acc_id = gene["mgiAccessionId"] + if mp_terms is None: + continue + for mp_term_name in mp_terms: + if mp_term_name["mpTermId"] not in mp_terms_vs_gene_idx: + mp_terms_vs_gene_idx[mp_term_name["mpTermId"]] = \ + {"mp_term": mp_term_name["mpTermId"], + "mp_name": mp_term_name["mpTermName"], "genes": []} + mp_terms_vs_gene_idx[mp_term_name["mpTermId"]]["genes"].\ + append(gene_acc_id) + genes_by_mp_term = list(mp_terms_vs_gene_idx.values()) + + df = pd.DataFrame() + terms = [] + names = [] + genes = [] + for i in genes_by_mp_term: + terms.append(i["mp_term"]) + names.append(i["mp_name"]) + genes.append(",".join(i["genes"])) + + df["mp_term"] = terms + df["mp_name"] = names + df["genes"] = genes + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 7. Extract images with a particular phenotype or a set of phenotypes +def extr_img(inp): + head = sys.argv[4] + target_mp_terms = inp # ["MP:0002110", "MP:0000559"] + +# All the data is paginated using the page and size parameters, +# by default the endpoint returns the first 20 hits + gene_by_phenotypes_query = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={target_mp_terms}&page=0&size=20" + genes_with_morph_mps = requests.get(gene_by_phenotypes_query).json() + list_of_gene_bundle_urls = [ + gene["_links"]["geneBundle"]["href"] for gene in + genes_with_morph_mps["_embedded"]["genes"] + ] + + gene_bundles = [] + for gene_bundle_url in list_of_gene_bundle_urls: + gene_bundle = requests.get(gene_bundle_url).json() + gene_bundles.append(gene_bundle) + + images_with_morphology_mps = [] + + # Doing just the first 20 and filtering out fields on the images + display_fields = ["geneSymbol", "parameterName", "biologicalSampleGroup", + "colonyId", "zygosity", "sex", "downloadUrl", + "externalSampleId", "thumbnailUrl"] + + for gene_bundle in gene_bundles[:20]: + if len(gene_bundle) == 4: + continue + if gene_bundle["geneImages"] is not None: + images = gene_bundle["geneImages"] + for image in images: + display_image = {k: v for k, v in image.items() + if k in display_fields} + images_with_morphology_mps.append(display_image) + + images_table = [] + print(f"Images related to phenotype {target_mp_terms}: " + f"{len(images_with_morphology_mps)}") + # Displaying just the first 20 images + for i in images_with_morphology_mps[:20]: + row = [f"<img src='{i['thumbnailUrl']}' />"] + list(i.values()) + images_table.append(row) + + df = pd.DataFrame() + externalSampleId = [] + geneSymbol = [] + biologicalSampleGroup = [] + sex = [] + colonyId = [] + zygosity = [] + parameterName = [] + downloadUrl = [] + thumbnailUrl = [] + + for i in images_table: + externalSampleId.append(i[1]) + geneSymbol.append(i[2]) + biologicalSampleGroup.append(i[3]) + sex.append(i[4]) + colonyId.append(i[5]) + zygosity.append(i[6]) + parameterName.append(i[7]) + downloadUrl.append(i[8]) + thumbnailUrl.append(i[9]) + + df["externalSampleId"] = externalSampleId + df["geneSymbol"] = geneSymbol + df["biologicalSampleGroup"] = biologicalSampleGroup + df["sex"] = sex + df["colonyId"] = colonyId + df["zygosity"] = zygosity + df["parameterName"] = parameterName + df["downloadUrl"] = downloadUrl + df["thumbnailUrl"] = thumbnailUrl + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 11- Which parameters have been measured for a particular knockout +def parameters(inp): + head = sys.argv[4] + knockout = inp # "MGI:104636" + gene_info = requests.get(impc_api_search_url + "/" + knockout).json() + + if gene_info["phenotypingDataAvailable"]: + geneBundle = requests.get(gene_info["_links"]["geneBundle"]["href"])\ + .json() + gen_imgs = geneBundle["geneImages"] + par_list = [] + lis = {} + for i in gen_imgs: + lis = {"Parameter Name": i["parameterName"]} + if lis not in par_list: + par_list.append(lis) + df = pd.DataFrame() + li = [] + + for i in par_list: + li.append(i["Parameter Name"]) + + df["Parameter"] = li + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + else: + stop_err("No parameters available for this knockout gene") + + +# 12- Which parameters identified a significant finding for a particular +# knockout line (colony) +def sign_par(inp): + head = sys.argv[4] + knockout = inp # "MGI:104636" + + gene_info = requests.get(f"{impc_api_url}statisticalResults/search/" + f"findAllByMarkerAccessionIdIsAndSignificantTrue?" + f"mgiAccessionId=" + knockout).json() + gene_stats = gene_info["_embedded"]["statisticalResults"] + + if len(gene_stats) == 0: + stop_err("No statistically relevant parameters found " + "for this knockout gene") + else: + df = pd.DataFrame() + n = [] + p = [] + for g in gene_stats: + n.append(g["parameterName"]) + p.append(g["pvalue"]) + + df["Parameter name"] = n + df["p-value"] = p + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 13- List of genes names and ID measured in a pipeline +def genes_in_pipeline(inp, g_out): + head = sys.argv[4] + pip = inp + + g_in_p_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedPipelineId?pipelineId={pip}&" \ + f"page=0&size=1000" + genes_in_pip = requests.get(g_in_p_query).json() + pages = genes_in_pip["page"]["totalPages"] + max_elem = genes_in_pip["page"]["totalElements"] + + print(f"Genes with {pip}: {genes_in_pip['page']['totalElements']}") + list_d = [] + acc = [] + name = [] + + if max_elem > 1000: + g_in_p_query = genes_in_pip["_embedded"]["genes"] + for i in range(1, pages): + gl = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedPipelineId?pipelineId={pip}&" + f"page={i}&" + f"size=1000").json()["_embedded"]["genes"] + g_in_p_query += gl + else: + g_in_p_query = genes_in_pip["_embedded"]["genes"] + + for g in g_in_p_query: + d = {"Gene Accession ID": g["mgiAccessionId"], + "Gene Name": g["markerName"]} + list_d.append(d) + + for i in list_d: + acc.append(i["Gene Accession ID"]) + name.append(i["Gene Name"]) + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol", "Gene name"]) + list_of_genes["Gene symbol"] = mgi_sym_map(acc) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Gene name"]) + list_of_genes["Gene accession id"] = acc + list_of_genes["Gene name"] = name + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 14- Extract all genes and corresponding phenotypes related to a +# particular organ system (eg: significatMPTerm) +def sign_mp(inp, g_out): + head = sys.argv[4] + mp_term = inp # ["MP:0005391"] + + gene_by_mpterm_query = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={mp_term}&size=1000" + genes_with_mpterm = requests.get(gene_by_mpterm_query).json() + + pages = genes_with_mpterm["page"]["totalPages"] + genes_info = genes_with_mpterm["_embedded"]["genes"] + + for pn in range(1, pages): + pq = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={mp_term}&page={pn}&size=1000" + g = requests.get(pq).json()["_embedded"]["genes"] + genes_info += g + + list_d = [] + d = {} + for g in genes_info: + names = [] + ids = [] + for s in g["significantMpTerms"]: + names.append(s["mpTermName"]) + ids.append(s["mpTermId"]) + d = {"Gene": g["mgiAccessionId"], "mpTermId": ids, "mpTermName": names} + list_d.append(d) + + g = [] + ids = [] + names = [] + for i in list_d: + g.append(i["Gene"]) + ids.append(i["mpTermId"]) + names.append(i["mpTermName"]) + + df = pd.DataFrame() + if g_out == "sym": + df["Gene symbol"] = mgi_sym_map(g) + else: + df["Gene Id"] = g + df["Significant MP terms Ids"] = ids + df["Significant MP terms Names"] = names + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 16- Full table of genes and all identified phenotypes +def full_gene_table(g_out): + head = sys.argv[4] + gene_list = requests.get(impc_api_search_url + "?page=0&size=1000").json() + pages = gene_list["page"]["totalPages"] + genes_info = gene_list["_embedded"]["genes"] + + for pn in range(1, pages): + gp = requests.get(impc_api_search_url + + f"?page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + genes_info += gp + + d = {} + list_d = [] + + for i in genes_info: + if i["significantMpTerms"] is None: + d = {"Gene": i["mgiAccessionId"], "Identified phenotypes": "None"} + else: + d = {"Gene": i["mgiAccessionId"], + "Identified phenotypes": [ + sub["mpTermId"] for sub in i["significantMpTerms"] + ]} + list_d.append(d) + + df = pd.DataFrame() + g = [] + p = [] + for i in list_d: + g.append(i["Gene"]) + p.append(i["Identified phenotypes"]) + + if g_out == "sym": + df["Gene symbol"] = mgi_sym_map(g) + else: + df["MGI id"] = g + df["MP term list"] = p + + for i in range(0, len(df)): + if df["MP term list"][i] != "None": + df["MP term list"][i] = str( + df["MP term list"][i] + )[1:-1].replace("'", "") + + if str(sys.argv[1]) == "True": + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + else: + df = df[df["MP term list"] != "None"] + df.reset_index(drop=True, inplace=True) + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 18- Extract measurements and analysis for a parameter or pipeline +def par_pip_ma(inp): + head = sys.argv[4] + id = inp + + if id[0:4] == "IMPC": + par = True + ma_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedParameterId?" \ + f"parameterId={id}&page=0&size=1000" + else: + ma_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedPipelineId?" \ + f"pipelineId={id}&page=0&size=1000" + par = False + + ma_in_pip = requests.get(ma_query).json() + pages = ma_in_pip["page"]["totalPages"] + max_elem = ma_in_pip["page"]["totalElements"] + + print(f"Genes with {id}: {ma_in_pip['page']['totalElements']}") + list_d = [] + list_of_genes = pd.DataFrame(columns=["Measurements", "Analysis"]) + mes = [] + an = [] + + if max_elem > 1000: + + ma_in_pip = ma_in_pip["_embedded"]["genes"] + for pn in range(1, pages): + if par: + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedParameterId?" + f"parameterId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + else: + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedPipelineId?" + f"pipelineId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + ma_in_pip += pip + + else: + ma_in_pip = ma_in_pip["_embedded"]["genes"] + + for g in ma_in_pip: + d = {"Measurements": g[""], "Analysis": g[""]} + list_d.append(d) + + for i in list_d: + mes.append(i[""]) + an.append(i[""]) + + list_of_genes["Analysis"] = an + list_of_genes["Measurements"] = mes + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 19- Get all genes and measured values for a particular parameter +def par_gen(inp, g_out): + head = sys.argv[4] + id = inp + + pa_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedParameterId?parameterId={id}&page=0&size=1000" + + gm_par = requests.get(pa_query).json() + pages = gm_par["page"]["totalPages"] + max_elem = gm_par["page"]["totalElements"] + + print(f"Genes with {id}: {gm_par['page']['totalElements']}") + list_d = [] + gen = [] + mes = [] + + if max_elem > 1000: + + gm_par = gm_par["_embedded"]["genes"] + + for pn in range(1, pages): + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedParameterId?" + f"parameterId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + gm_par += pip + + else: + gm_par = gm_par["_embedded"]["genes"] + + for g in gm_par: + d = {"Genes": g["mgiAccessionId"], "Measured Values": g[""]} + list_d.append(d) + + for i in list_d: + gen.append(i["Genes"]) + mes.append(i["Measured Values"]) + + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol", + "Measured Values"]) + list_of_genes["Gene symbol"] = mgi_sym_map(gen) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Measured Values"]) + list_of_genes["Gene accession id"] = gen + list_of_genes["Measured Values"] = mes + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# Function to map gene symbol to MGI ids +def gene_mapping(inp): + tmp = inp.split(",") + final_list = [] + sym_list = [] + for i in tmp: + if "MGI:" in i: + final_list.append(i) + else: + sym_list.append(i) + del i + + # symbol for symbols, mgi for MGI : + # https://docs.mygene.info/en/latest/doc/query_service.html#available-fields + if len(sym_list) != 0: + mg = mygene.MyGeneInfo() + ginfo = mg.querymany(sym_list, scopes="symbol", fields="symbol,MGI", + species="mouse") + empty = True + discarded = [] + for i in ginfo: + try: + final_list.append(i["MGI"]) + empty = False + except KeyError: + discarded.append(i["query"]) + if empty and len(final_list) == 0: + stop_err("Error: it was not possible to map the input.") + elif empty: + print("Warning: it was not possible to map any of the symbol ids. " + "Only MGI ids will be used.") + elif len(discarded) != 0: + print("Warning: it was not possible to map these elements: " + "" + ",".join(discarded) + "\n") + + return final_list + + +# Function to map phenotypes ids to names +def pheno_mapping(inp): + tmp = inp.split(",") + final_list = [] + sym_list = [] + for i in tmp: + if "MP:" in i: + final_list.append(i) + else: + sym_list.append(i) + del i + if len(sym_list) != 0: + url = "https://raw.githubusercontent.com/AndreaFurlani/" \ + "hp_mp_mapping_test/main/hp_mp_mapping.csv" + mapper = pd.read_csv(url, header=0, index_col=2) + empty = True + discarded = [] + for i in sym_list: + try: + final_list.append(mapper.loc[i]["mpId"]) + empty = False + except KeyError: + discarded.append(i) + continue + if empty and len(final_list) == 0: + stop_err("Error: it was not possible to map the input.") + elif empty: + print("Warning: it was not possible to map any of the " + "HP term entries. Only MP entries will be used.") + elif len(discarded) != 0: + print("Warning: it was not possible to " + "map these elements: " + ",".join(discarded) + "\n") + return final_list + + +# Function to map MGI ids to Gene Symbols +def mgi_sym_map(mgi_list): + sym_list = [] + mg = mygene.MyGeneInfo() + ginfo = mg.querymany(mgi_list, scopes="MGI", fields="symbol,MGI", + species="mouse") + discarded = [] + for i in ginfo: + try: + sym_list.append(i["symbol"]) + except KeyError: + sym_list.append(i["query"]) + discarded.append(i["query"]) + if len(discarded) != 0: + print("It was not possible to map these genes: " + ",".join(discarded)) + return sym_list + + +if __name__ == "__main__": + main()