Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool
diff pipeline_telescope.py @ 0:a35056104c2c draft default tip
planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author | astroteam |
---|---|
date | Fri, 13 Jun 2025 13:26:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pipeline_telescope.py Fri Jun 13 13:26:36 2025 +0000 @@ -0,0 +1,92 @@ +import pandas as pd +import re + +from rdflib import Graph, Namespace +from rdflib.namespace import RDF, RDFS, SKOS + +from aux_functions import compute_sensitivity, compute_sensitivity_int, list_tel + +ODA = Namespace("https://odahub.io/ontology#") +g_label_site = len("http://www.w3.org/2000/01/rdf-schema#label") + + +def find_entity(g, class_, text_id_text, text_id_text_upper): + dict_ = {"label": {"val": [], "URI": [], "Sensitivity": []}, "altLabel": {"val": [], "URI": [], "Sensitivity": []}, "hiddenLabel": {"val": [], "URI": [], "Sensitivity": []}} + for i, [u_telescope, p, o] in enumerate(g.triples((None, RDF.type, class_))): + + exists_label_telescope = 0 + for s, p, label_telescope in g.triples((u_telescope, RDFS.label, None)): + + result = re.search("\\b(" + label_telescope.lower() + ")([1-2]{0,1})\\b", text_id_text) + if result: + exists_label_telescope = 1 + val_ = result.group(0) + # add_ = val_[len(label_telescope):] + dict_["label"]["val"].append(label_telescope) + dict_["label"]["URI"].append(u_telescope) + dict_["label"]["Sensitivity"].append(compute_sensitivity(list_tel(u_telescope, g))) + + exists_altlabel_telescope = 0 + if exists_label_telescope == 0: + for s, p, altlabel_telescope in g.triples((u_telescope, SKOS.altLabel, None)): + + result = re.search("\\b(" + altlabel_telescope + ")\\b", text_id_text_upper) + if result: + exists_altlabel_telescope = 1 + val_ = result.group(0) + dict_["altLabel"]["val"].append(val_) + dict_["altLabel"]["URI"].append(u_telescope) + dict_["altLabel"]["Sensitivity"].append(compute_sensitivity(list_tel(u_telescope, g))) + + if exists_altlabel_telescope == 0: + for s, p, hiddenlabel_telescope in g.triples((u_telescope, SKOS.hiddenLabel, None)): + + result = re.search("\\b" + hiddenlabel_telescope.lower() + "\\b", text_id_text) + if result: + val_ = result.group(0) + dict_["hiddenLabel"]["val"].append(hiddenlabel_telescope) + dict_["hiddenLabel"]["URI"].append(u_telescope) + dict_["hiddenLabel"]["Sensitivity"].append(compute_sensitivity(list_tel(u_telescope, g))) + + return dict_ + + +def rule_based_telescope_detector(text_id, text_id_text, telescope_ontology): + g = Graph() + g.parse(telescope_ontology, format="n3") + + text_id_text_lower = text_id_text.lower() + + dict_observatory = find_entity(g, ODA.observatory, text_id_text_lower, text_id_text) + dict_survey = find_entity(g, ODA.survey, text_id_text_lower, text_id_text) + dict_telescope = find_entity(g, ODA.telescope, text_id_text_lower, text_id_text) + dict_misctelescope = find_entity(g, ODA.misctelescope, text_id_text_lower, text_id_text) + dict_telescopetype = find_entity(g, ODA.telescopetype, text_id_text_lower, text_id_text) + + dict_spacetelescope = find_entity(g, ODA.spacetelescope, text_id_text_lower, text_id_text) + dict_instrument = find_entity(g, ODA.instrument, text_id_text_lower, text_id_text) + dict_institution = find_entity(g, ODA.institution, text_id_text_lower, text_id_text) + dict_radiotelescope = find_entity(g, ODA.radiotelescope, text_id_text_lower, text_id_text) + + tel_sur_obs = [] + type_key = [] + uri_list = [] + sens_list = [] + + for key in ["label", "altLabel", "hiddenLabel"]: + list_key = dict_institution[key]["val"] + dict_spacetelescope[key]["val"] + dict_telescope[key]["val"] + dict_survey[key]["val"] + dict_observatory[key]["val"] + dict_radiotelescope[key]["val"] + dict_instrument[key]["val"] + dict_telescopetype[key]["val"] + dict_misctelescope[key]["val"] + tel_sur_obs += list_key + + list_uri_key = dict_institution[key]["URI"] + dict_spacetelescope[key]["URI"] + dict_telescope[key]["URI"] + dict_survey[key]["URI"] + dict_observatory[key]["URI"] + dict_radiotelescope[key]["URI"] + dict_instrument[key]["URI"] + dict_telescopetype[key]["URI"] + dict_misctelescope[key]["URI"] + uri_list += list_uri_key + + sens_list += dict_institution[key]["Sensitivity"] + dict_spacetelescope[key]["Sensitivity"] + dict_telescope[key]["Sensitivity"] + dict_survey[key]["Sensitivity"] + dict_observatory[key]["Sensitivity"] + dict_radiotelescope[key]["Sensitivity"] + dict_instrument[key]["Sensitivity"] + dict_telescopetype[key]["Sensitivity"] + dict_misctelescope[key]["Sensitivity"] + + type_key += [key]*len(list_key) + + dict_data = {"TEXT_ID": [text_id] * len(tel_sur_obs), "Telescope": tel_sur_obs, "LabelType": type_key, "URI": uri_list, "Sensitivity": sens_list, "Total Sensitivity": [compute_sensitivity_int(sens_list)] * len(tel_sur_obs)} + + df_data = pd.DataFrame(dict_data) + df_data.drop_duplicates(subset=['URI'], inplace=True) + + return df_data