comparison pipeline_source_classes.py @ 0:a35056104c2c draft default tip

planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author astroteam
date Fri, 13 Jun 2025 13:26:36 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a35056104c2c
1 import pandas as pd
2 import re
3
4
5 def rule_based_class_detector(simbad_node_file, text_id_text):
6 df = pd.read_csv(simbad_node_file)
7 pattern_list = list(df["Description"].values)
8
9 classes = []
10
11 for pattern in pattern_list:
12 for m in re.finditer(f"\\b{pattern.lower()}\\b", text_id_text):
13 source_ = m.group(0)
14 classes.append(source_)
15
16 return classes
17
18
19 def source_class(df_in, simbad_node_file):
20 out_class_list = []
21 if len(df_in) > 0:
22 df_dict = pd.read_csv(simbad_node_file)
23
24 class_list = []
25
26 otypes_ = df_in["OTYPE"].values
27 for otypes in otypes_:
28 if otypes is not None:
29 for otype in set(otypes.split("|")):
30 class_list.append(otype)
31
32 for otype in set(class_list):
33 if "?" in otype:
34 out_class_list.append(otype)
35 classes = df_dict["Description"][df_dict["Id"] == otype].values
36 if len(classes) != 0:
37 out_class_list.append(classes[0])
38
39 return out_class_list
40
41
42 def detect_source_classes(text_id, text_id_text, df_sources, simbad_node_file):
43 classes_1 = rule_based_class_detector(simbad_node_file, text_id_text.lower())
44 classes_2 = source_class(df_sources, simbad_node_file)
45 classes = classes_1 + classes_2
46
47 if len(classes) != 0:
48 out_classes = list(set(classes))
49
50 dict_data = {"TEXT_ID": [text_id] * len(out_classes), "Source Classes": out_classes}
51 df_data = pd.DataFrame(dict_data)
52 return df_data
53
54 return pd.DataFrame()