Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool
comparison pipeline_source_classes.py @ 0:a35056104c2c draft default tip
planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author | astroteam |
---|---|
date | Fri, 13 Jun 2025 13:26:36 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a35056104c2c |
---|---|
1 import pandas as pd | |
2 import re | |
3 | |
4 | |
5 def rule_based_class_detector(simbad_node_file, text_id_text): | |
6 df = pd.read_csv(simbad_node_file) | |
7 pattern_list = list(df["Description"].values) | |
8 | |
9 classes = [] | |
10 | |
11 for pattern in pattern_list: | |
12 for m in re.finditer(f"\\b{pattern.lower()}\\b", text_id_text): | |
13 source_ = m.group(0) | |
14 classes.append(source_) | |
15 | |
16 return classes | |
17 | |
18 | |
19 def source_class(df_in, simbad_node_file): | |
20 out_class_list = [] | |
21 if len(df_in) > 0: | |
22 df_dict = pd.read_csv(simbad_node_file) | |
23 | |
24 class_list = [] | |
25 | |
26 otypes_ = df_in["OTYPE"].values | |
27 for otypes in otypes_: | |
28 if otypes is not None: | |
29 for otype in set(otypes.split("|")): | |
30 class_list.append(otype) | |
31 | |
32 for otype in set(class_list): | |
33 if "?" in otype: | |
34 out_class_list.append(otype) | |
35 classes = df_dict["Description"][df_dict["Id"] == otype].values | |
36 if len(classes) != 0: | |
37 out_class_list.append(classes[0]) | |
38 | |
39 return out_class_list | |
40 | |
41 | |
42 def detect_source_classes(text_id, text_id_text, df_sources, simbad_node_file): | |
43 classes_1 = rule_based_class_detector(simbad_node_file, text_id_text.lower()) | |
44 classes_2 = source_class(df_sources, simbad_node_file) | |
45 classes = classes_1 + classes_2 | |
46 | |
47 if len(classes) != 0: | |
48 out_classes = list(set(classes)) | |
49 | |
50 dict_data = {"TEXT_ID": [text_id] * len(out_classes), "Source Classes": out_classes} | |
51 df_data = pd.DataFrame(dict_data) | |
52 return df_data | |
53 | |
54 return pd.DataFrame() |