annotate pipeline_astrobert.py @ 0:a35056104c2c draft default tip

planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author astroteam
date Fri, 13 Jun 2025 13:26:36 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
1 import re
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
2 import pandas as pd
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
3 import numpy as np
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
4 import tempfile
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
5 from transformers import AutoModelForTokenClassification, AutoTokenizer
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
6 from transformers import TokenClassificationPipeline
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
7
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
8
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
9 def split_text_in_phrases(text_id, text_):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
10 list_proto_phrases = re.split(r"(\. [A-Z])", text_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
11 for i in range(1, len(list_proto_phrases) - 1, 2):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
12 back_ = list_proto_phrases[i][0]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
13 front_ = list_proto_phrases[i][-1]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
14 list_proto_phrases[i+1] = front_ + list_proto_phrases[i+1]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
15 list_proto_phrases[i-1] = list_proto_phrases[i-1] + back_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
16
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
17 list_phrases = []
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
18 for i in range(0, len(list_proto_phrases), 2):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
19 list_phrases.append(list_proto_phrases[i])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
20
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
21 text_check = " ".join(list_phrases)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
22 if text_check != text_:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
23 print(text_id)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
24 return list_phrases
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
25
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
26
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
27 def apply_astroBERT(text_id, body_text_0):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
28 dict_out = {"TEXT_ID": [], "word": [], "start": [], "end": [], "score": [], "entity_group": [], "Phrase": []}
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
29
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
30 tmpdir_ = tempfile.TemporaryDirectory()
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
31
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
32 try:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
33 # load astroBERT for NER-DEAL
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
34 remote_model_path = 'adsabs/astroBERT'
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
35 # you need to load the astroBERT trained for NER-DEAL, which is on a seperate branch
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
36 revision = 'NER-DEAL'
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
37
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
38 astroBERT_NER_DEAL = AutoModelForTokenClassification.from_pretrained(
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
39 pretrained_model_name_or_path=remote_model_path,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
40 revision=revision,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
41 cache_dir=tmpdir_.name
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
42 )
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
43
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
44 astroBERT_tokenizer = AutoTokenizer.from_pretrained(
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
45 pretrained_model_name_or_path=remote_model_path,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
46 add_special_tokens=True,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
47 do_lower_case=False,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
48 model_max_length=512,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
49 cache_dir=tmpdir_.name
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
50 )
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
51
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
52 # use the Hugginface Pipeline class
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
53 NER_pipeline = TokenClassificationPipeline(
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
54 model=astroBERT_NER_DEAL,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
55 tokenizer=astroBERT_tokenizer,
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
56 task='astroBERT NER_DEAL',
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
57 aggregation_strategy='average',
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
58 ignore_labels=['O']
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
59 )
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
60
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
61 text = " ".join(body_text_0.split()).replace("°", "o").replace("º", "o").replace("−", "-").replace('°', "o")
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
62 list_phrases = split_text_in_phrases(text_id, text)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
63
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
64 for phrase_ in list_phrases:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
65 result = NER_pipeline(phrase_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
66
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
67 for u in result:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
68 ent_ = u["entity_group"]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
69 if ent_ in ["Instrument", "Telescope", "Wavelength", "CelestialObject", "CelestialRegion", "EntityOfFutureInterest", "Mission", "Observatory", "Survey"]:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
70 dict_out["TEXT_ID"].append(text_id)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
71 dict_out["Phrase"].append(phrase_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
72
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
73 dict_out["word"].append(u["word"])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
74 dict_out["score"].append(u["score"])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
75 dict_out["start"].append(u["start"])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
76 dict_out["end"].append(u["end"])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
77 dict_out["entity_group"].append(ent_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
78 except Exception as e:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
79 print(f"An error occurred in apply_astroBERT: {e}")
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
80 finally:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
81 tmpdir_.cleanup()
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
82
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
83 return pd.DataFrame(dict_out)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
84
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
85
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
86 def get_astroBERT_cleaned_result(text_id, body_text_0):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
87 list_entities = ["Instrument", "Telescope", "Wavelength", "CelestialObject", "CelestialRegion", "EntityOfFutureInterest", "Mission", "Observatory", "Survey"]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
88
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
89 df_raw = apply_astroBERT(text_id, body_text_0)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
90 dict_out = {"TEXT_ID": [], "word": [], "start": [], "end": [], "Score": [], "Phrase": [], "entity_group": []}
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
91
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
92 for entity_to_study in list_entities:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
93 df_tmp0 = df_raw[df_raw["entity_group"] == entity_to_study]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
94 phrases_ = np.unique(df_tmp0["Phrase"])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
95
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
96 for phrase_ in phrases_:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
97 df_tmp1 = df_tmp0[df_tmp0["Phrase"] == phrase_]
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
98 if len(df_tmp1) == 1:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
99 dict_out["TEXT_ID"].append(text_id)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
100 dict_out["Phrase"].append(df_tmp1.Phrase.values[0])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
101 dict_out["word"].append(df_tmp1.word.values[0])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
102 dict_out["start"].append(df_tmp1.start.values[0])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
103 dict_out["end"].append(df_tmp1.end.values[0])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
104 dict_out["Score"].append(df_tmp1.score.values[0])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
105 dict_out["entity_group"].append(entity_to_study)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
106
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
107 else:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
108 df_tmp1.sort_values(by=['start'])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
109 for s_i, (s_, e_, sc_) in enumerate(zip(df_tmp1.start.values, df_tmp1.end.values, df_tmp1.score.values)):
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
110 if s_i == 0:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
111 s_o = s_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
112 e_o = e_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
113 sc_s = sc_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
114 word_size = 1
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
115 else:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
116
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
117 if s_ <= e_o + 1:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
118 e_o = e_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
119 sc_s += sc_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
120 word_size += 1
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
121
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
122 else:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
123 dict_out["TEXT_ID"].append(text_id)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
124 dict_out["Phrase"].append(phrase_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
125 dict_out["word"].append(phrase_[s_o: e_o])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
126 dict_out["start"].append(s_o)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
127 dict_out["end"].append(e_o)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
128 dict_out["Score"].append(sc_s / word_size)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
129 dict_out["entity_group"].append(entity_to_study)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
130
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
131 s_o = s_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
132 e_o = e_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
133 sc_s = sc_
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
134 word_size = 1
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
135
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
136 if s_i == len(df_tmp1) - 1:
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
137 dict_out["TEXT_ID"].append(text_id)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
138 dict_out["Phrase"].append(phrase_)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
139 dict_out["word"].append(phrase_[s_o: e_o])
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
140 dict_out["start"].append(s_o)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
141 dict_out["end"].append(e_o)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
142 dict_out["Score"].append(sc_s / word_size)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
143 dict_out["entity_group"].append(entity_to_study)
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
144
a35056104c2c planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
astroteam
parents:
diff changeset
145 return pd.DataFrame(dict_out)