Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool
diff fetch_atel.py @ 0:a35056104c2c draft default tip
planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author | astroteam |
---|---|
date | Fri, 13 Jun 2025 13:26:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_atel.py Fri Jun 13 13:26:36 2025 +0000 @@ -0,0 +1,74 @@ +import requests +from bs4 import BeautifulSoup +import os +import re +atel_number = 16672 + + +def fetch_atel(atel_number): + """ + Fetches the ATel page for the given ATel number and returns the AteL text. + It assumes that the paragraph is the first one after the paragraph that + contains the string "Tweet". + input : atel_number (int): The ATel number to fetch. + output : response_text (str): The HTML content of the ATel text. + If an error occurs, it returns None. + """ + + # URL of the ATel page + url = 'https://www.astronomerstelegram.org/?read={}'.format(atel_number) + + # To fake the User-Agent header + # This is to avoid being blocked by the server for not having a User-Agent + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + } + + # This is mainly for testing purposes + # Check if the file already exists + # If it does, read the content from the file + # If it doesn't, fetch the page and save it to a file + # The file name is based on the ATel number + # For example, if the ATel number is 16672, the file name will be 'atel_16672.html' + + fname = 'atel_{}.html'.format(atel_number) + + if not os.path.isfile(fname): + # Send a GET request to the URL + response = requests.get(url, headers=headers) + if response.status_code == 200: + print("Page fetched successfully.") + with open(fname, 'w', encoding='utf-8') as f: + f.write(response.text) + response_text = response.text + else: + print(f"Failed to retrieve the page. Status code: {response.status_code}") + return None + elif os.path.isfile(fname): + print("Page already fetched.") + with open(fname, 'r', encoding='utf-8') as f: + response_text = f.read() + else: + print("Page not found.") + return None + + soup = BeautifulSoup(response_text, 'html.parser') + + # print(soup.prettify()) + + tds = soup.body.find_all("p") + twitter_index = -1 + for i, td in enumerate(tds): + if 'Tweet' in td.get_text(strip=True): + twitter_index = i + + para = tds[twitter_index + 1] + + cleaned_text = re.sub(r'[^\x00-\x7F]+', '', para.text) # remove non-ASCII + print(cleaned_text) + + return cleaned_text + + +if __name__ == "__main__": + fetch_atel(atel_number)