analyse_short_astro_text_astro_tool: fetch

comparison fetch_atel.py @ 0:a35056104c2c draft default tip

planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2

author	astroteam
date	Fri, 13 Jun 2025 13:26:36 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:a35056104c2c
+import requests
+from bs4 import BeautifulSoup
+import os
+import re
+atel_number = 16672
+def fetch_atel(atel_number):
+"""
+Fetches the ATel page for the given ATel number and returns the AteL text.
+It assumes that the paragraph is the first one after the paragraph that
+contains the string "Tweet".
+input : atel_number (int): The ATel number to fetch.
+output : response_text (str): The HTML content of the ATel text.
+If an error occurs, it returns None.
+"""
+# URL of the ATel page
+url = 'https://www.astronomerstelegram.org/?read={}'.format(atel_number)
+# To fake the User-Agent header
+# This is to avoid being blocked by the server for not having a User-Agent
+headers = {
+'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
+}
+# This is mainly for testing purposes
+# Check if the file already exists
+# If it does, read the content from the file
+# If it doesn't, fetch the page and save it to a file
+# The file name is based on the ATel number
+# For example, if the ATel number is 16672, the file name will be 'atel_16672.html'
+fname = 'atel_{}.html'.format(atel_number)
+if not os.path.isfile(fname):
+# Send a GET request to the URL
+response = requests.get(url, headers=headers)
+if response.status_code == 200:
+print("Page fetched successfully.")
+with open(fname, 'w', encoding='utf-8') as f:
+f.write(response.text)
+response_text = response.text
+else:
+print(f"Failed to retrieve the page. Status code: {response.status_code}")
+return None
+elif os.path.isfile(fname):
+print("Page already fetched.")
+with open(fname, 'r', encoding='utf-8') as f:
+response_text = f.read()
+else:
+print("Page not found.")
+return None
+soup = BeautifulSoup(response_text, 'html.parser')
+# print(soup.prettify())
+tds = soup.body.find_all("p")
+twitter_index = -1
+for i, td in enumerate(tds):
+if 'Tweet' in td.get_text(strip=True):
+twitter_index = i
+para = tds[twitter_index + 1]
+cleaned_text = re.sub(r'[^\x00-\x7F]+', '', para.text)  # remove non-ASCII
+print(cleaned_text)
+return cleaned_text
+if __name__ == "__main__":
+fetch_atel(atel_number)

Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool

comparison fetch_atel.py @ 0:a35056104c2c draft default tip