Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool

diff fetch_atel.py @ 0:a35056104c2c draft default tip
planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author: astroteam
date: Fri, 13 Jun 2025 13:26:36 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_atel.py	Fri Jun 13 13:26:36 2025 +0000
@@ -0,0 +1,74 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+import re
+atel_number = 16672
+
+
+def fetch_atel(atel_number):
+    """
+    Fetches the ATel page for the given ATel number and returns the AteL text.
+    It assumes that the paragraph is the first one after the paragraph that
+    contains the string "Tweet".
+    input : atel_number (int): The ATel number to fetch.
+    output : response_text (str): The HTML content of the ATel text.
+    If an error occurs, it returns None.
+    """
+
+    # URL of the ATel page
+    url = 'https://www.astronomerstelegram.org/?read={}'.format(atel_number)
+
+    # To fake the User-Agent header
+    # This is to avoid being blocked by the server for not having a User-Agent
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
+    }
+
+    # This is mainly for testing purposes
+    # Check if the file already exists
+    # If it does, read the content from the file
+    # If it doesn't, fetch the page and save it to a file
+    # The file name is based on the ATel number
+    # For example, if the ATel number is 16672, the file name will be 'atel_16672.html'
+
+    fname = 'atel_{}.html'.format(atel_number)
+
+    if not os.path.isfile(fname):
+        # Send a GET request to the URL
+        response = requests.get(url, headers=headers)
+        if response.status_code == 200:
+            print("Page fetched successfully.")
+            with open(fname, 'w', encoding='utf-8') as f:
+                f.write(response.text)
+            response_text = response.text
+        else:
+            print(f"Failed to retrieve the page. Status code: {response.status_code}")
+            return None
+    elif os.path.isfile(fname):
+        print("Page already fetched.")
+        with open(fname, 'r', encoding='utf-8') as f:
+            response_text = f.read()
+    else:
+        print("Page not found.")
+        return None
+
+    soup = BeautifulSoup(response_text, 'html.parser')
+
+    # print(soup.prettify())
+
+    tds = soup.body.find_all("p")
+    twitter_index = -1
+    for i, td in enumerate(tds):
+        if 'Tweet' in td.get_text(strip=True):
+            twitter_index = i
+
+    para = tds[twitter_index + 1]
+
+    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', para.text)  # remove non-ASCII
+    print(cleaned_text)
+
+    return cleaned_text
+
+
+if __name__ == "__main__":
+    fetch_atel(atel_number)
author	astroteam
date	Fri, 13 Jun 2025 13:26:36 +0000
parents
children