comparison fetch_atel.py @ 0:a35056104c2c draft default tip

planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author astroteam
date Fri, 13 Jun 2025 13:26:36 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a35056104c2c
1 import requests
2 from bs4 import BeautifulSoup
3 import os
4 import re
5 atel_number = 16672
6
7
8 def fetch_atel(atel_number):
9 """
10 Fetches the ATel page for the given ATel number and returns the AteL text.
11 It assumes that the paragraph is the first one after the paragraph that
12 contains the string "Tweet".
13 input : atel_number (int): The ATel number to fetch.
14 output : response_text (str): The HTML content of the ATel text.
15 If an error occurs, it returns None.
16 """
17
18 # URL of the ATel page
19 url = 'https://www.astronomerstelegram.org/?read={}'.format(atel_number)
20
21 # To fake the User-Agent header
22 # This is to avoid being blocked by the server for not having a User-Agent
23 headers = {
24 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
25 }
26
27 # This is mainly for testing purposes
28 # Check if the file already exists
29 # If it does, read the content from the file
30 # If it doesn't, fetch the page and save it to a file
31 # The file name is based on the ATel number
32 # For example, if the ATel number is 16672, the file name will be 'atel_16672.html'
33
34 fname = 'atel_{}.html'.format(atel_number)
35
36 if not os.path.isfile(fname):
37 # Send a GET request to the URL
38 response = requests.get(url, headers=headers)
39 if response.status_code == 200:
40 print("Page fetched successfully.")
41 with open(fname, 'w', encoding='utf-8') as f:
42 f.write(response.text)
43 response_text = response.text
44 else:
45 print(f"Failed to retrieve the page. Status code: {response.status_code}")
46 return None
47 elif os.path.isfile(fname):
48 print("Page already fetched.")
49 with open(fname, 'r', encoding='utf-8') as f:
50 response_text = f.read()
51 else:
52 print("Page not found.")
53 return None
54
55 soup = BeautifulSoup(response_text, 'html.parser')
56
57 # print(soup.prettify())
58
59 tds = soup.body.find_all("p")
60 twitter_index = -1
61 for i, td in enumerate(tds):
62 if 'Tweet' in td.get_text(strip=True):
63 twitter_index = i
64
65 para = tds[twitter_index + 1]
66
67 cleaned_text = re.sub(r'[^\x00-\x7F]+', '', para.text) # remove non-ASCII
68 print(cleaned_text)
69
70 return cleaned_text
71
72
73 if __name__ == "__main__":
74 fetch_atel(atel_number)