Mercurial > repos > astroteam > analyse_short_astro_text_astro_tool
comparison fetch_atel.py @ 0:a35056104c2c draft default tip
planemo upload for repository https://github.com/esg-epfl-apc/tools-astro/tree/main/tools commit da42ae0d18f550dec7f6d7e29d297e7cf1909df2
author | astroteam |
---|---|
date | Fri, 13 Jun 2025 13:26:36 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a35056104c2c |
---|---|
1 import requests | |
2 from bs4 import BeautifulSoup | |
3 import os | |
4 import re | |
5 atel_number = 16672 | |
6 | |
7 | |
8 def fetch_atel(atel_number): | |
9 """ | |
10 Fetches the ATel page for the given ATel number and returns the AteL text. | |
11 It assumes that the paragraph is the first one after the paragraph that | |
12 contains the string "Tweet". | |
13 input : atel_number (int): The ATel number to fetch. | |
14 output : response_text (str): The HTML content of the ATel text. | |
15 If an error occurs, it returns None. | |
16 """ | |
17 | |
18 # URL of the ATel page | |
19 url = 'https://www.astronomerstelegram.org/?read={}'.format(atel_number) | |
20 | |
21 # To fake the User-Agent header | |
22 # This is to avoid being blocked by the server for not having a User-Agent | |
23 headers = { | |
24 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' | |
25 } | |
26 | |
27 # This is mainly for testing purposes | |
28 # Check if the file already exists | |
29 # If it does, read the content from the file | |
30 # If it doesn't, fetch the page and save it to a file | |
31 # The file name is based on the ATel number | |
32 # For example, if the ATel number is 16672, the file name will be 'atel_16672.html' | |
33 | |
34 fname = 'atel_{}.html'.format(atel_number) | |
35 | |
36 if not os.path.isfile(fname): | |
37 # Send a GET request to the URL | |
38 response = requests.get(url, headers=headers) | |
39 if response.status_code == 200: | |
40 print("Page fetched successfully.") | |
41 with open(fname, 'w', encoding='utf-8') as f: | |
42 f.write(response.text) | |
43 response_text = response.text | |
44 else: | |
45 print(f"Failed to retrieve the page. Status code: {response.status_code}") | |
46 return None | |
47 elif os.path.isfile(fname): | |
48 print("Page already fetched.") | |
49 with open(fname, 'r', encoding='utf-8') as f: | |
50 response_text = f.read() | |
51 else: | |
52 print("Page not found.") | |
53 return None | |
54 | |
55 soup = BeautifulSoup(response_text, 'html.parser') | |
56 | |
57 # print(soup.prettify()) | |
58 | |
59 tds = soup.body.find_all("p") | |
60 twitter_index = -1 | |
61 for i, td in enumerate(tds): | |
62 if 'Tweet' in td.get_text(strip=True): | |
63 twitter_index = i | |
64 | |
65 para = tds[twitter_index + 1] | |
66 | |
67 cleaned_text = re.sub(r'[^\x00-\x7F]+', '', para.text) # remove non-ASCII | |
68 print(cleaned_text) | |
69 | |
70 return cleaned_text | |
71 | |
72 | |
73 if __name__ == "__main__": | |
74 fetch_atel(atel_number) |