Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/chardet/metadata/languages.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 """ | |
4 Metadata about languages used by our model training code for our | |
5 SingleByteCharSetProbers. Could be used for other things in the future. | |
6 | |
7 This code is based on the language metadata from the uchardet project. | |
8 """ | |
9 from __future__ import absolute_import, print_function | |
10 | |
11 from string import ascii_letters | |
12 | |
13 | |
14 # TODO: Add Ukranian (KOI8-U) | |
15 | |
16 class Language(object): | |
17 """Metadata about a language useful for training models | |
18 | |
19 :ivar name: The human name for the language, in English. | |
20 :type name: str | |
21 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, | |
22 or use another catalog as a last resort. | |
23 :type iso_code: str | |
24 :ivar use_ascii: Whether or not ASCII letters should be included in trained | |
25 models. | |
26 :type use_ascii: bool | |
27 :ivar charsets: The charsets we want to support and create data for. | |
28 :type charsets: list of str | |
29 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is | |
30 `True`, you only need to add those not in the ASCII set. | |
31 :type alphabet: str | |
32 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling | |
33 Wikipedia for training data. | |
34 :type wiki_start_pages: list of str | |
35 """ | |
36 def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, | |
37 alphabet=None, wiki_start_pages=None): | |
38 super(Language, self).__init__() | |
39 self.name = name | |
40 self.iso_code = iso_code | |
41 self.use_ascii = use_ascii | |
42 self.charsets = charsets | |
43 if self.use_ascii: | |
44 if alphabet: | |
45 alphabet += ascii_letters | |
46 else: | |
47 alphabet = ascii_letters | |
48 elif not alphabet: | |
49 raise ValueError('Must supply alphabet if use_ascii is False') | |
50 self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None | |
51 self.wiki_start_pages = wiki_start_pages | |
52 | |
53 def __repr__(self): | |
54 return '{}({})'.format(self.__class__.__name__, | |
55 ', '.join('{}={!r}'.format(k, v) | |
56 for k, v in self.__dict__.items() | |
57 if not k.startswith('_'))) | |
58 | |
59 | |
60 LANGUAGES = {'Arabic': Language(name='Arabic', | |
61 iso_code='ar', | |
62 use_ascii=False, | |
63 # We only support encodings that use isolated | |
64 # forms, because the current recommendation is | |
65 # that the rendering system handles presentation | |
66 # forms. This means we purposefully skip IBM864. | |
67 charsets=['ISO-8859-6', 'WINDOWS-1256', | |
68 'CP720', 'CP864'], | |
69 alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', | |
70 wiki_start_pages=[u'الصفحة_الرئيسية']), | |
71 'Belarusian': Language(name='Belarusian', | |
72 iso_code='be', | |
73 use_ascii=False, | |
74 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
75 'IBM866', 'MacCyrillic'], | |
76 alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' | |
77 u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), | |
78 wiki_start_pages=[u'Галоўная_старонка']), | |
79 'Bulgarian': Language(name='Bulgarian', | |
80 iso_code='bg', | |
81 use_ascii=False, | |
82 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
83 'IBM855'], | |
84 alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' | |
85 u'абвгдежзийклмнопрстуфхцчшщъьюя'), | |
86 wiki_start_pages=[u'Начална_страница']), | |
87 'Czech': Language(name='Czech', | |
88 iso_code='cz', | |
89 use_ascii=True, | |
90 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
91 alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', | |
92 wiki_start_pages=[u'Hlavní_strana']), | |
93 'Danish': Language(name='Danish', | |
94 iso_code='da', | |
95 use_ascii=True, | |
96 charsets=['ISO-8859-1', 'ISO-8859-15', | |
97 'WINDOWS-1252'], | |
98 alphabet=u'æøåÆØÅ', | |
99 wiki_start_pages=[u'Forside']), | |
100 'German': Language(name='German', | |
101 iso_code='de', | |
102 use_ascii=True, | |
103 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
104 alphabet=u'äöüßÄÖÜ', | |
105 wiki_start_pages=[u'Wikipedia:Hauptseite']), | |
106 'Greek': Language(name='Greek', | |
107 iso_code='el', | |
108 use_ascii=False, | |
109 charsets=['ISO-8859-7', 'WINDOWS-1253'], | |
110 alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' | |
111 u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), | |
112 wiki_start_pages=[u'Πύλη:Κύρια']), | |
113 'English': Language(name='English', | |
114 iso_code='en', | |
115 use_ascii=True, | |
116 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
117 wiki_start_pages=[u'Main_Page']), | |
118 'Esperanto': Language(name='Esperanto', | |
119 iso_code='eo', | |
120 # Q, W, X, and Y not used at all | |
121 use_ascii=False, | |
122 charsets=['ISO-8859-3'], | |
123 alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' | |
124 u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), | |
125 wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']), | |
126 'Spanish': Language(name='Spanish', | |
127 iso_code='es', | |
128 use_ascii=True, | |
129 charsets=['ISO-8859-1', 'ISO-8859-15', | |
130 'WINDOWS-1252'], | |
131 alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ', | |
132 wiki_start_pages=[u'Wikipedia:Portada']), | |
133 'Estonian': Language(name='Estonian', | |
134 iso_code='et', | |
135 use_ascii=False, | |
136 charsets=['ISO-8859-4', 'ISO-8859-13', | |
137 'WINDOWS-1257'], | |
138 # C, F, Š, Q, W, X, Y, Z, Ž are only for | |
139 # loanwords | |
140 alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ' | |
141 u'abdeghijklmnoprstuvõäöü'), | |
142 wiki_start_pages=[u'Esileht']), | |
143 'Finnish': Language(name='Finnish', | |
144 iso_code='fi', | |
145 use_ascii=True, | |
146 charsets=['ISO-8859-1', 'ISO-8859-15', | |
147 'WINDOWS-1252'], | |
148 alphabet=u'ÅÄÖŠŽåäöšž', | |
149 wiki_start_pages=[u'Wikipedia:Etusivu']), | |
150 'French': Language(name='French', | |
151 iso_code='fr', | |
152 use_ascii=True, | |
153 charsets=['ISO-8859-1', 'ISO-8859-15', | |
154 'WINDOWS-1252'], | |
155 alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', | |
156 wiki_start_pages=[u'Wikipédia:Accueil_principal', | |
157 u'Bœuf (animal)']), | |
158 'Hebrew': Language(name='Hebrew', | |
159 iso_code='he', | |
160 use_ascii=False, | |
161 charsets=['ISO-8859-8', 'WINDOWS-1255'], | |
162 alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', | |
163 wiki_start_pages=[u'עמוד_ראשי']), | |
164 'Croatian': Language(name='Croatian', | |
165 iso_code='hr', | |
166 # Q, W, X, Y are only used for foreign words. | |
167 use_ascii=False, | |
168 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
169 alphabet=(u'abcčćdđefghijklmnoprsštuvzž' | |
170 u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), | |
171 wiki_start_pages=[u'Glavna_stranica']), | |
172 'Hungarian': Language(name='Hungarian', | |
173 iso_code='hu', | |
174 # Q, W, X, Y are only used for foreign words. | |
175 use_ascii=False, | |
176 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
177 alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű' | |
178 u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), | |
179 wiki_start_pages=[u'Kezdőlap']), | |
180 'Italian': Language(name='Italian', | |
181 iso_code='it', | |
182 use_ascii=True, | |
183 charsets=['ISO-8859-1', 'ISO-8859-15', | |
184 'WINDOWS-1252'], | |
185 alphabet=u'ÀÈÉÌÒÓÙàèéìòóù', | |
186 wiki_start_pages=[u'Pagina_principale']), | |
187 'Lithuanian': Language(name='Lithuanian', | |
188 iso_code='lt', | |
189 use_ascii=False, | |
190 charsets=['ISO-8859-13', 'WINDOWS-1257', | |
191 'ISO-8859-4'], | |
192 # Q, W, and X not used at all | |
193 alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' | |
194 u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), | |
195 wiki_start_pages=[u'Pagrindinis_puslapis']), | |
196 'Latvian': Language(name='Latvian', | |
197 iso_code='lv', | |
198 use_ascii=False, | |
199 charsets=['ISO-8859-13', 'WINDOWS-1257', | |
200 'ISO-8859-4'], | |
201 # Q, W, X, Y are only for loanwords | |
202 alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' | |
203 u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), | |
204 wiki_start_pages=[u'Sākumlapa']), | |
205 'Macedonian': Language(name='Macedonian', | |
206 iso_code='mk', | |
207 use_ascii=False, | |
208 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
209 'MacCyrillic', 'IBM855'], | |
210 alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' | |
211 u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), | |
212 wiki_start_pages=[u'Главна_страница']), | |
213 'Dutch': Language(name='Dutch', | |
214 iso_code='nl', | |
215 use_ascii=True, | |
216 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
217 wiki_start_pages=[u'Hoofdpagina']), | |
218 'Polish': Language(name='Polish', | |
219 iso_code='pl', | |
220 # Q and X are only used for foreign words. | |
221 use_ascii=False, | |
222 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
223 alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' | |
224 u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), | |
225 wiki_start_pages=[u'Wikipedia:Strona_główna']), | |
226 'Portuguese': Language(name='Portuguese', | |
227 iso_code='pt', | |
228 use_ascii=True, | |
229 charsets=['ISO-8859-1', 'ISO-8859-15', | |
230 'WINDOWS-1252'], | |
231 alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', | |
232 wiki_start_pages=[u'Wikipédia:Página_principal']), | |
233 'Romanian': Language(name='Romanian', | |
234 iso_code='ro', | |
235 use_ascii=True, | |
236 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
237 alphabet=u'ăâîșțĂÂÎȘȚ', | |
238 wiki_start_pages=[u'Pagina_principală']), | |
239 'Russian': Language(name='Russian', | |
240 iso_code='ru', | |
241 use_ascii=False, | |
242 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
243 'KOI8-R', 'MacCyrillic', 'IBM866', | |
244 'IBM855'], | |
245 alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' | |
246 u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), | |
247 wiki_start_pages=[u'Заглавная_страница']), | |
248 'Slovak': Language(name='Slovak', | |
249 iso_code='sk', | |
250 use_ascii=True, | |
251 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
252 alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', | |
253 wiki_start_pages=[u'Hlavná_stránka']), | |
254 'Slovene': Language(name='Slovene', | |
255 iso_code='sl', | |
256 # Q, W, X, Y are only used for foreign words. | |
257 use_ascii=False, | |
258 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
259 alphabet=(u'abcčdefghijklmnoprsštuvzž' | |
260 u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), | |
261 wiki_start_pages=[u'Glavna_stran']), | |
262 # Serbian can be written in both Latin and Cyrillic, but there's no | |
263 # simple way to get the Latin alphabet pages from Wikipedia through | |
264 # the API, so for now we just support Cyrillic. | |
265 'Serbian': Language(name='Serbian', | |
266 iso_code='sr', | |
267 alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' | |
268 u'абвгдђежзијклљмнњопрстћуфхцчџш'), | |
269 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
270 'MacCyrillic', 'IBM855'], | |
271 wiki_start_pages=[u'Главна_страна']), | |
272 'Thai': Language(name='Thai', | |
273 iso_code='th', | |
274 use_ascii=False, | |
275 charsets=['ISO-8859-11', 'TIS-620', 'CP874'], | |
276 alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', | |
277 wiki_start_pages=[u'หน้าหลัก']), | |
278 'Turkish': Language(name='Turkish', | |
279 iso_code='tr', | |
280 # Q, W, and X are not used by Turkish | |
281 use_ascii=False, | |
282 charsets=['ISO-8859-3', 'ISO-8859-9', | |
283 'WINDOWS-1254'], | |
284 alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû' | |
285 u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), | |
286 wiki_start_pages=[u'Ana_Sayfa']), | |
287 'Vietnamese': Language(name='Vietnamese', | |
288 iso_code='vi', | |
289 use_ascii=False, | |
290 # Windows-1258 is the only common 8-bit | |
291 # Vietnamese encoding supported by Python. | |
292 # From Wikipedia: | |
293 # For systems that lack support for Unicode, | |
294 # dozens of 8-bit Vietnamese code pages are | |
295 # available.[1] The most common are VISCII | |
296 # (TCVN 5712:1993), VPS, and Windows-1258.[3] | |
297 # Where ASCII is required, such as when | |
298 # ensuring readability in plain text e-mail, | |
299 # Vietnamese letters are often encoded | |
300 # according to Vietnamese Quoted-Readable | |
301 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] | |
302 # though usage of either variable-width | |
303 # scheme has declined dramatically following | |
304 # the adoption of Unicode on the World Wide | |
305 # Web. | |
306 charsets=['WINDOWS-1258'], | |
307 alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy' | |
308 u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), | |
309 wiki_start_pages=[u'Chữ_Quốc_ngữ']), | |
310 } |