Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
languages.py
Go to the documentation of this file.
1"""
2Metadata about languages used by our model training code for our
3SingleByteCharSetProbers. Could be used for other things in the future.
4
5This code is based on the language metadata from the uchardet project.
6"""
7
8from string import ascii_letters
9from typing import List, Optional
10
11# TODO: Add Ukrainian (KOI8-U)
12
13
15 """Metadata about a language useful for training models
16
17 :ivar name: The human name for the language, in English.
18 :type name: str
19 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
20 or use another catalog as a last resort.
21 :type iso_code: str
22 :ivar use_ascii: Whether or not ASCII letters should be included in trained
23 models.
24 :type use_ascii: bool
25 :ivar charsets: The charsets we want to support and create data for.
26 :type charsets: list of str
27 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
28 `True`, you only need to add those not in the ASCII set.
29 :type alphabet: str
30 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
31 Wikipedia for training data.
32 :type wiki_start_pages: list of str
33 """
34
36 self,
37 name: Optional[str] = None,
38 iso_code: Optional[str] = None,
39 use_ascii: bool = True,
40 charsets: Optional[List[str]] = None,
41 alphabet: Optional[str] = None,
42 wiki_start_pages: Optional[List[str]] = None,
43 ) -> None:
44 super().__init__()
45 self.name = name
46 self.iso_code = iso_code
47 self.use_ascii = use_ascii
48 self.charsets = charsets
49 if self.use_ascii:
50 if alphabet:
51 alphabet += ascii_letters
52 else:
53 alphabet = ascii_letters
54 elif not alphabet:
55 raise ValueError("Must supply alphabet if use_ascii is False")
56 self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
57 self.wiki_start_pages = wiki_start_pages
58
59 def __repr__(self) -> str:
60 param_str = ", ".join(
61 f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
62 )
63 return f"{self.__class__.__name__}({param_str})"
64
65
66LANGUAGES = {
67 "Arabic": Language(
68 name="Arabic",
69 iso_code="ar",
70 use_ascii=False,
71 # We only support encodings that use isolated
72 # forms, because the current recommendation is
73 # that the rendering system handles presentation
74 # forms. This means we purposefully skip IBM864.
75 charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
76 alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
77 wiki_start_pages=["الصفحة_الرئيسية"],
78 ),
79 "Belarusian": Language(
80 name="Belarusian",
81 iso_code="be",
82 use_ascii=False,
83 charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
84 alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
85 wiki_start_pages=["Галоўная_старонка"],
86 ),
87 "Bulgarian": Language(
88 name="Bulgarian",
89 iso_code="bg",
90 use_ascii=False,
91 charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
92 alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
93 wiki_start_pages=["Начална_страница"],
94 ),
95 "Czech": Language(
96 name="Czech",
97 iso_code="cz",
98 use_ascii=True,
99 charsets=["ISO-8859-2", "WINDOWS-1250"],
100 alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
101 wiki_start_pages=["Hlavní_strana"],
102 ),
103 "Danish": Language(
104 name="Danish",
105 iso_code="da",
106 use_ascii=True,
107 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
108 alphabet="æøåÆØÅ",
109 wiki_start_pages=["Forside"],
110 ),
111 "German": Language(
112 name="German",
113 iso_code="de",
114 use_ascii=True,
115 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
116 alphabet="äöüßẞÄÖÜ",
117 wiki_start_pages=["Wikipedia:Hauptseite"],
118 ),
119 "Greek": Language(
120 name="Greek",
121 iso_code="el",
122 use_ascii=False,
123 charsets=["ISO-8859-7", "WINDOWS-1253"],
124 alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
125 wiki_start_pages=["Πύλη:Κύρια"],
126 ),
127 "English": Language(
128 name="English",
129 iso_code="en",
130 use_ascii=True,
131 charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
132 wiki_start_pages=["Main_Page"],
133 ),
134 "Esperanto": Language(
135 name="Esperanto",
136 iso_code="eo",
137 # Q, W, X, and Y not used at all
138 use_ascii=False,
139 charsets=["ISO-8859-3"],
140 alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
141 wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
142 ),
143 "Spanish": Language(
144 name="Spanish",
145 iso_code="es",
146 use_ascii=True,
147 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
148 alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
149 wiki_start_pages=["Wikipedia:Portada"],
150 ),
151 "Estonian": Language(
152 name="Estonian",
153 iso_code="et",
154 use_ascii=False,
155 charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
156 # C, F, Š, Q, W, X, Y, Z, Ž are only for
157 # loanwords
158 alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
159 wiki_start_pages=["Esileht"],
160 ),
161 "Finnish": Language(
162 name="Finnish",
163 iso_code="fi",
164 use_ascii=True,
165 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
166 alphabet="ÅÄÖŠŽåäöšž",
167 wiki_start_pages=["Wikipedia:Etusivu"],
168 ),
169 "French": Language(
170 name="French",
171 iso_code="fr",
172 use_ascii=True,
173 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
174 alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
175 wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
176 ),
177 "Hebrew": Language(
178 name="Hebrew",
179 iso_code="he",
180 use_ascii=False,
181 charsets=["ISO-8859-8", "WINDOWS-1255"],
182 alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
183 wiki_start_pages=["עמוד_ראשי"],
184 ),
185 "Croatian": Language(
186 name="Croatian",
187 iso_code="hr",
188 # Q, W, X, Y are only used for foreign words.
189 use_ascii=False,
190 charsets=["ISO-8859-2", "WINDOWS-1250"],
191 alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
192 wiki_start_pages=["Glavna_stranica"],
193 ),
194 "Hungarian": Language(
195 name="Hungarian",
196 iso_code="hu",
197 # Q, W, X, Y are only used for foreign words.
198 use_ascii=False,
199 charsets=["ISO-8859-2", "WINDOWS-1250"],
200 alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
201 wiki_start_pages=["Kezdőlap"],
202 ),
203 "Italian": Language(
204 name="Italian",
205 iso_code="it",
206 use_ascii=True,
207 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
208 alphabet="ÀÈÉÌÒÓÙàèéìòóù",
209 wiki_start_pages=["Pagina_principale"],
210 ),
211 "Lithuanian": Language(
212 name="Lithuanian",
213 iso_code="lt",
214 use_ascii=False,
215 charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
216 # Q, W, and X not used at all
217 alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
218 wiki_start_pages=["Pagrindinis_puslapis"],
219 ),
220 "Latvian": Language(
221 name="Latvian",
222 iso_code="lv",
223 use_ascii=False,
224 charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
225 # Q, W, X, Y are only for loanwords
226 alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
227 wiki_start_pages=["Sākumlapa"],
228 ),
229 "Macedonian": Language(
230 name="Macedonian",
231 iso_code="mk",
232 use_ascii=False,
233 charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
234 alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
235 wiki_start_pages=["Главна_страница"],
236 ),
237 "Dutch": Language(
238 name="Dutch",
239 iso_code="nl",
240 use_ascii=True,
241 charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
242 wiki_start_pages=["Hoofdpagina"],
243 ),
244 "Polish": Language(
245 name="Polish",
246 iso_code="pl",
247 # Q and X are only used for foreign words.
248 use_ascii=False,
249 charsets=["ISO-8859-2", "WINDOWS-1250"],
250 alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
251 wiki_start_pages=["Wikipedia:Strona_główna"],
252 ),
253 "Portuguese": Language(
254 name="Portuguese",
255 iso_code="pt",
256 use_ascii=True,
257 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
258 alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
259 wiki_start_pages=["Wikipédia:Página_principal"],
260 ),
261 "Romanian": Language(
262 name="Romanian",
263 iso_code="ro",
264 use_ascii=True,
265 charsets=["ISO-8859-2", "WINDOWS-1250"],
266 alphabet="ăâîșțĂÂÎȘȚ",
267 wiki_start_pages=["Pagina_principală"],
268 ),
269 "Russian": Language(
270 name="Russian",
271 iso_code="ru",
272 use_ascii=False,
273 charsets=[
274 "ISO-8859-5",
275 "WINDOWS-1251",
276 "KOI8-R",
277 "MacCyrillic",
278 "IBM866",
279 "IBM855",
280 ],
281 alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
282 wiki_start_pages=["Заглавная_страница"],
283 ),
284 "Slovak": Language(
285 name="Slovak",
286 iso_code="sk",
287 use_ascii=True,
288 charsets=["ISO-8859-2", "WINDOWS-1250"],
289 alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
290 wiki_start_pages=["Hlavná_stránka"],
291 ),
292 "Slovene": Language(
293 name="Slovene",
294 iso_code="sl",
295 # Q, W, X, Y are only used for foreign words.
296 use_ascii=False,
297 charsets=["ISO-8859-2", "WINDOWS-1250"],
298 alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
299 wiki_start_pages=["Glavna_stran"],
300 ),
301 # Serbian can be written in both Latin and Cyrillic, but there's no
302 # simple way to get the Latin alphabet pages from Wikipedia through
303 # the API, so for now we just support Cyrillic.
304 "Serbian": Language(
305 name="Serbian",
306 iso_code="sr",
307 alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
308 charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
309 wiki_start_pages=["Главна_страна"],
310 ),
311 "Thai": Language(
312 name="Thai",
313 iso_code="th",
314 use_ascii=False,
315 charsets=["ISO-8859-11", "TIS-620", "CP874"],
316 alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
317 wiki_start_pages=["หน้าหลัก"],
318 ),
319 "Turkish": Language(
320 name="Turkish",
321 iso_code="tr",
322 # Q, W, and X are not used by Turkish
323 use_ascii=False,
324 charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
325 alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
326 wiki_start_pages=["Ana_Sayfa"],
327 ),
328 "Vietnamese": Language(
329 name="Vietnamese",
330 iso_code="vi",
331 use_ascii=False,
332 # Windows-1258 is the only common 8-bit
333 # Vietnamese encoding supported by Python.
334 # From Wikipedia:
335 # For systems that lack support for Unicode,
336 # dozens of 8-bit Vietnamese code pages are
337 # available.[1] The most common are VISCII
338 # (TCVN 5712:1993), VPS, and Windows-1258.[3]
339 # Where ASCII is required, such as when
340 # ensuring readability in plain text e-mail,
341 # Vietnamese letters are often encoded
342 # according to Vietnamese Quoted-Readable
343 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
344 # though usage of either variable-width
345 # scheme has declined dramatically following
346 # the adoption of Unicode on the World Wide
347 # Web.
348 charsets=["WINDOWS-1258"],
349 alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
350 wiki_start_pages=["Chữ_Quốc_ngữ"],
351 ),
352}
None __init__(self, Optional[str] name=None, Optional[str] iso_code=None, bool use_ascii=True, Optional[List[str]] charsets=None, Optional[str] alphabet=None, Optional[List[str]] wiki_start_pages=None)
Definition languages.py:43
for i