LocaleToScriptMappingDefault.cpp source code [webkit/Source/WebCore/platform/text/LocaleToScriptMappingDefault.cpp]

1	/*
2	* Copyright (C) 2011 Google Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are
6	* met:
7	*
8	* * Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* * Redistributions in binary form must reproduce the above
11	* copyright notice, this list of conditions and the following disclaimer
12	* in the documentation and/or other materials provided with the
13	* distribution.
14	* * Neither the name of Google Inc. nor the names of its
15	* contributors may be used to endorse or promote products derived from
16	* this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*/
30
31	#include "config.h"
32	#include "LocaleToScriptMapping.h"
33
34	#include <wtf/HashMap.h>
35	#include <wtf/NeverDestroyed.h>
36	#include <wtf/text/StringHash.h>
37
38	namespace WebCore {
39
40	struct ScriptNameCode {
41	ASCIILiteral name;
42	UScriptCode code;
43	};
44
45	// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
46	// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
47	// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
48	// using the same font setting.
49	static const ScriptNameCode scriptNameCodeList[] = {
50	{ "zyyy"_s, USCRIPT_COMMON },
51	{ "qaai"_s, USCRIPT_INHERITED },
52	{ "arab"_s, USCRIPT_ARABIC },
53	{ "armn"_s, USCRIPT_ARMENIAN },
54	{ "beng"_s, USCRIPT_BENGALI },
55	{ "bopo"_s, USCRIPT_BOPOMOFO },
56	{ "cher"_s, USCRIPT_CHEROKEE },
57	{ "copt"_s, USCRIPT_COPTIC },
58	{ "cyrl"_s, USCRIPT_CYRILLIC },
59	{ "dsrt"_s, USCRIPT_DESERET },
60	{ "deva"_s, USCRIPT_DEVANAGARI },
61	{ "ethi"_s, USCRIPT_ETHIOPIC },
62	{ "geor"_s, USCRIPT_GEORGIAN },
63	{ "goth"_s, USCRIPT_GOTHIC },
64	{ "grek"_s, USCRIPT_GREEK },
65	{ "gujr"_s, USCRIPT_GUJARATI },
66	{ "guru"_s, USCRIPT_GURMUKHI },
67	{ "hani"_s, USCRIPT_HAN },
68	{ "hang"_s, USCRIPT_HANGUL },
69	{ "hebr"_s, USCRIPT_HEBREW },
70	{ "hira"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
71	{ "knda"_s, USCRIPT_KANNADA },
72	{ "kana"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
73	{ "khmr"_s, USCRIPT_KHMER },
74	{ "laoo"_s, USCRIPT_LAO },
75	{ "latn"_s, USCRIPT_LATIN },
76	{ "mlym"_s, USCRIPT_MALAYALAM },
77	{ "mong"_s, USCRIPT_MONGOLIAN },
78	{ "mymr"_s, USCRIPT_MYANMAR },
79	{ "ogam"_s, USCRIPT_OGHAM },
80	{ "ital"_s, USCRIPT_OLD_ITALIC },
81	{ "orya"_s, USCRIPT_ORIYA },
82	{ "runr"_s, USCRIPT_RUNIC },
83	{ "sinh"_s, USCRIPT_SINHALA },
84	{ "syrc"_s, USCRIPT_SYRIAC },
85	{ "taml"_s, USCRIPT_TAMIL },
86	{ "telu"_s, USCRIPT_TELUGU },
87	{ "thaa"_s, USCRIPT_THAANA },
88	{ "thai"_s, USCRIPT_THAI },
89	{ "tibt"_s, USCRIPT_TIBETAN },
90	{ "cans"_s, USCRIPT_CANADIAN_ABORIGINAL },
91	{ "yiii"_s, USCRIPT_YI },
92	{ "tglg"_s, USCRIPT_TAGALOG },
93	{ "hano"_s, USCRIPT_HANUNOO },
94	{ "buhd"_s, USCRIPT_BUHID },
95	{ "tagb"_s, USCRIPT_TAGBANWA },
96	{ "brai"_s, USCRIPT_BRAILLE },
97	{ "cprt"_s, USCRIPT_CYPRIOT },
98	{ "limb"_s, USCRIPT_LIMBU },
99	{ "linb"_s, USCRIPT_LINEAR_B },
100	{ "osma"_s, USCRIPT_OSMANYA },
101	{ "shaw"_s, USCRIPT_SHAVIAN },
102	{ "tale"_s, USCRIPT_TAI_LE },
103	{ "ugar"_s, USCRIPT_UGARITIC },
104	{ "hrkt"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
105	{ "bugi"_s, USCRIPT_BUGINESE },
106	{ "glag"_s, USCRIPT_GLAGOLITIC },
107	{ "khar"_s, USCRIPT_KHAROSHTHI },
108	{ "sylo"_s, USCRIPT_SYLOTI_NAGRI },
109	{ "talu"_s, USCRIPT_NEW_TAI_LUE },
110	{ "tfng"_s, USCRIPT_TIFINAGH },
111	{ "xpeo"_s, USCRIPT_OLD_PERSIAN },
112	{ "bali"_s, USCRIPT_BALINESE },
113	{ "batk"_s, USCRIPT_BATAK },
114	{ "blis"_s, USCRIPT_BLISSYMBOLS },
115	{ "brah"_s, USCRIPT_BRAHMI },
116	{ "cham"_s, USCRIPT_CHAM },
117	{ "cirt"_s, USCRIPT_CIRTH },
118	{ "cyrs"_s, USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
119	{ "egyd"_s, USCRIPT_DEMOTIC_EGYPTIAN },
120	{ "egyh"_s, USCRIPT_HIERATIC_EGYPTIAN },
121	{ "egyp"_s, USCRIPT_EGYPTIAN_HIEROGLYPHS },
122	{ "geok"_s, USCRIPT_KHUTSURI },
123	{ "hans"_s, USCRIPT_SIMPLIFIED_HAN },
124	{ "hant"_s, USCRIPT_TRADITIONAL_HAN },
125	{ "hmng"_s, USCRIPT_PAHAWH_HMONG },
126	{ "hung"_s, USCRIPT_OLD_HUNGARIAN },
127	{ "inds"_s, USCRIPT_HARAPPAN_INDUS },
128	{ "java"_s, USCRIPT_JAVANESE },
129	{ "kali"_s, USCRIPT_KAYAH_LI },
130	{ "latf"_s, USCRIPT_LATIN_FRAKTUR },
131	{ "latg"_s, USCRIPT_LATIN_GAELIC },
132	{ "lepc"_s, USCRIPT_LEPCHA },
133	{ "lina"_s, USCRIPT_LINEAR_A },
134	{ "mand"_s, USCRIPT_MANDAEAN },
135	{ "maya"_s, USCRIPT_MAYAN_HIEROGLYPHS },
136	{ "mero"_s, USCRIPT_MEROITIC },
137	{ "nkoo"_s, USCRIPT_NKO },
138	{ "orkh"_s, USCRIPT_ORKHON },
139	{ "perm"_s, USCRIPT_OLD_PERMIC },
140	{ "phag"_s, USCRIPT_PHAGS_PA },
141	{ "phnx"_s, USCRIPT_PHOENICIAN },
142	{ "plrd"_s, USCRIPT_PHONETIC_POLLARD },
143	{ "roro"_s, USCRIPT_RONGORONGO },
144	{ "sara"_s, USCRIPT_SARATI },
145	{ "syre"_s, USCRIPT_ESTRANGELO_SYRIAC },
146	{ "syrj"_s, USCRIPT_WESTERN_SYRIAC },
147	{ "syrn"_s, USCRIPT_EASTERN_SYRIAC },
148	{ "teng"_s, USCRIPT_TENGWAR },
149	{ "vaii"_s, USCRIPT_VAI },
150	{ "visp"_s, USCRIPT_VISIBLE_SPEECH },
151	{ "xsux"_s, USCRIPT_CUNEIFORM },
152	{ "jpan"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
153	{ "kore"_s, USCRIPT_HANGUL },
154	{ "zxxx"_s, USCRIPT_UNWRITTEN_LANGUAGES },
155	{ "zzzz"_s, USCRIPT_UNKNOWN }
156	};
157
158	struct ScriptNameCodeMapHashTraits : public HashTraits<String> {
159	static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(scriptNameCodeList)>::value;
160	};
161
162	UScriptCode scriptNameToCode(const String& scriptName)
163	{
164	static const auto scriptNameCodeMap = makeNeverDestroyed([] {
165	HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, ScriptNameCodeMapHashTraits> map;
166	for (auto& nameAndCode : scriptNameCodeList)
167	map.add(nameAndCode.name, nameAndCode.code);
168	return map;
169	}());
170
171	auto it = scriptNameCodeMap.get().find(scriptName);
172	if (it != scriptNameCodeMap.get().end())
173	return it ->value;
174	return USCRIPT_INVALID_CODE;
175	}
176
177	struct LocaleScript {
178	ASCIILiteral locale;
179	UScriptCode script;
180	};
181
182	static const LocaleScript localeScriptList[] = {
183	{ "aa"_s, USCRIPT_LATIN },
184	{ "ab"_s, USCRIPT_CYRILLIC },
185	{ "ady"_s, USCRIPT_CYRILLIC },
186	{ "af"_s, USCRIPT_LATIN },
187	{ "ak"_s, USCRIPT_LATIN },
188	{ "am"_s, USCRIPT_ETHIOPIC },
189	{ "ar"_s, USCRIPT_ARABIC },
190	{ "as"_s, USCRIPT_BENGALI },
191	{ "ast"_s, USCRIPT_LATIN },
192	{ "av"_s, USCRIPT_CYRILLIC },
193	{ "ay"_s, USCRIPT_LATIN },
194	{ "az"_s, USCRIPT_LATIN },
195	{ "ba"_s, USCRIPT_CYRILLIC },
196	{ "be"_s, USCRIPT_CYRILLIC },
197	{ "bg"_s, USCRIPT_CYRILLIC },
198	{ "bi"_s, USCRIPT_LATIN },
199	{ "bn"_s, USCRIPT_BENGALI },
200	{ "bo"_s, USCRIPT_TIBETAN },
201	{ "bs"_s, USCRIPT_LATIN },
202	{ "ca"_s, USCRIPT_LATIN },
203	{ "ce"_s, USCRIPT_CYRILLIC },
204	{ "ceb"_s, USCRIPT_LATIN },
205	{ "ch"_s, USCRIPT_LATIN },
206	{ "chk"_s, USCRIPT_LATIN },
207	{ "cs"_s, USCRIPT_LATIN },
208	{ "cy"_s, USCRIPT_LATIN },
209	{ "da"_s, USCRIPT_LATIN },
210	{ "de"_s, USCRIPT_LATIN },
211	{ "dv"_s, USCRIPT_THAANA },
212	{ "dz"_s, USCRIPT_TIBETAN },
213	{ "ee"_s, USCRIPT_LATIN },
214	{ "efi"_s, USCRIPT_LATIN },
215	{ "el"_s, USCRIPT_GREEK },
216	{ "en"_s, USCRIPT_LATIN },
217	{ "es"_s, USCRIPT_LATIN },
218	{ "et"_s, USCRIPT_LATIN },
219	{ "eu"_s, USCRIPT_LATIN },
220	{ "fa"_s, USCRIPT_ARABIC },
221	{ "fi"_s, USCRIPT_LATIN },
222	{ "fil"_s, USCRIPT_LATIN },
223	{ "fj"_s, USCRIPT_LATIN },
224	{ "fo"_s, USCRIPT_LATIN },
225	{ "fr"_s, USCRIPT_LATIN },
226	{ "fur"_s, USCRIPT_LATIN },
227	{ "fy"_s, USCRIPT_LATIN },
228	{ "ga"_s, USCRIPT_LATIN },
229	{ "gaa"_s, USCRIPT_LATIN },
230	{ "gd"_s, USCRIPT_LATIN },
231	{ "gil"_s, USCRIPT_LATIN },
232	{ "gl"_s, USCRIPT_LATIN },
233	{ "gn"_s, USCRIPT_LATIN },
234	{ "gsw"_s, USCRIPT_LATIN },
235	{ "gu"_s, USCRIPT_GUJARATI },
236	{ "ha"_s, USCRIPT_LATIN },
237	{ "haw"_s, USCRIPT_LATIN },
238	{ "he"_s, USCRIPT_HEBREW },
239	{ "hi"_s, USCRIPT_DEVANAGARI },
240	{ "hil"_s, USCRIPT_LATIN },
241	{ "ho"_s, USCRIPT_LATIN },
242	{ "hr"_s, USCRIPT_LATIN },
243	{ "ht"_s, USCRIPT_LATIN },
244	{ "hu"_s, USCRIPT_LATIN },
245	{ "hy"_s, USCRIPT_ARMENIAN },
246	{ "id"_s, USCRIPT_LATIN },
247	{ "ig"_s, USCRIPT_LATIN },
248	{ "ii"_s, USCRIPT_YI },
249	{ "ilo"_s, USCRIPT_LATIN },
250	{ "inh"_s, USCRIPT_CYRILLIC },
251	{ "is"_s, USCRIPT_LATIN },
252	{ "it"_s, USCRIPT_LATIN },
253	{ "iu"_s, USCRIPT_CANADIAN_ABORIGINAL },
254	{ "ja"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
255	{ "jv"_s, USCRIPT_LATIN },
256	{ "ka"_s, USCRIPT_GEORGIAN },
257	{ "kaj"_s, USCRIPT_LATIN },
258	{ "kam"_s, USCRIPT_LATIN },
259	{ "kbd"_s, USCRIPT_CYRILLIC },
260	{ "kha"_s, USCRIPT_LATIN },
261	{ "kk"_s, USCRIPT_CYRILLIC },
262	{ "kl"_s, USCRIPT_LATIN },
263	{ "km"_s, USCRIPT_KHMER },
264	{ "kn"_s, USCRIPT_KANNADA },
265	{ "ko"_s, USCRIPT_HANGUL },
266	{ "kok"_s, USCRIPT_DEVANAGARI },
267	{ "kos"_s, USCRIPT_LATIN },
268	{ "kpe"_s, USCRIPT_LATIN },
269	{ "krc"_s, USCRIPT_CYRILLIC },
270	{ "ks"_s, USCRIPT_ARABIC },
271	{ "ku"_s, USCRIPT_ARABIC },
272	{ "kum"_s, USCRIPT_CYRILLIC },
273	{ "ky"_s, USCRIPT_CYRILLIC },
274	{ "la"_s, USCRIPT_LATIN },
275	{ "lah"_s, USCRIPT_ARABIC },
276	{ "lb"_s, USCRIPT_LATIN },
277	{ "lez"_s, USCRIPT_CYRILLIC },
278	{ "ln"_s, USCRIPT_LATIN },
279	{ "lo"_s, USCRIPT_LAO },
280	{ "lt"_s, USCRIPT_LATIN },
281	{ "lv"_s, USCRIPT_LATIN },
282	{ "mai"_s, USCRIPT_DEVANAGARI },
283	{ "mdf"_s, USCRIPT_CYRILLIC },
284	{ "mg"_s, USCRIPT_LATIN },
285	{ "mh"_s, USCRIPT_LATIN },
286	{ "mi"_s, USCRIPT_LATIN },
287	{ "mk"_s, USCRIPT_CYRILLIC },
288	{ "ml"_s, USCRIPT_MALAYALAM },
289	{ "mn"_s, USCRIPT_CYRILLIC },
290	{ "mr"_s, USCRIPT_DEVANAGARI },
291	{ "ms"_s, USCRIPT_LATIN },
292	{ "mt"_s, USCRIPT_LATIN },
293	{ "my"_s, USCRIPT_MYANMAR },
294	{ "myv"_s, USCRIPT_CYRILLIC },
295	{ "na"_s, USCRIPT_LATIN },
296	{ "nb"_s, USCRIPT_LATIN },
297	{ "ne"_s, USCRIPT_DEVANAGARI },
298	{ "niu"_s, USCRIPT_LATIN },
299	{ "nl"_s, USCRIPT_LATIN },
300	{ "nn"_s, USCRIPT_LATIN },
301	{ "nr"_s, USCRIPT_LATIN },
302	{ "nso"_s, USCRIPT_LATIN },
303	{ "ny"_s, USCRIPT_LATIN },
304	{ "oc"_s, USCRIPT_LATIN },
305	{ "om"_s, USCRIPT_LATIN },
306	{ "or"_s, USCRIPT_ORIYA },
307	{ "os"_s, USCRIPT_CYRILLIC },
308	{ "pa"_s, USCRIPT_GURMUKHI },
309	{ "pag"_s, USCRIPT_LATIN },
310	{ "pap"_s, USCRIPT_LATIN },
311	{ "pau"_s, USCRIPT_LATIN },
312	{ "pl"_s, USCRIPT_LATIN },
313	{ "pon"_s, USCRIPT_LATIN },
314	{ "ps"_s, USCRIPT_ARABIC },
315	{ "pt"_s, USCRIPT_LATIN },
316	{ "qu"_s, USCRIPT_LATIN },
317	{ "rm"_s, USCRIPT_LATIN },
318	{ "rn"_s, USCRIPT_LATIN },
319	{ "ro"_s, USCRIPT_LATIN },
320	{ "ru"_s, USCRIPT_CYRILLIC },
321	{ "rw"_s, USCRIPT_LATIN },
322	{ "sa"_s, USCRIPT_DEVANAGARI },
323	{ "sah"_s, USCRIPT_CYRILLIC },
324	{ "sat"_s, USCRIPT_LATIN },
325	{ "sd"_s, USCRIPT_ARABIC },
326	{ "se"_s, USCRIPT_LATIN },
327	{ "sg"_s, USCRIPT_LATIN },
328	{ "si"_s, USCRIPT_SINHALA },
329	{ "sid"_s, USCRIPT_LATIN },
330	{ "sk"_s, USCRIPT_LATIN },
331	{ "sl"_s, USCRIPT_LATIN },
332	{ "sm"_s, USCRIPT_LATIN },
333	{ "so"_s, USCRIPT_LATIN },
334	{ "sq"_s, USCRIPT_LATIN },
335	{ "sr"_s, USCRIPT_CYRILLIC },
336	{ "ss"_s, USCRIPT_LATIN },
337	{ "st"_s, USCRIPT_LATIN },
338	{ "su"_s, USCRIPT_LATIN },
339	{ "sv"_s, USCRIPT_LATIN },
340	{ "sw"_s, USCRIPT_LATIN },
341	{ "ta"_s, USCRIPT_TAMIL },
342	{ "te"_s, USCRIPT_TELUGU },
343	{ "tet"_s, USCRIPT_LATIN },
344	{ "tg"_s, USCRIPT_CYRILLIC },
345	{ "th"_s, USCRIPT_THAI },
346	{ "ti"_s, USCRIPT_ETHIOPIC },
347	{ "tig"_s, USCRIPT_ETHIOPIC },
348	{ "tk"_s, USCRIPT_LATIN },
349	{ "tkl"_s, USCRIPT_LATIN },
350	{ "tl"_s, USCRIPT_LATIN },
351	{ "tn"_s, USCRIPT_LATIN },
352	{ "to"_s, USCRIPT_LATIN },
353	{ "tpi"_s, USCRIPT_LATIN },
354	{ "tr"_s, USCRIPT_LATIN },
355	{ "trv"_s, USCRIPT_LATIN },
356	{ "ts"_s, USCRIPT_LATIN },
357	{ "tt"_s, USCRIPT_CYRILLIC },
358	{ "tvl"_s, USCRIPT_LATIN },
359	{ "tw"_s, USCRIPT_LATIN },
360	{ "ty"_s, USCRIPT_LATIN },
361	{ "tyv"_s, USCRIPT_CYRILLIC },
362	{ "udm"_s, USCRIPT_CYRILLIC },
363	{ "ug"_s, USCRIPT_ARABIC },
364	{ "uk"_s, USCRIPT_CYRILLIC },
365	{ "und"_s, USCRIPT_LATIN },
366	{ "ur"_s, USCRIPT_ARABIC },
367	{ "uz"_s, USCRIPT_CYRILLIC },
368	{ "ve"_s, USCRIPT_LATIN },
369	{ "vi"_s, USCRIPT_LATIN },
370	{ "wal"_s, USCRIPT_ETHIOPIC },
371	{ "war"_s, USCRIPT_LATIN },
372	{ "wo"_s, USCRIPT_LATIN },
373	{ "xh"_s, USCRIPT_LATIN },
374	{ "yap"_s, USCRIPT_LATIN },
375	{ "yo"_s, USCRIPT_LATIN },
376	{ "za"_s, USCRIPT_LATIN },
377	{ "zh"_s, USCRIPT_HAN },
378	{ "zh_hk"_s, USCRIPT_TRADITIONAL_HAN },
379	{ "zh_tw"_s, USCRIPT_TRADITIONAL_HAN },
380	{ "zu"_s, USCRIPT_LATIN }
381	};
382
383	struct LocaleScriptMapHashTraits : public HashTraits<String> {
384	static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(localeScriptList)>::value;
385	};
386
387	UScriptCode localeToScriptCodeForFontSelection(const String& locale)
388	{
389	static const auto localeScriptMap = makeNeverDestroyed([] {
390	HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, LocaleScriptMapHashTraits> map;
391	for (auto& localeAndScript : localeScriptList)
392	map.add(localeAndScript.locale, localeAndScript.script);
393	return map;
394	}());
395
396	String canonicalLocale = locale;
397	canonicalLocale.replace(`'-'`, `'_'`);
398	while (!canonicalLocale.isEmpty()) {
399	auto it = localeScriptMap.get().find(canonicalLocale);
400	if (it != localeScriptMap.get().end())
401	return it ->value;
402	auto underscorePosition = canonicalLocale.reverseFind(`'_'`);
403	if (underscorePosition == notFound)
404	break;
405	UScriptCode code = scriptNameToCode(canonicalLocale.substring(underscorePosition + `1`));
406	if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
407	return code;
408	canonicalLocale = canonicalLocale.substring(`0`, underscorePosition);
409	}
410	return USCRIPT_COMMON;
411	}
412
413	} // namespace WebCore
414

Browse the source code of webkit/Source/WebCore/platform/text/LocaleToScriptMappingDefault.cpp