1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "LocaleToScriptMapping.h"
33
34#include <wtf/HashMap.h>
35#include <wtf/NeverDestroyed.h>
36#include <wtf/text/StringHash.h>
37
38namespace WebCore {
39
40struct ScriptNameCode {
41 ASCIILiteral name;
42 UScriptCode code;
43};
44
45// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
46// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
47// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
48// using the same font setting.
49static const ScriptNameCode scriptNameCodeList[] = {
50 { "zyyy"_s, USCRIPT_COMMON },
51 { "qaai"_s, USCRIPT_INHERITED },
52 { "arab"_s, USCRIPT_ARABIC },
53 { "armn"_s, USCRIPT_ARMENIAN },
54 { "beng"_s, USCRIPT_BENGALI },
55 { "bopo"_s, USCRIPT_BOPOMOFO },
56 { "cher"_s, USCRIPT_CHEROKEE },
57 { "copt"_s, USCRIPT_COPTIC },
58 { "cyrl"_s, USCRIPT_CYRILLIC },
59 { "dsrt"_s, USCRIPT_DESERET },
60 { "deva"_s, USCRIPT_DEVANAGARI },
61 { "ethi"_s, USCRIPT_ETHIOPIC },
62 { "geor"_s, USCRIPT_GEORGIAN },
63 { "goth"_s, USCRIPT_GOTHIC },
64 { "grek"_s, USCRIPT_GREEK },
65 { "gujr"_s, USCRIPT_GUJARATI },
66 { "guru"_s, USCRIPT_GURMUKHI },
67 { "hani"_s, USCRIPT_HAN },
68 { "hang"_s, USCRIPT_HANGUL },
69 { "hebr"_s, USCRIPT_HEBREW },
70 { "hira"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
71 { "knda"_s, USCRIPT_KANNADA },
72 { "kana"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
73 { "khmr"_s, USCRIPT_KHMER },
74 { "laoo"_s, USCRIPT_LAO },
75 { "latn"_s, USCRIPT_LATIN },
76 { "mlym"_s, USCRIPT_MALAYALAM },
77 { "mong"_s, USCRIPT_MONGOLIAN },
78 { "mymr"_s, USCRIPT_MYANMAR },
79 { "ogam"_s, USCRIPT_OGHAM },
80 { "ital"_s, USCRIPT_OLD_ITALIC },
81 { "orya"_s, USCRIPT_ORIYA },
82 { "runr"_s, USCRIPT_RUNIC },
83 { "sinh"_s, USCRIPT_SINHALA },
84 { "syrc"_s, USCRIPT_SYRIAC },
85 { "taml"_s, USCRIPT_TAMIL },
86 { "telu"_s, USCRIPT_TELUGU },
87 { "thaa"_s, USCRIPT_THAANA },
88 { "thai"_s, USCRIPT_THAI },
89 { "tibt"_s, USCRIPT_TIBETAN },
90 { "cans"_s, USCRIPT_CANADIAN_ABORIGINAL },
91 { "yiii"_s, USCRIPT_YI },
92 { "tglg"_s, USCRIPT_TAGALOG },
93 { "hano"_s, USCRIPT_HANUNOO },
94 { "buhd"_s, USCRIPT_BUHID },
95 { "tagb"_s, USCRIPT_TAGBANWA },
96 { "brai"_s, USCRIPT_BRAILLE },
97 { "cprt"_s, USCRIPT_CYPRIOT },
98 { "limb"_s, USCRIPT_LIMBU },
99 { "linb"_s, USCRIPT_LINEAR_B },
100 { "osma"_s, USCRIPT_OSMANYA },
101 { "shaw"_s, USCRIPT_SHAVIAN },
102 { "tale"_s, USCRIPT_TAI_LE },
103 { "ugar"_s, USCRIPT_UGARITIC },
104 { "hrkt"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
105 { "bugi"_s, USCRIPT_BUGINESE },
106 { "glag"_s, USCRIPT_GLAGOLITIC },
107 { "khar"_s, USCRIPT_KHAROSHTHI },
108 { "sylo"_s, USCRIPT_SYLOTI_NAGRI },
109 { "talu"_s, USCRIPT_NEW_TAI_LUE },
110 { "tfng"_s, USCRIPT_TIFINAGH },
111 { "xpeo"_s, USCRIPT_OLD_PERSIAN },
112 { "bali"_s, USCRIPT_BALINESE },
113 { "batk"_s, USCRIPT_BATAK },
114 { "blis"_s, USCRIPT_BLISSYMBOLS },
115 { "brah"_s, USCRIPT_BRAHMI },
116 { "cham"_s, USCRIPT_CHAM },
117 { "cirt"_s, USCRIPT_CIRTH },
118 { "cyrs"_s, USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
119 { "egyd"_s, USCRIPT_DEMOTIC_EGYPTIAN },
120 { "egyh"_s, USCRIPT_HIERATIC_EGYPTIAN },
121 { "egyp"_s, USCRIPT_EGYPTIAN_HIEROGLYPHS },
122 { "geok"_s, USCRIPT_KHUTSURI },
123 { "hans"_s, USCRIPT_SIMPLIFIED_HAN },
124 { "hant"_s, USCRIPT_TRADITIONAL_HAN },
125 { "hmng"_s, USCRIPT_PAHAWH_HMONG },
126 { "hung"_s, USCRIPT_OLD_HUNGARIAN },
127 { "inds"_s, USCRIPT_HARAPPAN_INDUS },
128 { "java"_s, USCRIPT_JAVANESE },
129 { "kali"_s, USCRIPT_KAYAH_LI },
130 { "latf"_s, USCRIPT_LATIN_FRAKTUR },
131 { "latg"_s, USCRIPT_LATIN_GAELIC },
132 { "lepc"_s, USCRIPT_LEPCHA },
133 { "lina"_s, USCRIPT_LINEAR_A },
134 { "mand"_s, USCRIPT_MANDAEAN },
135 { "maya"_s, USCRIPT_MAYAN_HIEROGLYPHS },
136 { "mero"_s, USCRIPT_MEROITIC },
137 { "nkoo"_s, USCRIPT_NKO },
138 { "orkh"_s, USCRIPT_ORKHON },
139 { "perm"_s, USCRIPT_OLD_PERMIC },
140 { "phag"_s, USCRIPT_PHAGS_PA },
141 { "phnx"_s, USCRIPT_PHOENICIAN },
142 { "plrd"_s, USCRIPT_PHONETIC_POLLARD },
143 { "roro"_s, USCRIPT_RONGORONGO },
144 { "sara"_s, USCRIPT_SARATI },
145 { "syre"_s, USCRIPT_ESTRANGELO_SYRIAC },
146 { "syrj"_s, USCRIPT_WESTERN_SYRIAC },
147 { "syrn"_s, USCRIPT_EASTERN_SYRIAC },
148 { "teng"_s, USCRIPT_TENGWAR },
149 { "vaii"_s, USCRIPT_VAI },
150 { "visp"_s, USCRIPT_VISIBLE_SPEECH },
151 { "xsux"_s, USCRIPT_CUNEIFORM },
152 { "jpan"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
153 { "kore"_s, USCRIPT_HANGUL },
154 { "zxxx"_s, USCRIPT_UNWRITTEN_LANGUAGES },
155 { "zzzz"_s, USCRIPT_UNKNOWN }
156};
157
158struct ScriptNameCodeMapHashTraits : public HashTraits<String> {
159 static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(scriptNameCodeList)>::value;
160};
161
162UScriptCode scriptNameToCode(const String& scriptName)
163{
164 static const auto scriptNameCodeMap = makeNeverDestroyed([] {
165 HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, ScriptNameCodeMapHashTraits> map;
166 for (auto& nameAndCode : scriptNameCodeList)
167 map.add(nameAndCode.name, nameAndCode.code);
168 return map;
169 }());
170
171 auto it = scriptNameCodeMap.get().find(scriptName);
172 if (it != scriptNameCodeMap.get().end())
173 return it->value;
174 return USCRIPT_INVALID_CODE;
175}
176
177struct LocaleScript {
178 ASCIILiteral locale;
179 UScriptCode script;
180};
181
182static const LocaleScript localeScriptList[] = {
183 { "aa"_s, USCRIPT_LATIN },
184 { "ab"_s, USCRIPT_CYRILLIC },
185 { "ady"_s, USCRIPT_CYRILLIC },
186 { "af"_s, USCRIPT_LATIN },
187 { "ak"_s, USCRIPT_LATIN },
188 { "am"_s, USCRIPT_ETHIOPIC },
189 { "ar"_s, USCRIPT_ARABIC },
190 { "as"_s, USCRIPT_BENGALI },
191 { "ast"_s, USCRIPT_LATIN },
192 { "av"_s, USCRIPT_CYRILLIC },
193 { "ay"_s, USCRIPT_LATIN },
194 { "az"_s, USCRIPT_LATIN },
195 { "ba"_s, USCRIPT_CYRILLIC },
196 { "be"_s, USCRIPT_CYRILLIC },
197 { "bg"_s, USCRIPT_CYRILLIC },
198 { "bi"_s, USCRIPT_LATIN },
199 { "bn"_s, USCRIPT_BENGALI },
200 { "bo"_s, USCRIPT_TIBETAN },
201 { "bs"_s, USCRIPT_LATIN },
202 { "ca"_s, USCRIPT_LATIN },
203 { "ce"_s, USCRIPT_CYRILLIC },
204 { "ceb"_s, USCRIPT_LATIN },
205 { "ch"_s, USCRIPT_LATIN },
206 { "chk"_s, USCRIPT_LATIN },
207 { "cs"_s, USCRIPT_LATIN },
208 { "cy"_s, USCRIPT_LATIN },
209 { "da"_s, USCRIPT_LATIN },
210 { "de"_s, USCRIPT_LATIN },
211 { "dv"_s, USCRIPT_THAANA },
212 { "dz"_s, USCRIPT_TIBETAN },
213 { "ee"_s, USCRIPT_LATIN },
214 { "efi"_s, USCRIPT_LATIN },
215 { "el"_s, USCRIPT_GREEK },
216 { "en"_s, USCRIPT_LATIN },
217 { "es"_s, USCRIPT_LATIN },
218 { "et"_s, USCRIPT_LATIN },
219 { "eu"_s, USCRIPT_LATIN },
220 { "fa"_s, USCRIPT_ARABIC },
221 { "fi"_s, USCRIPT_LATIN },
222 { "fil"_s, USCRIPT_LATIN },
223 { "fj"_s, USCRIPT_LATIN },
224 { "fo"_s, USCRIPT_LATIN },
225 { "fr"_s, USCRIPT_LATIN },
226 { "fur"_s, USCRIPT_LATIN },
227 { "fy"_s, USCRIPT_LATIN },
228 { "ga"_s, USCRIPT_LATIN },
229 { "gaa"_s, USCRIPT_LATIN },
230 { "gd"_s, USCRIPT_LATIN },
231 { "gil"_s, USCRIPT_LATIN },
232 { "gl"_s, USCRIPT_LATIN },
233 { "gn"_s, USCRIPT_LATIN },
234 { "gsw"_s, USCRIPT_LATIN },
235 { "gu"_s, USCRIPT_GUJARATI },
236 { "ha"_s, USCRIPT_LATIN },
237 { "haw"_s, USCRIPT_LATIN },
238 { "he"_s, USCRIPT_HEBREW },
239 { "hi"_s, USCRIPT_DEVANAGARI },
240 { "hil"_s, USCRIPT_LATIN },
241 { "ho"_s, USCRIPT_LATIN },
242 { "hr"_s, USCRIPT_LATIN },
243 { "ht"_s, USCRIPT_LATIN },
244 { "hu"_s, USCRIPT_LATIN },
245 { "hy"_s, USCRIPT_ARMENIAN },
246 { "id"_s, USCRIPT_LATIN },
247 { "ig"_s, USCRIPT_LATIN },
248 { "ii"_s, USCRIPT_YI },
249 { "ilo"_s, USCRIPT_LATIN },
250 { "inh"_s, USCRIPT_CYRILLIC },
251 { "is"_s, USCRIPT_LATIN },
252 { "it"_s, USCRIPT_LATIN },
253 { "iu"_s, USCRIPT_CANADIAN_ABORIGINAL },
254 { "ja"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
255 { "jv"_s, USCRIPT_LATIN },
256 { "ka"_s, USCRIPT_GEORGIAN },
257 { "kaj"_s, USCRIPT_LATIN },
258 { "kam"_s, USCRIPT_LATIN },
259 { "kbd"_s, USCRIPT_CYRILLIC },
260 { "kha"_s, USCRIPT_LATIN },
261 { "kk"_s, USCRIPT_CYRILLIC },
262 { "kl"_s, USCRIPT_LATIN },
263 { "km"_s, USCRIPT_KHMER },
264 { "kn"_s, USCRIPT_KANNADA },
265 { "ko"_s, USCRIPT_HANGUL },
266 { "kok"_s, USCRIPT_DEVANAGARI },
267 { "kos"_s, USCRIPT_LATIN },
268 { "kpe"_s, USCRIPT_LATIN },
269 { "krc"_s, USCRIPT_CYRILLIC },
270 { "ks"_s, USCRIPT_ARABIC },
271 { "ku"_s, USCRIPT_ARABIC },
272 { "kum"_s, USCRIPT_CYRILLIC },
273 { "ky"_s, USCRIPT_CYRILLIC },
274 { "la"_s, USCRIPT_LATIN },
275 { "lah"_s, USCRIPT_ARABIC },
276 { "lb"_s, USCRIPT_LATIN },
277 { "lez"_s, USCRIPT_CYRILLIC },
278 { "ln"_s, USCRIPT_LATIN },
279 { "lo"_s, USCRIPT_LAO },
280 { "lt"_s, USCRIPT_LATIN },
281 { "lv"_s, USCRIPT_LATIN },
282 { "mai"_s, USCRIPT_DEVANAGARI },
283 { "mdf"_s, USCRIPT_CYRILLIC },
284 { "mg"_s, USCRIPT_LATIN },
285 { "mh"_s, USCRIPT_LATIN },
286 { "mi"_s, USCRIPT_LATIN },
287 { "mk"_s, USCRIPT_CYRILLIC },
288 { "ml"_s, USCRIPT_MALAYALAM },
289 { "mn"_s, USCRIPT_CYRILLIC },
290 { "mr"_s, USCRIPT_DEVANAGARI },
291 { "ms"_s, USCRIPT_LATIN },
292 { "mt"_s, USCRIPT_LATIN },
293 { "my"_s, USCRIPT_MYANMAR },
294 { "myv"_s, USCRIPT_CYRILLIC },
295 { "na"_s, USCRIPT_LATIN },
296 { "nb"_s, USCRIPT_LATIN },
297 { "ne"_s, USCRIPT_DEVANAGARI },
298 { "niu"_s, USCRIPT_LATIN },
299 { "nl"_s, USCRIPT_LATIN },
300 { "nn"_s, USCRIPT_LATIN },
301 { "nr"_s, USCRIPT_LATIN },
302 { "nso"_s, USCRIPT_LATIN },
303 { "ny"_s, USCRIPT_LATIN },
304 { "oc"_s, USCRIPT_LATIN },
305 { "om"_s, USCRIPT_LATIN },
306 { "or"_s, USCRIPT_ORIYA },
307 { "os"_s, USCRIPT_CYRILLIC },
308 { "pa"_s, USCRIPT_GURMUKHI },
309 { "pag"_s, USCRIPT_LATIN },
310 { "pap"_s, USCRIPT_LATIN },
311 { "pau"_s, USCRIPT_LATIN },
312 { "pl"_s, USCRIPT_LATIN },
313 { "pon"_s, USCRIPT_LATIN },
314 { "ps"_s, USCRIPT_ARABIC },
315 { "pt"_s, USCRIPT_LATIN },
316 { "qu"_s, USCRIPT_LATIN },
317 { "rm"_s, USCRIPT_LATIN },
318 { "rn"_s, USCRIPT_LATIN },
319 { "ro"_s, USCRIPT_LATIN },
320 { "ru"_s, USCRIPT_CYRILLIC },
321 { "rw"_s, USCRIPT_LATIN },
322 { "sa"_s, USCRIPT_DEVANAGARI },
323 { "sah"_s, USCRIPT_CYRILLIC },
324 { "sat"_s, USCRIPT_LATIN },
325 { "sd"_s, USCRIPT_ARABIC },
326 { "se"_s, USCRIPT_LATIN },
327 { "sg"_s, USCRIPT_LATIN },
328 { "si"_s, USCRIPT_SINHALA },
329 { "sid"_s, USCRIPT_LATIN },
330 { "sk"_s, USCRIPT_LATIN },
331 { "sl"_s, USCRIPT_LATIN },
332 { "sm"_s, USCRIPT_LATIN },
333 { "so"_s, USCRIPT_LATIN },
334 { "sq"_s, USCRIPT_LATIN },
335 { "sr"_s, USCRIPT_CYRILLIC },
336 { "ss"_s, USCRIPT_LATIN },
337 { "st"_s, USCRIPT_LATIN },
338 { "su"_s, USCRIPT_LATIN },
339 { "sv"_s, USCRIPT_LATIN },
340 { "sw"_s, USCRIPT_LATIN },
341 { "ta"_s, USCRIPT_TAMIL },
342 { "te"_s, USCRIPT_TELUGU },
343 { "tet"_s, USCRIPT_LATIN },
344 { "tg"_s, USCRIPT_CYRILLIC },
345 { "th"_s, USCRIPT_THAI },
346 { "ti"_s, USCRIPT_ETHIOPIC },
347 { "tig"_s, USCRIPT_ETHIOPIC },
348 { "tk"_s, USCRIPT_LATIN },
349 { "tkl"_s, USCRIPT_LATIN },
350 { "tl"_s, USCRIPT_LATIN },
351 { "tn"_s, USCRIPT_LATIN },
352 { "to"_s, USCRIPT_LATIN },
353 { "tpi"_s, USCRIPT_LATIN },
354 { "tr"_s, USCRIPT_LATIN },
355 { "trv"_s, USCRIPT_LATIN },
356 { "ts"_s, USCRIPT_LATIN },
357 { "tt"_s, USCRIPT_CYRILLIC },
358 { "tvl"_s, USCRIPT_LATIN },
359 { "tw"_s, USCRIPT_LATIN },
360 { "ty"_s, USCRIPT_LATIN },
361 { "tyv"_s, USCRIPT_CYRILLIC },
362 { "udm"_s, USCRIPT_CYRILLIC },
363 { "ug"_s, USCRIPT_ARABIC },
364 { "uk"_s, USCRIPT_CYRILLIC },
365 { "und"_s, USCRIPT_LATIN },
366 { "ur"_s, USCRIPT_ARABIC },
367 { "uz"_s, USCRIPT_CYRILLIC },
368 { "ve"_s, USCRIPT_LATIN },
369 { "vi"_s, USCRIPT_LATIN },
370 { "wal"_s, USCRIPT_ETHIOPIC },
371 { "war"_s, USCRIPT_LATIN },
372 { "wo"_s, USCRIPT_LATIN },
373 { "xh"_s, USCRIPT_LATIN },
374 { "yap"_s, USCRIPT_LATIN },
375 { "yo"_s, USCRIPT_LATIN },
376 { "za"_s, USCRIPT_LATIN },
377 { "zh"_s, USCRIPT_HAN },
378 { "zh_hk"_s, USCRIPT_TRADITIONAL_HAN },
379 { "zh_tw"_s, USCRIPT_TRADITIONAL_HAN },
380 { "zu"_s, USCRIPT_LATIN }
381};
382
383struct LocaleScriptMapHashTraits : public HashTraits<String> {
384 static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(localeScriptList)>::value;
385};
386
387UScriptCode localeToScriptCodeForFontSelection(const String& locale)
388{
389 static const auto localeScriptMap = makeNeverDestroyed([] {
390 HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, LocaleScriptMapHashTraits> map;
391 for (auto& localeAndScript : localeScriptList)
392 map.add(localeAndScript.locale, localeAndScript.script);
393 return map;
394 }());
395
396 String canonicalLocale = locale;
397 canonicalLocale.replace('-', '_');
398 while (!canonicalLocale.isEmpty()) {
399 auto it = localeScriptMap.get().find(canonicalLocale);
400 if (it != localeScriptMap.get().end())
401 return it->value;
402 auto underscorePosition = canonicalLocale.reverseFind('_');
403 if (underscorePosition == notFound)
404 break;
405 UScriptCode code = scriptNameToCode(canonicalLocale.substring(underscorePosition + 1));
406 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
407 return code;
408 canonicalLocale = canonicalLocale.substring(0, underscorePosition);
409 }
410 return USCRIPT_COMMON;
411}
412
413} // namespace WebCore
414