1 | /* |
2 | * Copyright (C) 2006-2017 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2007-2009 Torch Mobile, Inc. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #include "config.h" |
28 | #include "TextEncodingRegistry.h" |
29 | |
30 | #include "TextCodecICU.h" |
31 | #include "TextCodecLatin1.h" |
32 | #include "TextCodecReplacement.h" |
33 | #include "TextCodecUTF16.h" |
34 | #include "TextCodecUTF8.h" |
35 | #include "TextCodecUserDefined.h" |
36 | #include "TextEncoding.h" |
37 | #include <mutex> |
38 | #include <wtf/ASCIICType.h> |
39 | #include <wtf/CheckedArithmetic.h> |
40 | #include <wtf/HashMap.h> |
41 | #include <wtf/HashSet.h> |
42 | #include <wtf/Lock.h> |
43 | #include <wtf/MainThread.h> |
44 | #include <wtf/StdLibExtras.h> |
45 | #include <wtf/text/CString.h> |
46 | |
47 | namespace WebCore { |
48 | |
49 | const size_t maxEncodingNameLength = 63; |
50 | |
51 | // Hash for all-ASCII strings that does case folding. |
52 | struct TextEncodingNameHash { |
53 | static bool equal(const char* s1, const char* s2) |
54 | { |
55 | char c1; |
56 | char c2; |
57 | do { |
58 | c1 = *s1++; |
59 | c2 = *s2++; |
60 | if (toASCIILower(c1) != toASCIILower(c2)) |
61 | return false; |
62 | } while (c1 && c2); |
63 | return !c1 && !c2; |
64 | } |
65 | |
66 | // This algorithm is the one-at-a-time hash from: |
67 | // http://burtleburtle.net/bob/hash/hashfaq.html |
68 | // http://burtleburtle.net/bob/hash/doobs.html |
69 | static unsigned hash(const char* s) |
70 | { |
71 | unsigned h = WTF::stringHashingStartValue; |
72 | for (;;) { |
73 | char c = *s++; |
74 | if (!c) { |
75 | h += (h << 3); |
76 | h ^= (h >> 11); |
77 | h += (h << 15); |
78 | return h; |
79 | } |
80 | h += toASCIILower(c); |
81 | h += (h << 10); |
82 | h ^= (h >> 6); |
83 | } |
84 | } |
85 | |
86 | static const bool safeToCompareToEmptyOrDeleted = false; |
87 | }; |
88 | |
89 | using TextEncodingNameMap = HashMap<const char*, const char*, TextEncodingNameHash>; |
90 | using TextCodecMap = HashMap<const char*, NewTextCodecFunction>; |
91 | |
92 | static Lock encodingRegistryMutex; |
93 | |
94 | static TextEncodingNameMap* textEncodingNameMap; |
95 | static TextCodecMap* textCodecMap; |
96 | static bool didExtendTextCodecMaps; |
97 | static HashSet<const char*>* japaneseEncodings; |
98 | static HashSet<const char*>* nonBackslashEncodings; |
99 | |
100 | static const char* const textEncodingNameBlacklist[] = { "UTF-7" , "BOCU-1" , "SCSU" }; |
101 | |
102 | static bool isUndesiredAlias(const char* alias) |
103 | { |
104 | // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). |
105 | for (const char* p = alias; *p; ++p) { |
106 | if (*p == ',') |
107 | return true; |
108 | } |
109 | // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility |
110 | // problem, see bug 43554. |
111 | if (0 == strcmp(alias, "8859_1" )) |
112 | return true; |
113 | return false; |
114 | } |
115 | |
116 | static void addToTextEncodingNameMap(const char* alias, const char* name) |
117 | { |
118 | ASSERT(strlen(alias) <= maxEncodingNameLength); |
119 | if (isUndesiredAlias(alias)) |
120 | return; |
121 | const char* atomicName = textEncodingNameMap->get(name); |
122 | ASSERT(strcmp(alias, name) == 0 || atomicName); |
123 | if (!atomicName) |
124 | atomicName = name; |
125 | |
126 | ASSERT_WITH_MESSAGE(!textEncodingNameMap->get(alias), "Duplicate text encoding name %s for %s (previously registered as %s)" , alias, atomicName, textEncodingNameMap->get(alias)); |
127 | |
128 | textEncodingNameMap->add(alias, atomicName); |
129 | } |
130 | |
131 | static void addToTextCodecMap(const char* name, NewTextCodecFunction&& function) |
132 | { |
133 | const char* atomicName = textEncodingNameMap->get(name); |
134 | ASSERT(atomicName); |
135 | textCodecMap->add(atomicName, WTFMove(function)); |
136 | } |
137 | |
138 | static void pruneBlacklistedCodecs() |
139 | { |
140 | for (auto& nameFromBlacklist : textEncodingNameBlacklist) { |
141 | auto* atomicName = textEncodingNameMap->get(nameFromBlacklist); |
142 | if (!atomicName) |
143 | continue; |
144 | |
145 | Vector<const char*> names; |
146 | for (auto& entry : *textEncodingNameMap) { |
147 | if (entry.value == atomicName) |
148 | names.append(entry.key); |
149 | } |
150 | |
151 | for (auto* name : names) |
152 | textEncodingNameMap->remove(name); |
153 | |
154 | textCodecMap->remove(atomicName); |
155 | } |
156 | } |
157 | |
158 | static void buildBaseTextCodecMaps(const std::lock_guard<Lock>&) |
159 | { |
160 | ASSERT(!textCodecMap); |
161 | ASSERT(!textEncodingNameMap); |
162 | |
163 | textCodecMap = new TextCodecMap; |
164 | textEncodingNameMap = new TextEncodingNameMap; |
165 | |
166 | TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); |
167 | TextCodecLatin1::registerCodecs(addToTextCodecMap); |
168 | |
169 | TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); |
170 | TextCodecUTF8::registerCodecs(addToTextCodecMap); |
171 | |
172 | TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); |
173 | TextCodecUTF16::registerCodecs(addToTextCodecMap); |
174 | |
175 | TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); |
176 | TextCodecUserDefined::registerCodecs(addToTextCodecMap); |
177 | } |
178 | |
179 | static void addEncodingName(HashSet<const char*>* set, const char* name) |
180 | { |
181 | // We must not use atomicCanonicalTextEncodingName() because this function is called in it. |
182 | const char* atomicName = textEncodingNameMap->get(name); |
183 | if (atomicName) |
184 | set->add(atomicName); |
185 | } |
186 | |
187 | static void buildQuirksSets() |
188 | { |
189 | // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() |
190 | // and initializing the sets for them in TextEncodingRegistry.cpp look strange. |
191 | |
192 | ASSERT(!japaneseEncodings); |
193 | ASSERT(!nonBackslashEncodings); |
194 | |
195 | japaneseEncodings = new HashSet<const char*>; |
196 | addEncodingName(japaneseEncodings, "EUC-JP" ); |
197 | addEncodingName(japaneseEncodings, "ISO-2022-JP" ); |
198 | addEncodingName(japaneseEncodings, "ISO-2022-JP-1" ); |
199 | addEncodingName(japaneseEncodings, "ISO-2022-JP-2" ); |
200 | addEncodingName(japaneseEncodings, "ISO-2022-JP-3" ); |
201 | addEncodingName(japaneseEncodings, "JIS_C6226-1978" ); |
202 | addEncodingName(japaneseEncodings, "JIS_X0201" ); |
203 | addEncodingName(japaneseEncodings, "JIS_X0208-1983" ); |
204 | addEncodingName(japaneseEncodings, "JIS_X0208-1990" ); |
205 | addEncodingName(japaneseEncodings, "JIS_X0212-1990" ); |
206 | addEncodingName(japaneseEncodings, "Shift_JIS" ); |
207 | addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000" ); |
208 | addEncodingName(japaneseEncodings, "cp932" ); |
209 | addEncodingName(japaneseEncodings, "x-mac-japanese" ); |
210 | |
211 | nonBackslashEncodings = new HashSet<const char*>; |
212 | // The text encodings below treat backslash as a currency symbol for IE compatibility. |
213 | // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. |
214 | addEncodingName(nonBackslashEncodings, "x-mac-japanese" ); |
215 | addEncodingName(nonBackslashEncodings, "ISO-2022-JP" ); |
216 | addEncodingName(nonBackslashEncodings, "EUC-JP" ); |
217 | // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. |
218 | addEncodingName(nonBackslashEncodings, "Shift_JIS" ); |
219 | addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000" ); |
220 | } |
221 | |
222 | bool isJapaneseEncoding(const char* canonicalEncodingName) |
223 | { |
224 | return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); |
225 | } |
226 | |
227 | bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) |
228 | { |
229 | return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); |
230 | } |
231 | |
232 | static void extendTextCodecMaps() |
233 | { |
234 | TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap); |
235 | TextCodecReplacement::registerCodecs(addToTextCodecMap); |
236 | |
237 | TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); |
238 | TextCodecICU::registerCodecs(addToTextCodecMap); |
239 | |
240 | pruneBlacklistedCodecs(); |
241 | buildQuirksSets(); |
242 | } |
243 | |
244 | std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding) |
245 | { |
246 | std::lock_guard<Lock> lock(encodingRegistryMutex); |
247 | |
248 | ASSERT(textCodecMap); |
249 | auto result = textCodecMap->find(encoding.name()); |
250 | ASSERT(result != textCodecMap->end()); |
251 | return result->value(); |
252 | } |
253 | |
254 | const char* atomicCanonicalTextEncodingName(const char* name) |
255 | { |
256 | if (!name || !name[0]) |
257 | return nullptr; |
258 | |
259 | std::lock_guard<Lock> lock(encodingRegistryMutex); |
260 | |
261 | if (!textEncodingNameMap) |
262 | buildBaseTextCodecMaps(lock); |
263 | |
264 | if (const char* atomicName = textEncodingNameMap->get(name)) |
265 | return atomicName; |
266 | if (didExtendTextCodecMaps) |
267 | return nullptr; |
268 | |
269 | extendTextCodecMaps(); |
270 | didExtendTextCodecMaps = true; |
271 | return textEncodingNameMap->get(name); |
272 | } |
273 | |
274 | template<typename CharacterType> static const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) |
275 | { |
276 | char buffer[maxEncodingNameLength + 1]; |
277 | size_t j = 0; |
278 | for (size_t i = 0; i < length; ++i) { |
279 | if (j == maxEncodingNameLength) |
280 | return nullptr; |
281 | buffer[j++] = characters[i]; |
282 | } |
283 | buffer[j] = 0; |
284 | return atomicCanonicalTextEncodingName(buffer); |
285 | } |
286 | |
287 | const char* atomicCanonicalTextEncodingName(const String& alias) |
288 | { |
289 | if (alias.isEmpty() || !alias.isAllASCII()) |
290 | return nullptr; |
291 | |
292 | if (alias.is8Bit()) |
293 | return atomicCanonicalTextEncodingName(alias.characters8(), alias.length()); |
294 | |
295 | return atomicCanonicalTextEncodingName(alias.characters16(), alias.length()); |
296 | } |
297 | |
298 | bool noExtendedTextEncodingNameUsed() |
299 | { |
300 | // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. |
301 | return !didExtendTextCodecMaps; |
302 | } |
303 | |
304 | String defaultTextEncodingNameForSystemLanguage() |
305 | { |
306 | #if PLATFORM(COCOA) |
307 | String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding()); |
308 | |
309 | // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949. |
310 | // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949. |
311 | // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>. |
312 | // On some OS versions, the result is CP949 (uppercase). |
313 | if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949" )) |
314 | systemEncodingName = "ks_c_5601-1987"_s ; |
315 | |
316 | // CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874. |
317 | // Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to |
318 | // "dos-874" instead. |
319 | if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874" )) |
320 | systemEncodingName = "dos-874"_s ; |
321 | |
322 | return systemEncodingName; |
323 | #else |
324 | return "ISO-8859-1"_s ; |
325 | #endif |
326 | } |
327 | |
328 | } // namespace WebCore |
329 | |