1/*
2 * Copyright (C) 2006-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextEncodingRegistry.h"
29
30#include "TextCodecICU.h"
31#include "TextCodecLatin1.h"
32#include "TextCodecReplacement.h"
33#include "TextCodecUTF16.h"
34#include "TextCodecUTF8.h"
35#include "TextCodecUserDefined.h"
36#include "TextEncoding.h"
37#include <mutex>
38#include <wtf/ASCIICType.h>
39#include <wtf/CheckedArithmetic.h>
40#include <wtf/HashMap.h>
41#include <wtf/HashSet.h>
42#include <wtf/Lock.h>
43#include <wtf/MainThread.h>
44#include <wtf/StdLibExtras.h>
45#include <wtf/text/CString.h>
46
47namespace WebCore {
48
49const size_t maxEncodingNameLength = 63;
50
51// Hash for all-ASCII strings that does case folding.
52struct TextEncodingNameHash {
53 static bool equal(const char* s1, const char* s2)
54 {
55 char c1;
56 char c2;
57 do {
58 c1 = *s1++;
59 c2 = *s2++;
60 if (toASCIILower(c1) != toASCIILower(c2))
61 return false;
62 } while (c1 && c2);
63 return !c1 && !c2;
64 }
65
66 // This algorithm is the one-at-a-time hash from:
67 // http://burtleburtle.net/bob/hash/hashfaq.html
68 // http://burtleburtle.net/bob/hash/doobs.html
69 static unsigned hash(const char* s)
70 {
71 unsigned h = WTF::stringHashingStartValue;
72 for (;;) {
73 char c = *s++;
74 if (!c) {
75 h += (h << 3);
76 h ^= (h >> 11);
77 h += (h << 15);
78 return h;
79 }
80 h += toASCIILower(c);
81 h += (h << 10);
82 h ^= (h >> 6);
83 }
84 }
85
86 static const bool safeToCompareToEmptyOrDeleted = false;
87};
88
89using TextEncodingNameMap = HashMap<const char*, const char*, TextEncodingNameHash>;
90using TextCodecMap = HashMap<const char*, NewTextCodecFunction>;
91
92static Lock encodingRegistryMutex;
93
94static TextEncodingNameMap* textEncodingNameMap;
95static TextCodecMap* textCodecMap;
96static bool didExtendTextCodecMaps;
97static HashSet<const char*>* japaneseEncodings;
98static HashSet<const char*>* nonBackslashEncodings;
99
100static const char* const textEncodingNameBlacklist[] = { "UTF-7", "BOCU-1", "SCSU" };
101
102static bool isUndesiredAlias(const char* alias)
103{
104 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
105 for (const char* p = alias; *p; ++p) {
106 if (*p == ',')
107 return true;
108 }
109 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
110 // problem, see bug 43554.
111 if (0 == strcmp(alias, "8859_1"))
112 return true;
113 return false;
114}
115
116static void addToTextEncodingNameMap(const char* alias, const char* name)
117{
118 ASSERT(strlen(alias) <= maxEncodingNameLength);
119 if (isUndesiredAlias(alias))
120 return;
121 const char* atomicName = textEncodingNameMap->get(name);
122 ASSERT(strcmp(alias, name) == 0 || atomicName);
123 if (!atomicName)
124 atomicName = name;
125
126 ASSERT_WITH_MESSAGE(!textEncodingNameMap->get(alias), "Duplicate text encoding name %s for %s (previously registered as %s)", alias, atomicName, textEncodingNameMap->get(alias));
127
128 textEncodingNameMap->add(alias, atomicName);
129}
130
131static void addToTextCodecMap(const char* name, NewTextCodecFunction&& function)
132{
133 const char* atomicName = textEncodingNameMap->get(name);
134 ASSERT(atomicName);
135 textCodecMap->add(atomicName, WTFMove(function));
136}
137
138static void pruneBlacklistedCodecs()
139{
140 for (auto& nameFromBlacklist : textEncodingNameBlacklist) {
141 auto* atomicName = textEncodingNameMap->get(nameFromBlacklist);
142 if (!atomicName)
143 continue;
144
145 Vector<const char*> names;
146 for (auto& entry : *textEncodingNameMap) {
147 if (entry.value == atomicName)
148 names.append(entry.key);
149 }
150
151 for (auto* name : names)
152 textEncodingNameMap->remove(name);
153
154 textCodecMap->remove(atomicName);
155 }
156}
157
158static void buildBaseTextCodecMaps(const std::lock_guard<Lock>&)
159{
160 ASSERT(!textCodecMap);
161 ASSERT(!textEncodingNameMap);
162
163 textCodecMap = new TextCodecMap;
164 textEncodingNameMap = new TextEncodingNameMap;
165
166 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
167 TextCodecLatin1::registerCodecs(addToTextCodecMap);
168
169 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
170 TextCodecUTF8::registerCodecs(addToTextCodecMap);
171
172 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
173 TextCodecUTF16::registerCodecs(addToTextCodecMap);
174
175 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
176 TextCodecUserDefined::registerCodecs(addToTextCodecMap);
177}
178
179static void addEncodingName(HashSet<const char*>* set, const char* name)
180{
181 // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
182 const char* atomicName = textEncodingNameMap->get(name);
183 if (atomicName)
184 set->add(atomicName);
185}
186
187static void buildQuirksSets()
188{
189 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
190 // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
191
192 ASSERT(!japaneseEncodings);
193 ASSERT(!nonBackslashEncodings);
194
195 japaneseEncodings = new HashSet<const char*>;
196 addEncodingName(japaneseEncodings, "EUC-JP");
197 addEncodingName(japaneseEncodings, "ISO-2022-JP");
198 addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
199 addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
200 addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
201 addEncodingName(japaneseEncodings, "JIS_C6226-1978");
202 addEncodingName(japaneseEncodings, "JIS_X0201");
203 addEncodingName(japaneseEncodings, "JIS_X0208-1983");
204 addEncodingName(japaneseEncodings, "JIS_X0208-1990");
205 addEncodingName(japaneseEncodings, "JIS_X0212-1990");
206 addEncodingName(japaneseEncodings, "Shift_JIS");
207 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
208 addEncodingName(japaneseEncodings, "cp932");
209 addEncodingName(japaneseEncodings, "x-mac-japanese");
210
211 nonBackslashEncodings = new HashSet<const char*>;
212 // The text encodings below treat backslash as a currency symbol for IE compatibility.
213 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
214 addEncodingName(nonBackslashEncodings, "x-mac-japanese");
215 addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
216 addEncodingName(nonBackslashEncodings, "EUC-JP");
217 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
218 addEncodingName(nonBackslashEncodings, "Shift_JIS");
219 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
220}
221
222bool isJapaneseEncoding(const char* canonicalEncodingName)
223{
224 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
225}
226
227bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
228{
229 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
230}
231
232static void extendTextCodecMaps()
233{
234 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
235 TextCodecReplacement::registerCodecs(addToTextCodecMap);
236
237 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
238 TextCodecICU::registerCodecs(addToTextCodecMap);
239
240 pruneBlacklistedCodecs();
241 buildQuirksSets();
242}
243
244std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding)
245{
246 std::lock_guard<Lock> lock(encodingRegistryMutex);
247
248 ASSERT(textCodecMap);
249 auto result = textCodecMap->find(encoding.name());
250 ASSERT(result != textCodecMap->end());
251 return result->value();
252}
253
254const char* atomicCanonicalTextEncodingName(const char* name)
255{
256 if (!name || !name[0])
257 return nullptr;
258
259 std::lock_guard<Lock> lock(encodingRegistryMutex);
260
261 if (!textEncodingNameMap)
262 buildBaseTextCodecMaps(lock);
263
264 if (const char* atomicName = textEncodingNameMap->get(name))
265 return atomicName;
266 if (didExtendTextCodecMaps)
267 return nullptr;
268
269 extendTextCodecMaps();
270 didExtendTextCodecMaps = true;
271 return textEncodingNameMap->get(name);
272}
273
274template<typename CharacterType> static const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
275{
276 char buffer[maxEncodingNameLength + 1];
277 size_t j = 0;
278 for (size_t i = 0; i < length; ++i) {
279 if (j == maxEncodingNameLength)
280 return nullptr;
281 buffer[j++] = characters[i];
282 }
283 buffer[j] = 0;
284 return atomicCanonicalTextEncodingName(buffer);
285}
286
287const char* atomicCanonicalTextEncodingName(const String& alias)
288{
289 if (alias.isEmpty() || !alias.isAllASCII())
290 return nullptr;
291
292 if (alias.is8Bit())
293 return atomicCanonicalTextEncodingName(alias.characters8(), alias.length());
294
295 return atomicCanonicalTextEncodingName(alias.characters16(), alias.length());
296}
297
298bool noExtendedTextEncodingNameUsed()
299{
300 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
301 return !didExtendTextCodecMaps;
302}
303
304String defaultTextEncodingNameForSystemLanguage()
305{
306#if PLATFORM(COCOA)
307 String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding());
308
309 // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949.
310 // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949.
311 // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>.
312 // On some OS versions, the result is CP949 (uppercase).
313 if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949"))
314 systemEncodingName = "ks_c_5601-1987"_s;
315
316 // CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874.
317 // Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to
318 // "dos-874" instead.
319 if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874"))
320 systemEncodingName = "dos-874"_s;
321
322 return systemEncodingName;
323#else
324 return "ISO-8859-1"_s;
325#endif
326}
327
328} // namespace WebCore
329