| 1 | /* |
| 2 | * Copyright (C) 2006-2017 Apple Inc. All rights reserved. |
| 3 | * Copyright (C) 2007-2009 Torch Mobile, Inc. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * |
| 14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| 15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 25 | */ |
| 26 | |
| 27 | #include "config.h" |
| 28 | #include "TextEncodingRegistry.h" |
| 29 | |
| 30 | #include "TextCodecICU.h" |
| 31 | #include "TextCodecLatin1.h" |
| 32 | #include "TextCodecReplacement.h" |
| 33 | #include "TextCodecUTF16.h" |
| 34 | #include "TextCodecUTF8.h" |
| 35 | #include "TextCodecUserDefined.h" |
| 36 | #include "TextEncoding.h" |
| 37 | #include <mutex> |
| 38 | #include <wtf/ASCIICType.h> |
| 39 | #include <wtf/CheckedArithmetic.h> |
| 40 | #include <wtf/HashMap.h> |
| 41 | #include <wtf/HashSet.h> |
| 42 | #include <wtf/Lock.h> |
| 43 | #include <wtf/MainThread.h> |
| 44 | #include <wtf/StdLibExtras.h> |
| 45 | #include <wtf/text/CString.h> |
| 46 | |
| 47 | namespace WebCore { |
| 48 | |
| 49 | const size_t maxEncodingNameLength = 63; |
| 50 | |
| 51 | // Hash for all-ASCII strings that does case folding. |
| 52 | struct TextEncodingNameHash { |
| 53 | static bool equal(const char* s1, const char* s2) |
| 54 | { |
| 55 | char c1; |
| 56 | char c2; |
| 57 | do { |
| 58 | c1 = *s1++; |
| 59 | c2 = *s2++; |
| 60 | if (toASCIILower(c1) != toASCIILower(c2)) |
| 61 | return false; |
| 62 | } while (c1 && c2); |
| 63 | return !c1 && !c2; |
| 64 | } |
| 65 | |
| 66 | // This algorithm is the one-at-a-time hash from: |
| 67 | // http://burtleburtle.net/bob/hash/hashfaq.html |
| 68 | // http://burtleburtle.net/bob/hash/doobs.html |
| 69 | static unsigned hash(const char* s) |
| 70 | { |
| 71 | unsigned h = WTF::stringHashingStartValue; |
| 72 | for (;;) { |
| 73 | char c = *s++; |
| 74 | if (!c) { |
| 75 | h += (h << 3); |
| 76 | h ^= (h >> 11); |
| 77 | h += (h << 15); |
| 78 | return h; |
| 79 | } |
| 80 | h += toASCIILower(c); |
| 81 | h += (h << 10); |
| 82 | h ^= (h >> 6); |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | static const bool safeToCompareToEmptyOrDeleted = false; |
| 87 | }; |
| 88 | |
| 89 | using TextEncodingNameMap = HashMap<const char*, const char*, TextEncodingNameHash>; |
| 90 | using TextCodecMap = HashMap<const char*, NewTextCodecFunction>; |
| 91 | |
| 92 | static Lock encodingRegistryMutex; |
| 93 | |
| 94 | static TextEncodingNameMap* textEncodingNameMap; |
| 95 | static TextCodecMap* textCodecMap; |
| 96 | static bool didExtendTextCodecMaps; |
| 97 | static HashSet<const char*>* japaneseEncodings; |
| 98 | static HashSet<const char*>* nonBackslashEncodings; |
| 99 | |
| 100 | static const char* const textEncodingNameBlacklist[] = { "UTF-7" , "BOCU-1" , "SCSU" }; |
| 101 | |
| 102 | static bool isUndesiredAlias(const char* alias) |
| 103 | { |
| 104 | // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). |
| 105 | for (const char* p = alias; *p; ++p) { |
| 106 | if (*p == ',') |
| 107 | return true; |
| 108 | } |
| 109 | // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility |
| 110 | // problem, see bug 43554. |
| 111 | if (0 == strcmp(alias, "8859_1" )) |
| 112 | return true; |
| 113 | return false; |
| 114 | } |
| 115 | |
| 116 | static void addToTextEncodingNameMap(const char* alias, const char* name) |
| 117 | { |
| 118 | ASSERT(strlen(alias) <= maxEncodingNameLength); |
| 119 | if (isUndesiredAlias(alias)) |
| 120 | return; |
| 121 | const char* atomicName = textEncodingNameMap->get(name); |
| 122 | ASSERT(strcmp(alias, name) == 0 || atomicName); |
| 123 | if (!atomicName) |
| 124 | atomicName = name; |
| 125 | |
| 126 | ASSERT_WITH_MESSAGE(!textEncodingNameMap->get(alias), "Duplicate text encoding name %s for %s (previously registered as %s)" , alias, atomicName, textEncodingNameMap->get(alias)); |
| 127 | |
| 128 | textEncodingNameMap->add(alias, atomicName); |
| 129 | } |
| 130 | |
| 131 | static void addToTextCodecMap(const char* name, NewTextCodecFunction&& function) |
| 132 | { |
| 133 | const char* atomicName = textEncodingNameMap->get(name); |
| 134 | ASSERT(atomicName); |
| 135 | textCodecMap->add(atomicName, WTFMove(function)); |
| 136 | } |
| 137 | |
| 138 | static void pruneBlacklistedCodecs() |
| 139 | { |
| 140 | for (auto& nameFromBlacklist : textEncodingNameBlacklist) { |
| 141 | auto* atomicName = textEncodingNameMap->get(nameFromBlacklist); |
| 142 | if (!atomicName) |
| 143 | continue; |
| 144 | |
| 145 | Vector<const char*> names; |
| 146 | for (auto& entry : *textEncodingNameMap) { |
| 147 | if (entry.value == atomicName) |
| 148 | names.append(entry.key); |
| 149 | } |
| 150 | |
| 151 | for (auto* name : names) |
| 152 | textEncodingNameMap->remove(name); |
| 153 | |
| 154 | textCodecMap->remove(atomicName); |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | static void buildBaseTextCodecMaps(const std::lock_guard<Lock>&) |
| 159 | { |
| 160 | ASSERT(!textCodecMap); |
| 161 | ASSERT(!textEncodingNameMap); |
| 162 | |
| 163 | textCodecMap = new TextCodecMap; |
| 164 | textEncodingNameMap = new TextEncodingNameMap; |
| 165 | |
| 166 | TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); |
| 167 | TextCodecLatin1::registerCodecs(addToTextCodecMap); |
| 168 | |
| 169 | TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); |
| 170 | TextCodecUTF8::registerCodecs(addToTextCodecMap); |
| 171 | |
| 172 | TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); |
| 173 | TextCodecUTF16::registerCodecs(addToTextCodecMap); |
| 174 | |
| 175 | TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); |
| 176 | TextCodecUserDefined::registerCodecs(addToTextCodecMap); |
| 177 | } |
| 178 | |
| 179 | static void addEncodingName(HashSet<const char*>* set, const char* name) |
| 180 | { |
| 181 | // We must not use atomicCanonicalTextEncodingName() because this function is called in it. |
| 182 | const char* atomicName = textEncodingNameMap->get(name); |
| 183 | if (atomicName) |
| 184 | set->add(atomicName); |
| 185 | } |
| 186 | |
| 187 | static void buildQuirksSets() |
| 188 | { |
| 189 | // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() |
| 190 | // and initializing the sets for them in TextEncodingRegistry.cpp look strange. |
| 191 | |
| 192 | ASSERT(!japaneseEncodings); |
| 193 | ASSERT(!nonBackslashEncodings); |
| 194 | |
| 195 | japaneseEncodings = new HashSet<const char*>; |
| 196 | addEncodingName(japaneseEncodings, "EUC-JP" ); |
| 197 | addEncodingName(japaneseEncodings, "ISO-2022-JP" ); |
| 198 | addEncodingName(japaneseEncodings, "ISO-2022-JP-1" ); |
| 199 | addEncodingName(japaneseEncodings, "ISO-2022-JP-2" ); |
| 200 | addEncodingName(japaneseEncodings, "ISO-2022-JP-3" ); |
| 201 | addEncodingName(japaneseEncodings, "JIS_C6226-1978" ); |
| 202 | addEncodingName(japaneseEncodings, "JIS_X0201" ); |
| 203 | addEncodingName(japaneseEncodings, "JIS_X0208-1983" ); |
| 204 | addEncodingName(japaneseEncodings, "JIS_X0208-1990" ); |
| 205 | addEncodingName(japaneseEncodings, "JIS_X0212-1990" ); |
| 206 | addEncodingName(japaneseEncodings, "Shift_JIS" ); |
| 207 | addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000" ); |
| 208 | addEncodingName(japaneseEncodings, "cp932" ); |
| 209 | addEncodingName(japaneseEncodings, "x-mac-japanese" ); |
| 210 | |
| 211 | nonBackslashEncodings = new HashSet<const char*>; |
| 212 | // The text encodings below treat backslash as a currency symbol for IE compatibility. |
| 213 | // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. |
| 214 | addEncodingName(nonBackslashEncodings, "x-mac-japanese" ); |
| 215 | addEncodingName(nonBackslashEncodings, "ISO-2022-JP" ); |
| 216 | addEncodingName(nonBackslashEncodings, "EUC-JP" ); |
| 217 | // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. |
| 218 | addEncodingName(nonBackslashEncodings, "Shift_JIS" ); |
| 219 | addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000" ); |
| 220 | } |
| 221 | |
| 222 | bool isJapaneseEncoding(const char* canonicalEncodingName) |
| 223 | { |
| 224 | return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); |
| 225 | } |
| 226 | |
| 227 | bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) |
| 228 | { |
| 229 | return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); |
| 230 | } |
| 231 | |
| 232 | static void extendTextCodecMaps() |
| 233 | { |
| 234 | TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap); |
| 235 | TextCodecReplacement::registerCodecs(addToTextCodecMap); |
| 236 | |
| 237 | TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); |
| 238 | TextCodecICU::registerCodecs(addToTextCodecMap); |
| 239 | |
| 240 | pruneBlacklistedCodecs(); |
| 241 | buildQuirksSets(); |
| 242 | } |
| 243 | |
| 244 | std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding) |
| 245 | { |
| 246 | std::lock_guard<Lock> lock(encodingRegistryMutex); |
| 247 | |
| 248 | ASSERT(textCodecMap); |
| 249 | auto result = textCodecMap->find(encoding.name()); |
| 250 | ASSERT(result != textCodecMap->end()); |
| 251 | return result->value(); |
| 252 | } |
| 253 | |
| 254 | const char* atomicCanonicalTextEncodingName(const char* name) |
| 255 | { |
| 256 | if (!name || !name[0]) |
| 257 | return nullptr; |
| 258 | |
| 259 | std::lock_guard<Lock> lock(encodingRegistryMutex); |
| 260 | |
| 261 | if (!textEncodingNameMap) |
| 262 | buildBaseTextCodecMaps(lock); |
| 263 | |
| 264 | if (const char* atomicName = textEncodingNameMap->get(name)) |
| 265 | return atomicName; |
| 266 | if (didExtendTextCodecMaps) |
| 267 | return nullptr; |
| 268 | |
| 269 | extendTextCodecMaps(); |
| 270 | didExtendTextCodecMaps = true; |
| 271 | return textEncodingNameMap->get(name); |
| 272 | } |
| 273 | |
| 274 | template<typename CharacterType> static const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) |
| 275 | { |
| 276 | char buffer[maxEncodingNameLength + 1]; |
| 277 | size_t j = 0; |
| 278 | for (size_t i = 0; i < length; ++i) { |
| 279 | if (j == maxEncodingNameLength) |
| 280 | return nullptr; |
| 281 | buffer[j++] = characters[i]; |
| 282 | } |
| 283 | buffer[j] = 0; |
| 284 | return atomicCanonicalTextEncodingName(buffer); |
| 285 | } |
| 286 | |
| 287 | const char* atomicCanonicalTextEncodingName(const String& alias) |
| 288 | { |
| 289 | if (alias.isEmpty() || !alias.isAllASCII()) |
| 290 | return nullptr; |
| 291 | |
| 292 | if (alias.is8Bit()) |
| 293 | return atomicCanonicalTextEncodingName(alias.characters8(), alias.length()); |
| 294 | |
| 295 | return atomicCanonicalTextEncodingName(alias.characters16(), alias.length()); |
| 296 | } |
| 297 | |
| 298 | bool noExtendedTextEncodingNameUsed() |
| 299 | { |
| 300 | // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. |
| 301 | return !didExtendTextCodecMaps; |
| 302 | } |
| 303 | |
| 304 | String defaultTextEncodingNameForSystemLanguage() |
| 305 | { |
| 306 | #if PLATFORM(COCOA) |
| 307 | String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding()); |
| 308 | |
| 309 | // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949. |
| 310 | // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949. |
| 311 | // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>. |
| 312 | // On some OS versions, the result is CP949 (uppercase). |
| 313 | if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949" )) |
| 314 | systemEncodingName = "ks_c_5601-1987"_s ; |
| 315 | |
| 316 | // CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874. |
| 317 | // Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to |
| 318 | // "dos-874" instead. |
| 319 | if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874" )) |
| 320 | systemEncodingName = "dos-874"_s ; |
| 321 | |
| 322 | return systemEncodingName; |
| 323 | #else |
| 324 | return "ISO-8859-1"_s ; |
| 325 | #endif |
| 326 | } |
| 327 | |
| 328 | } // namespace WebCore |
| 329 | |