| 1 | /* | 
| 2 |  * Copyright (C) 2006-2017 Apple Inc. All rights reserved. | 
| 3 |  * Copyright (C) 2007-2009 Torch Mobile, Inc. | 
| 4 |  * | 
| 5 |  * Redistribution and use in source and binary forms, with or without | 
| 6 |  * modification, are permitted provided that the following conditions | 
| 7 |  * are met: | 
| 8 |  * 1. Redistributions of source code must retain the above copyright | 
| 9 |  *    notice, this list of conditions and the following disclaimer. | 
| 10 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 11 |  *    notice, this list of conditions and the following disclaimer in the | 
| 12 |  *    documentation and/or other materials provided with the distribution. | 
| 13 |  * | 
| 14 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 15 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 16 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 17 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 18 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 19 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 20 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 21 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 22 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 24 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 25 |  */ | 
| 26 |  | 
| 27 | #include "config.h" | 
| 28 | #include "TextEncodingRegistry.h" | 
| 29 |  | 
| 30 | #include "TextCodecICU.h" | 
| 31 | #include "TextCodecLatin1.h" | 
| 32 | #include "TextCodecReplacement.h" | 
| 33 | #include "TextCodecUTF16.h" | 
| 34 | #include "TextCodecUTF8.h" | 
| 35 | #include "TextCodecUserDefined.h" | 
| 36 | #include "TextEncoding.h" | 
| 37 | #include <mutex> | 
| 38 | #include <wtf/ASCIICType.h> | 
| 39 | #include <wtf/CheckedArithmetic.h> | 
| 40 | #include <wtf/HashMap.h> | 
| 41 | #include <wtf/HashSet.h> | 
| 42 | #include <wtf/Lock.h> | 
| 43 | #include <wtf/MainThread.h> | 
| 44 | #include <wtf/StdLibExtras.h> | 
| 45 | #include <wtf/text/CString.h> | 
| 46 |  | 
| 47 | namespace WebCore { | 
| 48 |  | 
| 49 | const size_t maxEncodingNameLength = 63; | 
| 50 |  | 
| 51 | // Hash for all-ASCII strings that does case folding. | 
| 52 | struct TextEncodingNameHash { | 
| 53 |     static bool equal(const char* s1, const char* s2) | 
| 54 |     { | 
| 55 |         char c1; | 
| 56 |         char c2; | 
| 57 |         do { | 
| 58 |             c1 = *s1++; | 
| 59 |             c2 = *s2++; | 
| 60 |             if (toASCIILower(c1) != toASCIILower(c2)) | 
| 61 |                 return false; | 
| 62 |         } while (c1 && c2); | 
| 63 |         return !c1 && !c2; | 
| 64 |     } | 
| 65 |  | 
| 66 |     // This algorithm is the one-at-a-time hash from: | 
| 67 |     // http://burtleburtle.net/bob/hash/hashfaq.html | 
| 68 |     // http://burtleburtle.net/bob/hash/doobs.html | 
| 69 |     static unsigned hash(const char* s) | 
| 70 |     { | 
| 71 |         unsigned h = WTF::stringHashingStartValue; | 
| 72 |         for (;;) { | 
| 73 |             char c = *s++; | 
| 74 |             if (!c) { | 
| 75 |                 h += (h << 3); | 
| 76 |                 h ^= (h >> 11); | 
| 77 |                 h += (h << 15); | 
| 78 |                 return h; | 
| 79 |             } | 
| 80 |             h += toASCIILower(c); | 
| 81 |             h += (h << 10);  | 
| 82 |             h ^= (h >> 6);  | 
| 83 |         } | 
| 84 |     } | 
| 85 |  | 
| 86 |     static const bool safeToCompareToEmptyOrDeleted = false; | 
| 87 | }; | 
| 88 |  | 
| 89 | using TextEncodingNameMap = HashMap<const char*, const char*, TextEncodingNameHash>; | 
| 90 | using TextCodecMap = HashMap<const char*, NewTextCodecFunction>; | 
| 91 |  | 
| 92 | static Lock encodingRegistryMutex; | 
| 93 |  | 
| 94 | static TextEncodingNameMap* textEncodingNameMap; | 
| 95 | static TextCodecMap* textCodecMap; | 
| 96 | static bool didExtendTextCodecMaps; | 
| 97 | static HashSet<const char*>* japaneseEncodings; | 
| 98 | static HashSet<const char*>* nonBackslashEncodings; | 
| 99 |  | 
| 100 | static const char* const textEncodingNameBlacklist[] = { "UTF-7" , "BOCU-1" , "SCSU"  }; | 
| 101 |  | 
| 102 | static bool isUndesiredAlias(const char* alias) | 
| 103 | { | 
| 104 |     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). | 
| 105 |     for (const char* p = alias; *p; ++p) { | 
| 106 |         if (*p == ',') | 
| 107 |             return true; | 
| 108 |     } | 
| 109 |     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility | 
| 110 |     // problem, see bug 43554. | 
| 111 |     if (0 == strcmp(alias, "8859_1" )) | 
| 112 |         return true; | 
| 113 |     return false; | 
| 114 | } | 
| 115 |  | 
| 116 | static void addToTextEncodingNameMap(const char* alias, const char* name) | 
| 117 | { | 
| 118 |     ASSERT(strlen(alias) <= maxEncodingNameLength); | 
| 119 |     if (isUndesiredAlias(alias)) | 
| 120 |         return; | 
| 121 |     const char* atomicName = textEncodingNameMap->get(name); | 
| 122 |     ASSERT(strcmp(alias, name) == 0 || atomicName); | 
| 123 |     if (!atomicName) | 
| 124 |         atomicName = name; | 
| 125 |  | 
| 126 |     ASSERT_WITH_MESSAGE(!textEncodingNameMap->get(alias), "Duplicate text encoding name %s for %s (previously registered as %s)" , alias, atomicName, textEncodingNameMap->get(alias)); | 
| 127 |  | 
| 128 |     textEncodingNameMap->add(alias, atomicName); | 
| 129 | } | 
| 130 |  | 
| 131 | static void addToTextCodecMap(const char* name, NewTextCodecFunction&& function) | 
| 132 | { | 
| 133 |     const char* atomicName = textEncodingNameMap->get(name); | 
| 134 |     ASSERT(atomicName); | 
| 135 |     textCodecMap->add(atomicName, WTFMove(function)); | 
| 136 | } | 
| 137 |  | 
| 138 | static void pruneBlacklistedCodecs() | 
| 139 | { | 
| 140 |     for (auto& nameFromBlacklist : textEncodingNameBlacklist) { | 
| 141 |         auto* atomicName = textEncodingNameMap->get(nameFromBlacklist); | 
| 142 |         if (!atomicName) | 
| 143 |             continue; | 
| 144 |  | 
| 145 |         Vector<const char*> names; | 
| 146 |         for (auto& entry : *textEncodingNameMap) { | 
| 147 |             if (entry.value == atomicName) | 
| 148 |                 names.append(entry.key); | 
| 149 |         } | 
| 150 |  | 
| 151 |         for (auto* name : names) | 
| 152 |             textEncodingNameMap->remove(name); | 
| 153 |  | 
| 154 |         textCodecMap->remove(atomicName); | 
| 155 |     } | 
| 156 | } | 
| 157 |  | 
| 158 | static void buildBaseTextCodecMaps(const std::lock_guard<Lock>&) | 
| 159 | { | 
| 160 |     ASSERT(!textCodecMap); | 
| 161 |     ASSERT(!textEncodingNameMap); | 
| 162 |  | 
| 163 |     textCodecMap = new TextCodecMap; | 
| 164 |     textEncodingNameMap = new TextEncodingNameMap; | 
| 165 |  | 
| 166 |     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); | 
| 167 |     TextCodecLatin1::registerCodecs(addToTextCodecMap); | 
| 168 |  | 
| 169 |     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); | 
| 170 |     TextCodecUTF8::registerCodecs(addToTextCodecMap); | 
| 171 |  | 
| 172 |     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); | 
| 173 |     TextCodecUTF16::registerCodecs(addToTextCodecMap); | 
| 174 |  | 
| 175 |     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); | 
| 176 |     TextCodecUserDefined::registerCodecs(addToTextCodecMap); | 
| 177 | } | 
| 178 |  | 
| 179 | static void addEncodingName(HashSet<const char*>* set, const char* name) | 
| 180 | { | 
| 181 |     // We must not use atomicCanonicalTextEncodingName() because this function is called in it. | 
| 182 |     const char* atomicName = textEncodingNameMap->get(name); | 
| 183 |     if (atomicName) | 
| 184 |         set->add(atomicName); | 
| 185 | } | 
| 186 |  | 
| 187 | static void buildQuirksSets() | 
| 188 | { | 
| 189 |     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() | 
| 190 |     // and initializing the sets for them in TextEncodingRegistry.cpp look strange. | 
| 191 |  | 
| 192 |     ASSERT(!japaneseEncodings); | 
| 193 |     ASSERT(!nonBackslashEncodings); | 
| 194 |  | 
| 195 |     japaneseEncodings = new HashSet<const char*>; | 
| 196 |     addEncodingName(japaneseEncodings, "EUC-JP" ); | 
| 197 |     addEncodingName(japaneseEncodings, "ISO-2022-JP" ); | 
| 198 |     addEncodingName(japaneseEncodings, "ISO-2022-JP-1" ); | 
| 199 |     addEncodingName(japaneseEncodings, "ISO-2022-JP-2" ); | 
| 200 |     addEncodingName(japaneseEncodings, "ISO-2022-JP-3" ); | 
| 201 |     addEncodingName(japaneseEncodings, "JIS_C6226-1978" ); | 
| 202 |     addEncodingName(japaneseEncodings, "JIS_X0201" ); | 
| 203 |     addEncodingName(japaneseEncodings, "JIS_X0208-1983" ); | 
| 204 |     addEncodingName(japaneseEncodings, "JIS_X0208-1990" ); | 
| 205 |     addEncodingName(japaneseEncodings, "JIS_X0212-1990" ); | 
| 206 |     addEncodingName(japaneseEncodings, "Shift_JIS" ); | 
| 207 |     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000" ); | 
| 208 |     addEncodingName(japaneseEncodings, "cp932" ); | 
| 209 |     addEncodingName(japaneseEncodings, "x-mac-japanese" ); | 
| 210 |  | 
| 211 |     nonBackslashEncodings = new HashSet<const char*>; | 
| 212 |     // The text encodings below treat backslash as a currency symbol for IE compatibility. | 
| 213 |     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. | 
| 214 |     addEncodingName(nonBackslashEncodings, "x-mac-japanese" ); | 
| 215 |     addEncodingName(nonBackslashEncodings, "ISO-2022-JP" ); | 
| 216 |     addEncodingName(nonBackslashEncodings, "EUC-JP" ); | 
| 217 |     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. | 
| 218 |     addEncodingName(nonBackslashEncodings, "Shift_JIS" ); | 
| 219 |     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000" ); | 
| 220 | } | 
| 221 |  | 
| 222 | bool isJapaneseEncoding(const char* canonicalEncodingName) | 
| 223 | { | 
| 224 |     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); | 
| 225 | } | 
| 226 |  | 
| 227 | bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) | 
| 228 | { | 
| 229 |     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); | 
| 230 | } | 
| 231 |  | 
| 232 | static void extendTextCodecMaps() | 
| 233 | { | 
| 234 |     TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap); | 
| 235 |     TextCodecReplacement::registerCodecs(addToTextCodecMap); | 
| 236 |  | 
| 237 |     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); | 
| 238 |     TextCodecICU::registerCodecs(addToTextCodecMap); | 
| 239 |  | 
| 240 |     pruneBlacklistedCodecs(); | 
| 241 |     buildQuirksSets(); | 
| 242 | } | 
| 243 |  | 
| 244 | std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding) | 
| 245 | { | 
| 246 |     std::lock_guard<Lock> lock(encodingRegistryMutex); | 
| 247 |  | 
| 248 |     ASSERT(textCodecMap); | 
| 249 |     auto result = textCodecMap->find(encoding.name()); | 
| 250 |     ASSERT(result != textCodecMap->end()); | 
| 251 |     return result->value(); | 
| 252 | } | 
| 253 |  | 
| 254 | const char* atomicCanonicalTextEncodingName(const char* name) | 
| 255 | { | 
| 256 |     if (!name || !name[0]) | 
| 257 |         return nullptr; | 
| 258 |  | 
| 259 |     std::lock_guard<Lock> lock(encodingRegistryMutex); | 
| 260 |  | 
| 261 |     if (!textEncodingNameMap) | 
| 262 |         buildBaseTextCodecMaps(lock); | 
| 263 |  | 
| 264 |     if (const char* atomicName = textEncodingNameMap->get(name)) | 
| 265 |         return atomicName; | 
| 266 |     if (didExtendTextCodecMaps) | 
| 267 |         return nullptr; | 
| 268 |  | 
| 269 |     extendTextCodecMaps(); | 
| 270 |     didExtendTextCodecMaps = true; | 
| 271 |     return textEncodingNameMap->get(name); | 
| 272 | } | 
| 273 |  | 
| 274 | template<typename CharacterType> static const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) | 
| 275 | { | 
| 276 |     char buffer[maxEncodingNameLength + 1]; | 
| 277 |     size_t j = 0; | 
| 278 |     for (size_t i = 0; i < length; ++i) { | 
| 279 |         if (j == maxEncodingNameLength) | 
| 280 |             return nullptr; | 
| 281 |         buffer[j++] = characters[i]; | 
| 282 |     } | 
| 283 |     buffer[j] = 0; | 
| 284 |     return atomicCanonicalTextEncodingName(buffer); | 
| 285 | } | 
| 286 |  | 
| 287 | const char* atomicCanonicalTextEncodingName(const String& alias) | 
| 288 | { | 
| 289 |     if (alias.isEmpty() || !alias.isAllASCII()) | 
| 290 |         return nullptr; | 
| 291 |  | 
| 292 |     if (alias.is8Bit()) | 
| 293 |         return atomicCanonicalTextEncodingName(alias.characters8(), alias.length()); | 
| 294 |  | 
| 295 |     return atomicCanonicalTextEncodingName(alias.characters16(), alias.length()); | 
| 296 | } | 
| 297 |  | 
| 298 | bool noExtendedTextEncodingNameUsed() | 
| 299 | { | 
| 300 |     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. | 
| 301 |     return !didExtendTextCodecMaps; | 
| 302 | } | 
| 303 |  | 
| 304 | String defaultTextEncodingNameForSystemLanguage() | 
| 305 | { | 
| 306 | #if PLATFORM(COCOA) | 
| 307 |     String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding()); | 
| 308 |  | 
| 309 |     // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949. | 
| 310 |     // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949. | 
| 311 |     // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>. | 
| 312 |     // On some OS versions, the result is CP949 (uppercase). | 
| 313 |     if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949" )) | 
| 314 |         systemEncodingName = "ks_c_5601-1987"_s ; | 
| 315 |  | 
| 316 |     // CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874. | 
| 317 |     // Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to | 
| 318 |     // "dos-874" instead. | 
| 319 |     if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874" )) | 
| 320 |         systemEncodingName = "dos-874"_s ; | 
| 321 |  | 
| 322 |     return systemEncodingName; | 
| 323 | #else | 
| 324 |     return "ISO-8859-1"_s ; | 
| 325 | #endif | 
| 326 | } | 
| 327 |  | 
| 328 | } // namespace WebCore | 
| 329 |  |