| 1 | /* | 
| 2 |  * Copyright (C) 2004-2019 Apple Inc. All rights reserved. | 
| 3 |  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> | 
| 4 |  * Copyright (C) 2007-2009 Torch Mobile, Inc. | 
| 5 |  * | 
| 6 |  * Redistribution and use in source and binary forms, with or without | 
| 7 |  * modification, are permitted provided that the following conditions | 
| 8 |  * are met: | 
| 9 |  * 1. Redistributions of source code must retain the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer. | 
| 11 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 12 |  *    notice, this list of conditions and the following disclaimer in the | 
| 13 |  *    documentation and/or other materials provided with the distribution. | 
| 14 |  * | 
| 15 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 26 |  */ | 
| 27 |  | 
| 28 | #include "config.h" | 
| 29 | #include "TextEncoding.h" | 
| 30 |  | 
| 31 | #include "DecodeEscapeSequences.h" | 
| 32 | #include "TextCodec.h" | 
| 33 | #include "TextEncodingRegistry.h" | 
| 34 | #include <wtf/NeverDestroyed.h> | 
| 35 | #include <wtf/StdLibExtras.h> | 
| 36 | #include <wtf/text/StringView.h> | 
| 37 |  | 
| 38 | namespace WebCore { | 
| 39 |  | 
| 40 | static const TextEncoding& UTF7Encoding() | 
| 41 | { | 
| 42 |     static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7" ); | 
| 43 |     return globalUTF7Encoding; | 
| 44 | } | 
| 45 |  | 
| 46 | TextEncoding::TextEncoding(const char* name) | 
| 47 |     : m_name(atomicCanonicalTextEncodingName(name)) | 
| 48 |     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) | 
| 49 | { | 
| 50 |     // Aliases are valid, but not "replacement" itself. | 
| 51 |     if (equalLettersIgnoringASCIICase(name, "replacement" )) | 
| 52 |         m_name = nullptr; | 
| 53 | } | 
| 54 |  | 
| 55 | TextEncoding::TextEncoding(const String& name) | 
| 56 |     : m_name(atomicCanonicalTextEncodingName(name)) | 
| 57 |     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) | 
| 58 | { | 
| 59 |     // Aliases are valid, but not "replacement" itself. | 
| 60 |     if (equalLettersIgnoringASCIICase(name, "replacement" )) | 
| 61 |         m_name = nullptr; | 
| 62 | } | 
| 63 |  | 
| 64 | String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const | 
| 65 | { | 
| 66 |     if (!m_name) | 
| 67 |         return String(); | 
| 68 |  | 
| 69 |     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); | 
| 70 | } | 
| 71 |  | 
| 72 | Vector<uint8_t> TextEncoding::encode(StringView string, UnencodableHandling handling) const | 
| 73 | { | 
| 74 |     if (!m_name || string.isEmpty()) | 
| 75 |         return { }; | 
| 76 |  | 
| 77 |     // FIXME: What's the right place to do normalization? | 
| 78 |     // It's a little strange to do it inside the encode function. | 
| 79 |     // Perhaps normalization should be an explicit step done before calling encode. | 
| 80 |     auto normalizedString = normalizedNFC(string); | 
| 81 |     return newTextCodec(*this)->encode(normalizedString.view, handling); | 
| 82 | } | 
| 83 |  | 
| 84 | const char* TextEncoding::domName() const | 
| 85 | { | 
| 86 |     if (noExtendedTextEncodingNameUsed()) | 
| 87 |         return m_name; | 
| 88 |  | 
| 89 |     // We treat EUC-KR as windows-949 (its superset), but need to expose  | 
| 90 |     // the name 'EUC-KR' because the name 'windows-949' is not recognized by | 
| 91 |     // most Korean web servers even though they do use the encoding | 
| 92 |     // 'windows-949' with the name 'EUC-KR'.  | 
| 93 |     // FIXME: This is not thread-safe. At the moment, this function is | 
| 94 |     // only accessed in a single thread, but eventually has to be made | 
| 95 |     // thread-safe along with usesVisualOrdering(). | 
| 96 |     static const char* const a = atomicCanonicalTextEncodingName("windows-949" ); | 
| 97 |     if (m_name == a) | 
| 98 |         return "EUC-KR" ; | 
| 99 |     return m_name; | 
| 100 | } | 
| 101 |  | 
| 102 | bool TextEncoding::usesVisualOrdering() const | 
| 103 | { | 
| 104 |     if (noExtendedTextEncodingNameUsed()) | 
| 105 |         return false; | 
| 106 |  | 
| 107 |     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8" ); | 
| 108 |     return m_name == a; | 
| 109 | } | 
| 110 |  | 
| 111 | bool TextEncoding::isJapanese() const | 
| 112 | { | 
| 113 |     return isJapaneseEncoding(m_name); | 
| 114 | } | 
| 115 |  | 
| 116 | UChar TextEncoding::backslashAsCurrencySymbol() const | 
| 117 | { | 
| 118 |     return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; | 
| 119 | } | 
| 120 |  | 
| 121 | bool TextEncoding::isNonByteBasedEncoding() const | 
| 122 | { | 
| 123 |     return *this == UTF16LittleEndianEncoding() || *this == UTF16BigEndianEncoding(); | 
| 124 | } | 
| 125 |  | 
| 126 | bool TextEncoding::isUTF7Encoding() const | 
| 127 | { | 
| 128 |     if (noExtendedTextEncodingNameUsed()) | 
| 129 |         return false; | 
| 130 |  | 
| 131 |     return *this == UTF7Encoding(); | 
| 132 | } | 
| 133 |  | 
| 134 | const TextEncoding& TextEncoding::closestByteBasedEquivalent() const | 
| 135 | { | 
| 136 |     if (isNonByteBasedEncoding()) | 
| 137 |         return UTF8Encoding(); | 
| 138 |     return *this;  | 
| 139 | } | 
| 140 |  | 
| 141 | // HTML5 specifies that UTF-8 be used in form submission when a form is  | 
| 142 | // is a part of a document in UTF-16 probably because UTF-16 is not a  | 
| 143 | // byte-based encoding and can contain 0x00. By extension, the same | 
| 144 | // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, | 
| 145 | // but it's fraught with problems and we'd rather steer clear of it. | 
| 146 | const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const | 
| 147 | { | 
| 148 |     if (isNonByteBasedEncoding() || isUTF7Encoding()) | 
| 149 |         return UTF8Encoding(); | 
| 150 |     return *this; | 
| 151 | } | 
| 152 |  | 
| 153 | const TextEncoding& ASCIIEncoding() | 
| 154 | { | 
| 155 |     static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII" ); | 
| 156 |     return globalASCIIEncoding; | 
| 157 | } | 
| 158 |  | 
| 159 | const TextEncoding& Latin1Encoding() | 
| 160 | { | 
| 161 |     static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1" ); | 
| 162 |     return globalLatin1Encoding; | 
| 163 | } | 
| 164 |  | 
| 165 | const TextEncoding& UTF16BigEndianEncoding() | 
| 166 | { | 
| 167 |     static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE" ); | 
| 168 |     return globalUTF16BigEndianEncoding; | 
| 169 | } | 
| 170 |  | 
| 171 | const TextEncoding& UTF16LittleEndianEncoding() | 
| 172 | { | 
| 173 |     static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE" ); | 
| 174 |     return globalUTF16LittleEndianEncoding; | 
| 175 | } | 
| 176 |  | 
| 177 | const TextEncoding& UTF8Encoding() | 
| 178 | { | 
| 179 |     static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8" ); | 
| 180 |     ASSERT(globalUTF8Encoding.get().isValid()); | 
| 181 |     return globalUTF8Encoding; | 
| 182 | } | 
| 183 |  | 
| 184 | const TextEncoding& WindowsLatin1Encoding() | 
| 185 | { | 
| 186 |     static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1" ); | 
| 187 |     return globalWindowsLatin1Encoding; | 
| 188 | } | 
| 189 |  | 
| 190 | String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding) | 
| 191 | { | 
| 192 |     if (string.isEmpty()) | 
| 193 |         return string; | 
| 194 |     return decodeEscapeSequences<URLEscapeSequence>(string, encoding); | 
| 195 | } | 
| 196 |  | 
| 197 | } // namespace WebCore | 
| 198 |  |