1/*
2 * Copyright (C) 2004-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "TextEncoding.h"
30
31#include "DecodeEscapeSequences.h"
32#include "TextCodec.h"
33#include "TextEncodingRegistry.h"
34#include <wtf/NeverDestroyed.h>
35#include <wtf/StdLibExtras.h>
36#include <wtf/text/StringView.h>
37
38namespace WebCore {
39
40static const TextEncoding& UTF7Encoding()
41{
42 static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7");
43 return globalUTF7Encoding;
44}
45
46TextEncoding::TextEncoding(const char* name)
47 : m_name(atomicCanonicalTextEncodingName(name))
48 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
49{
50 // Aliases are valid, but not "replacement" itself.
51 if (equalLettersIgnoringASCIICase(name, "replacement"))
52 m_name = nullptr;
53}
54
55TextEncoding::TextEncoding(const String& name)
56 : m_name(atomicCanonicalTextEncodingName(name))
57 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
58{
59 // Aliases are valid, but not "replacement" itself.
60 if (equalLettersIgnoringASCIICase(name, "replacement"))
61 m_name = nullptr;
62}
63
64String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
65{
66 if (!m_name)
67 return String();
68
69 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
70}
71
72Vector<uint8_t> TextEncoding::encode(StringView string, UnencodableHandling handling) const
73{
74 if (!m_name || string.isEmpty())
75 return { };
76
77 // FIXME: What's the right place to do normalization?
78 // It's a little strange to do it inside the encode function.
79 // Perhaps normalization should be an explicit step done before calling encode.
80 auto normalizedString = normalizedNFC(string);
81 return newTextCodec(*this)->encode(normalizedString.view, handling);
82}
83
84const char* TextEncoding::domName() const
85{
86 if (noExtendedTextEncodingNameUsed())
87 return m_name;
88
89 // We treat EUC-KR as windows-949 (its superset), but need to expose
90 // the name 'EUC-KR' because the name 'windows-949' is not recognized by
91 // most Korean web servers even though they do use the encoding
92 // 'windows-949' with the name 'EUC-KR'.
93 // FIXME: This is not thread-safe. At the moment, this function is
94 // only accessed in a single thread, but eventually has to be made
95 // thread-safe along with usesVisualOrdering().
96 static const char* const a = atomicCanonicalTextEncodingName("windows-949");
97 if (m_name == a)
98 return "EUC-KR";
99 return m_name;
100}
101
102bool TextEncoding::usesVisualOrdering() const
103{
104 if (noExtendedTextEncodingNameUsed())
105 return false;
106
107 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
108 return m_name == a;
109}
110
111bool TextEncoding::isJapanese() const
112{
113 return isJapaneseEncoding(m_name);
114}
115
116UChar TextEncoding::backslashAsCurrencySymbol() const
117{
118 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
119}
120
121bool TextEncoding::isNonByteBasedEncoding() const
122{
123 return *this == UTF16LittleEndianEncoding() || *this == UTF16BigEndianEncoding();
124}
125
126bool TextEncoding::isUTF7Encoding() const
127{
128 if (noExtendedTextEncodingNameUsed())
129 return false;
130
131 return *this == UTF7Encoding();
132}
133
134const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
135{
136 if (isNonByteBasedEncoding())
137 return UTF8Encoding();
138 return *this;
139}
140
141// HTML5 specifies that UTF-8 be used in form submission when a form is
142// is a part of a document in UTF-16 probably because UTF-16 is not a
143// byte-based encoding and can contain 0x00. By extension, the same
144// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
145// but it's fraught with problems and we'd rather steer clear of it.
146const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const
147{
148 if (isNonByteBasedEncoding() || isUTF7Encoding())
149 return UTF8Encoding();
150 return *this;
151}
152
153const TextEncoding& ASCIIEncoding()
154{
155 static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII");
156 return globalASCIIEncoding;
157}
158
159const TextEncoding& Latin1Encoding()
160{
161 static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1");
162 return globalLatin1Encoding;
163}
164
165const TextEncoding& UTF16BigEndianEncoding()
166{
167 static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE");
168 return globalUTF16BigEndianEncoding;
169}
170
171const TextEncoding& UTF16LittleEndianEncoding()
172{
173 static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE");
174 return globalUTF16LittleEndianEncoding;
175}
176
177const TextEncoding& UTF8Encoding()
178{
179 static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8");
180 ASSERT(globalUTF8Encoding.get().isValid());
181 return globalUTF8Encoding;
182}
183
184const TextEncoding& WindowsLatin1Encoding()
185{
186 static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1");
187 return globalWindowsLatin1Encoding;
188}
189
190String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
191{
192 if (string.isEmpty())
193 return string;
194 return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
195}
196
197} // namespace WebCore
198