TextEncoding.cpp source code [webkit/Source/WebCore/platform/text/TextEncoding.cpp]

1	/*
2	* Copyright (C) 2004-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4	* Copyright (C) 2007-2009 Torch Mobile, Inc.
5	*
6	* Redistribution and use in source and binary forms, with or without
7	* modification, are permitted provided that the following conditions
8	* are met:
9	* 1. Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* 2. Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	*
15	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26	*/
27
28	#include "config.h"
29	#include "TextEncoding.h"
30
31	#include "DecodeEscapeSequences.h"
32	#include "TextCodec.h"
33	#include "TextEncodingRegistry.h"
34	#include <wtf/NeverDestroyed.h>
35	#include <wtf/StdLibExtras.h>
36	#include <wtf/text/StringView.h>
37
38	namespace WebCore {
39
40	static const TextEncoding& UTF7Encoding()
41	{
42	static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7");
43	return globalUTF7Encoding;
44	}
45
46	TextEncoding::TextEncoding(const char* name)
47	: m_name(atomicCanonicalTextEncodingName(name))
48	, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
49	{
50	// Aliases are valid, but not "replacement" itself.
51	if (equalLettersIgnoringASCIICase(name, "replacement"))
52	m_name = nullptr;
53	}
54
55	TextEncoding::TextEncoding(const String& name)
56	: m_name(atomicCanonicalTextEncodingName(name))
57	, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
58	{
59	// Aliases are valid, but not "replacement" itself.
60	if (equalLettersIgnoringASCIICase(name, "replacement"))
61	m_name = nullptr;
62	}
63
64	String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
65	{
66	if (!m_name)
67	return String ();
68
69	return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
70	}
71
72	Vector<uint8_t> TextEncoding::encode(StringView string, UnencodableHandling handling) const
73	{
74	if (!m_name \|\| string.isEmpty())
75	return { };
76
77	// FIXME: What's the right place to do normalization?
78	// It's a little strange to do it inside the encode function.
79	// Perhaps normalization should be an explicit step done before calling encode.
80	auto normalizedString = normalizedNFC(string);
81	return newTextCodec(*this)->encode(normalizedString.view, handling);
82	}
83
84	const char* TextEncoding::domName() const
85	{
86	if (noExtendedTextEncodingNameUsed())
87	return m_name;
88
89	// We treat EUC-KR as windows-949 (its superset), but need to expose
90	// the name 'EUC-KR' because the name 'windows-949' is not recognized by
91	// most Korean web servers even though they do use the encoding
92	// 'windows-949' with the name 'EUC-KR'.
93	// FIXME: This is not thread-safe. At the moment, this function is
94	// only accessed in a single thread, but eventually has to be made
95	// thread-safe along with usesVisualOrdering().
96	static const char* const a = atomicCanonicalTextEncodingName("windows-949");
97	if (m_name == a)
98	return "EUC-KR";
99	return m_name;
100	}
101
102	bool TextEncoding::usesVisualOrdering() const
103	{
104	if (noExtendedTextEncodingNameUsed())
105	return false;
106
107	static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
108	return m_name == a;
109	}
110
111	bool TextEncoding::isJapanese() const
112	{
113	return isJapaneseEncoding(m_name);
114	}
115
116	UChar TextEncoding::backslashAsCurrencySymbol() const
117	{
118	return shouldShowBackslashAsCurrencySymbolIn(m_name) ? `0x00A5` : `'\\'`;
119	}
120
121	bool TextEncoding::isNonByteBasedEncoding() const
122	{
123	return *this == UTF16LittleEndianEncoding() \|\| *this == UTF16BigEndianEncoding();
124	}
125
126	bool TextEncoding::isUTF7Encoding() const
127	{
128	if (noExtendedTextEncodingNameUsed())
129	return false;
130
131	return *this == UTF7Encoding();
132	}
133
134	const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
135	{
136	if (isNonByteBasedEncoding())
137	return UTF8Encoding();
138	return *this;
139	}
140
141	// HTML5 specifies that UTF-8 be used in form submission when a form is
142	// is a part of a document in UTF-16 probably because UTF-16 is not a
143	// byte-based encoding and can contain 0x00. By extension, the same
144	// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
145	// but it's fraught with problems and we'd rather steer clear of it.
146	const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const
147	{
148	if (isNonByteBasedEncoding() \|\| isUTF7Encoding())
149	return UTF8Encoding();
150	return *this;
151	}
152
153	const TextEncoding& ASCIIEncoding()
154	{
155	static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII");
156	return globalASCIIEncoding;
157	}
158
159	const TextEncoding& Latin1Encoding()
160	{
161	static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1");
162	return globalLatin1Encoding;
163	}
164
165	const TextEncoding& UTF16BigEndianEncoding()
166	{
167	static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE");
168	return globalUTF16BigEndianEncoding;
169	}
170
171	const TextEncoding& UTF16LittleEndianEncoding()
172	{
173	static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE");
174	return globalUTF16LittleEndianEncoding;
175	}
176
177	const TextEncoding& UTF8Encoding()
178	{
179	static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8");
180	ASSERT(globalUTF8Encoding.get().isValid());
181	return globalUTF8Encoding;
182	}
183
184	const TextEncoding& WindowsLatin1Encoding()
185	{
186	static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1");
187	return globalWindowsLatin1Encoding;
188	}
189
190	String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
191	{
192	if (string.isEmpty())
193	return string;
194	return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
195	}
196
197	} // namespace WebCore
198

Browse the source code of webkit/Source/WebCore/platform/text/TextEncoding.cpp