TextCodecUTF8.cpp source code [webkit/Source/WebCore/platform/text/TextCodecUTF8.cpp]

1	/*
2	* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "TextCodecUTF8.h"
28
29	#include "TextCodecASCIIFastPath.h"
30	#include <wtf/text/CString.h>
31	#include <wtf/text/StringBuffer.h>
32	#include <wtf/text/WTFString.h>
33	#include <wtf/unicode/CharacterNames.h>
34
35	namespace WebCore {
36
37	using namespace WTF::Unicode;
38
39	const int nonCharacter = -`1`;
40
41	void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
42	{
43	// From https://encoding.spec.whatwg.org.
44	registrar("UTF-8", "UTF-8");
45	registrar("utf8", "UTF-8");
46	registrar("unicode-1-1-utf-8", "UTF-8");
47
48	// Additional aliases that originally were present in the encoding
49	// table in WebKit on Macintosh, and subsequently added by
50	// TextCodecICU. Perhaps we can prove some are not used on the web
51	// and remove them.
52	registrar("unicode11utf8", "UTF-8");
53	registrar("unicode20utf8", "UTF-8");
54	registrar("x-unicode20utf8", "UTF-8");
55	}
56
57	void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
58	{
59	registrar("UTF-8", [] {
60	return std::make_unique<TextCodecUTF8>();
61	});
62	}
63
64	static inline int nonASCIISequenceLength(uint8_t firstByte)
65	{
66	static const uint8_t lengths[`256`] = {
67	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
68	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
69	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
70	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
71	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
72	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
73	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
74	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
75	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
76	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
77	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
78	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
79	`0`, `0`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
80	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
81	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`,
82	`4`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
83	};
84	return lengths[firstByte];
85	}
86
87	static inline int decodeNonASCIISequence(const uint8_t* sequence, int& length)
88	{
89	ASSERT(!isASCII(sequence[`0`]));
90	if (length == `2`) {
91	ASSERT(sequence[`0`] >= `0xC2`);
92	ASSERT(sequence[`0`] <= `0xDF`);
93	if (sequence[`1`] < `0x80` \|\| sequence[`1`] > `0xBF`) {
94	length = `1`;
95	return nonCharacter;
96	}
97	return ((sequence[`0`] << `6`) + sequence[`1`]) - `0x00003080`;
98	}
99	if (length == `3`) {
100	ASSERT(sequence[`0`] >= `0xE0`);
101	ASSERT(sequence[`0`] <= `0xEF`);
102	switch (sequence[`0`]) {
103	case `0xE0`:
104	if (sequence[`1`] < `0xA0` \|\| sequence[`1`] > `0xBF`) {
105	length = `1`;
106	return nonCharacter;
107	}
108	break;
109	case `0xED`:
110	if (sequence[`1`] < `0x80` \|\| sequence[`1`] > `0x9F`) {
111	length = `1`;
112	return nonCharacter;
113	}
114	break;
115	default:
116	if (sequence[`1`] < `0x80` \|\| sequence[`1`] > `0xBF`) {
117	length = `1`;
118	return nonCharacter;
119	}
120	}
121	if (sequence[`2`] < `0x80` \|\| sequence[`2`] > `0xBF`) {
122	length = `2`;
123	return nonCharacter;
124	}
125	return ((sequence[`0`] << `12`) + (sequence[`1`] << `6`) + sequence[`2`]) - `0x000E2080`;
126	}
127	ASSERT(length == `4`);
128	ASSERT(sequence[`0`] >= `0xF0`);
129	ASSERT(sequence[`0`] <= `0xF4`);
130	switch (sequence[`0`]) {
131	case `0xF0`:
132	if (sequence[`1`] < `0x90` \|\| sequence[`1`] > `0xBF`) {
133	length = `1`;
134	return nonCharacter;
135	}
136	break;
137	case `0xF4`:
138	if (sequence[`1`] < `0x80` \|\| sequence[`1`] > `0x8F`) {
139	length = `1`;
140	return nonCharacter;
141	}
142	break;
143	default:
144	if (sequence[`1`] < `0x80` \|\| sequence[`1`] > `0xBF`) {
145	length = `1`;
146	return nonCharacter;
147	}
148	}
149	if (sequence[`2`] < `0x80` \|\| sequence[`2`] > `0xBF`) {
150	length = `2`;
151	return nonCharacter;
152	}
153	if (sequence[`3`] < `0x80` \|\| sequence[`3`] > `0xBF`) {
154	length = `3`;
155	return nonCharacter;
156	}
157	return ((sequence[`0`] << `18`) + (sequence[`1`] << `12`) + (sequence[`2`] << `6`) + sequence[`3`]) - `0x03C82080`;
158	}
159
160	static inline UChar* appendCharacter(UChar* destination, int character)
161	{
162	ASSERT(character != nonCharacter);
163	ASSERT(!U_IS_SURROGATE(character));
164	if (U_IS_BMP(character))
165	*destination++ = character;
166	else {
167	*destination++ = U16_LEAD(character);
168	*destination++ = U16_TRAIL(character);
169	}
170	return destination;
171	}
172
173	void TextCodecUTF8::consumePartialSequenceByte()
174	{
175	--m_partialSequenceSize;
176	memmove(m_partialSequence, m_partialSequence + `1`, m_partialSequenceSize);
177	}
178
179	bool TextCodecUTF8::handlePartialSequence(LChar& destination, const* uint8_t& source, const* uint8_t* end, bool flush)
180	{
181	ASSERT(m_partialSequenceSize);
182	do {
183	if (isASCII(m_partialSequence[`0`])) {
184	*destination++ = m_partialSequence[`0`];
185	consumePartialSequenceByte();
186	continue;
187	}
188	int count = nonASCIISequenceLength(m_partialSequence[`0`]);
189	if (!count)
190	return true;
191
192	if (count > m_partialSequenceSize) {
193	if (count - m_partialSequenceSize > end - source) {
194	if (!flush) {
195	// The new data is not enough to complete the sequence, so
196	// add it to the existing partial sequence.
197	memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
198	m_partialSequenceSize += end - source;
199	return false;
200	}
201	// An incomplete partial sequence at the end is an error, but it will create
202	// a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
203	// the error.
204	return true;
205	}
206	memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
207	source += count - m_partialSequenceSize;
208	m_partialSequenceSize = count;
209	}
210	int character = decodeNonASCIISequence(m_partialSequence, count);
211	if (character == nonCharacter \|\| character > `0xFF`)
212	return true;
213
214	m_partialSequenceSize -= count;
215	*destination++ = character;
216	} while (m_partialSequenceSize);
217
218	return false;
219	}
220
221	void TextCodecUTF8::handlePartialSequence(UChar& destination, const* uint8_t& source, const* uint8_t* end, bool flush, bool stopOnError, bool& sawError)
222	{
223	ASSERT(m_partialSequenceSize);
224	do {
225	if (isASCII(m_partialSequence[`0`])) {
226	*destination++ = m_partialSequence[`0`];
227	consumePartialSequenceByte();
228	continue;
229	}
230	int count = nonASCIISequenceLength(m_partialSequence[`0`]);
231	if (!count) {
232	sawError = true;
233	if (stopOnError)
234	return;
235	*destination++ = replacementCharacter;
236	consumePartialSequenceByte();
237	continue;
238	}
239	if (count > m_partialSequenceSize) {
240	if (count - m_partialSequenceSize > end - source) {
241	if (!flush) {
242	// The new data is not enough to complete the sequence, so
243	// add it to the existing partial sequence.
244	memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
245	m_partialSequenceSize += end - source;
246	return;
247	}
248	// An incomplete partial sequence at the end is an error.
249	sawError = true;
250	if (stopOnError)
251	return;
252	*destination++ = replacementCharacter;
253	m_partialSequenceSize = `0`;
254	source = end;
255	continue;
256	}
257	memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
258	source += count - m_partialSequenceSize;
259	m_partialSequenceSize = count;
260	}
261	int character = decodeNonASCIISequence(m_partialSequence, count);
262	if (character == nonCharacter) {
263	sawError = true;
264	if (stopOnError)
265	return;
266	*destination++ = replacementCharacter;
267	m_partialSequenceSize -= count;
268	memmove(m_partialSequence, m_partialSequence + count, m_partialSequenceSize);
269	continue;
270	}
271
272	m_partialSequenceSize -= count;
273	destination = appendCharacter(destination, character);
274	} while (m_partialSequenceSize);
275	}
276
277	String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
278	{
279	// Each input byte might turn into a character.
280	// That includes all bytes in the partial-sequence buffer because
281	// each byte in an invalid sequence will turn into a replacement character.
282	StringBuffer<LChar> buffer(m_partialSequenceSize + length);
283
284	const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
285	const uint8_t* end = source + length;
286	const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
287	LChar* destination = buffer.characters();
288
289	do {
290	if (m_partialSequenceSize) {
291	// Explicitly copy destination and source pointers to avoid taking pointers to the
292	// local variables, which may harm code generation by disabling some optimizations
293	// in some compilers.
294	LChar* destinationForHandlePartialSequence = destination;
295	const uint8_t* sourceForHandlePartialSequence = source;
296	if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush)) {
297	source = sourceForHandlePartialSequence;
298	goto upConvertTo16Bit;
299	}
300	destination = destinationForHandlePartialSequence;
301	source = sourceForHandlePartialSequence;
302	if (m_partialSequenceSize)
303	break;
304	}
305
306	while (source < end) {
307	if (isASCII(*source)) {
308	// Fast path for ASCII. Most UTF-8 text will be ASCII.
309	if (WTF::isAlignedToMachineWord(source)) {
310	while (source < alignedEnd) {
311	auto chunk = reinterpret_cast_ptr<const* WTF::MachineWord*>(source);
312	if (!WTF::isAllASCII<LChar>(chunk))
313	break;
314	copyASCIIMachineWord(destination, source);
315	source += sizeof(WTF::MachineWord);
316	destination += sizeof(WTF::MachineWord);
317	}
318	if (source == end)
319	break;
320	if (!isASCII(*source))
321	continue;
322	}
323	destination++ = source++;
324	continue;
325	}
326	int count = nonASCIISequenceLength(*source);
327	int character;
328	if (!count)
329	character = nonCharacter;
330	else {
331	if (count > end - source) {
332	ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
333	ASSERT(!m_partialSequenceSize);
334	m_partialSequenceSize = end - source;
335	memcpy(m_partialSequence, source, m_partialSequenceSize);
336	source = end;
337	break;
338	}
339	character = decodeNonASCIISequence(source, count);
340	}
341	if (character == nonCharacter) {
342	sawError = true;
343	if (stopOnError)
344	break;
345
346	goto upConvertTo16Bit;
347	}
348	if (character > `0xFF`)
349	goto upConvertTo16Bit;
350
351	source += count;
352	*destination++ = character;
353	}
354	} while (flush && m_partialSequenceSize);
355
356	buffer.shrink(destination - buffer.characters());
357
358	return String::adopt(WTFMove(buffer));
359
360	upConvertTo16Bit:
361	StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
362
363	UChar* destination16 = buffer16.characters();
364
365	// Copy the already converted characters
366	for (LChar* converted8 = buffer.characters(); converted8 < destination;)
367	destination16++ = converted8++;
368
369	do {
370	if (m_partialSequenceSize) {
371	// Explicitly copy destination and source pointers to avoid taking pointers to the
372	// local variables, which may harm code generation by disabling some optimizations
373	// in some compilers.
374	UChar* destinationForHandlePartialSequence = destination16;
375	const uint8_t* sourceForHandlePartialSequence = source;
376	handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
377	destination16 = destinationForHandlePartialSequence;
378	source = sourceForHandlePartialSequence;
379	if (m_partialSequenceSize)
380	break;
381	}
382
383	while (source < end) {
384	if (isASCII(*source)) {
385	// Fast path for ASCII. Most UTF-8 text will be ASCII.
386	if (WTF::isAlignedToMachineWord(source)) {
387	while (source < alignedEnd) {
388	auto chunk = reinterpret_cast_ptr<const* WTF::MachineWord*>(source);
389	if (!WTF::isAllASCII<LChar>(chunk))
390	break;
391	copyASCIIMachineWord(destination16, source);
392	source += sizeof(WTF::MachineWord);
393	destination16 += sizeof(WTF::MachineWord);
394	}
395	if (source == end)
396	break;
397	if (!isASCII(*source))
398	continue;
399	}
400	destination16++ = source++;
401	continue;
402	}
403	int count = nonASCIISequenceLength(*source);
404	int character;
405	if (!count)
406	character = nonCharacter;
407	else {
408	if (count > end - source) {
409	ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
410	ASSERT(!m_partialSequenceSize);
411	m_partialSequenceSize = end - source;
412	memcpy(m_partialSequence, source, m_partialSequenceSize);
413	source = end;
414	break;
415	}
416	character = decodeNonASCIISequence(source, count);
417	}
418	if (character == nonCharacter) {
419	sawError = true;
420	if (stopOnError)
421	break;
422	*destination16++ = replacementCharacter;
423	source += count ? count : `1`;
424	continue;
425	}
426	source += count;
427	destination16 = appendCharacter(destination16, character);
428	}
429	} while (flush && m_partialSequenceSize);
430
431	buffer16.shrink(destination16 - buffer16.characters());
432
433	return String::adopt(WTFMove(buffer16));
434	}
435
436	Vector<uint8_t> TextCodecUTF8::encode(StringView string, UnencodableHandling)
437	{
438	// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
439	// BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
440	// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
441	Vector<uint8_t> bytes(WTF::checkedProduct<size_t>(string.length(), `3`).unsafeGet());
442	size_t bytesWritten = `0`;
443	for (auto character : string.codePoints())
444	U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
445	bytes.shrink(bytesWritten);
446	return bytes;
447	}
448
449	} // namespace WebCore
450

Browse the source code of webkit/Source/WebCore/platform/text/TextCodecUTF8.cpp