1/*
2 * Copyright (C) 2004-2017 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "TextCodecUTF8.h"
28
29#include "TextCodecASCIIFastPath.h"
30#include <wtf/text/CString.h>
31#include <wtf/text/StringBuffer.h>
32#include <wtf/text/WTFString.h>
33#include <wtf/unicode/CharacterNames.h>
34
35namespace WebCore {
36
37using namespace WTF::Unicode;
38
39const int nonCharacter = -1;
40
41void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
42{
43 // From https://encoding.spec.whatwg.org.
44 registrar("UTF-8", "UTF-8");
45 registrar("utf8", "UTF-8");
46 registrar("unicode-1-1-utf-8", "UTF-8");
47
48 // Additional aliases that originally were present in the encoding
49 // table in WebKit on Macintosh, and subsequently added by
50 // TextCodecICU. Perhaps we can prove some are not used on the web
51 // and remove them.
52 registrar("unicode11utf8", "UTF-8");
53 registrar("unicode20utf8", "UTF-8");
54 registrar("x-unicode20utf8", "UTF-8");
55}
56
57void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
58{
59 registrar("UTF-8", [] {
60 return std::make_unique<TextCodecUTF8>();
61 });
62}
63
64static inline int nonASCIISequenceLength(uint8_t firstByte)
65{
66 static const uint8_t lengths[256] = {
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
82 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
83 };
84 return lengths[firstByte];
85}
86
87static inline int decodeNonASCIISequence(const uint8_t* sequence, int& length)
88{
89 ASSERT(!isASCII(sequence[0]));
90 if (length == 2) {
91 ASSERT(sequence[0] >= 0xC2);
92 ASSERT(sequence[0] <= 0xDF);
93 if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
94 length = 1;
95 return nonCharacter;
96 }
97 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
98 }
99 if (length == 3) {
100 ASSERT(sequence[0] >= 0xE0);
101 ASSERT(sequence[0] <= 0xEF);
102 switch (sequence[0]) {
103 case 0xE0:
104 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
105 length = 1;
106 return nonCharacter;
107 }
108 break;
109 case 0xED:
110 if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
111 length = 1;
112 return nonCharacter;
113 }
114 break;
115 default:
116 if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
117 length = 1;
118 return nonCharacter;
119 }
120 }
121 if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
122 length = 2;
123 return nonCharacter;
124 }
125 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
126 }
127 ASSERT(length == 4);
128 ASSERT(sequence[0] >= 0xF0);
129 ASSERT(sequence[0] <= 0xF4);
130 switch (sequence[0]) {
131 case 0xF0:
132 if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
133 length = 1;
134 return nonCharacter;
135 }
136 break;
137 case 0xF4:
138 if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
139 length = 1;
140 return nonCharacter;
141 }
142 break;
143 default:
144 if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
145 length = 1;
146 return nonCharacter;
147 }
148 }
149 if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
150 length = 2;
151 return nonCharacter;
152 }
153 if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
154 length = 3;
155 return nonCharacter;
156 }
157 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
158}
159
160static inline UChar* appendCharacter(UChar* destination, int character)
161{
162 ASSERT(character != nonCharacter);
163 ASSERT(!U_IS_SURROGATE(character));
164 if (U_IS_BMP(character))
165 *destination++ = character;
166 else {
167 *destination++ = U16_LEAD(character);
168 *destination++ = U16_TRAIL(character);
169 }
170 return destination;
171}
172
173void TextCodecUTF8::consumePartialSequenceByte()
174{
175 --m_partialSequenceSize;
176 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
177}
178
179bool TextCodecUTF8::handlePartialSequence(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush)
180{
181 ASSERT(m_partialSequenceSize);
182 do {
183 if (isASCII(m_partialSequence[0])) {
184 *destination++ = m_partialSequence[0];
185 consumePartialSequenceByte();
186 continue;
187 }
188 int count = nonASCIISequenceLength(m_partialSequence[0]);
189 if (!count)
190 return true;
191
192 if (count > m_partialSequenceSize) {
193 if (count - m_partialSequenceSize > end - source) {
194 if (!flush) {
195 // The new data is not enough to complete the sequence, so
196 // add it to the existing partial sequence.
197 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
198 m_partialSequenceSize += end - source;
199 return false;
200 }
201 // An incomplete partial sequence at the end is an error, but it will create
202 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
203 // the error.
204 return true;
205 }
206 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
207 source += count - m_partialSequenceSize;
208 m_partialSequenceSize = count;
209 }
210 int character = decodeNonASCIISequence(m_partialSequence, count);
211 if (character == nonCharacter || character > 0xFF)
212 return true;
213
214 m_partialSequenceSize -= count;
215 *destination++ = character;
216 } while (m_partialSequenceSize);
217
218 return false;
219}
220
221void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
222{
223 ASSERT(m_partialSequenceSize);
224 do {
225 if (isASCII(m_partialSequence[0])) {
226 *destination++ = m_partialSequence[0];
227 consumePartialSequenceByte();
228 continue;
229 }
230 int count = nonASCIISequenceLength(m_partialSequence[0]);
231 if (!count) {
232 sawError = true;
233 if (stopOnError)
234 return;
235 *destination++ = replacementCharacter;
236 consumePartialSequenceByte();
237 continue;
238 }
239 if (count > m_partialSequenceSize) {
240 if (count - m_partialSequenceSize > end - source) {
241 if (!flush) {
242 // The new data is not enough to complete the sequence, so
243 // add it to the existing partial sequence.
244 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
245 m_partialSequenceSize += end - source;
246 return;
247 }
248 // An incomplete partial sequence at the end is an error.
249 sawError = true;
250 if (stopOnError)
251 return;
252 *destination++ = replacementCharacter;
253 m_partialSequenceSize = 0;
254 source = end;
255 continue;
256 }
257 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
258 source += count - m_partialSequenceSize;
259 m_partialSequenceSize = count;
260 }
261 int character = decodeNonASCIISequence(m_partialSequence, count);
262 if (character == nonCharacter) {
263 sawError = true;
264 if (stopOnError)
265 return;
266 *destination++ = replacementCharacter;
267 m_partialSequenceSize -= count;
268 memmove(m_partialSequence, m_partialSequence + count, m_partialSequenceSize);
269 continue;
270 }
271
272 m_partialSequenceSize -= count;
273 destination = appendCharacter(destination, character);
274 } while (m_partialSequenceSize);
275}
276
277String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
278{
279 // Each input byte might turn into a character.
280 // That includes all bytes in the partial-sequence buffer because
281 // each byte in an invalid sequence will turn into a replacement character.
282 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
283
284 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
285 const uint8_t* end = source + length;
286 const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
287 LChar* destination = buffer.characters();
288
289 do {
290 if (m_partialSequenceSize) {
291 // Explicitly copy destination and source pointers to avoid taking pointers to the
292 // local variables, which may harm code generation by disabling some optimizations
293 // in some compilers.
294 LChar* destinationForHandlePartialSequence = destination;
295 const uint8_t* sourceForHandlePartialSequence = source;
296 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush)) {
297 source = sourceForHandlePartialSequence;
298 goto upConvertTo16Bit;
299 }
300 destination = destinationForHandlePartialSequence;
301 source = sourceForHandlePartialSequence;
302 if (m_partialSequenceSize)
303 break;
304 }
305
306 while (source < end) {
307 if (isASCII(*source)) {
308 // Fast path for ASCII. Most UTF-8 text will be ASCII.
309 if (WTF::isAlignedToMachineWord(source)) {
310 while (source < alignedEnd) {
311 auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
312 if (!WTF::isAllASCII<LChar>(chunk))
313 break;
314 copyASCIIMachineWord(destination, source);
315 source += sizeof(WTF::MachineWord);
316 destination += sizeof(WTF::MachineWord);
317 }
318 if (source == end)
319 break;
320 if (!isASCII(*source))
321 continue;
322 }
323 *destination++ = *source++;
324 continue;
325 }
326 int count = nonASCIISequenceLength(*source);
327 int character;
328 if (!count)
329 character = nonCharacter;
330 else {
331 if (count > end - source) {
332 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
333 ASSERT(!m_partialSequenceSize);
334 m_partialSequenceSize = end - source;
335 memcpy(m_partialSequence, source, m_partialSequenceSize);
336 source = end;
337 break;
338 }
339 character = decodeNonASCIISequence(source, count);
340 }
341 if (character == nonCharacter) {
342 sawError = true;
343 if (stopOnError)
344 break;
345
346 goto upConvertTo16Bit;
347 }
348 if (character > 0xFF)
349 goto upConvertTo16Bit;
350
351 source += count;
352 *destination++ = character;
353 }
354 } while (flush && m_partialSequenceSize);
355
356 buffer.shrink(destination - buffer.characters());
357
358 return String::adopt(WTFMove(buffer));
359
360upConvertTo16Bit:
361 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
362
363 UChar* destination16 = buffer16.characters();
364
365 // Copy the already converted characters
366 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
367 *destination16++ = *converted8++;
368
369 do {
370 if (m_partialSequenceSize) {
371 // Explicitly copy destination and source pointers to avoid taking pointers to the
372 // local variables, which may harm code generation by disabling some optimizations
373 // in some compilers.
374 UChar* destinationForHandlePartialSequence = destination16;
375 const uint8_t* sourceForHandlePartialSequence = source;
376 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
377 destination16 = destinationForHandlePartialSequence;
378 source = sourceForHandlePartialSequence;
379 if (m_partialSequenceSize)
380 break;
381 }
382
383 while (source < end) {
384 if (isASCII(*source)) {
385 // Fast path for ASCII. Most UTF-8 text will be ASCII.
386 if (WTF::isAlignedToMachineWord(source)) {
387 while (source < alignedEnd) {
388 auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
389 if (!WTF::isAllASCII<LChar>(chunk))
390 break;
391 copyASCIIMachineWord(destination16, source);
392 source += sizeof(WTF::MachineWord);
393 destination16 += sizeof(WTF::MachineWord);
394 }
395 if (source == end)
396 break;
397 if (!isASCII(*source))
398 continue;
399 }
400 *destination16++ = *source++;
401 continue;
402 }
403 int count = nonASCIISequenceLength(*source);
404 int character;
405 if (!count)
406 character = nonCharacter;
407 else {
408 if (count > end - source) {
409 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
410 ASSERT(!m_partialSequenceSize);
411 m_partialSequenceSize = end - source;
412 memcpy(m_partialSequence, source, m_partialSequenceSize);
413 source = end;
414 break;
415 }
416 character = decodeNonASCIISequence(source, count);
417 }
418 if (character == nonCharacter) {
419 sawError = true;
420 if (stopOnError)
421 break;
422 *destination16++ = replacementCharacter;
423 source += count ? count : 1;
424 continue;
425 }
426 source += count;
427 destination16 = appendCharacter(destination16, character);
428 }
429 } while (flush && m_partialSequenceSize);
430
431 buffer16.shrink(destination16 - buffer16.characters());
432
433 return String::adopt(WTFMove(buffer16));
434}
435
436Vector<uint8_t> TextCodecUTF8::encode(StringView string, UnencodableHandling)
437{
438 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
439 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
440 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
441 Vector<uint8_t> bytes(WTF::checkedProduct<size_t>(string.length(), 3).unsafeGet());
442 size_t bytesWritten = 0;
443 for (auto character : string.codePoints())
444 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
445 bytes.shrink(bytesWritten);
446 return bytes;
447}
448
449} // namespace WebCore
450