1/*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003-2017 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20*/
21
22
23#include "config.h"
24#include "TextResourceDecoder.h"
25
26#include "HTMLMetaCharsetParser.h"
27#include "HTMLNames.h"
28#include "MIMETypeRegistry.h"
29#include "TextCodec.h"
30#include "TextEncoding.h"
31#include "TextEncodingDetector.h"
32#include "TextEncodingRegistry.h"
33#include <wtf/ASCIICType.h>
34
35
36namespace WebCore {
37
38using namespace HTMLNames;
39
40static inline bool bytesEqual(const char* p, char b0, char b1)
41{
42 return p[0] == b0 && p[1] == b1;
43}
44
45static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
46{
47 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
48}
49
50static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
51{
52 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
53}
54
55static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
56{
57 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
58}
59
60static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
61{
62 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
63}
64
65// You might think we should put these find functions elsewhere, perhaps with the
66// similar functions that operate on UChar, but arguably only the decoder has
67// a reason to process strings of char rather than UChar.
68
69static int find(const char* subject, size_t subjectLength, const char* target)
70{
71 size_t targetLength = strlen(target);
72 if (targetLength > subjectLength)
73 return -1;
74 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
75 bool match = true;
76 for (size_t j = 0; j < targetLength; ++j) {
77 if (subject[i + j] != target[j]) {
78 match = false;
79 break;
80 }
81 }
82 if (match)
83 return i;
84 }
85 return -1;
86}
87
88static TextEncoding findTextEncoding(const char* encodingName, int length)
89{
90 Vector<char, 64> buffer(length + 1);
91 memcpy(buffer.data(), encodingName, length);
92 buffer[length] = '\0';
93 return buffer.data();
94}
95
96class KanjiCode {
97public:
98 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
99 static enum Type judge(const char* str, int length);
100 static const int ESC = 0x1b;
101 static const unsigned char sjisMap[256];
102 static int ISkanji(int code)
103 {
104 if (code >= 0x100)
105 return 0;
106 return sjisMap[code & 0xff] & 1;
107 }
108 static int ISkana(int code)
109 {
110 if (code >= 0x100)
111 return 0;
112 return sjisMap[code & 0xff] & 2;
113 }
114};
115
116const unsigned char KanjiCode::sjisMap[256] = {
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
133};
134
135/*
136 * EUC-JP is
137 * [0xa1 - 0xfe][0xa1 - 0xfe]
138 * 0x8e[0xa1 - 0xfe](SS2)
139 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
140 *
141 * Shift_Jis is
142 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
143 *
144 * Shift_Jis Hankaku Kana is
145 * [0xa1 - 0xdf]
146 */
147
148/*
149 * KanjiCode::judge() is based on judge_jcode() from jvim
150 * http://hp.vector.co.jp/authors/VA003457/vim/
151 *
152 * Special Thanks to Kenichi Tsuchida
153 */
154
155enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
156{
157 enum Type code;
158 int i;
159 int bfr = false; /* Kana Moji */
160 int bfk = 0; /* EUC Kana */
161 int sjis = 0;
162 int euc = 0;
163
164 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
165
166 code = ASCII;
167
168 i = 0;
169 while (i < size) {
170 if (ptr[i] == ESC && (size - i >= 3)) {
171 if (bytesEqual(str + i + 1, '$', 'B')
172 || bytesEqual(str + i + 1, '(', 'B')
173 || bytesEqual(str + i + 1, '$', '@')
174 || bytesEqual(str + i + 1, '(', 'J')) {
175 code = JIS;
176 goto breakBreak;
177 }
178 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')', 'I')) {
179 code = JIS;
180 i += 3;
181 } else {
182 i++;
183 }
184 bfr = false;
185 bfk = 0;
186 } else {
187 if (ptr[i] < 0x20) {
188 bfr = false;
189 bfk = 0;
190 /* ?? check kudokuten ?? && ?? hiragana ?? */
191 if ((i >= 2) && (ptr[i - 2] == 0x81)
192 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
193 code = SJIS;
194 sjis += 100; /* kudokuten */
195 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
196 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
197 code = EUC;
198 euc += 100; /* kudokuten */
199 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
200 sjis += 40; /* hiragana */
201 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
202 euc += 40; /* hiragana */
203 }
204 } else {
205 /* ?? check hiragana or katana ?? */
206 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
207 sjis++; /* hiragana */
208 } else if ((size - i > 1) && (ptr[i] == 0x83)
209 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
210 sjis++; /* katakana */
211 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
212 euc++; /* hiragana */
213 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
214 euc++; /* katakana */
215 }
216 if (bfr) {
217 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
218 code = SJIS;
219 goto breakBreak;
220 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
221 code = SJIS;
222 goto breakBreak;
223 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
224 code = EUC;
225 goto breakBreak;
226 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
227 code = EUC;
228 goto breakBreak;
229 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
230 code = SJIS;
231 goto breakBreak;
232 } else if (ptr[i] <= 0x7f) {
233 code = SJIS;
234 goto breakBreak;
235 } else {
236 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
237 euc++; /* sjis hankaku kana kigo */
238 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
239 ; /* sjis hankaku kana */
240 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
241 euc++;
242 } else if (0x8e == ptr[i]) {
243 euc++;
244 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
245 sjis++;
246 }
247 bfr = false;
248 bfk = 0;
249 }
250 } else if (0x8e == ptr[i]) {
251 if (size - i <= 1) {
252 ;
253 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
254 /* EUC KANA or SJIS KANJI */
255 if (bfk == 1) {
256 euc += 100;
257 }
258 bfk++;
259 i++;
260 } else {
261 /* SJIS only */
262 code = SJIS;
263 goto breakBreak;
264 }
265 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
266 /* SJIS only */
267 code = SJIS;
268 if ((size - i >= 1)
269 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
270 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
271 goto breakBreak;
272 }
273 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
274 /* EUC only */
275 code = EUC;
276 if ((size - i >= 1)
277 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
278 goto breakBreak;
279 }
280 } else if (ptr[i] <= 0x7f) {
281 ;
282 } else {
283 bfr = true;
284 bfk = 0;
285 }
286 }
287 i++;
288 }
289 }
290 if (code == ASCII) {
291 if (sjis > euc) {
292 code = SJIS;
293 } else if (sjis < euc) {
294 code = EUC;
295 }
296 }
297breakBreak:
298 return (code);
299}
300
301TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
302{
303 if (equalLettersIgnoringASCIICase(mimeType, "text/css"))
304 return CSS;
305 if (equalLettersIgnoringASCIICase(mimeType, "text/html"))
306 return HTML;
307 if (MIMETypeRegistry::isXMLMIMEType(mimeType))
308 return XML;
309 return PlainText;
310}
311
312const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
313{
314 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
315 // for text/xml. This matches Firefox.
316 if (contentType == XML)
317 return UTF8Encoding();
318 if (!specifiedDefaultEncoding.isValid())
319 return Latin1Encoding();
320 return specifiedDefaultEncoding;
321}
322
323inline TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
324 : m_contentType(determineContentType(mimeType))
325 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
326 , m_usesEncodingDetector(usesEncodingDetector)
327{
328}
329
330Ref<TextResourceDecoder> TextResourceDecoder::create(const String& mimeType, const TextEncoding& defaultEncoding, bool usesEncodingDetector)
331{
332 return adoptRef(*new TextResourceDecoder(mimeType, defaultEncoding, usesEncodingDetector));
333}
334
335TextResourceDecoder::~TextResourceDecoder() = default;
336
337void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
338{
339 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
340 if (!encoding.isValid())
341 return;
342
343 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
344 // treat x-user-defined as windows-1252 (bug 18270)
345 if (source == EncodingFromMetaTag && equalLettersIgnoringASCIICase(encoding.name(), "x-user-defined"))
346 m_encoding = "windows-1252";
347 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
348 m_encoding = encoding.closestByteBasedEquivalent();
349 else
350 m_encoding = encoding;
351
352 m_codec = nullptr;
353 m_source = source;
354}
355
356bool TextResourceDecoder::hasEqualEncodingForCharset(const String& charset) const
357{
358 return defaultEncoding(m_contentType, charset) == m_encoding;
359}
360
361// Returns the position of the encoding string.
362static int findXMLEncoding(const char* str, int len, int& encodingLength)
363{
364 int pos = find(str, len, "encoding");
365 if (pos == -1)
366 return -1;
367 pos += 8;
368
369 // Skip spaces and stray control characters.
370 while (pos < len && str[pos] <= ' ')
371 ++pos;
372
373 // Skip equals sign.
374 if (pos >= len || str[pos] != '=')
375 return -1;
376 ++pos;
377
378 // Skip spaces and stray control characters.
379 while (pos < len && str[pos] <= ' ')
380 ++pos;
381
382 // Skip quotation mark.
383 if (pos >= len)
384 return - 1;
385 char quoteMark = str[pos];
386 if (quoteMark != '"' && quoteMark != '\'')
387 return -1;
388 ++pos;
389
390 // Find the trailing quotation mark.
391 int end = pos;
392 while (end < len && str[end] != quoteMark)
393 ++end;
394 if (end >= len)
395 return -1;
396
397 encodingLength = end - pos;
398 return pos;
399}
400
401size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
402{
403 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
404 // We let it override even a user-chosen encoding.
405 const size_t maximumBOMLength = 3;
406
407 ASSERT(!m_checkedForBOM);
408
409 size_t lengthOfBOM = 0;
410
411 size_t bufferLength = m_buffer.size();
412
413 size_t buf1Len = bufferLength;
414 size_t buf2Len = len;
415 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
416 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
417 unsigned char c1 = buf1Len ? (static_cast<void>(--buf1Len), *buf1++) : buf2Len ? (static_cast<void>(--buf2Len), *buf2++) : 0;
418 unsigned char c2 = buf1Len ? (static_cast<void>(--buf1Len), *buf1++) : buf2Len ? (static_cast<void>(--buf2Len), *buf2++) : 0;
419 unsigned char c3 = buf1Len ? (static_cast<void>(--buf1Len), *buf1++) : buf2Len ? (static_cast<void>(--buf2Len), *buf2++) : 0;
420
421 // Check for the BOM.
422 if (c1 == 0xFF && c2 == 0xFE) {
423 ASSERT(UTF16LittleEndianEncoding().isValid());
424 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
425 lengthOfBOM = 2;
426 } else if (c1 == 0xFE && c2 == 0xFF) {
427 ASSERT(UTF16BigEndianEncoding().isValid());
428 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
429 lengthOfBOM = 2;
430 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
431 ASSERT(UTF8Encoding().isValid());
432 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
433 lengthOfBOM = 3;
434 }
435
436 if (lengthOfBOM || bufferLength + len >= maximumBOMLength)
437 m_checkedForBOM = true;
438
439 ASSERT(lengthOfBOM <= maximumBOMLength);
440 return lengthOfBOM;
441}
442
443bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
444{
445 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
446 m_checkedForCSSCharset = true;
447 return true;
448 }
449
450 size_t oldSize = m_buffer.size();
451 m_buffer.grow(oldSize + len);
452 memcpy(m_buffer.data() + oldSize, data, len);
453
454 movedDataToBuffer = true;
455
456 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
457 return false;
458
459 const char* dataStart = m_buffer.data();
460 const char* dataEnd = dataStart + m_buffer.size();
461
462 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
463 dataStart += 10;
464 const char* pos = dataStart;
465
466 while (pos < dataEnd && *pos != '"')
467 ++pos;
468 if (pos == dataEnd)
469 return false;
470
471 int encodingNameLength = pos - dataStart;
472
473 ++pos;
474 if (pos == dataEnd)
475 return false;
476
477 if (*pos == ';')
478 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
479 }
480
481 m_checkedForCSSCharset = true;
482 return true;
483}
484
485bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
486{
487 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
488 m_checkedForHeadCharset = true;
489 return true;
490 }
491
492 // This is not completely efficient, since the function might go
493 // through the HTML head several times.
494
495 size_t oldSize = m_buffer.size();
496 m_buffer.grow(oldSize + len);
497 memcpy(m_buffer.data() + oldSize, data, len);
498
499 movedDataToBuffer = true;
500
501 // Continue with checking for an HTML meta tag if we were already doing so.
502 if (m_charsetParser)
503 return checkForMetaCharset(data, len);
504
505 const char* ptr = m_buffer.data();
506 const char* pEnd = ptr + m_buffer.size();
507
508 // Is there enough data available to check for XML declaration?
509 if (m_buffer.size() < 8)
510 return false;
511
512 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
513 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
514 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
515 const char* xmlDeclarationEnd = ptr;
516 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
517 ++xmlDeclarationEnd;
518 if (xmlDeclarationEnd == pEnd)
519 return false;
520 // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
521 int len = 0;
522 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
523 if (pos != -1)
524 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
525 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
526 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) {
527 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
528 return true;
529 } else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) {
530 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
531 return true;
532 }
533
534 // The HTTP-EQUIV meta has no effect on XHTML.
535 if (m_contentType == XML)
536 return true;
537
538 m_charsetParser = std::make_unique<HTMLMetaCharsetParser>();
539 return checkForMetaCharset(data, len);
540}
541
542bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
543{
544 if (!m_charsetParser->checkForMetaCharset(data, length))
545 return false;
546
547 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
548 m_charsetParser = nullptr;
549 m_checkedForHeadCharset = true;
550 return true;
551}
552
553void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
554{
555 switch (KanjiCode::judge(data, len)) {
556 case KanjiCode::JIS:
557 setEncoding("ISO-2022-JP", AutoDetectedEncoding);
558 break;
559 case KanjiCode::EUC:
560 setEncoding("EUC-JP", AutoDetectedEncoding);
561 break;
562 case KanjiCode::SJIS:
563 setEncoding("Shift_JIS", AutoDetectedEncoding);
564 break;
565 case KanjiCode::ASCII:
566 case KanjiCode::UTF16:
567 case KanjiCode::UTF8:
568 break;
569 }
570}
571
572// We use the encoding detector in two cases:
573// 1. Encoding detector is turned ON and no other encoding source is
574// available (that is, it's DefaultEncoding).
575// 2. Encoding detector is turned ON and the encoding is set to
576// the encoding of the parent frame, which is also auto-detected.
577// Note that condition #2 is NOT satisfied unless parent-child frame
578// relationship is compliant to the same-origin policy. If they're from
579// different domains, |m_source| would not be set to EncodingFromParentFrame
580// in the first place.
581bool TextResourceDecoder::shouldAutoDetect() const
582{
583 return m_usesEncodingDetector
584 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_parentFrameAutoDetectedEncoding));
585}
586
587String TextResourceDecoder::decode(const char* data, size_t length)
588{
589 size_t lengthOfBOM = 0;
590 if (!m_checkedForBOM)
591 lengthOfBOM = checkForBOM(data, length);
592
593 bool movedDataToBuffer = false;
594
595 if (m_contentType == CSS && !m_checkedForCSSCharset)
596 if (!checkForCSSCharset(data, length, movedDataToBuffer))
597 return emptyString();
598
599 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
600 if (!checkForHeadCharset(data, length, movedDataToBuffer))
601 return emptyString();
602
603 // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
604 if (shouldAutoDetect()) {
605 if (m_encoding.isJapanese())
606 detectJapaneseEncoding(data, length); // FIXME: We should use detectTextEncoding() for all languages.
607 else {
608 TextEncoding detectedEncoding;
609 if (detectTextEncoding(data, length, m_parentFrameAutoDetectedEncoding, &detectedEncoding))
610 setEncoding(detectedEncoding, AutoDetectedEncoding);
611 }
612 }
613
614 ASSERT(m_encoding.isValid());
615
616 if (!m_codec)
617 m_codec = newTextCodec(m_encoding);
618
619 if (m_buffer.isEmpty())
620 return m_codec->decode(data + lengthOfBOM, length - lengthOfBOM, false, m_contentType == XML, m_sawError);
621
622 if (!movedDataToBuffer) {
623 size_t oldSize = m_buffer.size();
624 m_buffer.grow(oldSize + length);
625 memcpy(m_buffer.data() + oldSize, data, length);
626 }
627
628 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
629 m_buffer.clear();
630 return result;
631}
632
633String TextResourceDecoder::flush()
634{
635 // If we can not identify the encoding even after a document is completely
636 // loaded, we need to detect the encoding if other conditions for
637 // autodetection is satisfied.
638 if (m_buffer.size() && shouldAutoDetect()
639 && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
640 TextEncoding detectedEncoding;
641 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_parentFrameAutoDetectedEncoding, &detectedEncoding))
642 setEncoding(detectedEncoding, AutoDetectedEncoding);
643 }
644
645 if (!m_codec)
646 m_codec = newTextCodec(m_encoding);
647
648 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
649 m_buffer.clear();
650 m_codec = nullptr;
651 m_checkedForBOM = false; // Skip BOM again when re-decoding.
652 return result;
653}
654
655String TextResourceDecoder::decodeAndFlush(const char* data, size_t length)
656{
657 String decoded = decode(data, length);
658 return decoded + flush();
659}
660
661const TextEncoding* TextResourceDecoder::encodingForURLParsing()
662{
663 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
664 // we do when submitting a form. A form with GET method
665 // has its contents added to a URL as query params and it makes sense
666 // to be consistent.
667 auto& encoding = m_encoding.encodingForFormSubmissionOrURLParsing();
668 if (encoding == UTF8Encoding())
669 return nullptr;
670 return &encoding;
671}
672
673}
674