TextResourceDecoder.cpp source code [webkit/Source/WebCore/loader/TextResourceDecoder.cpp]

1	/*
2	Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3	Copyright (C) 2003-2017 Apple Inc. All rights reserved.
4	Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6	This library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Library General Public
8	License as published by the Free Software Foundation; either
9	version 2 of the License, or (at your option) any later version.
10
11	This library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Library General Public License for more details.
15
16	You should have received a copy of the GNU Library General Public License
17	along with this library; see the file COPYING.LIB. If not, write to
18	the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19	Boston, MA 02110-1301, USA.
20	*/
21
22
23	#include "config.h"
24	#include "TextResourceDecoder.h"
25
26	#include "HTMLMetaCharsetParser.h"
27	#include "HTMLNames.h"
28	#include "MIMETypeRegistry.h"
29	#include "TextCodec.h"
30	#include "TextEncoding.h"
31	#include "TextEncodingDetector.h"
32	#include "TextEncodingRegistry.h"
33	#include <wtf/ASCIICType.h>
34
35
36	namespace WebCore {
37
38	using namespace HTMLNames;
39
40	static inline bool bytesEqual(const char* p, char b0, char b1)
41	{
42	return p[`0`] == b0 && p[`1`] == b1;
43	}
44
45	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
46	{
47	return p[`0`] == b0 && p[`1`] == b1 && p[`2`] == b2 && p[`3`] == b3 && p[`4`] == b4;
48	}
49
50	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
51	{
52	return p[`0`] == b0 && p[`1`] == b1 && p[`2`] == b2 && p[`3`] == b3 && p[`4`] == b4 && p[`5`] == b5;
53	}
54
55	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
56	{
57	return p[`0`] == b0 && p[`1`] == b1 && p[`2`] == b2 && p[`3`] == b3 && p[`4`] == b4 && p[`5`] == b5 && p[`6`] == b6 && p[`7`] == b7;
58	}
59
60	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
61	{
62	return p[`0`] == b0 && p[`1`] == b1 && p[`2`] == b2 && p[`3`] == b3 && p[`4`] == b4 && p[`5`] == b5 && p[`6`] == b6 && p[`7`] == b7 && p[`8`] == b8 && p[`9`] == b9;
63	}
64
65	// You might think we should put these find functions elsewhere, perhaps with the
66	// similar functions that operate on UChar, but arguably only the decoder has
67	// a reason to process strings of char rather than UChar.
68
69	static int find(const char* subject, size_t subjectLength, const char* target)
70	{
71	size_t targetLength = strlen(target);
72	if (targetLength > subjectLength)
73	return -`1`;
74	for (size_t i = `0`; i <= subjectLength - targetLength; ++i) {
75	bool match = true;
76	for (size_t j = `0`; j < targetLength; ++j) {
77	if (subject[i + j] != target[j]) {
78	match = false;
79	break;
80	}
81	}
82	if (match)
83	return i;
84	}
85	return -`1`;
86	}
87
88	static TextEncoding findTextEncoding(const char* encodingName, int length)
89	{
90	Vector<char, `64`> buffer(length + `1`);
91	memcpy(buffer.data(), encodingName, length);
92	buffer [length] = `'\0'`;
93	return buffer.data();
94	}
95
96	class KanjiCode {
97	public:
98	enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
99	static enum Type judge(const char* str, int length);
100	static const int ESC = `0x1b`;
101	static const unsigned char sjisMap[`256`];
102	static int ISkanji(int code)
103	{
104	if (code >= `0x100`)
105	return `0`;
106	return sjisMap[code & `0xff`] & `1`;
107	}
108	static int ISkana(int code)
109	{
110	if (code >= `0x100`)
111	return `0`;
112	return sjisMap[code & `0xff`] & `2`;
113	}
114	};
115
116	const unsigned char KanjiCode::sjisMap[`256`] = {
117	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
118	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
119	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
120	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
121	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
122	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
123	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
124	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
125	`0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
126	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
127	`0`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
128	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
129	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
130	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
131	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
132	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`
133	};
134
135	/*
136	* EUC-JP is
137	* [0xa1 - 0xfe][0xa1 - 0xfe]
138	* 0x8e[0xa1 - 0xfe](SS2)
139	* 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
140	*
141	* Shift_Jis is
142	* [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
143	*
144	* Shift_Jis Hankaku Kana is
145	* [0xa1 - 0xdf]
146	*/
147
148	/*
149	* KanjiCode::judge() is based on judge_jcode() from jvim
150	* http://hp.vector.co.jp/authors/VA003457/vim/
151	*
152	* Special Thanks to Kenichi Tsuchida
153	*/
154
155	enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
156	{
157	enum Type code;
158	int i;
159	int bfr = false; / Kana Moji /
160	int bfk = `0`; / EUC Kana /
161	int sjis = `0`;
162	int euc = `0`;
163
164	const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
165
166	code = ASCII;
167
168	i = `0`;
169	while (i < size) {
170	if (ptr[i] == ESC && (size - i >= `3`)) {
171	if (bytesEqual(str + i + `1`, `'$'`, `'B'`)
172	\|\| bytesEqual(str + i + `1`, `'('`, `'B'`)
173	\|\| bytesEqual(str + i + `1`, `'$'`, `'@'`)
174	\|\| bytesEqual(str + i + `1`, `'('`, `'J'`)) {
175	code = JIS;
176	goto breakBreak;
177	}
178	if (bytesEqual(str + i + `1`, `'('`, `'I'`) \|\| bytesEqual(str + i + `1`, `')'`, `'I'`)) {
179	code = JIS;
180	i += `3`;
181	} else {
182	i++;
183	}
184	bfr = false;
185	bfk = `0`;
186	} else {
187	if (ptr[i] < `0x20`) {
188	bfr = false;
189	bfk = `0`;
190	/ ?? check kudokuten ?? && ?? hiragana ?? /
191	if ((i >= `2`) && (ptr[i - `2`] == `0x81`)
192	&& (`0x41` <= ptr[i - `1`] && ptr[i - `1`] <= `0x49`)) {
193	code = SJIS;
194	sjis += `100`; / kudokuten /
195	} else if ((i >= `2`) && (ptr[i - `2`] == `0xa1`)
196	&& (`0xa2` <= ptr[i - `1`] && ptr[i - `1`] <= `0xaa`)) {
197	code = EUC;
198	euc += `100`; / kudokuten /
199	} else if ((i >= `2`) && (ptr[i - `2`] == `0x82`) && (`0xa0` <= ptr[i - `1`])) {
200	sjis += `40`; / hiragana /
201	} else if ((i >= `2`) && (ptr[i - `2`] == `0xa4`) && (`0xa0` <= ptr[i - `1`])) {
202	euc += `40`; / hiragana /
203	}
204	} else {
205	/ ?? check hiragana or katana ?? /
206	if ((size - i > `1`) && (ptr[i] == `0x82`) && (`0xa0` <= ptr[i + `1`])) {
207	sjis++; / hiragana /
208	} else if ((size - i > `1`) && (ptr[i] == `0x83`)
209	&& (`0x40` <= ptr[i + `1`] && ptr[i + `1`] <= `0x9f`)) {
210	sjis++; / katakana /
211	} else if ((size - i > `1`) && (ptr[i] == `0xa4`) && (`0xa0` <= ptr[i + `1`])) {
212	euc++; / hiragana /
213	} else if ((size - i > `1`) && (ptr[i] == `0xa5`) && (`0xa0` <= ptr[i + `1`])) {
214	euc++; / katakana /
215	}
216	if (bfr) {
217	if ((i >= `1`) && (`0x40` <= ptr[i] && ptr[i] <= `0xa0`) && ISkanji(ptr[i - `1`])) {
218	code = SJIS;
219	goto breakBreak;
220	} else if ((i >= `1`) && (`0x81` <= ptr[i - `1`] && ptr[i - `1`] <= `0x9f`) && ((`0x40` <= ptr[i] && ptr[i] < `0x7e`) \|\| (`0x7e` < ptr[i] && ptr[i] <= `0xfc`))) {
221	code = SJIS;
222	goto breakBreak;
223	} else if ((i >= `1`) && (`0xfd` <= ptr[i] && ptr[i] <= `0xfe`) && (`0xa1` <= ptr[i - `1`] && ptr[i - `1`] <= `0xfe`)) {
224	code = EUC;
225	goto breakBreak;
226	} else if ((i >= `1`) && (`0xfd` <= ptr[i - `1`] && ptr[i - `1`] <= `0xfe`) && (`0xa1` <= ptr[i] && ptr[i] <= `0xfe`)) {
227	code = EUC;
228	goto breakBreak;
229	} else if ((i >= `1`) && (ptr[i] < `0xa0` \|\| `0xdf` < ptr[i]) && (`0x8e` == ptr[i - `1`])) {
230	code = SJIS;
231	goto breakBreak;
232	} else if (ptr[i] <= `0x7f`) {
233	code = SJIS;
234	goto breakBreak;
235	} else {
236	if (`0xa1` <= ptr[i] && ptr[i] <= `0xa6`) {
237	euc++; / sjis hankaku kana kigo /
238	} else if (`0xa1` <= ptr[i] && ptr[i] <= `0xdf`) {
239	; / sjis hankaku kana /
240	} else if (`0xa1` <= ptr[i] && ptr[i] <= `0xfe`) {
241	euc++;
242	} else if (`0x8e` == ptr[i]) {
243	euc++;
244	} else if (`0x20` <= ptr[i] && ptr[i] <= `0x7f`) {
245	sjis++;
246	}
247	bfr = false;
248	bfk = `0`;
249	}
250	} else if (`0x8e` == ptr[i]) {
251	if (size - i <= `1`) {
252	;
253	} else if (`0xa1` <= ptr[i + `1`] && ptr[i + `1`] <= `0xdf`) {
254	/ EUC KANA or SJIS KANJI /
255	if (bfk == `1`) {
256	euc += `100`;
257	}
258	bfk++;
259	i++;
260	} else {
261	/ SJIS only /
262	code = SJIS;
263	goto breakBreak;
264	}
265	} else if (`0x81` <= ptr[i] && ptr[i] <= `0x9f`) {
266	/ SJIS only /
267	code = SJIS;
268	if ((size - i >= `1`)
269	&& ((`0x40` <= ptr[i + `1`] && ptr[i + `1`] <= `0x7e`)
270	\|\| (`0x80` <= ptr[i + `1`] && ptr[i + `1`] <= `0xfc`))) {
271	goto breakBreak;
272	}
273	} else if (`0xfd` <= ptr[i] && ptr[i] <= `0xfe`) {
274	/ EUC only /
275	code = EUC;
276	if ((size - i >= `1`)
277	&& (`0xa1` <= ptr[i + `1`] && ptr[i + `1`] <= `0xfe`)) {
278	goto breakBreak;
279	}
280	} else if (ptr[i] <= `0x7f`) {
281	;
282	} else {
283	bfr = true;
284	bfk = `0`;
285	}
286	}
287	i++;
288	}
289	}
290	if (code == ASCII) {
291	if (sjis > euc) {
292	code = SJIS;
293	} else if (sjis < euc) {
294	code = EUC;
295	}
296	}
297	breakBreak:
298	return (code);
299	}
300
301	TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
302	{
303	if (equalLettersIgnoringASCIICase(mimeType, "text/css"))
304	return CSS;
305	if (equalLettersIgnoringASCIICase(mimeType, "text/html"))
306	return HTML;
307	if (MIMETypeRegistry::isXMLMIMEType(mimeType))
308	return XML;
309	return PlainText;
310	}
311
312	const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
313	{
314	// Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
315	// for text/xml. This matches Firefox.
316	if (contentType == XML)
317	return UTF8Encoding();
318	if (!specifiedDefaultEncoding.isValid())
319	return Latin1Encoding();
320	return specifiedDefaultEncoding;
321	}
322
323	inline TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
324	: m_contentType(determineContentType(mimeType))
325	, m_encoding (defaultEncoding(m_contentType, specifiedDefaultEncoding))
326	, m_usesEncodingDetector(usesEncodingDetector)
327	{
328	}
329
330	Ref<TextResourceDecoder> TextResourceDecoder::create(const String& mimeType, const TextEncoding& defaultEncoding, bool usesEncodingDetector)
331	{
332	return adoptRef(*new TextResourceDecoder (mimeType, defaultEncoding, usesEncodingDetector));
333	}
334
335	TextResourceDecoder::~TextResourceDecoder() = default;
336
337	void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
338	{
339	// In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
340	if (!encoding.isValid())
341	return;
342
343	// When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
344	// treat x-user-defined as windows-1252 (bug 18270)
345	if (source == EncodingFromMetaTag && equalLettersIgnoringASCIICase(encoding.name(), "x-user-defined"))
346	m_encoding = "windows-1252";
347	else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)
348	m_encoding = encoding.closestByteBasedEquivalent();
349	else
350	m_encoding = encoding;
351
352	m_codec = nullptr;
353	m_source = source;
354	}
355
356	bool TextResourceDecoder::hasEqualEncodingForCharset(const String& charset) const
357	{
358	return defaultEncoding(m_contentType, charset) == m_encoding;
359	}
360
361	// Returns the position of the encoding string.
362	static int findXMLEncoding(const char* str, int len, int& encodingLength)
363	{
364	int pos = find(str, len, "encoding");
365	if (pos == -`1`)
366	return -`1`;
367	pos += `8`;
368
369	// Skip spaces and stray control characters.
370	while (pos < len && str[pos] <= `' '`)
371	++pos;
372
373	// Skip equals sign.
374	if (pos >= len \|\| str[pos] != `'='`)
375	return -`1`;
376	++pos;
377
378	// Skip spaces and stray control characters.
379	while (pos < len && str[pos] <= `' '`)
380	++pos;
381
382	// Skip quotation mark.
383	if (pos >= len)
384	return - `1`;
385	char quoteMark = str[pos];
386	if (quoteMark != `'"'` && quoteMark != `'\''`)
387	return -`1`;
388	++pos;
389
390	// Find the trailing quotation mark.
391	int end = pos;
392	while (end < len && str[end] != quoteMark)
393	++end;
394	if (end >= len)
395	return -`1`;
396
397	encodingLength = end - pos;
398	return pos;
399	}
400
401	size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
402	{
403	// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
404	// We let it override even a user-chosen encoding.
405	const size_t maximumBOMLength = `3`;
406
407	ASSERT(!m_checkedForBOM);
408
409	size_t lengthOfBOM = `0`;
410
411	size_t bufferLength = m_buffer.size();
412
413	size_t buf1Len = bufferLength;
414	size_t buf2Len = len;
415	const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
416	const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
417	unsigned char c1 = buf1Len ? (static_cast<void>(--buf1Len), buf1++) : buf2Len ? (static_cast<void>(--buf2Len), buf2++) : `0`;
418	unsigned char c2 = buf1Len ? (static_cast<void>(--buf1Len), buf1++) : buf2Len ? (static_cast<void>(--buf2Len), buf2++) : `0`;
419	unsigned char c3 = buf1Len ? (static_cast<void>(--buf1Len), buf1++) : buf2Len ? (static_cast<void>(--buf2Len), buf2++) : `0`;
420
421	// Check for the BOM.
422	if (c1 == `0xFF` && c2 == `0xFE`) {
423	ASSERT(UTF16LittleEndianEncoding().isValid());
424	setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
425	lengthOfBOM = `2`;
426	} else if (c1 == `0xFE` && c2 == `0xFF`) {
427	ASSERT(UTF16BigEndianEncoding().isValid());
428	setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
429	lengthOfBOM = `2`;
430	} else if (c1 == `0xEF` && c2 == `0xBB` && c3 == `0xBF`) {
431	ASSERT(UTF8Encoding().isValid());
432	setEncoding(UTF8Encoding(), AutoDetectedEncoding);
433	lengthOfBOM = `3`;
434	}
435
436	if (lengthOfBOM \|\| bufferLength + len >= maximumBOMLength)
437	m_checkedForBOM = true;
438
439	ASSERT(lengthOfBOM <= maximumBOMLength);
440	return lengthOfBOM;
441	}
442
443	bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
444	{
445	if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
446	m_checkedForCSSCharset = true;
447	return true;
448	}
449
450	size_t oldSize = m_buffer.size();
451	m_buffer.grow(oldSize + len);
452	memcpy(m_buffer.data() + oldSize, data, len);
453
454	movedDataToBuffer = true;
455
456	if (m_buffer.size() <= `13`) // strlen('@charset "x";') == 13
457	return false;
458
459	const char* dataStart = m_buffer.data();
460	const char* dataEnd = dataStart + m_buffer.size();
461
462	if (bytesEqual(dataStart, `'@'`, `'c'`, `'h'`, `'a'`, `'r'`, `'s'`, `'e'`, `'t'`, `' '`, `'"'`)) {
463	dataStart += `10`;
464	const char* pos = dataStart;
465
466	while (pos < dataEnd && *pos != `'"'`)
467	++pos;
468	if (pos == dataEnd)
469	return false;
470
471	int encodingNameLength = pos - dataStart;
472
473	++pos;
474	if (pos == dataEnd)
475	return false;
476
477	if (*pos == `';'`)
478	setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
479	}
480
481	m_checkedForCSSCharset = true;
482	return true;
483	}
484
485	bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
486	{
487	if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
488	m_checkedForHeadCharset = true;
489	return true;
490	}
491
492	// This is not completely efficient, since the function might go
493	// through the HTML head several times.
494
495	size_t oldSize = m_buffer.size();
496	m_buffer.grow(oldSize + len);
497	memcpy(m_buffer.data() + oldSize, data, len);
498
499	movedDataToBuffer = true;
500
501	// Continue with checking for an HTML meta tag if we were already doing so.
502	if (m_charsetParser)
503	return checkForMetaCharset(data, len);
504
505	const char* ptr = m_buffer.data();
506	const char* pEnd = ptr + m_buffer.size();
507
508	// Is there enough data available to check for XML declaration?
509	if (m_buffer.size() < `8`)
510	return false;
511
512	// Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
513	// It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
514	if (bytesEqual(ptr, `'<'`, `'?'`, `'x'`, `'m'`, `'l'`)) {
515	const char* xmlDeclarationEnd = ptr;
516	while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != `'>'`)
517	++xmlDeclarationEnd;
518	if (xmlDeclarationEnd == pEnd)
519	return false;
520	// No need for +1, because we have an extra "?" to lose at the end of XML declaration.
521	int len = `0`;
522	int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
523	if (pos != -`1`)
524	setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
525	// continue looking for a charset - it may be specified in an HTTP-Equiv meta
526	} else if (bytesEqual(ptr, `'<'`, `0`, `'?'`, `0`, `'x'`, `0`)) {
527	setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
528	return true;
529	} else if (bytesEqual(ptr, `0`, `'<'`, `0`, `'?'`, `0`, `'x'`)) {
530	setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
531	return true;
532	}
533
534	// The HTTP-EQUIV meta has no effect on XHTML.
535	if (m_contentType == XML)
536	return true;
537
538	m_charsetParser = std::make_unique<HTMLMetaCharsetParser>();
539	return checkForMetaCharset(data, len);
540	}
541
542	bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
543	{
544	if (!m_charsetParser ->checkForMetaCharset(data, length))
545	return false;
546
547	setEncoding(m_charsetParser ->encoding(), EncodingFromMetaTag);
548	m_charsetParser = nullptr;
549	m_checkedForHeadCharset = true;
550	return true;
551	}
552
553	void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
554	{
555	switch (KanjiCode::judge(data, len)) {
556	case KanjiCode::JIS:
557	setEncoding("ISO-2022-JP", AutoDetectedEncoding);
558	break;
559	case KanjiCode::EUC:
560	setEncoding("EUC-JP", AutoDetectedEncoding);
561	break;
562	case KanjiCode::SJIS:
563	setEncoding("Shift_JIS", AutoDetectedEncoding);
564	break;
565	case KanjiCode::ASCII:
566	case KanjiCode::UTF16:
567	case KanjiCode::UTF8:
568	break;
569	}
570	}
571
572	// We use the encoding detector in two cases:
573	// 1. Encoding detector is turned ON and no other encoding source is
574	// available (that is, it's DefaultEncoding).
575	// 2. Encoding detector is turned ON and the encoding is set to
576	// the encoding of the parent frame, which is also auto-detected.
577	// Note that condition #2 is NOT satisfied unless parent-child frame
578	// relationship is compliant to the same-origin policy. If they're from
579	// different domains, \|m_source\| would not be set to EncodingFromParentFrame
580	// in the first place.
581	bool TextResourceDecoder::shouldAutoDetect() const
582	{
583	return m_usesEncodingDetector
584	&& (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_parentFrameAutoDetectedEncoding));
585	}
586
587	String TextResourceDecoder::decode(const char* data, size_t length)
588	{
589	size_t lengthOfBOM = `0`;
590	if (!m_checkedForBOM)
591	lengthOfBOM = checkForBOM(data, length);
592
593	bool movedDataToBuffer = false;
594
595	if (m_contentType == CSS && !m_checkedForCSSCharset)
596	if (!checkForCSSCharset(data, length, movedDataToBuffer))
597	return emptyString();
598
599	if ((m_contentType == HTML \|\| m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
600	if (!checkForHeadCharset(data, length, movedDataToBuffer))
601	return emptyString();
602
603	// FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
604	if (shouldAutoDetect()) {
605	if (m_encoding.isJapanese())
606	detectJapaneseEncoding(data, length); // FIXME: We should use detectTextEncoding() for all languages.
607	else {
608	TextEncoding detectedEncoding;
609	if (detectTextEncoding(data, length, m_parentFrameAutoDetectedEncoding, &detectedEncoding))
610	setEncoding(detectedEncoding, AutoDetectedEncoding);
611	}
612	}
613
614	ASSERT(m_encoding.isValid());
615
616	if (!m_codec)
617	m_codec = newTextCodec(m_encoding);
618
619	if (m_buffer.isEmpty())
620	return m_codec ->decode(data + lengthOfBOM, length - lengthOfBOM, false, m_contentType == XML, m_sawError);
621
622	if (!movedDataToBuffer) {
623	size_t oldSize = m_buffer.size();
624	m_buffer.grow(oldSize + length);
625	memcpy(m_buffer.data() + oldSize, data, length);
626	}
627
628	String result = m_codec ->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
629	m_buffer.clear();
630	return result;
631	}
632
633	String TextResourceDecoder::flush()
634	{
635	// If we can not identify the encoding even after a document is completely
636	// loaded, we need to detect the encoding if other conditions for
637	// autodetection is satisfied.
638	if (m_buffer.size() && shouldAutoDetect()
639	&& ((!m_checkedForHeadCharset && (m_contentType == HTML \|\| m_contentType == XML)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
640	TextEncoding detectedEncoding;
641	if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_parentFrameAutoDetectedEncoding, &detectedEncoding))
642	setEncoding(detectedEncoding, AutoDetectedEncoding);
643	}
644
645	if (!m_codec)
646	m_codec = newTextCodec(m_encoding);
647
648	String result = m_codec ->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
649	m_buffer.clear();
650	m_codec = nullptr;
651	m_checkedForBOM = false; // Skip BOM again when re-decoding.
652	return result;
653	}
654
655	String TextResourceDecoder::decodeAndFlush(const char* data, size_t length)
656	{
657	String decoded = decode(data, length);
658	return decoded + flush();
659	}
660
661	const TextEncoding* TextResourceDecoder::encodingForURLParsing()
662	{
663	// For UTF-{7,16,32}, we want to use UTF-8 for the query part as
664	// we do when submitting a form. A form with GET method
665	// has its contents added to a URL as query params and it makes sense
666	// to be consistent.
667	auto& encoding = m_encoding.encodingForFormSubmissionOrURLParsing();
668	if (encoding == UTF8Encoding())
669	return nullptr;
670	return &encoding;
671	}
672
673	}
674

Browse the source code of webkit/Source/WebCore/loader/TextResourceDecoder.cpp