HTMLMetaCharsetParser.cpp source code [webkit/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp]

1	/*
2	* Copyright (C) 2010 Google Inc. All Rights Reserved.
3	* Copyright (C) 2015-2017 Apple Inc. All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include "HTMLMetaCharsetParser.h"
29
30	#include "HTMLNames.h"
31	#include "HTMLParserIdioms.h"
32	#include "TextCodec.h"
33	#include "TextEncodingRegistry.h"
34
35	namespace WebCore {
36
37	using namespace HTMLNames;
38
39	HTMLMetaCharsetParser::HTMLMetaCharsetParser()
40	: m_codec(newTextCodec(Latin1Encoding()))
41	{
42	}
43
44	static StringView extractCharset(const String& value)
45	{
46	unsigned length = value.length();
47	for (size_t pos = `0`; pos < length; ) {
48	pos = value.findIgnoringASCIICase("charset", pos);
49	if (pos == notFound)
50	break;
51
52	static const size_t charsetLength = sizeof("charset") - `1`;
53	pos += charsetLength;
54
55	// Skip whitespace.
56	while (pos < length && value [pos] <= `' '`)
57	++pos;
58
59	if (value [pos] != `'='`)
60	continue;
61
62	++pos;
63
64	while (pos < length && value [pos] <= `' '`)
65	++pos;
66
67	UChar quoteMark = `0`;
68	if (pos < length && (value [pos] == `'"'` \|\| value [pos] == `'\''`))
69	quoteMark = value [pos++];
70
71	if (pos == length)
72	break;
73
74	unsigned end = pos;
75	while (end < length && ((quoteMark && value [end] != quoteMark) \|\| (!quoteMark && value [end] > `' '` && value [end] != `'"'` && value [end] != `'\''` && value [end] != `';'`)))
76	++end;
77
78	if (quoteMark && (end == length))
79	break; // Close quote not found.
80
81	return StringView (value).substring(pos, end - pos);
82	}
83	return StringView ();
84	}
85
86	bool HTMLMetaCharsetParser::processMeta(HTMLToken& token)
87	{
88	AttributeList attributes;
89	for (auto& attribute : token.attributes()) {
90	String attributeName = StringImpl::create8BitIfPossible(attribute.name);
91	String attributeValue = StringImpl::create8BitIfPossible(attribute.value);
92	attributes.append(std::make_pair(attributeName, attributeValue));
93	}
94
95	m_encoding = encodingFromMetaAttributes(attributes);
96	return m_encoding.isValid();
97	}
98
99	TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeList& attributes)
100	{
101	bool gotPragma = false;
102	enum { None, Charset, Pragma } mode = None;
103	StringView charset;
104
105	for (auto& attribute : attributes) {
106	const String& attributeName = attribute.first;
107	const String& attributeValue = attribute.second;
108
109	if (attributeName == http_equivAttr) {
110	if (equalLettersIgnoringASCIICase(attributeValue, "content-type"))
111	gotPragma = true;
112	} else if (charset.isEmpty()) {
113	if (attributeName == charsetAttr) {
114	charset = attributeValue;
115	mode = Charset;
116	} else if (attributeName == contentAttr) {
117	charset = extractCharset(attributeValue);
118	if (charset.length())
119	mode = Pragma;
120	}
121	}
122	}
123
124	if (mode == Charset \|\| (mode == Pragma && gotPragma))
125	return TextEncoding (stripLeadingAndTrailingHTMLSpaces(charset.toStringWithoutCopying()));
126
127	return TextEncoding ();
128	}
129
130	bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
131	{
132	if (m_doneChecking)
133	return true;
134
135	ASSERT(!m_encoding.isValid());
136
137	// We still don't have an encoding, and are in the head.
138	// The following tags are allowed in <head>:
139	// SCRIPT\|STYLE\|META\|LINK\|OBJECT\|TITLE\|BASE
140	//
141	// We stop scanning when a tag that is not permitted in <head>
142	// is seen, rather when </head> is seen, because that more closely
143	// matches behavior in other browsers; more details in
144	// <http://bugs.webkit.org/show_bug.cgi?id=3590>.
145	//
146	// Additionally, we ignore things that looks like tags in <title>, <script>
147	// and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>,
148	// <http://bugs.webkit.org/show_bug.cgi?id=12165> and
149	// <http://bugs.webkit.org/show_bug.cgi?id=12389>.
150	//
151	// Since many sites have charset declarations after <body> or other tags
152	// that are disallowed in <head>, we don't bail out until we've checked at
153	// least bytesToCheckUnconditionally bytes of input.
154
155	constexpr int bytesToCheckUnconditionally = `1024`;
156
157	bool ignoredSawErrorFlag;
158	m_input.append(m_codec ->decode(data, length, false, false, ignoredSawErrorFlag));
159
160	while (auto token = m_tokenizer.nextToken(m_input)) {
161	bool isEnd = token ->type() == HTMLToken::EndTag;
162	if (isEnd \|\| token ->type() == HTMLToken::StartTag) {
163	AtomicString tagName(token ->name());
164	if (!isEnd) {
165	m_tokenizer.updateStateFor(tagName);
166	if (tagName == metaTag && processMeta(*token)) {
167	m_doneChecking = true;
168	return true;
169	}
170	}
171
172	if (tagName != scriptTag && tagName != noscriptTag
173	&& tagName != styleTag && tagName != linkTag
174	&& tagName != metaTag && tagName != objectTag
175	&& tagName != titleTag && tagName != baseTag
176	&& (isEnd \|\| tagName != htmlTag)
177	&& (isEnd \|\| tagName != headTag)) {
178	m_inHeadSection = false;
179	}
180	}
181
182	if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) {
183	m_doneChecking = true;
184	return true;
185	}
186	}
187
188	return false;
189	}
190
191	}
192

Browse the source code of webkit/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp