HTMLTokenizer.h source code [webkit/Source/WebCore/html/parser/HTMLTokenizer.h]

1	/*
2	* Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved.
3	* Copyright (C) 2010 Google, Inc. All Rights Reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#pragma once
28
29	#include "HTMLParserOptions.h"
30	#include "HTMLToken.h"
31	#include "InputStreamPreprocessor.h"
32
33	namespace WebCore {
34
35	class SegmentedString;
36
37	class HTMLTokenizer {
38	public:
39	explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions ());
40
41	// If we can't parse a whole token, this returns null.
42	class TokenPtr;
43	TokenPtr nextToken(SegmentedString&);
44
45	// Used by HTMLSourceTracker.
46	void setTokenAttributeBaseOffset(unsigned);
47
48	// Returns a copy of any characters buffered internally by the tokenizer.
49	// The tokenizer buffers characters when searching for the </script> token that terminates a script element.
50	String bufferedCharacters() const;
51	size_t numberOfBufferedCharacters() const;
52
53	// Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree
54	// builder would update the tokenizer's state. This method is useful for approximating HTML tokenization.
55	// To get exactly the correct tokenization, you need the real tree builder.
56	//
57	// The main failures in the approximation are as follows:
58	//
59	// The first set of character tokens emitted for a <pre> element might contain an extra leading newline.*
60	// The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode.*
61	// CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens.*
62	//
63	// This approximation is also the algorithm called for when parsing an HTML fragment.
64	// https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
65	void updateStateFor(const AtomicString& tagName);
66
67	void setForceNullCharacterReplacement(bool);
68
69	bool shouldAllowCDATA() const;
70	void setShouldAllowCDATA(bool);
71
72	bool isInDataState() const;
73
74	void setDataState();
75	void setPLAINTEXTState();
76	void setRAWTEXTState();
77	void setRCDATAState();
78	void setScriptDataState();
79
80	bool neverSkipNullCharacters() const;
81
82	private:
83	enum State {
84	DataState,
85	CharacterReferenceInDataState,
86	RCDATAState,
87	CharacterReferenceInRCDATAState,
88	RAWTEXTState,
89	ScriptDataState,
90	PLAINTEXTState,
91	TagOpenState,
92	EndTagOpenState,
93	TagNameState,
94	RCDATALessThanSignState,
95	RCDATAEndTagOpenState,
96	RCDATAEndTagNameState,
97	RAWTEXTLessThanSignState,
98	RAWTEXTEndTagOpenState,
99	RAWTEXTEndTagNameState,
100	ScriptDataLessThanSignState,
101	ScriptDataEndTagOpenState,
102	ScriptDataEndTagNameState,
103	ScriptDataEscapeStartState,
104	ScriptDataEscapeStartDashState,
105	ScriptDataEscapedState,
106	ScriptDataEscapedDashState,
107	ScriptDataEscapedDashDashState,
108	ScriptDataEscapedLessThanSignState,
109	ScriptDataEscapedEndTagOpenState,
110	ScriptDataEscapedEndTagNameState,
111	ScriptDataDoubleEscapeStartState,
112	ScriptDataDoubleEscapedState,
113	ScriptDataDoubleEscapedDashState,
114	ScriptDataDoubleEscapedDashDashState,
115	ScriptDataDoubleEscapedLessThanSignState,
116	ScriptDataDoubleEscapeEndState,
117	BeforeAttributeNameState,
118	AttributeNameState,
119	AfterAttributeNameState,
120	BeforeAttributeValueState,
121	AttributeValueDoubleQuotedState,
122	AttributeValueSingleQuotedState,
123	AttributeValueUnquotedState,
124	CharacterReferenceInAttributeValueState,
125	AfterAttributeValueQuotedState,
126	SelfClosingStartTagState,
127	BogusCommentState,
128	ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token.
129	MarkupDeclarationOpenState,
130	CommentStartState,
131	CommentStartDashState,
132	CommentState,
133	CommentEndDashState,
134	CommentEndState,
135	CommentEndBangState,
136	DOCTYPEState,
137	BeforeDOCTYPENameState,
138	DOCTYPENameState,
139	AfterDOCTYPENameState,
140	AfterDOCTYPEPublicKeywordState,
141	BeforeDOCTYPEPublicIdentifierState,
142	DOCTYPEPublicIdentifierDoubleQuotedState,
143	DOCTYPEPublicIdentifierSingleQuotedState,
144	AfterDOCTYPEPublicIdentifierState,
145	BetweenDOCTYPEPublicAndSystemIdentifiersState,
146	AfterDOCTYPESystemKeywordState,
147	BeforeDOCTYPESystemIdentifierState,
148	DOCTYPESystemIdentifierDoubleQuotedState,
149	DOCTYPESystemIdentifierSingleQuotedState,
150	AfterDOCTYPESystemIdentifierState,
151	BogusDOCTYPEState,
152	CDATASectionState,
153	// These CDATA states are not in the HTML5 spec, but we use them internally.
154	CDATASectionRightSquareBracketState,
155	CDATASectionDoubleRightSquareBracketState,
156	};
157
158	bool processToken(SegmentedString&);
159	bool processEntity(SegmentedString&);
160
161	void parseError();
162
163	void bufferASCIICharacter(UChar);
164	void bufferCharacter(UChar);
165
166	bool emitAndResumeInDataState(SegmentedString&);
167	bool emitAndReconsumeInDataState();
168	bool emitEndOfFile(SegmentedString&);
169
170	// Return true if we wil emit a character token before dealing with the buffered end tag.
171	void flushBufferedEndTag();
172	bool commitToPartialEndTag(SegmentedString&, UChar, State);
173	bool commitToCompleteEndTag(SegmentedString&);
174
175	void appendToTemporaryBuffer(UChar);
176	bool temporaryBufferIs(const char*);
177
178	// Sometimes we speculatively consume input characters and we don't know whether they represent
179	// end tags or RCDATA, etc. These functions help manage these state.
180	bool inEndTagBufferingState() const;
181	void appendToPossibleEndTag(UChar);
182	void saveEndTagNameIfNeeded();
183	bool isAppropriateEndTag() const;
184
185	bool haveBufferedCharacterToken() const;
186
187	static bool isNullCharacterSkippingState(State);
188
189	State m_state { DataState };
190	bool m_forceNullCharacterReplacement { false };
191	bool m_shouldAllowCDATA { false };
192
193	mutable HTMLToken m_token;
194
195	// https://html.spec.whatwg.org/#additional-allowed-character
196	UChar m_additionalAllowedCharacter { `0` };
197
198	// https://html.spec.whatwg.org/#preprocessing-the-input-stream
199	InputStreamPreprocessor<HTMLTokenizer> m_preprocessor;
200
201	Vector<UChar, `32`> m_appropriateEndTagName;
202
203	// https://html.spec.whatwg.org/#temporary-buffer
204	Vector<LChar, `32`> m_temporaryBuffer;
205
206	// We occasionally want to emit both a character token and an end tag
207	// token (e.g., when lexing script). We buffer the name of the end tag
208	// token here so we remember it next time we re-enter the tokenizer.
209	Vector<LChar, `32`> m_bufferedEndTagName;
210
211	const HTMLParserOptions m_options;
212	};
213
214	class HTMLTokenizer::TokenPtr {
215	public:
216	TokenPtr();
217	~TokenPtr();
218
219	TokenPtr(TokenPtr&&);
220	TokenPtr& operator=(TokenPtr&&) = delete;
221
222	void clear();
223
224	operator bool() const;
225
226	HTMLToken& operator() const*;
227	HTMLToken* operator->() const;
228
229	private:
230	friend class HTMLTokenizer;
231	explicit TokenPtr(HTMLToken*);
232
233	HTMLToken* m_token { nullptr };
234	};
235
236	inline HTMLTokenizer::TokenPtr::TokenPtr()
237	{
238	}
239
240	inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
241	: m_token(token)
242	{
243	}
244
245	inline HTMLTokenizer::TokenPtr::~TokenPtr()
246	{
247	if (m_token)
248	m_token->clear();
249	}
250
251	inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
252	: m_token(other.m_token)
253	{
254	other.m_token = nullptr;
255	}
256
257	inline void HTMLTokenizer::TokenPtr::clear()
258	{
259	if (m_token) {
260	m_token->clear();
261	m_token = nullptr;
262	}
263	}
264
265	inline HTMLTokenizer::TokenPtr::operator bool() const
266	{
267	return m_token;
268	}
269
270	inline HTMLToken& HTMLTokenizer::TokenPtr::operator() const*
271	{
272	ASSERT(m_token);
273	return *m_token;
274	}
275
276	inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
277	{
278	ASSERT(m_token);
279	return m_token;
280	}
281
282	inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
283	{
284	return TokenPtr (processToken(source) ? &m_token : nullptr);
285	}
286
287	inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
288	{
289	m_token.setAttributeBaseOffset(offset);
290	}
291
292	inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
293	{
294	// Notice that we add 2 to the length of the m_temporaryBuffer to
295	// account for the "</" characters, which are effecitvely buffered in
296	// the tokenizer's state machine.
297	return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + `2` : `0`;
298	}
299
300	inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value)
301	{
302	m_forceNullCharacterReplacement = value;
303	}
304
305	inline bool HTMLTokenizer::shouldAllowCDATA() const
306	{
307	return m_shouldAllowCDATA;
308	}
309
310	inline void HTMLTokenizer::setShouldAllowCDATA(bool value)
311	{
312	m_shouldAllowCDATA = value;
313	}
314
315	inline bool HTMLTokenizer::isInDataState() const
316	{
317	return m_state == DataState;
318	}
319
320	inline void HTMLTokenizer::setDataState()
321	{
322	m_state = DataState;
323	}
324
325	inline void HTMLTokenizer::setPLAINTEXTState()
326	{
327	m_state = PLAINTEXTState;
328	}
329
330	inline void HTMLTokenizer::setRAWTEXTState()
331	{
332	m_state = RAWTEXTState;
333	}
334
335	inline void HTMLTokenizer::setRCDATAState()
336	{
337	m_state = RCDATAState;
338	}
339
340	inline void HTMLTokenizer::setScriptDataState()
341	{
342	m_state = ScriptDataState;
343	}
344
345	inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
346	{
347	return state == DataState \|\| state == RCDATAState \|\| state == RAWTEXTState;
348	}
349
350	inline bool HTMLTokenizer::neverSkipNullCharacters() const
351	{
352	return m_forceNullCharacterReplacement;
353	}
354
355	} // namespace WebCore
356

Browse the source code of webkit/Source/WebCore/html/parser/HTMLTokenizer.h