1/*
2 * Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#pragma once
28
29#include "HTMLParserOptions.h"
30#include "HTMLToken.h"
31#include "InputStreamPreprocessor.h"
32
33namespace WebCore {
34
35class SegmentedString;
36
37class HTMLTokenizer {
38public:
39 explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions());
40
41 // If we can't parse a whole token, this returns null.
42 class TokenPtr;
43 TokenPtr nextToken(SegmentedString&);
44
45 // Used by HTMLSourceTracker.
46 void setTokenAttributeBaseOffset(unsigned);
47
48 // Returns a copy of any characters buffered internally by the tokenizer.
49 // The tokenizer buffers characters when searching for the </script> token that terminates a script element.
50 String bufferedCharacters() const;
51 size_t numberOfBufferedCharacters() const;
52
53 // Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree
54 // builder would update the tokenizer's state. This method is useful for approximating HTML tokenization.
55 // To get exactly the correct tokenization, you need the real tree builder.
56 //
57 // The main failures in the approximation are as follows:
58 //
59 // * The first set of character tokens emitted for a <pre> element might contain an extra leading newline.
60 // * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode.
61 // * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens.
62 //
63 // This approximation is also the algorithm called for when parsing an HTML fragment.
64 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
65 void updateStateFor(const AtomicString& tagName);
66
67 void setForceNullCharacterReplacement(bool);
68
69 bool shouldAllowCDATA() const;
70 void setShouldAllowCDATA(bool);
71
72 bool isInDataState() const;
73
74 void setDataState();
75 void setPLAINTEXTState();
76 void setRAWTEXTState();
77 void setRCDATAState();
78 void setScriptDataState();
79
80 bool neverSkipNullCharacters() const;
81
82private:
83 enum State {
84 DataState,
85 CharacterReferenceInDataState,
86 RCDATAState,
87 CharacterReferenceInRCDATAState,
88 RAWTEXTState,
89 ScriptDataState,
90 PLAINTEXTState,
91 TagOpenState,
92 EndTagOpenState,
93 TagNameState,
94 RCDATALessThanSignState,
95 RCDATAEndTagOpenState,
96 RCDATAEndTagNameState,
97 RAWTEXTLessThanSignState,
98 RAWTEXTEndTagOpenState,
99 RAWTEXTEndTagNameState,
100 ScriptDataLessThanSignState,
101 ScriptDataEndTagOpenState,
102 ScriptDataEndTagNameState,
103 ScriptDataEscapeStartState,
104 ScriptDataEscapeStartDashState,
105 ScriptDataEscapedState,
106 ScriptDataEscapedDashState,
107 ScriptDataEscapedDashDashState,
108 ScriptDataEscapedLessThanSignState,
109 ScriptDataEscapedEndTagOpenState,
110 ScriptDataEscapedEndTagNameState,
111 ScriptDataDoubleEscapeStartState,
112 ScriptDataDoubleEscapedState,
113 ScriptDataDoubleEscapedDashState,
114 ScriptDataDoubleEscapedDashDashState,
115 ScriptDataDoubleEscapedLessThanSignState,
116 ScriptDataDoubleEscapeEndState,
117 BeforeAttributeNameState,
118 AttributeNameState,
119 AfterAttributeNameState,
120 BeforeAttributeValueState,
121 AttributeValueDoubleQuotedState,
122 AttributeValueSingleQuotedState,
123 AttributeValueUnquotedState,
124 CharacterReferenceInAttributeValueState,
125 AfterAttributeValueQuotedState,
126 SelfClosingStartTagState,
127 BogusCommentState,
128 ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token.
129 MarkupDeclarationOpenState,
130 CommentStartState,
131 CommentStartDashState,
132 CommentState,
133 CommentEndDashState,
134 CommentEndState,
135 CommentEndBangState,
136 DOCTYPEState,
137 BeforeDOCTYPENameState,
138 DOCTYPENameState,
139 AfterDOCTYPENameState,
140 AfterDOCTYPEPublicKeywordState,
141 BeforeDOCTYPEPublicIdentifierState,
142 DOCTYPEPublicIdentifierDoubleQuotedState,
143 DOCTYPEPublicIdentifierSingleQuotedState,
144 AfterDOCTYPEPublicIdentifierState,
145 BetweenDOCTYPEPublicAndSystemIdentifiersState,
146 AfterDOCTYPESystemKeywordState,
147 BeforeDOCTYPESystemIdentifierState,
148 DOCTYPESystemIdentifierDoubleQuotedState,
149 DOCTYPESystemIdentifierSingleQuotedState,
150 AfterDOCTYPESystemIdentifierState,
151 BogusDOCTYPEState,
152 CDATASectionState,
153 // These CDATA states are not in the HTML5 spec, but we use them internally.
154 CDATASectionRightSquareBracketState,
155 CDATASectionDoubleRightSquareBracketState,
156 };
157
158 bool processToken(SegmentedString&);
159 bool processEntity(SegmentedString&);
160
161 void parseError();
162
163 void bufferASCIICharacter(UChar);
164 void bufferCharacter(UChar);
165
166 bool emitAndResumeInDataState(SegmentedString&);
167 bool emitAndReconsumeInDataState();
168 bool emitEndOfFile(SegmentedString&);
169
170 // Return true if we wil emit a character token before dealing with the buffered end tag.
171 void flushBufferedEndTag();
172 bool commitToPartialEndTag(SegmentedString&, UChar, State);
173 bool commitToCompleteEndTag(SegmentedString&);
174
175 void appendToTemporaryBuffer(UChar);
176 bool temporaryBufferIs(const char*);
177
178 // Sometimes we speculatively consume input characters and we don't know whether they represent
179 // end tags or RCDATA, etc. These functions help manage these state.
180 bool inEndTagBufferingState() const;
181 void appendToPossibleEndTag(UChar);
182 void saveEndTagNameIfNeeded();
183 bool isAppropriateEndTag() const;
184
185 bool haveBufferedCharacterToken() const;
186
187 static bool isNullCharacterSkippingState(State);
188
189 State m_state { DataState };
190 bool m_forceNullCharacterReplacement { false };
191 bool m_shouldAllowCDATA { false };
192
193 mutable HTMLToken m_token;
194
195 // https://html.spec.whatwg.org/#additional-allowed-character
196 UChar m_additionalAllowedCharacter { 0 };
197
198 // https://html.spec.whatwg.org/#preprocessing-the-input-stream
199 InputStreamPreprocessor<HTMLTokenizer> m_preprocessor;
200
201 Vector<UChar, 32> m_appropriateEndTagName;
202
203 // https://html.spec.whatwg.org/#temporary-buffer
204 Vector<LChar, 32> m_temporaryBuffer;
205
206 // We occasionally want to emit both a character token and an end tag
207 // token (e.g., when lexing script). We buffer the name of the end tag
208 // token here so we remember it next time we re-enter the tokenizer.
209 Vector<LChar, 32> m_bufferedEndTagName;
210
211 const HTMLParserOptions m_options;
212};
213
214class HTMLTokenizer::TokenPtr {
215public:
216 TokenPtr();
217 ~TokenPtr();
218
219 TokenPtr(TokenPtr&&);
220 TokenPtr& operator=(TokenPtr&&) = delete;
221
222 void clear();
223
224 operator bool() const;
225
226 HTMLToken& operator*() const;
227 HTMLToken* operator->() const;
228
229private:
230 friend class HTMLTokenizer;
231 explicit TokenPtr(HTMLToken*);
232
233 HTMLToken* m_token { nullptr };
234};
235
236inline HTMLTokenizer::TokenPtr::TokenPtr()
237{
238}
239
240inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
241 : m_token(token)
242{
243}
244
245inline HTMLTokenizer::TokenPtr::~TokenPtr()
246{
247 if (m_token)
248 m_token->clear();
249}
250
251inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
252 : m_token(other.m_token)
253{
254 other.m_token = nullptr;
255}
256
257inline void HTMLTokenizer::TokenPtr::clear()
258{
259 if (m_token) {
260 m_token->clear();
261 m_token = nullptr;
262 }
263}
264
265inline HTMLTokenizer::TokenPtr::operator bool() const
266{
267 return m_token;
268}
269
270inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const
271{
272 ASSERT(m_token);
273 return *m_token;
274}
275
276inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
277{
278 ASSERT(m_token);
279 return m_token;
280}
281
282inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
283{
284 return TokenPtr(processToken(source) ? &m_token : nullptr);
285}
286
287inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
288{
289 m_token.setAttributeBaseOffset(offset);
290}
291
292inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
293{
294 // Notice that we add 2 to the length of the m_temporaryBuffer to
295 // account for the "</" characters, which are effecitvely buffered in
296 // the tokenizer's state machine.
297 return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
298}
299
300inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value)
301{
302 m_forceNullCharacterReplacement = value;
303}
304
305inline bool HTMLTokenizer::shouldAllowCDATA() const
306{
307 return m_shouldAllowCDATA;
308}
309
310inline void HTMLTokenizer::setShouldAllowCDATA(bool value)
311{
312 m_shouldAllowCDATA = value;
313}
314
315inline bool HTMLTokenizer::isInDataState() const
316{
317 return m_state == DataState;
318}
319
320inline void HTMLTokenizer::setDataState()
321{
322 m_state = DataState;
323}
324
325inline void HTMLTokenizer::setPLAINTEXTState()
326{
327 m_state = PLAINTEXTState;
328}
329
330inline void HTMLTokenizer::setRAWTEXTState()
331{
332 m_state = RAWTEXTState;
333}
334
335inline void HTMLTokenizer::setRCDATAState()
336{
337 m_state = RCDATAState;
338}
339
340inline void HTMLTokenizer::setScriptDataState()
341{
342 m_state = ScriptDataState;
343}
344
345inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
346{
347 return state == DataState || state == RCDATAState || state == RAWTEXTState;
348}
349
350inline bool HTMLTokenizer::neverSkipNullCharacters() const
351{
352 return m_forceNullCharacterReplacement;
353}
354
355} // namespace WebCore
356