1 | /* |
2 | * Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved. |
3 | * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #pragma once |
28 | |
29 | #include "HTMLParserOptions.h" |
30 | #include "HTMLToken.h" |
31 | #include "InputStreamPreprocessor.h" |
32 | |
33 | namespace WebCore { |
34 | |
35 | class SegmentedString; |
36 | |
37 | class HTMLTokenizer { |
38 | public: |
39 | explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions()); |
40 | |
41 | // If we can't parse a whole token, this returns null. |
42 | class TokenPtr; |
43 | TokenPtr nextToken(SegmentedString&); |
44 | |
45 | // Used by HTMLSourceTracker. |
46 | void setTokenAttributeBaseOffset(unsigned); |
47 | |
48 | // Returns a copy of any characters buffered internally by the tokenizer. |
49 | // The tokenizer buffers characters when searching for the </script> token that terminates a script element. |
50 | String bufferedCharacters() const; |
51 | size_t numberOfBufferedCharacters() const; |
52 | |
53 | // Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree |
54 | // builder would update the tokenizer's state. This method is useful for approximating HTML tokenization. |
55 | // To get exactly the correct tokenization, you need the real tree builder. |
56 | // |
57 | // The main failures in the approximation are as follows: |
58 | // |
59 | // * The first set of character tokens emitted for a <pre> element might contain an extra leading newline. |
60 | // * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode. |
61 | // * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens. |
62 | // |
63 | // This approximation is also the algorithm called for when parsing an HTML fragment. |
64 | // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments |
65 | void updateStateFor(const AtomicString& tagName); |
66 | |
67 | void setForceNullCharacterReplacement(bool); |
68 | |
69 | bool shouldAllowCDATA() const; |
70 | void setShouldAllowCDATA(bool); |
71 | |
72 | bool isInDataState() const; |
73 | |
74 | void setDataState(); |
75 | void setPLAINTEXTState(); |
76 | void setRAWTEXTState(); |
77 | void setRCDATAState(); |
78 | void setScriptDataState(); |
79 | |
80 | bool neverSkipNullCharacters() const; |
81 | |
82 | private: |
83 | enum State { |
84 | DataState, |
85 | CharacterReferenceInDataState, |
86 | RCDATAState, |
87 | CharacterReferenceInRCDATAState, |
88 | RAWTEXTState, |
89 | ScriptDataState, |
90 | PLAINTEXTState, |
91 | TagOpenState, |
92 | EndTagOpenState, |
93 | TagNameState, |
94 | RCDATALessThanSignState, |
95 | RCDATAEndTagOpenState, |
96 | RCDATAEndTagNameState, |
97 | RAWTEXTLessThanSignState, |
98 | RAWTEXTEndTagOpenState, |
99 | RAWTEXTEndTagNameState, |
100 | ScriptDataLessThanSignState, |
101 | ScriptDataEndTagOpenState, |
102 | ScriptDataEndTagNameState, |
103 | ScriptDataEscapeStartState, |
104 | ScriptDataEscapeStartDashState, |
105 | ScriptDataEscapedState, |
106 | ScriptDataEscapedDashState, |
107 | ScriptDataEscapedDashDashState, |
108 | ScriptDataEscapedLessThanSignState, |
109 | ScriptDataEscapedEndTagOpenState, |
110 | ScriptDataEscapedEndTagNameState, |
111 | ScriptDataDoubleEscapeStartState, |
112 | ScriptDataDoubleEscapedState, |
113 | ScriptDataDoubleEscapedDashState, |
114 | ScriptDataDoubleEscapedDashDashState, |
115 | ScriptDataDoubleEscapedLessThanSignState, |
116 | ScriptDataDoubleEscapeEndState, |
117 | BeforeAttributeNameState, |
118 | AttributeNameState, |
119 | AfterAttributeNameState, |
120 | BeforeAttributeValueState, |
121 | AttributeValueDoubleQuotedState, |
122 | AttributeValueSingleQuotedState, |
123 | AttributeValueUnquotedState, |
124 | CharacterReferenceInAttributeValueState, |
125 | AfterAttributeValueQuotedState, |
126 | SelfClosingStartTagState, |
127 | , |
128 | , // Not in the HTML spec, used internally to track whether we started the bogus comment token. |
129 | MarkupDeclarationOpenState, |
130 | , |
131 | , |
132 | , |
133 | , |
134 | , |
135 | , |
136 | DOCTYPEState, |
137 | BeforeDOCTYPENameState, |
138 | DOCTYPENameState, |
139 | AfterDOCTYPENameState, |
140 | AfterDOCTYPEPublicKeywordState, |
141 | BeforeDOCTYPEPublicIdentifierState, |
142 | DOCTYPEPublicIdentifierDoubleQuotedState, |
143 | DOCTYPEPublicIdentifierSingleQuotedState, |
144 | AfterDOCTYPEPublicIdentifierState, |
145 | BetweenDOCTYPEPublicAndSystemIdentifiersState, |
146 | AfterDOCTYPESystemKeywordState, |
147 | BeforeDOCTYPESystemIdentifierState, |
148 | DOCTYPESystemIdentifierDoubleQuotedState, |
149 | DOCTYPESystemIdentifierSingleQuotedState, |
150 | AfterDOCTYPESystemIdentifierState, |
151 | BogusDOCTYPEState, |
152 | CDATASectionState, |
153 | // These CDATA states are not in the HTML5 spec, but we use them internally. |
154 | CDATASectionRightSquareBracketState, |
155 | CDATASectionDoubleRightSquareBracketState, |
156 | }; |
157 | |
158 | bool processToken(SegmentedString&); |
159 | bool processEntity(SegmentedString&); |
160 | |
161 | void parseError(); |
162 | |
163 | void bufferASCIICharacter(UChar); |
164 | void bufferCharacter(UChar); |
165 | |
166 | bool emitAndResumeInDataState(SegmentedString&); |
167 | bool emitAndReconsumeInDataState(); |
168 | bool emitEndOfFile(SegmentedString&); |
169 | |
170 | // Return true if we wil emit a character token before dealing with the buffered end tag. |
171 | void flushBufferedEndTag(); |
172 | bool commitToPartialEndTag(SegmentedString&, UChar, State); |
173 | bool commitToCompleteEndTag(SegmentedString&); |
174 | |
175 | void appendToTemporaryBuffer(UChar); |
176 | bool temporaryBufferIs(const char*); |
177 | |
178 | // Sometimes we speculatively consume input characters and we don't know whether they represent |
179 | // end tags or RCDATA, etc. These functions help manage these state. |
180 | bool inEndTagBufferingState() const; |
181 | void appendToPossibleEndTag(UChar); |
182 | void saveEndTagNameIfNeeded(); |
183 | bool isAppropriateEndTag() const; |
184 | |
185 | bool haveBufferedCharacterToken() const; |
186 | |
187 | static bool isNullCharacterSkippingState(State); |
188 | |
189 | State m_state { DataState }; |
190 | bool m_forceNullCharacterReplacement { false }; |
191 | bool m_shouldAllowCDATA { false }; |
192 | |
193 | mutable HTMLToken m_token; |
194 | |
195 | // https://html.spec.whatwg.org/#additional-allowed-character |
196 | UChar m_additionalAllowedCharacter { 0 }; |
197 | |
198 | // https://html.spec.whatwg.org/#preprocessing-the-input-stream |
199 | InputStreamPreprocessor<HTMLTokenizer> m_preprocessor; |
200 | |
201 | Vector<UChar, 32> m_appropriateEndTagName; |
202 | |
203 | // https://html.spec.whatwg.org/#temporary-buffer |
204 | Vector<LChar, 32> m_temporaryBuffer; |
205 | |
206 | // We occasionally want to emit both a character token and an end tag |
207 | // token (e.g., when lexing script). We buffer the name of the end tag |
208 | // token here so we remember it next time we re-enter the tokenizer. |
209 | Vector<LChar, 32> m_bufferedEndTagName; |
210 | |
211 | const HTMLParserOptions m_options; |
212 | }; |
213 | |
214 | class HTMLTokenizer::TokenPtr { |
215 | public: |
216 | TokenPtr(); |
217 | ~TokenPtr(); |
218 | |
219 | TokenPtr(TokenPtr&&); |
220 | TokenPtr& operator=(TokenPtr&&) = delete; |
221 | |
222 | void clear(); |
223 | |
224 | operator bool() const; |
225 | |
226 | HTMLToken& operator*() const; |
227 | HTMLToken* operator->() const; |
228 | |
229 | private: |
230 | friend class HTMLTokenizer; |
231 | explicit TokenPtr(HTMLToken*); |
232 | |
233 | HTMLToken* m_token { nullptr }; |
234 | }; |
235 | |
236 | inline HTMLTokenizer::TokenPtr::TokenPtr() |
237 | { |
238 | } |
239 | |
240 | inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token) |
241 | : m_token(token) |
242 | { |
243 | } |
244 | |
245 | inline HTMLTokenizer::TokenPtr::~TokenPtr() |
246 | { |
247 | if (m_token) |
248 | m_token->clear(); |
249 | } |
250 | |
251 | inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other) |
252 | : m_token(other.m_token) |
253 | { |
254 | other.m_token = nullptr; |
255 | } |
256 | |
257 | inline void HTMLTokenizer::TokenPtr::clear() |
258 | { |
259 | if (m_token) { |
260 | m_token->clear(); |
261 | m_token = nullptr; |
262 | } |
263 | } |
264 | |
265 | inline HTMLTokenizer::TokenPtr::operator bool() const |
266 | { |
267 | return m_token; |
268 | } |
269 | |
270 | inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const |
271 | { |
272 | ASSERT(m_token); |
273 | return *m_token; |
274 | } |
275 | |
276 | inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const |
277 | { |
278 | ASSERT(m_token); |
279 | return m_token; |
280 | } |
281 | |
282 | inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source) |
283 | { |
284 | return TokenPtr(processToken(source) ? &m_token : nullptr); |
285 | } |
286 | |
287 | inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset) |
288 | { |
289 | m_token.setAttributeBaseOffset(offset); |
290 | } |
291 | |
292 | inline size_t HTMLTokenizer::numberOfBufferedCharacters() const |
293 | { |
294 | // Notice that we add 2 to the length of the m_temporaryBuffer to |
295 | // account for the "</" characters, which are effecitvely buffered in |
296 | // the tokenizer's state machine. |
297 | return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; |
298 | } |
299 | |
300 | inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value) |
301 | { |
302 | m_forceNullCharacterReplacement = value; |
303 | } |
304 | |
305 | inline bool HTMLTokenizer::shouldAllowCDATA() const |
306 | { |
307 | return m_shouldAllowCDATA; |
308 | } |
309 | |
310 | inline void HTMLTokenizer::setShouldAllowCDATA(bool value) |
311 | { |
312 | m_shouldAllowCDATA = value; |
313 | } |
314 | |
315 | inline bool HTMLTokenizer::isInDataState() const |
316 | { |
317 | return m_state == DataState; |
318 | } |
319 | |
320 | inline void HTMLTokenizer::setDataState() |
321 | { |
322 | m_state = DataState; |
323 | } |
324 | |
325 | inline void HTMLTokenizer::setPLAINTEXTState() |
326 | { |
327 | m_state = PLAINTEXTState; |
328 | } |
329 | |
330 | inline void HTMLTokenizer::setRAWTEXTState() |
331 | { |
332 | m_state = RAWTEXTState; |
333 | } |
334 | |
335 | inline void HTMLTokenizer::setRCDATAState() |
336 | { |
337 | m_state = RCDATAState; |
338 | } |
339 | |
340 | inline void HTMLTokenizer::setScriptDataState() |
341 | { |
342 | m_state = ScriptDataState; |
343 | } |
344 | |
345 | inline bool HTMLTokenizer::isNullCharacterSkippingState(State state) |
346 | { |
347 | return state == DataState || state == RCDATAState || state == RAWTEXTState; |
348 | } |
349 | |
350 | inline bool HTMLTokenizer::neverSkipNullCharacters() const |
351 | { |
352 | return m_forceNullCharacterReplacement; |
353 | } |
354 | |
355 | } // namespace WebCore |
356 | |