1/*
2 * Copyright (C) 2008-2016 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLTokenizer.h"
30
31#include "HTMLEntityParser.h"
32#include "HTMLNames.h"
33#include "MarkupTokenizerInlines.h"
34#include <wtf/text/StringBuilder.h>
35
36
37namespace WebCore {
38
39using namespace HTMLNames;
40
41static inline LChar convertASCIIAlphaToLower(UChar character)
42{
43 ASSERT(isASCIIAlpha(character));
44 return toASCIILowerUnchecked(character);
45}
46
47static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const char* string)
48{
49 unsigned size = vector.size();
50 for (unsigned i = 0; i < size; ++i) {
51 if (!string[i] || vector[i] != string[i])
52 return false;
53 }
54 return !string[size];
55}
56
57inline bool HTMLTokenizer::inEndTagBufferingState() const
58{
59 switch (m_state) {
60 case RCDATAEndTagOpenState:
61 case RCDATAEndTagNameState:
62 case RAWTEXTEndTagOpenState:
63 case RAWTEXTEndTagNameState:
64 case ScriptDataEndTagOpenState:
65 case ScriptDataEndTagNameState:
66 case ScriptDataEscapedEndTagOpenState:
67 case ScriptDataEscapedEndTagNameState:
68 return true;
69 default:
70 return false;
71 }
72}
73
74HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
75 : m_preprocessor(*this)
76 , m_options(options)
77{
78}
79
80inline void HTMLTokenizer::bufferASCIICharacter(UChar character)
81{
82 ASSERT(character != kEndOfFileMarker);
83 ASSERT(isASCII(character));
84 LChar narrowedCharacter = character;
85 m_token.appendToCharacter(narrowedCharacter);
86}
87
88inline void HTMLTokenizer::bufferCharacter(UChar character)
89{
90 ASSERT(character != kEndOfFileMarker);
91 m_token.appendToCharacter(character);
92}
93
94inline bool HTMLTokenizer::emitAndResumeInDataState(SegmentedString& source)
95{
96 saveEndTagNameIfNeeded();
97 m_state = DataState;
98 source.advancePastNonNewline();
99 return true;
100}
101
102inline bool HTMLTokenizer::emitAndReconsumeInDataState()
103{
104 saveEndTagNameIfNeeded();
105 m_state = DataState;
106 return true;
107}
108
109inline bool HTMLTokenizer::emitEndOfFile(SegmentedString& source)
110{
111 m_state = DataState;
112 if (haveBufferedCharacterToken())
113 return true;
114 source.advance();
115 m_token.clear();
116 m_token.makeEndOfFile();
117 return true;
118}
119
120inline void HTMLTokenizer::saveEndTagNameIfNeeded()
121{
122 ASSERT(m_token.type() != HTMLToken::Uninitialized);
123 if (m_token.type() == HTMLToken::StartTag)
124 m_appropriateEndTagName = m_token.name();
125}
126
127inline bool HTMLTokenizer::haveBufferedCharacterToken() const
128{
129 return m_token.type() == HTMLToken::Character;
130}
131
132inline bool HTMLTokenizer::processEntity(SegmentedString& source)
133{
134 bool notEnoughCharacters = false;
135 StringBuilder decodedEntity;
136 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
137 if (notEnoughCharacters)
138 return false;
139 if (!success) {
140 ASSERT(decodedEntity.isEmpty());
141 bufferASCIICharacter('&');
142 } else {
143 for (unsigned i = 0; i < decodedEntity.length(); ++i)
144 bufferCharacter(decodedEntity[i]);
145 }
146 return true;
147}
148
149void HTMLTokenizer::flushBufferedEndTag()
150{
151 m_token.beginEndTag(m_bufferedEndTagName);
152 m_bufferedEndTagName.clear();
153 m_appropriateEndTagName.clear();
154 m_temporaryBuffer.clear();
155}
156
157bool HTMLTokenizer::commitToPartialEndTag(SegmentedString& source, UChar character, State state)
158{
159 ASSERT(source.currentCharacter() == character);
160 appendToTemporaryBuffer(character);
161 source.advance();
162
163 if (haveBufferedCharacterToken()) {
164 // Emit the buffered character token.
165 // The next call to processToken will flush the buffered end tag and continue parsing it.
166 m_state = state;
167 return true;
168 }
169
170 flushBufferedEndTag();
171 return false;
172}
173
174bool HTMLTokenizer::commitToCompleteEndTag(SegmentedString& source)
175{
176 ASSERT(source.currentCharacter() == '>');
177 appendToTemporaryBuffer('>');
178 source.advancePastNonNewline();
179
180 m_state = DataState;
181
182 if (haveBufferedCharacterToken()) {
183 // Emit the character token we already have.
184 // The next call to processToken will flush the buffered end tag and emit it.
185 return true;
186 }
187
188 flushBufferedEndTag();
189 return true;
190}
191
192bool HTMLTokenizer::processToken(SegmentedString& source)
193{
194 if (!m_bufferedEndTagName.isEmpty() && !inEndTagBufferingState()) {
195 // We are back here after emitting a character token that came just before an end tag.
196 // To continue parsing the end tag we need to move the buffered tag name into the token.
197 flushBufferedEndTag();
198
199 // If we are in the data state, the end tag is already complete and we should emit it
200 // now, otherwise, we want to resume parsing the partial end tag.
201 if (m_state == DataState)
202 return true;
203 }
204
205 if (!m_preprocessor.peek(source, isNullCharacterSkippingState(m_state)))
206 return haveBufferedCharacterToken();
207 UChar character = m_preprocessor.nextInputCharacter();
208
209 // https://html.spec.whatwg.org/#tokenization
210 switch (m_state) {
211
212 BEGIN_STATE(DataState)
213 if (character == '&')
214 ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInDataState);
215 if (character == '<') {
216 if (haveBufferedCharacterToken())
217 RETURN_IN_CURRENT_STATE(true);
218 ADVANCE_PAST_NON_NEWLINE_TO(TagOpenState);
219 }
220 if (character == kEndOfFileMarker)
221 return emitEndOfFile(source);
222 bufferCharacter(character);
223 ADVANCE_TO(DataState);
224 END_STATE()
225
226 BEGIN_STATE(CharacterReferenceInDataState)
227 if (!processEntity(source))
228 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
229 SWITCH_TO(DataState);
230 END_STATE()
231
232 BEGIN_STATE(RCDATAState)
233 if (character == '&')
234 ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInRCDATAState);
235 if (character == '<')
236 ADVANCE_PAST_NON_NEWLINE_TO(RCDATALessThanSignState);
237 if (character == kEndOfFileMarker)
238 RECONSUME_IN(DataState);
239 bufferCharacter(character);
240 ADVANCE_TO(RCDATAState);
241 END_STATE()
242
243 BEGIN_STATE(CharacterReferenceInRCDATAState)
244 if (!processEntity(source))
245 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
246 SWITCH_TO(RCDATAState);
247 END_STATE()
248
249 BEGIN_STATE(RAWTEXTState)
250 if (character == '<')
251 ADVANCE_PAST_NON_NEWLINE_TO(RAWTEXTLessThanSignState);
252 if (character == kEndOfFileMarker)
253 RECONSUME_IN(DataState);
254 bufferCharacter(character);
255 ADVANCE_TO(RAWTEXTState);
256 END_STATE()
257
258 BEGIN_STATE(ScriptDataState)
259 if (character == '<')
260 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataLessThanSignState);
261 if (character == kEndOfFileMarker)
262 RECONSUME_IN(DataState);
263 bufferCharacter(character);
264 ADVANCE_TO(ScriptDataState);
265 END_STATE()
266
267 BEGIN_STATE(PLAINTEXTState)
268 if (character == kEndOfFileMarker)
269 RECONSUME_IN(DataState);
270 bufferCharacter(character);
271 ADVANCE_TO(PLAINTEXTState);
272 END_STATE()
273
274 BEGIN_STATE(TagOpenState)
275 if (character == '!')
276 ADVANCE_PAST_NON_NEWLINE_TO(MarkupDeclarationOpenState);
277 if (character == '/')
278 ADVANCE_PAST_NON_NEWLINE_TO(EndTagOpenState);
279 if (isASCIIAlpha(character)) {
280 m_token.beginStartTag(convertASCIIAlphaToLower(character));
281 ADVANCE_PAST_NON_NEWLINE_TO(TagNameState);
282 }
283 if (character == '?') {
284 parseError();
285 // The spec consumes the current character before switching
286 // to the bogus comment state, but it's easier to implement
287 // if we reconsume the current character.
288 RECONSUME_IN(BogusCommentState);
289 }
290 parseError();
291 bufferASCIICharacter('<');
292 RECONSUME_IN(DataState);
293 END_STATE()
294
295 BEGIN_STATE(EndTagOpenState)
296 if (isASCIIAlpha(character)) {
297 m_token.beginEndTag(convertASCIIAlphaToLower(character));
298 m_appropriateEndTagName.clear();
299 ADVANCE_PAST_NON_NEWLINE_TO(TagNameState);
300 }
301 if (character == '>') {
302 parseError();
303 ADVANCE_PAST_NON_NEWLINE_TO(DataState);
304 }
305 if (character == kEndOfFileMarker) {
306 parseError();
307 bufferASCIICharacter('<');
308 bufferASCIICharacter('/');
309 RECONSUME_IN(DataState);
310 }
311 parseError();
312 RECONSUME_IN(BogusCommentState);
313 END_STATE()
314
315 BEGIN_STATE(TagNameState)
316 if (isTokenizerWhitespace(character))
317 ADVANCE_TO(BeforeAttributeNameState);
318 if (character == '/')
319 ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
320 if (character == '>')
321 return emitAndResumeInDataState(source);
322 if (m_options.usePreHTML5ParserQuirks && character == '<')
323 return emitAndReconsumeInDataState();
324 if (character == kEndOfFileMarker) {
325 parseError();
326 RECONSUME_IN(DataState);
327 }
328 m_token.appendToName(toASCIILower(character));
329 ADVANCE_PAST_NON_NEWLINE_TO(TagNameState);
330 END_STATE()
331
332 BEGIN_STATE(RCDATALessThanSignState)
333 if (character == '/') {
334 m_temporaryBuffer.clear();
335 ASSERT(m_bufferedEndTagName.isEmpty());
336 ADVANCE_PAST_NON_NEWLINE_TO(RCDATAEndTagOpenState);
337 }
338 bufferASCIICharacter('<');
339 RECONSUME_IN(RCDATAState);
340 END_STATE()
341
342 BEGIN_STATE(RCDATAEndTagOpenState)
343 if (isASCIIAlpha(character)) {
344 appendToTemporaryBuffer(character);
345 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
346 ADVANCE_PAST_NON_NEWLINE_TO(RCDATAEndTagNameState);
347 }
348 bufferASCIICharacter('<');
349 bufferASCIICharacter('/');
350 RECONSUME_IN(RCDATAState);
351 END_STATE()
352
353 BEGIN_STATE(RCDATAEndTagNameState)
354 if (isASCIIAlpha(character)) {
355 appendToTemporaryBuffer(character);
356 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
357 ADVANCE_PAST_NON_NEWLINE_TO(RCDATAEndTagNameState);
358 }
359 if (isTokenizerWhitespace(character)) {
360 if (isAppropriateEndTag()) {
361 if (commitToPartialEndTag(source, character, BeforeAttributeNameState))
362 return true;
363 SWITCH_TO(BeforeAttributeNameState);
364 }
365 } else if (character == '/') {
366 if (isAppropriateEndTag()) {
367 if (commitToPartialEndTag(source, '/', SelfClosingStartTagState))
368 return true;
369 SWITCH_TO(SelfClosingStartTagState);
370 }
371 } else if (character == '>') {
372 if (isAppropriateEndTag())
373 return commitToCompleteEndTag(source);
374 }
375 bufferASCIICharacter('<');
376 bufferASCIICharacter('/');
377 m_token.appendToCharacter(m_temporaryBuffer);
378 m_bufferedEndTagName.clear();
379 m_temporaryBuffer.clear();
380 RECONSUME_IN(RCDATAState);
381 END_STATE()
382
383 BEGIN_STATE(RAWTEXTLessThanSignState)
384 if (character == '/') {
385 m_temporaryBuffer.clear();
386 ASSERT(m_bufferedEndTagName.isEmpty());
387 ADVANCE_PAST_NON_NEWLINE_TO(RAWTEXTEndTagOpenState);
388 }
389 bufferASCIICharacter('<');
390 RECONSUME_IN(RAWTEXTState);
391 END_STATE()
392
393 BEGIN_STATE(RAWTEXTEndTagOpenState)
394 if (isASCIIAlpha(character)) {
395 appendToTemporaryBuffer(character);
396 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
397 ADVANCE_PAST_NON_NEWLINE_TO(RAWTEXTEndTagNameState);
398 }
399 bufferASCIICharacter('<');
400 bufferASCIICharacter('/');
401 RECONSUME_IN(RAWTEXTState);
402 END_STATE()
403
404 BEGIN_STATE(RAWTEXTEndTagNameState)
405 if (isASCIIAlpha(character)) {
406 appendToTemporaryBuffer(character);
407 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
408 ADVANCE_PAST_NON_NEWLINE_TO(RAWTEXTEndTagNameState);
409 }
410 if (isTokenizerWhitespace(character)) {
411 if (isAppropriateEndTag()) {
412 if (commitToPartialEndTag(source, character, BeforeAttributeNameState))
413 return true;
414 SWITCH_TO(BeforeAttributeNameState);
415 }
416 } else if (character == '/') {
417 if (isAppropriateEndTag()) {
418 if (commitToPartialEndTag(source, '/', SelfClosingStartTagState))
419 return true;
420 SWITCH_TO(SelfClosingStartTagState);
421 }
422 } else if (character == '>') {
423 if (isAppropriateEndTag())
424 return commitToCompleteEndTag(source);
425 }
426 bufferASCIICharacter('<');
427 bufferASCIICharacter('/');
428 m_token.appendToCharacter(m_temporaryBuffer);
429 m_bufferedEndTagName.clear();
430 m_temporaryBuffer.clear();
431 RECONSUME_IN(RAWTEXTState);
432 END_STATE()
433
434 BEGIN_STATE(ScriptDataLessThanSignState)
435 if (character == '/') {
436 m_temporaryBuffer.clear();
437 ASSERT(m_bufferedEndTagName.isEmpty());
438 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEndTagOpenState);
439 }
440 if (character == '!') {
441 bufferASCIICharacter('<');
442 bufferASCIICharacter('!');
443 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapeStartState);
444 }
445 bufferASCIICharacter('<');
446 RECONSUME_IN(ScriptDataState);
447 END_STATE()
448
449 BEGIN_STATE(ScriptDataEndTagOpenState)
450 if (isASCIIAlpha(character)) {
451 appendToTemporaryBuffer(character);
452 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
453 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEndTagNameState);
454 }
455 bufferASCIICharacter('<');
456 bufferASCIICharacter('/');
457 RECONSUME_IN(ScriptDataState);
458 END_STATE()
459
460 BEGIN_STATE(ScriptDataEndTagNameState)
461 if (isASCIIAlpha(character)) {
462 appendToTemporaryBuffer(character);
463 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
464 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEndTagNameState);
465 }
466 if (isTokenizerWhitespace(character)) {
467 if (isAppropriateEndTag()) {
468 if (commitToPartialEndTag(source, character, BeforeAttributeNameState))
469 return true;
470 SWITCH_TO(BeforeAttributeNameState);
471 }
472 } else if (character == '/') {
473 if (isAppropriateEndTag()) {
474 if (commitToPartialEndTag(source, '/', SelfClosingStartTagState))
475 return true;
476 SWITCH_TO(SelfClosingStartTagState);
477 }
478 } else if (character == '>') {
479 if (isAppropriateEndTag())
480 return commitToCompleteEndTag(source);
481 }
482 bufferASCIICharacter('<');
483 bufferASCIICharacter('/');
484 m_token.appendToCharacter(m_temporaryBuffer);
485 m_bufferedEndTagName.clear();
486 m_temporaryBuffer.clear();
487 RECONSUME_IN(ScriptDataState);
488 END_STATE()
489
490 BEGIN_STATE(ScriptDataEscapeStartState)
491 if (character == '-') {
492 bufferASCIICharacter('-');
493 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapeStartDashState);
494 } else
495 RECONSUME_IN(ScriptDataState);
496 END_STATE()
497
498 BEGIN_STATE(ScriptDataEscapeStartDashState)
499 if (character == '-') {
500 bufferASCIICharacter('-');
501 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedDashDashState);
502 } else
503 RECONSUME_IN(ScriptDataState);
504 END_STATE()
505
506 BEGIN_STATE(ScriptDataEscapedState)
507 if (character == '-') {
508 bufferASCIICharacter('-');
509 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedDashState);
510 }
511 if (character == '<')
512 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedLessThanSignState);
513 if (character == kEndOfFileMarker) {
514 parseError();
515 RECONSUME_IN(DataState);
516 }
517 bufferCharacter(character);
518 ADVANCE_TO(ScriptDataEscapedState);
519 END_STATE()
520
521 BEGIN_STATE(ScriptDataEscapedDashState)
522 if (character == '-') {
523 bufferASCIICharacter('-');
524 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedDashDashState);
525 }
526 if (character == '<')
527 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedLessThanSignState);
528 if (character == kEndOfFileMarker) {
529 parseError();
530 RECONSUME_IN(DataState);
531 }
532 bufferCharacter(character);
533 ADVANCE_TO(ScriptDataEscapedState);
534 END_STATE()
535
536 BEGIN_STATE(ScriptDataEscapedDashDashState)
537 if (character == '-') {
538 bufferASCIICharacter('-');
539 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedDashDashState);
540 }
541 if (character == '<')
542 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedLessThanSignState);
543 if (character == '>') {
544 bufferASCIICharacter('>');
545 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataState);
546 }
547 if (character == kEndOfFileMarker) {
548 parseError();
549 RECONSUME_IN(DataState);
550 }
551 bufferCharacter(character);
552 ADVANCE_TO(ScriptDataEscapedState);
553 END_STATE()
554
555 BEGIN_STATE(ScriptDataEscapedLessThanSignState)
556 if (character == '/') {
557 m_temporaryBuffer.clear();
558 ASSERT(m_bufferedEndTagName.isEmpty());
559 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedEndTagOpenState);
560 }
561 if (isASCIIAlpha(character)) {
562 bufferASCIICharacter('<');
563 bufferASCIICharacter(character);
564 m_temporaryBuffer.clear();
565 appendToTemporaryBuffer(convertASCIIAlphaToLower(character));
566 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapeStartState);
567 }
568 bufferASCIICharacter('<');
569 RECONSUME_IN(ScriptDataEscapedState);
570 END_STATE()
571
572 BEGIN_STATE(ScriptDataEscapedEndTagOpenState)
573 if (isASCIIAlpha(character)) {
574 appendToTemporaryBuffer(character);
575 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
576 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedEndTagNameState);
577 }
578 bufferASCIICharacter('<');
579 bufferASCIICharacter('/');
580 RECONSUME_IN(ScriptDataEscapedState);
581 END_STATE()
582
583 BEGIN_STATE(ScriptDataEscapedEndTagNameState)
584 if (isASCIIAlpha(character)) {
585 appendToTemporaryBuffer(character);
586 appendToPossibleEndTag(convertASCIIAlphaToLower(character));
587 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataEscapedEndTagNameState);
588 }
589 if (isTokenizerWhitespace(character)) {
590 if (isAppropriateEndTag()) {
591 if (commitToPartialEndTag(source, character, BeforeAttributeNameState))
592 return true;
593 SWITCH_TO(BeforeAttributeNameState);
594 }
595 } else if (character == '/') {
596 if (isAppropriateEndTag()) {
597 if (commitToPartialEndTag(source, '/', SelfClosingStartTagState))
598 return true;
599 SWITCH_TO(SelfClosingStartTagState);
600 }
601 } else if (character == '>') {
602 if (isAppropriateEndTag())
603 return commitToCompleteEndTag(source);
604 }
605 bufferASCIICharacter('<');
606 bufferASCIICharacter('/');
607 m_token.appendToCharacter(m_temporaryBuffer);
608 m_bufferedEndTagName.clear();
609 m_temporaryBuffer.clear();
610 RECONSUME_IN(ScriptDataEscapedState);
611 END_STATE()
612
613 BEGIN_STATE(ScriptDataDoubleEscapeStartState)
614 if (isTokenizerWhitespace(character) || character == '/' || character == '>') {
615 bufferASCIICharacter(character);
616 if (temporaryBufferIs("script"))
617 ADVANCE_TO(ScriptDataDoubleEscapedState);
618 else
619 ADVANCE_TO(ScriptDataEscapedState);
620 }
621 if (isASCIIAlpha(character)) {
622 bufferASCIICharacter(character);
623 appendToTemporaryBuffer(convertASCIIAlphaToLower(character));
624 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapeStartState);
625 }
626 RECONSUME_IN(ScriptDataEscapedState);
627 END_STATE()
628
629 BEGIN_STATE(ScriptDataDoubleEscapedState)
630 if (character == '-') {
631 bufferASCIICharacter('-');
632 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedDashState);
633 }
634 if (character == '<') {
635 bufferASCIICharacter('<');
636 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedLessThanSignState);
637 }
638 if (character == kEndOfFileMarker) {
639 parseError();
640 RECONSUME_IN(DataState);
641 }
642 bufferCharacter(character);
643 ADVANCE_TO(ScriptDataDoubleEscapedState);
644 END_STATE()
645
646 BEGIN_STATE(ScriptDataDoubleEscapedDashState)
647 if (character == '-') {
648 bufferASCIICharacter('-');
649 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedDashDashState);
650 }
651 if (character == '<') {
652 bufferASCIICharacter('<');
653 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedLessThanSignState);
654 }
655 if (character == kEndOfFileMarker) {
656 parseError();
657 RECONSUME_IN(DataState);
658 }
659 bufferCharacter(character);
660 ADVANCE_TO(ScriptDataDoubleEscapedState);
661 END_STATE()
662
663 BEGIN_STATE(ScriptDataDoubleEscapedDashDashState)
664 if (character == '-') {
665 bufferASCIICharacter('-');
666 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedDashDashState);
667 }
668 if (character == '<') {
669 bufferASCIICharacter('<');
670 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapedLessThanSignState);
671 }
672 if (character == '>') {
673 bufferASCIICharacter('>');
674 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataState);
675 }
676 if (character == kEndOfFileMarker) {
677 parseError();
678 RECONSUME_IN(DataState);
679 }
680 bufferCharacter(character);
681 ADVANCE_TO(ScriptDataDoubleEscapedState);
682 END_STATE()
683
684 BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState)
685 if (character == '/') {
686 bufferASCIICharacter('/');
687 m_temporaryBuffer.clear();
688 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapeEndState);
689 }
690 RECONSUME_IN(ScriptDataDoubleEscapedState);
691 END_STATE()
692
693 BEGIN_STATE(ScriptDataDoubleEscapeEndState)
694 if (isTokenizerWhitespace(character) || character == '/' || character == '>') {
695 bufferASCIICharacter(character);
696 if (temporaryBufferIs("script"))
697 ADVANCE_TO(ScriptDataEscapedState);
698 else
699 ADVANCE_TO(ScriptDataDoubleEscapedState);
700 }
701 if (isASCIIAlpha(character)) {
702 bufferASCIICharacter(character);
703 appendToTemporaryBuffer(convertASCIIAlphaToLower(character));
704 ADVANCE_PAST_NON_NEWLINE_TO(ScriptDataDoubleEscapeEndState);
705 }
706 RECONSUME_IN(ScriptDataDoubleEscapedState);
707 END_STATE()
708
709 BEGIN_STATE(BeforeAttributeNameState)
710 if (isTokenizerWhitespace(character))
711 ADVANCE_TO(BeforeAttributeNameState);
712 if (character == '/')
713 ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
714 if (character == '>')
715 return emitAndResumeInDataState(source);
716 if (m_options.usePreHTML5ParserQuirks && character == '<')
717 return emitAndReconsumeInDataState();
718 if (character == kEndOfFileMarker) {
719 parseError();
720 RECONSUME_IN(DataState);
721 }
722 if (character == '"' || character == '\'' || character == '<' || character == '=')
723 parseError();
724 m_token.beginAttribute(source.numberOfCharactersConsumed());
725 m_token.appendToAttributeName(toASCIILower(character));
726 ADVANCE_PAST_NON_NEWLINE_TO(AttributeNameState);
727 END_STATE()
728
729 BEGIN_STATE(AttributeNameState)
730 if (isTokenizerWhitespace(character))
731 ADVANCE_TO(AfterAttributeNameState);
732 if (character == '/')
733 ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
734 if (character == '=')
735 ADVANCE_PAST_NON_NEWLINE_TO(BeforeAttributeValueState);
736 if (character == '>')
737 return emitAndResumeInDataState(source);
738 if (m_options.usePreHTML5ParserQuirks && character == '<')
739 return emitAndReconsumeInDataState();
740 if (character == kEndOfFileMarker) {
741 parseError();
742 RECONSUME_IN(DataState);
743 }
744 if (character == '"' || character == '\'' || character == '<' || character == '=')
745 parseError();
746 m_token.appendToAttributeName(toASCIILower(character));
747 ADVANCE_PAST_NON_NEWLINE_TO(AttributeNameState);
748 END_STATE()
749
750 BEGIN_STATE(AfterAttributeNameState)
751 if (isTokenizerWhitespace(character))
752 ADVANCE_TO(AfterAttributeNameState);
753 if (character == '/')
754 ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
755 if (character == '=')
756 ADVANCE_PAST_NON_NEWLINE_TO(BeforeAttributeValueState);
757 if (character == '>')
758 return emitAndResumeInDataState(source);
759 if (m_options.usePreHTML5ParserQuirks && character == '<')
760 return emitAndReconsumeInDataState();
761 if (character == kEndOfFileMarker) {
762 parseError();
763 RECONSUME_IN(DataState);
764 }
765 if (character == '"' || character == '\'' || character == '<')
766 parseError();
767 m_token.beginAttribute(source.numberOfCharactersConsumed());
768 m_token.appendToAttributeName(toASCIILower(character));
769 ADVANCE_PAST_NON_NEWLINE_TO(AttributeNameState);
770 END_STATE()
771
772 BEGIN_STATE(BeforeAttributeValueState)
773 if (isTokenizerWhitespace(character))
774 ADVANCE_TO(BeforeAttributeValueState);
775 if (character == '"')
776 ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueDoubleQuotedState);
777 if (character == '&')
778 RECONSUME_IN(AttributeValueUnquotedState);
779 if (character == '\'')
780 ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueSingleQuotedState);
781 if (character == '>') {
782 parseError();
783 return emitAndResumeInDataState(source);
784 }
785 if (character == kEndOfFileMarker) {
786 parseError();
787 RECONSUME_IN(DataState);
788 }
789 if (character == '<' || character == '=' || character == '`')
790 parseError();
791 m_token.appendToAttributeValue(character);
792 ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueUnquotedState);
793 END_STATE()
794
795 BEGIN_STATE(AttributeValueDoubleQuotedState)
796 if (character == '"') {
797 m_token.endAttribute(source.numberOfCharactersConsumed());
798 ADVANCE_PAST_NON_NEWLINE_TO(AfterAttributeValueQuotedState);
799 }
800 if (character == '&') {
801 m_additionalAllowedCharacter = '"';
802 ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInAttributeValueState);
803 }
804 if (character == kEndOfFileMarker) {
805 parseError();
806 m_token.endAttribute(source.numberOfCharactersConsumed());
807 RECONSUME_IN(DataState);
808 }
809 m_token.appendToAttributeValue(character);
810 ADVANCE_TO(AttributeValueDoubleQuotedState);
811 END_STATE()
812
813 BEGIN_STATE(AttributeValueSingleQuotedState)
814 if (character == '\'') {
815 m_token.endAttribute(source.numberOfCharactersConsumed());
816 ADVANCE_PAST_NON_NEWLINE_TO(AfterAttributeValueQuotedState);
817 }
818 if (character == '&') {
819 m_additionalAllowedCharacter = '\'';
820 ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInAttributeValueState);
821 }
822 if (character == kEndOfFileMarker) {
823 parseError();
824 m_token.endAttribute(source.numberOfCharactersConsumed());
825 RECONSUME_IN(DataState);
826 }
827 m_token.appendToAttributeValue(character);
828 ADVANCE_TO(AttributeValueSingleQuotedState);
829 END_STATE()
830
831 BEGIN_STATE(AttributeValueUnquotedState)
832 if (isTokenizerWhitespace(character)) {
833 m_token.endAttribute(source.numberOfCharactersConsumed());
834 ADVANCE_TO(BeforeAttributeNameState);
835 }
836 if (character == '&') {
837 m_additionalAllowedCharacter = '>';
838 ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInAttributeValueState);
839 }
840 if (character == '>') {
841 m_token.endAttribute(source.numberOfCharactersConsumed());
842 return emitAndResumeInDataState(source);
843 }
844 if (character == kEndOfFileMarker) {
845 parseError();
846 m_token.endAttribute(source.numberOfCharactersConsumed());
847 RECONSUME_IN(DataState);
848 }
849 if (character == '"' || character == '\'' || character == '<' || character == '=' || character == '`')
850 parseError();
851 m_token.appendToAttributeValue(character);
852 ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueUnquotedState);
853 END_STATE()
854
855 BEGIN_STATE(CharacterReferenceInAttributeValueState)
856 bool notEnoughCharacters = false;
857 StringBuilder decodedEntity;
858 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
859 if (notEnoughCharacters)
860 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
861 if (!success) {
862 ASSERT(decodedEntity.isEmpty());
863 m_token.appendToAttributeValue('&');
864 } else {
865 for (unsigned i = 0; i < decodedEntity.length(); ++i)
866 m_token.appendToAttributeValue(decodedEntity[i]);
867 }
868 // We're supposed to switch back to the attribute value state that
869 // we were in when we were switched into this state. Rather than
870 // keeping track of this explictly, we observe that the previous
871 // state can be determined by m_additionalAllowedCharacter.
872 if (m_additionalAllowedCharacter == '"')
873 SWITCH_TO(AttributeValueDoubleQuotedState);
874 if (m_additionalAllowedCharacter == '\'')
875 SWITCH_TO(AttributeValueSingleQuotedState);
876 ASSERT(m_additionalAllowedCharacter == '>');
877 SWITCH_TO(AttributeValueUnquotedState);
878 END_STATE()
879
880 BEGIN_STATE(AfterAttributeValueQuotedState)
881 if (isTokenizerWhitespace(character))
882 ADVANCE_TO(BeforeAttributeNameState);
883 if (character == '/')
884 ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
885 if (character == '>')
886 return emitAndResumeInDataState(source);
887 if (m_options.usePreHTML5ParserQuirks && character == '<')
888 return emitAndReconsumeInDataState();
889 if (character == kEndOfFileMarker) {
890 parseError();
891 RECONSUME_IN(DataState);
892 }
893 parseError();
894 RECONSUME_IN(BeforeAttributeNameState);
895 END_STATE()
896
897 BEGIN_STATE(SelfClosingStartTagState)
898 if (character == '>') {
899 m_token.setSelfClosing();
900 return emitAndResumeInDataState(source);
901 }
902 if (character == kEndOfFileMarker) {
903 parseError();
904 RECONSUME_IN(DataState);
905 }
906 parseError();
907 RECONSUME_IN(BeforeAttributeNameState);
908 END_STATE()
909
910 BEGIN_STATE(BogusCommentState)
911 m_token.beginComment();
912 RECONSUME_IN(ContinueBogusCommentState);
913 END_STATE()
914
915 BEGIN_STATE(ContinueBogusCommentState)
916 if (character == '>')
917 return emitAndResumeInDataState(source);
918 if (character == kEndOfFileMarker)
919 return emitAndReconsumeInDataState();
920 m_token.appendToComment(character);
921 ADVANCE_TO(ContinueBogusCommentState);
922 END_STATE()
923
924 BEGIN_STATE(MarkupDeclarationOpenState)
925 if (character == '-') {
926 auto result = source.advancePast("--");
927 if (result == SegmentedString::DidMatch) {
928 m_token.beginComment();
929 SWITCH_TO(CommentStartState);
930 }
931 if (result == SegmentedString::NotEnoughCharacters)
932 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
933 } else if (isASCIIAlphaCaselessEqual(character, 'd')) {
934 auto result = source.advancePastLettersIgnoringASCIICase("doctype");
935 if (result == SegmentedString::DidMatch)
936 SWITCH_TO(DOCTYPEState);
937 if (result == SegmentedString::NotEnoughCharacters)
938 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
939 } else if (character == '[' && shouldAllowCDATA()) {
940 auto result = source.advancePast("[CDATA[");
941 if (result == SegmentedString::DidMatch)
942 SWITCH_TO(CDATASectionState);
943 if (result == SegmentedString::NotEnoughCharacters)
944 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
945 }
946 parseError();
947 RECONSUME_IN(BogusCommentState);
948 END_STATE()
949
950 BEGIN_STATE(CommentStartState)
951 if (character == '-')
952 ADVANCE_PAST_NON_NEWLINE_TO(CommentStartDashState);
953 if (character == '>') {
954 parseError();
955 return emitAndResumeInDataState(source);
956 }
957 if (character == kEndOfFileMarker) {
958 parseError();
959 return emitAndReconsumeInDataState();
960 }
961 m_token.appendToComment(character);
962 ADVANCE_TO(CommentState);
963 END_STATE()
964
965 BEGIN_STATE(CommentStartDashState)
966 if (character == '-')
967 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndState);
968 if (character == '>') {
969 parseError();
970 return emitAndResumeInDataState(source);
971 }
972 if (character == kEndOfFileMarker) {
973 parseError();
974 return emitAndReconsumeInDataState();
975 }
976 m_token.appendToComment('-');
977 m_token.appendToComment(character);
978 ADVANCE_TO(CommentState);
979 END_STATE()
980
981 BEGIN_STATE(CommentState)
982 if (character == '-')
983 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndDashState);
984 if (character == kEndOfFileMarker) {
985 parseError();
986 return emitAndReconsumeInDataState();
987 }
988 m_token.appendToComment(character);
989 ADVANCE_TO(CommentState);
990 END_STATE()
991
992 BEGIN_STATE(CommentEndDashState)
993 if (character == '-')
994 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndState);
995 if (character == kEndOfFileMarker) {
996 parseError();
997 return emitAndReconsumeInDataState();
998 }
999 m_token.appendToComment('-');
1000 m_token.appendToComment(character);
1001 ADVANCE_TO(CommentState);
1002 END_STATE()
1003
1004 BEGIN_STATE(CommentEndState)
1005 if (character == '>')
1006 return emitAndResumeInDataState(source);
1007 if (character == '!') {
1008 parseError();
1009 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndBangState);
1010 }
1011 if (character == '-') {
1012 parseError();
1013 m_token.appendToComment('-');
1014 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndState);
1015 }
1016 if (character == kEndOfFileMarker) {
1017 parseError();
1018 return emitAndReconsumeInDataState();
1019 }
1020 parseError();
1021 m_token.appendToComment('-');
1022 m_token.appendToComment('-');
1023 m_token.appendToComment(character);
1024 ADVANCE_TO(CommentState);
1025 END_STATE()
1026
1027 BEGIN_STATE(CommentEndBangState)
1028 if (character == '-') {
1029 m_token.appendToComment('-');
1030 m_token.appendToComment('-');
1031 m_token.appendToComment('!');
1032 ADVANCE_PAST_NON_NEWLINE_TO(CommentEndDashState);
1033 }
1034 if (character == '>')
1035 return emitAndResumeInDataState(source);
1036 if (character == kEndOfFileMarker) {
1037 parseError();
1038 return emitAndReconsumeInDataState();
1039 }
1040 m_token.appendToComment('-');
1041 m_token.appendToComment('-');
1042 m_token.appendToComment('!');
1043 m_token.appendToComment(character);
1044 ADVANCE_TO(CommentState);
1045 END_STATE()
1046
1047 BEGIN_STATE(DOCTYPEState)
1048 if (isTokenizerWhitespace(character))
1049 ADVANCE_TO(BeforeDOCTYPENameState);
1050 if (character == kEndOfFileMarker) {
1051 parseError();
1052 m_token.beginDOCTYPE();
1053 m_token.setForceQuirks();
1054 return emitAndReconsumeInDataState();
1055 }
1056 parseError();
1057 RECONSUME_IN(BeforeDOCTYPENameState);
1058 END_STATE()
1059
1060 BEGIN_STATE(BeforeDOCTYPENameState)
1061 if (isTokenizerWhitespace(character))
1062 ADVANCE_TO(BeforeDOCTYPENameState);
1063 if (character == '>') {
1064 parseError();
1065 m_token.beginDOCTYPE();
1066 m_token.setForceQuirks();
1067 return emitAndResumeInDataState(source);
1068 }
1069 if (character == kEndOfFileMarker) {
1070 parseError();
1071 m_token.beginDOCTYPE();
1072 m_token.setForceQuirks();
1073 return emitAndReconsumeInDataState();
1074 }
1075 m_token.beginDOCTYPE(toASCIILower(character));
1076 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPENameState);
1077 END_STATE()
1078
1079 BEGIN_STATE(DOCTYPENameState)
1080 if (isTokenizerWhitespace(character))
1081 ADVANCE_TO(AfterDOCTYPENameState);
1082 if (character == '>')
1083 return emitAndResumeInDataState(source);
1084 if (character == kEndOfFileMarker) {
1085 parseError();
1086 m_token.setForceQuirks();
1087 return emitAndReconsumeInDataState();
1088 }
1089 m_token.appendToName(toASCIILower(character));
1090 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPENameState);
1091 END_STATE()
1092
1093 BEGIN_STATE(AfterDOCTYPENameState)
1094 if (isTokenizerWhitespace(character))
1095 ADVANCE_TO(AfterDOCTYPENameState);
1096 if (character == '>')
1097 return emitAndResumeInDataState(source);
1098 if (character == kEndOfFileMarker) {
1099 parseError();
1100 m_token.setForceQuirks();
1101 return emitAndReconsumeInDataState();
1102 }
1103 if (isASCIIAlphaCaselessEqual(character, 'p')) {
1104 auto result = source.advancePastLettersIgnoringASCIICase("public");
1105 if (result == SegmentedString::DidMatch)
1106 SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1107 if (result == SegmentedString::NotEnoughCharacters)
1108 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
1109 } else if (isASCIIAlphaCaselessEqual(character, 's')) {
1110 auto result = source.advancePastLettersIgnoringASCIICase("system");
1111 if (result == SegmentedString::DidMatch)
1112 SWITCH_TO(AfterDOCTYPESystemKeywordState);
1113 if (result == SegmentedString::NotEnoughCharacters)
1114 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken());
1115 }
1116 parseError();
1117 m_token.setForceQuirks();
1118 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1119 END_STATE()
1120
1121 BEGIN_STATE(AfterDOCTYPEPublicKeywordState)
1122 if (isTokenizerWhitespace(character))
1123 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1124 if (character == '"') {
1125 parseError();
1126 m_token.setPublicIdentifierToEmptyString();
1127 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1128 }
1129 if (character == '\'') {
1130 parseError();
1131 m_token.setPublicIdentifierToEmptyString();
1132 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1133 }
1134 if (character == '>') {
1135 parseError();
1136 m_token.setForceQuirks();
1137 return emitAndResumeInDataState(source);
1138 }
1139 if (character == kEndOfFileMarker) {
1140 parseError();
1141 m_token.setForceQuirks();
1142 return emitAndReconsumeInDataState();
1143 }
1144 parseError();
1145 m_token.setForceQuirks();
1146 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1147 END_STATE()
1148
1149 BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState)
1150 if (isTokenizerWhitespace(character))
1151 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1152 if (character == '"') {
1153 m_token.setPublicIdentifierToEmptyString();
1154 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1155 }
1156 if (character == '\'') {
1157 m_token.setPublicIdentifierToEmptyString();
1158 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1159 }
1160 if (character == '>') {
1161 parseError();
1162 m_token.setForceQuirks();
1163 return emitAndResumeInDataState(source);
1164 }
1165 if (character == kEndOfFileMarker) {
1166 parseError();
1167 m_token.setForceQuirks();
1168 return emitAndReconsumeInDataState();
1169 }
1170 parseError();
1171 m_token.setForceQuirks();
1172 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1173 END_STATE()
1174
1175 BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState)
1176 if (character == '"')
1177 ADVANCE_PAST_NON_NEWLINE_TO(AfterDOCTYPEPublicIdentifierState);
1178 if (character == '>') {
1179 parseError();
1180 m_token.setForceQuirks();
1181 return emitAndResumeInDataState(source);
1182 }
1183 if (character == kEndOfFileMarker) {
1184 parseError();
1185 m_token.setForceQuirks();
1186 return emitAndReconsumeInDataState();
1187 }
1188 m_token.appendToPublicIdentifier(character);
1189 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1190 END_STATE()
1191
1192 BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState)
1193 if (character == '\'')
1194 ADVANCE_PAST_NON_NEWLINE_TO(AfterDOCTYPEPublicIdentifierState);
1195 if (character == '>') {
1196 parseError();
1197 m_token.setForceQuirks();
1198 return emitAndResumeInDataState(source);
1199 }
1200 if (character == kEndOfFileMarker) {
1201 parseError();
1202 m_token.setForceQuirks();
1203 return emitAndReconsumeInDataState();
1204 }
1205 m_token.appendToPublicIdentifier(character);
1206 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1207 END_STATE()
1208
1209 BEGIN_STATE(AfterDOCTYPEPublicIdentifierState)
1210 if (isTokenizerWhitespace(character))
1211 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1212 if (character == '>')
1213 return emitAndResumeInDataState(source);
1214 if (character == '"') {
1215 parseError();
1216 m_token.setSystemIdentifierToEmptyString();
1217 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1218 }
1219 if (character == '\'') {
1220 parseError();
1221 m_token.setSystemIdentifierToEmptyString();
1222 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1223 }
1224 if (character == kEndOfFileMarker) {
1225 parseError();
1226 m_token.setForceQuirks();
1227 return emitAndReconsumeInDataState();
1228 }
1229 parseError();
1230 m_token.setForceQuirks();
1231 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1232 END_STATE()
1233
1234 BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState)
1235 if (isTokenizerWhitespace(character))
1236 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1237 if (character == '>')
1238 return emitAndResumeInDataState(source);
1239 if (character == '"') {
1240 m_token.setSystemIdentifierToEmptyString();
1241 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1242 }
1243 if (character == '\'') {
1244 m_token.setSystemIdentifierToEmptyString();
1245 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1246 }
1247 if (character == kEndOfFileMarker) {
1248 parseError();
1249 m_token.setForceQuirks();
1250 return emitAndReconsumeInDataState();
1251 }
1252 parseError();
1253 m_token.setForceQuirks();
1254 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1255 END_STATE()
1256
1257 BEGIN_STATE(AfterDOCTYPESystemKeywordState)
1258 if (isTokenizerWhitespace(character))
1259 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1260 if (character == '"') {
1261 parseError();
1262 m_token.setSystemIdentifierToEmptyString();
1263 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1264 }
1265 if (character == '\'') {
1266 parseError();
1267 m_token.setSystemIdentifierToEmptyString();
1268 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1269 }
1270 if (character == '>') {
1271 parseError();
1272 m_token.setForceQuirks();
1273 return emitAndResumeInDataState(source);
1274 }
1275 if (character == kEndOfFileMarker) {
1276 parseError();
1277 m_token.setForceQuirks();
1278 return emitAndReconsumeInDataState();
1279 }
1280 parseError();
1281 m_token.setForceQuirks();
1282 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1283 END_STATE()
1284
1285 BEGIN_STATE(BeforeDOCTYPESystemIdentifierState)
1286 if (isTokenizerWhitespace(character))
1287 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1288 if (character == '"') {
1289 m_token.setSystemIdentifierToEmptyString();
1290 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1291 }
1292 if (character == '\'') {
1293 m_token.setSystemIdentifierToEmptyString();
1294 ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1295 }
1296 if (character == '>') {
1297 parseError();
1298 m_token.setForceQuirks();
1299 return emitAndResumeInDataState(source);
1300 }
1301 if (character == kEndOfFileMarker) {
1302 parseError();
1303 m_token.setForceQuirks();
1304 return emitAndReconsumeInDataState();
1305 }
1306 parseError();
1307 m_token.setForceQuirks();
1308 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1309 END_STATE()
1310
1311 BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState)
1312 if (character == '"')
1313 ADVANCE_PAST_NON_NEWLINE_TO(AfterDOCTYPESystemIdentifierState);
1314 if (character == '>') {
1315 parseError();
1316 m_token.setForceQuirks();
1317 return emitAndResumeInDataState(source);
1318 }
1319 if (character == kEndOfFileMarker) {
1320 parseError();
1321 m_token.setForceQuirks();
1322 return emitAndReconsumeInDataState();
1323 }
1324 m_token.appendToSystemIdentifier(character);
1325 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1326 END_STATE()
1327
1328 BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState)
1329 if (character == '\'')
1330 ADVANCE_PAST_NON_NEWLINE_TO(AfterDOCTYPESystemIdentifierState);
1331 if (character == '>') {
1332 parseError();
1333 m_token.setForceQuirks();
1334 return emitAndResumeInDataState(source);
1335 }
1336 if (character == kEndOfFileMarker) {
1337 parseError();
1338 m_token.setForceQuirks();
1339 return emitAndReconsumeInDataState();
1340 }
1341 m_token.appendToSystemIdentifier(character);
1342 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1343 END_STATE()
1344
1345 BEGIN_STATE(AfterDOCTYPESystemIdentifierState)
1346 if (isTokenizerWhitespace(character))
1347 ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1348 if (character == '>')
1349 return emitAndResumeInDataState(source);
1350 if (character == kEndOfFileMarker) {
1351 parseError();
1352 m_token.setForceQuirks();
1353 return emitAndReconsumeInDataState();
1354 }
1355 parseError();
1356 ADVANCE_PAST_NON_NEWLINE_TO(BogusDOCTYPEState);
1357 END_STATE()
1358
1359 BEGIN_STATE(BogusDOCTYPEState)
1360 if (character == '>')
1361 return emitAndResumeInDataState(source);
1362 if (character == kEndOfFileMarker)
1363 return emitAndReconsumeInDataState();
1364 ADVANCE_TO(BogusDOCTYPEState);
1365 END_STATE()
1366
1367 BEGIN_STATE(CDATASectionState)
1368 if (character == ']')
1369 ADVANCE_PAST_NON_NEWLINE_TO(CDATASectionRightSquareBracketState);
1370 if (character == kEndOfFileMarker)
1371 RECONSUME_IN(DataState);
1372 bufferCharacter(character);
1373 ADVANCE_TO(CDATASectionState);
1374 END_STATE()
1375
1376 BEGIN_STATE(CDATASectionRightSquareBracketState)
1377 if (character == ']')
1378 ADVANCE_PAST_NON_NEWLINE_TO(CDATASectionDoubleRightSquareBracketState);
1379 bufferASCIICharacter(']');
1380 RECONSUME_IN(CDATASectionState);
1381 END_STATE()
1382
1383 BEGIN_STATE(CDATASectionDoubleRightSquareBracketState)
1384 if (character == '>')
1385 ADVANCE_PAST_NON_NEWLINE_TO(DataState);
1386 bufferASCIICharacter(']');
1387 bufferASCIICharacter(']');
1388 RECONSUME_IN(CDATASectionState);
1389 END_STATE()
1390
1391 }
1392
1393 ASSERT_NOT_REACHED();
1394 return false;
1395}
1396
1397String HTMLTokenizer::bufferedCharacters() const
1398{
1399 // FIXME: Add an assert about m_state.
1400 StringBuilder characters;
1401 characters.reserveCapacity(numberOfBufferedCharacters());
1402 characters.append('<');
1403 characters.append('/');
1404 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1405 return characters.toString();
1406}
1407
1408void HTMLTokenizer::updateStateFor(const AtomicString& tagName)
1409{
1410 if (tagName == textareaTag || tagName == titleTag)
1411 m_state = RCDATAState;
1412 else if (tagName == plaintextTag)
1413 m_state = PLAINTEXTState;
1414 else if (tagName == scriptTag)
1415 m_state = ScriptDataState;
1416 else if (tagName == styleTag
1417 || tagName == iframeTag
1418 || tagName == xmpTag
1419 || (tagName == noembedTag)
1420 || tagName == noframesTag
1421 || (tagName == noscriptTag && m_options.scriptEnabled))
1422 m_state = RAWTEXTState;
1423}
1424
1425inline void HTMLTokenizer::appendToTemporaryBuffer(UChar character)
1426{
1427 ASSERT(isASCII(character));
1428 m_temporaryBuffer.append(character);
1429}
1430
1431inline bool HTMLTokenizer::temporaryBufferIs(const char* expectedString)
1432{
1433 return vectorEqualsString(m_temporaryBuffer, expectedString);
1434}
1435
1436inline void HTMLTokenizer::appendToPossibleEndTag(UChar character)
1437{
1438 ASSERT(isASCII(character));
1439 m_bufferedEndTagName.append(character);
1440}
1441
1442inline bool HTMLTokenizer::isAppropriateEndTag() const
1443{
1444 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1445 return false;
1446
1447 unsigned size = m_bufferedEndTagName.size();
1448
1449 for (unsigned i = 0; i < size; i++) {
1450 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1451 return false;
1452 }
1453
1454 return true;
1455}
1456
1457inline void HTMLTokenizer::parseError()
1458{
1459}
1460
1461}
1462