1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 * Copyright (C) 2015-2017 Apple Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "HTMLDocumentParser.h"
29
30#include "CustomElementReactionQueue.h"
31#include "DocumentFragment.h"
32#include "DocumentLoader.h"
33#include "Frame.h"
34#include "HTMLDocument.h"
35#include "HTMLParserScheduler.h"
36#include "HTMLPreloadScanner.h"
37#include "HTMLScriptRunner.h"
38#include "HTMLTreeBuilder.h"
39#include "HTMLUnknownElement.h"
40#include "JSCustomElementInterface.h"
41#include "LinkLoader.h"
42#include "Microtasks.h"
43#include "NavigationScheduler.h"
44#include "ScriptElement.h"
45#include "ThrowOnDynamicMarkupInsertionCountIncrementer.h"
46
47namespace WebCore {
48
49using namespace HTMLNames;
50
51HTMLDocumentParser::HTMLDocumentParser(HTMLDocument& document)
52 : ScriptableDocumentParser(document)
53 , m_options(document)
54 , m_tokenizer(m_options)
55 , m_scriptRunner(std::make_unique<HTMLScriptRunner>(document, static_cast<HTMLScriptRunnerHost&>(*this)))
56 , m_treeBuilder(std::make_unique<HTMLTreeBuilder>(*this, document, parserContentPolicy(), m_options))
57 , m_parserScheduler(std::make_unique<HTMLParserScheduler>(*this))
58 , m_xssAuditorDelegate(document)
59 , m_preloader(std::make_unique<HTMLResourcePreloader>(document))
60{
61}
62
63Ref<HTMLDocumentParser> HTMLDocumentParser::create(HTMLDocument& document)
64{
65 return adoptRef(*new HTMLDocumentParser(document));
66}
67
68inline HTMLDocumentParser::HTMLDocumentParser(DocumentFragment& fragment, Element& contextElement, ParserContentPolicy rawPolicy)
69 : ScriptableDocumentParser(fragment.document(), rawPolicy)
70 , m_options(fragment.document())
71 , m_tokenizer(m_options)
72 , m_treeBuilder(std::make_unique<HTMLTreeBuilder>(*this, fragment, contextElement, parserContentPolicy(), m_options))
73 , m_xssAuditorDelegate(fragment.document())
74{
75 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
76 if (contextElement.isHTMLElement())
77 m_tokenizer.updateStateFor(contextElement.tagQName().localName());
78 m_xssAuditor.initForFragment();
79}
80
81inline Ref<HTMLDocumentParser> HTMLDocumentParser::create(DocumentFragment& fragment, Element& contextElement, ParserContentPolicy parserContentPolicy)
82{
83 return adoptRef(*new HTMLDocumentParser(fragment, contextElement, parserContentPolicy));
84}
85
86HTMLDocumentParser::~HTMLDocumentParser()
87{
88 ASSERT(!m_parserScheduler);
89 ASSERT(!m_pumpSessionNestingLevel);
90 ASSERT(!m_preloadScanner);
91 ASSERT(!m_insertionPreloadScanner);
92}
93
94void HTMLDocumentParser::detach()
95{
96 ScriptableDocumentParser::detach();
97
98 if (m_scriptRunner)
99 m_scriptRunner->detach();
100 // FIXME: It seems wrong that we would have a preload scanner here.
101 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do.
102 m_preloadScanner = nullptr;
103 m_insertionPreloadScanner = nullptr;
104 m_parserScheduler = nullptr; // Deleting the scheduler will clear any timers.
105}
106
107void HTMLDocumentParser::stopParsing()
108{
109 DocumentParser::stopParsing();
110 m_parserScheduler = nullptr; // Deleting the scheduler will clear any timers.
111}
112
113// This kicks off "Once the user agent stops parsing" as described by:
114// https://html.spec.whatwg.org/multipage/syntax.html#the-end
115void HTMLDocumentParser::prepareToStopParsing()
116{
117 ASSERT(!hasInsertionPoint());
118
119 // pumpTokenizer can cause this parser to be detached from the Document,
120 // but we need to ensure it isn't deleted yet.
121 Ref<HTMLDocumentParser> protectedThis(*this);
122
123 // NOTE: This pump should only ever emit buffered character tokens,
124 // so ForceSynchronous vs. AllowYield should be meaningless.
125 pumpTokenizerIfPossible(ForceSynchronous);
126
127 if (isStopped())
128 return;
129
130 DocumentParser::prepareToStopParsing();
131
132 // We will not have a scriptRunner when parsing a DocumentFragment.
133 if (m_scriptRunner)
134 document()->setReadyState(Document::Interactive);
135
136 // Setting the ready state above can fire mutation event and detach us
137 // from underneath. In that case, just bail out.
138 if (isDetached())
139 return;
140
141 attemptToRunDeferredScriptsAndEnd();
142}
143
144inline bool HTMLDocumentParser::inPumpSession() const
145{
146 return m_pumpSessionNestingLevel > 0;
147}
148
149inline bool HTMLDocumentParser::shouldDelayEnd() const
150{
151 return inPumpSession() || isWaitingForScripts() || isScheduledForResume() || isExecutingScript();
152}
153
154void HTMLDocumentParser::didBeginYieldingParser()
155{
156 m_parserScheduler->didBeginYieldingParser();
157}
158
159void HTMLDocumentParser::didEndYieldingParser()
160{
161 m_parserScheduler->didEndYieldingParser();
162}
163
164bool HTMLDocumentParser::isParsingFragment() const
165{
166 return m_treeBuilder->isParsingFragment();
167}
168
169bool HTMLDocumentParser::processingData() const
170{
171 return isScheduledForResume() || inPumpSession();
172}
173
174void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode)
175{
176 if (isStopped() || isWaitingForScripts())
177 return;
178
179 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump.
180 if (isScheduledForResume()) {
181 ASSERT(mode == AllowYield);
182 return;
183 }
184
185 pumpTokenizer(mode);
186}
187
188bool HTMLDocumentParser::isScheduledForResume() const
189{
190 return m_parserScheduler && m_parserScheduler->isScheduledForResume();
191}
192
193// Used by HTMLParserScheduler
194void HTMLDocumentParser::resumeParsingAfterYield()
195{
196 // pumpTokenizer can cause this parser to be detached from the Document,
197 // but we need to ensure it isn't deleted yet.
198 Ref<HTMLDocumentParser> protectedThis(*this);
199
200 // We should never be here unless we can pump immediately.
201 // Call pumpTokenizer() directly so that ASSERTS will fire if we're wrong.
202 pumpTokenizer(AllowYield);
203 endIfDelayed();
204}
205
206void HTMLDocumentParser::runScriptsForPausedTreeBuilder()
207{
208 ASSERT(scriptingContentIsAllowed(parserContentPolicy()));
209
210 if (std::unique_ptr<CustomElementConstructionData> constructionData = m_treeBuilder->takeCustomElementConstructionData()) {
211 ASSERT(!m_treeBuilder->hasParserBlockingScriptWork());
212
213 // https://html.spec.whatwg.org/#create-an-element-for-the-token
214 {
215 // Prevent document.open/write during reactions by allocating the incrementer before the reactions stack.
216 ThrowOnDynamicMarkupInsertionCountIncrementer incrementer(*document());
217
218 MicrotaskQueue::mainThreadQueue().performMicrotaskCheckpoint();
219
220 CustomElementReactionStack reactionStack(document()->execState());
221 auto& elementInterface = constructionData->elementInterface.get();
222 auto newElement = elementInterface.constructElementWithFallback(*document(), constructionData->name);
223 m_treeBuilder->didCreateCustomOrFallbackElement(WTFMove(newElement), *constructionData);
224 }
225 return;
226 }
227
228 TextPosition scriptStartPosition = TextPosition::belowRangePosition();
229 if (auto scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition)) {
230 ASSERT(!m_treeBuilder->hasParserBlockingScriptWork());
231 // We will not have a scriptRunner when parsing a DocumentFragment.
232 if (m_scriptRunner)
233 m_scriptRunner->execute(scriptElement.releaseNonNull(), scriptStartPosition);
234 }
235}
236
237Document* HTMLDocumentParser::contextForParsingSession()
238{
239 // The parsing session should interact with the document only when parsing
240 // non-fragments. Otherwise, we might delay the load event mistakenly.
241 if (isParsingFragment())
242 return nullptr;
243 return document();
244}
245
246bool HTMLDocumentParser::pumpTokenizerLoop(SynchronousMode mode, bool parsingFragment, PumpSession& session)
247{
248 do {
249 if (UNLIKELY(isWaitingForScripts())) {
250 if (mode == AllowYield && m_parserScheduler->shouldYieldBeforeExecutingScript(session))
251 return true;
252 runScriptsForPausedTreeBuilder();
253 // If we're paused waiting for a script, we try to execute scripts before continuing.
254 if (isWaitingForScripts() || isStopped())
255 return false;
256 }
257
258 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the Frame, but this approach is
259 // how the parser has always handled stopping when the page assigns window.location. What should
260 // happen instead is that assigning window.location causes the parser to stop parsing cleanly.
261 // The problem is we're not prepared to do that at every point where we run JavaScript.
262 if (UNLIKELY(!parsingFragment && document()->frame() && document()->frame()->navigationScheduler().locationChangePending()))
263 return false;
264
265 if (UNLIKELY(mode == AllowYield && m_parserScheduler->shouldYieldBeforeToken(session)))
266 return true;
267
268 if (!parsingFragment)
269 m_sourceTracker.startToken(m_input.current(), m_tokenizer);
270
271 auto token = m_tokenizer.nextToken(m_input.current());
272 if (!token)
273 return false;
274
275 if (!parsingFragment) {
276 m_sourceTracker.endToken(m_input.current(), m_tokenizer);
277
278 // We do not XSS filter innerHTML, which means we (intentionally) fail
279 // http/tests/security/xssAuditor/dom-write-innerHTML.html
280 if (auto xssInfo = m_xssAuditor.filterToken(FilterTokenRequest(*token, m_sourceTracker, m_tokenizer.shouldAllowCDATA())))
281 m_xssAuditorDelegate.didBlockScript(*xssInfo);
282 }
283
284 constructTreeFromHTMLToken(token);
285 } while (!isStopped());
286
287 return false;
288}
289
290void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
291{
292 ASSERT(!isStopped());
293 ASSERT(!isScheduledForResume());
294
295 // This is an attempt to check that this object is both attached to the Document and protected by something.
296 ASSERT(refCount() >= 2);
297
298 PumpSession session(m_pumpSessionNestingLevel, contextForParsingSession());
299
300 m_xssAuditor.init(document(), &m_xssAuditorDelegate);
301
302 bool shouldResume = pumpTokenizerLoop(mode, isParsingFragment(), session);
303
304 // Ensure we haven't been totally deref'ed after pumping. Any caller of this
305 // function should be holding a RefPtr to this to ensure we weren't deleted.
306 ASSERT(refCount() >= 1);
307
308 if (isStopped())
309 return;
310
311 if (shouldResume)
312 m_parserScheduler->scheduleForResume();
313
314 if (isWaitingForScripts()) {
315 ASSERT(m_tokenizer.isInDataState());
316 if (!m_preloadScanner) {
317 m_preloadScanner = std::make_unique<HTMLPreloadScanner>(m_options, document()->url(), document()->deviceScaleFactor());
318 m_preloadScanner->appendToEnd(m_input.current());
319 }
320 m_preloadScanner->scan(*m_preloader, *document());
321 }
322 // The viewport definition is known here, so we can load link preloads with media attributes.
323 if (document()->loader())
324 LinkLoader::loadLinksFromHeader(document()->loader()->response().httpHeaderField(HTTPHeaderName::Link), document()->url(), *document(), LinkLoader::MediaAttributeCheck::MediaAttributeNotEmpty);
325}
326
327void HTMLDocumentParser::constructTreeFromHTMLToken(HTMLTokenizer::TokenPtr& rawToken)
328{
329 AtomicHTMLToken token(*rawToken);
330
331 // We clear the rawToken in case constructTreeFromAtomicToken
332 // synchronously re-enters the parser. We don't clear the token immedately
333 // for Character tokens because the AtomicHTMLToken avoids copying the
334 // characters by keeping a pointer to the underlying buffer in the
335 // HTMLToken. Fortunately, Character tokens can't cause us to re-enter
336 // the parser.
337 //
338 // FIXME: Stop clearing the rawToken once we start running the parser off
339 // the main thread or once we stop allowing synchronous JavaScript
340 // execution from parseAttribute.
341 if (rawToken->type() != HTMLToken::Character) {
342 // Clearing the TokenPtr makes sure we don't clear the HTMLToken a second time
343 // later when the TokenPtr is destroyed.
344 rawToken.clear();
345 }
346
347 m_treeBuilder->constructTree(WTFMove(token));
348}
349
350bool HTMLDocumentParser::hasInsertionPoint()
351{
352 // FIXME: The wasCreatedByScript() branch here might not be fully correct.
353 // Our model of the EOF character differs slightly from the one in the spec
354 // because our treatment is uniform between network-sourced and script-sourced
355 // input streams whereas the spec treats them differently.
356 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile());
357}
358
359void HTMLDocumentParser::insert(SegmentedString&& source)
360{
361 if (isStopped())
362 return;
363
364 // pumpTokenizer can cause this parser to be detached from the Document,
365 // but we need to ensure it isn't deleted yet.
366 Ref<HTMLDocumentParser> protectedThis(*this);
367
368 source.setExcludeLineNumbers();
369 m_input.insertAtCurrentInsertionPoint(WTFMove(source));
370 pumpTokenizerIfPossible(ForceSynchronous);
371
372 if (isWaitingForScripts()) {
373 // Check the document.write() output with a separate preload scanner as
374 // the main scanner can't deal with insertions.
375 if (!m_insertionPreloadScanner)
376 m_insertionPreloadScanner = std::make_unique<HTMLPreloadScanner>(m_options, document()->url(), document()->deviceScaleFactor());
377 m_insertionPreloadScanner->appendToEnd(source);
378 m_insertionPreloadScanner->scan(*m_preloader, *document());
379 }
380
381 endIfDelayed();
382}
383
384void HTMLDocumentParser::append(RefPtr<StringImpl>&& inputSource)
385{
386 if (isStopped())
387 return;
388
389 // pumpTokenizer can cause this parser to be detached from the Document,
390 // but we need to ensure it isn't deleted yet.
391 Ref<HTMLDocumentParser> protectedThis(*this);
392
393 String source { WTFMove(inputSource) };
394
395 if (m_preloadScanner) {
396 if (m_input.current().isEmpty() && !isWaitingForScripts()) {
397 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner.
398 // Clear the scanner so we know to scan starting from the current input point if we block again.
399 m_preloadScanner = nullptr;
400 } else {
401 m_preloadScanner->appendToEnd(source);
402 if (isWaitingForScripts())
403 m_preloadScanner->scan(*m_preloader, *document());
404 }
405 }
406
407 m_input.appendToEnd(source);
408
409 if (inPumpSession()) {
410 // We've gotten data off the network in a nested write.
411 // We don't want to consume any more of the input stream now. Do
412 // not worry. We'll consume this data in a less-nested write().
413 return;
414 }
415
416 pumpTokenizerIfPossible(AllowYield);
417
418 endIfDelayed();
419}
420
421void HTMLDocumentParser::end()
422{
423 ASSERT(!isDetached());
424 ASSERT(!isScheduledForResume());
425
426 // Informs the rest of WebCore that parsing is really finished (and deletes this).
427 m_treeBuilder->finished();
428}
429
430void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd()
431{
432 ASSERT(isStopping());
433 ASSERT(!hasInsertionPoint());
434 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing())
435 return;
436 end();
437}
438
439void HTMLDocumentParser::attemptToEnd()
440{
441 // finish() indicates we will not receive any more data. If we are waiting on
442 // an external script to load, we can't finish parsing quite yet.
443
444 if (shouldDelayEnd()) {
445 m_endWasDelayed = true;
446 return;
447 }
448 prepareToStopParsing();
449}
450
451void HTMLDocumentParser::endIfDelayed()
452{
453 // If we've already been detached, don't bother ending.
454 if (isDetached())
455 return;
456
457 if (!m_endWasDelayed || shouldDelayEnd())
458 return;
459
460 m_endWasDelayed = false;
461 prepareToStopParsing();
462}
463
464void HTMLDocumentParser::finish()
465{
466 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not
467 // makes sense to call any methods on DocumentParser once it's been stopped.
468 // However, FrameLoader::stop calls DocumentParser::finish unconditionally.
469
470 // We're not going to get any more data off the network, so we tell the
471 // input stream we've reached the end of file. finish() can be called more
472 // than once, if the first time does not call end().
473 if (!m_input.haveSeenEndOfFile())
474 m_input.markEndOfFile();
475
476 attemptToEnd();
477}
478
479bool HTMLDocumentParser::isExecutingScript() const
480{
481 return m_scriptRunner && m_scriptRunner->isExecutingScript();
482}
483
484TextPosition HTMLDocumentParser::textPosition() const
485{
486 auto& currentString = m_input.current();
487 return TextPosition(currentString.currentLine(), currentString.currentColumn());
488}
489
490bool HTMLDocumentParser::shouldAssociateConsoleMessagesWithTextPosition() const
491{
492 return inPumpSession() && !isExecutingScript();
493}
494
495bool HTMLDocumentParser::isWaitingForScripts() const
496{
497 // When the TreeBuilder encounters a </script> tag, it returns to the HTMLDocumentParser
498 // where the script is transfered from the treebuilder to the script runner.
499 // The script runner will hold the script until its loaded and run. During
500 // any of this time, we want to count ourselves as "waiting for a script" and thus
501 // run the preload scanner, as well as delay completion of parsing.
502 bool treeBuilderHasBlockingScript = m_treeBuilder->hasParserBlockingScriptWork();
503 bool scriptRunnerHasBlockingScript = m_scriptRunner && m_scriptRunner->hasParserBlockingScript();
504 // Since the parser is paused while a script runner has a blocking script, it should
505 // never be possible to end up with both objects holding a blocking script.
506 ASSERT(!(treeBuilderHasBlockingScript && scriptRunnerHasBlockingScript));
507 // If either object has a blocking script, the parser should be paused.
508 return treeBuilderHasBlockingScript || scriptRunnerHasBlockingScript;
509}
510
511void HTMLDocumentParser::resumeParsingAfterScriptExecution()
512{
513 ASSERT(!isExecutingScript());
514 ASSERT(!isWaitingForScripts());
515
516 // pumpTokenizer can cause this parser to be detached from the Document,
517 // but we need to ensure it isn't deleted yet.
518 Ref<HTMLDocumentParser> protectedThis(*this);
519
520 m_insertionPreloadScanner = nullptr;
521 pumpTokenizerIfPossible(AllowYield);
522 endIfDelayed();
523}
524
525void HTMLDocumentParser::watchForLoad(PendingScript& pendingScript)
526{
527 ASSERT(!pendingScript.isLoaded());
528 // setClient would call notifyFinished if the load were complete.
529 // Callers do not expect to be re-entered from this call, so they should
530 // not an already-loaded PendingScript.
531 pendingScript.setClient(*this);
532}
533
534void HTMLDocumentParser::stopWatchingForLoad(PendingScript& pendingScript)
535{
536 pendingScript.clearClient();
537}
538
539void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan()
540{
541 ASSERT(m_preloadScanner);
542 m_preloadScanner->appendToEnd(m_input.current());
543 m_preloadScanner->scan(*m_preloader, *document());
544}
545
546void HTMLDocumentParser::notifyFinished(PendingScript& pendingScript)
547{
548 // pumpTokenizer can cause this parser to be detached from the Document,
549 // but we need to ensure it isn't deleted yet.
550 Ref<HTMLDocumentParser> protectedThis(*this);
551
552 // After Document parser is stopped or detached, the parser-inserted deferred script execution should be ignored.
553 if (isStopped())
554 return;
555
556 ASSERT(m_scriptRunner);
557 ASSERT(!isExecutingScript());
558 if (isStopping()) {
559 attemptToRunDeferredScriptsAndEnd();
560 return;
561 }
562
563 m_scriptRunner->executeScriptsWaitingForLoad(pendingScript);
564 if (!isWaitingForScripts())
565 resumeParsingAfterScriptExecution();
566}
567
568bool HTMLDocumentParser::hasScriptsWaitingForStylesheets() const
569{
570 return m_scriptRunner && m_scriptRunner->hasScriptsWaitingForStylesheets();
571}
572
573void HTMLDocumentParser::executeScriptsWaitingForStylesheets()
574{
575 // Document only calls this when the Document owns the DocumentParser
576 // so this will not be called in the DocumentFragment case.
577 ASSERT(m_scriptRunner);
578 // Ignore calls unless we have a script blocking the parser waiting on a
579 // stylesheet load. Otherwise we are currently parsing and this
580 // is a re-entrant call from encountering a </ style> tag.
581 if (!m_scriptRunner->hasScriptsWaitingForStylesheets())
582 return;
583
584 // pumpTokenizer can cause this parser to be detached from the Document,
585 // but we need to ensure it isn't deleted yet.
586 Ref<HTMLDocumentParser> protectedThis(*this);
587 m_scriptRunner->executeScriptsWaitingForStylesheets();
588 if (!isWaitingForScripts())
589 resumeParsingAfterScriptExecution();
590}
591
592void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment& fragment, Element& contextElement, ParserContentPolicy parserContentPolicy)
593{
594 auto parser = create(fragment, contextElement, parserContentPolicy);
595 parser->insert(source); // Use insert() so that the parser will not yield.
596 parser->finish();
597 ASSERT(!parser->processingData());
598 parser->detach();
599}
600
601void HTMLDocumentParser::suspendScheduledTasks()
602{
603 if (m_parserScheduler)
604 m_parserScheduler->suspend();
605}
606
607void HTMLDocumentParser::resumeScheduledTasks()
608{
609 if (m_parserScheduler)
610 m_parserScheduler->resume();
611}
612
613}
614