1// Copyright 2015 The Chromium Authors. All rights reserved.
2// Copyright (C) 2016 Apple Inc. All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8// * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14// * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30#include "config.h"
31#include "CSSTokenizer.h"
32
33#include "CSSParserIdioms.h"
34#include "CSSParserObserverWrapper.h"
35#include "CSSParserTokenRange.h"
36#include "CSSTokenizerInputStream.h"
37#include "HTMLParserIdioms.h"
38#include <wtf/text/StringBuilder.h>
39#include <wtf/unicode/CharacterNames.h>
40
41namespace WebCore {
42
43CSSTokenizer::CSSTokenizer(const String& string)
44 : m_input(string)
45{
46 // According to the spec, we should perform preprocessing here.
47 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
48 //
49 // However, we can skip this step since:
50 // * We're using HTML spaces (which accept \r and \f as a valid white space)
51 // * Do not count white spaces
52 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters
53
54 if (string.isEmpty())
55 return;
56
57 // To avoid resizing we err on the side of reserving too much space.
58 // Most strings we tokenize have about 3.5 to 5 characters per token.
59 m_tokens.reserveInitialCapacity(string.length() / 3);
60
61 while (true) {
62 CSSParserToken token = nextToken();
63 if (token.type() == CommentToken)
64 continue;
65 if (token.type() == EOFToken)
66 return;
67 m_tokens.append(token);
68 }
69}
70
71CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper)
72 : m_input(string)
73{
74 if (string.isEmpty())
75 return;
76
77 unsigned offset = 0;
78 while (true) {
79 CSSParserToken token = nextToken();
80 if (token.type() == EOFToken)
81 break;
82 if (token.type() == CommentToken)
83 wrapper.addComment(offset, m_input.offset(), m_tokens.size());
84 else {
85 m_tokens.append(token);
86 wrapper.addToken(offset);
87 }
88 offset = m_input.offset();
89 }
90
91 wrapper.addToken(offset);
92 wrapper.finalizeConstruction(m_tokens.begin());
93}
94
95CSSParserTokenRange CSSTokenizer::tokenRange() const
96{
97 return m_tokens;
98}
99
100unsigned CSSTokenizer::tokenCount()
101{
102 return m_tokens.size();
103}
104
105static bool isNewLine(UChar cc)
106{
107 // We check \r and \f here, since we have no preprocessing stage
108 return (cc == '\r' || cc == '\n' || cc == '\f');
109}
110
111// http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
112static bool twoCharsAreValidEscape(UChar first, UChar second)
113{
114 return first == '\\' && !isNewLine(second);
115}
116
117void CSSTokenizer::reconsume(UChar c)
118{
119 m_input.pushBack(c);
120}
121
122UChar CSSTokenizer::consume()
123{
124 UChar current = m_input.nextInputChar();
125 m_input.advance();
126 return current;
127}
128
129CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/)
130{
131 m_input.advanceUntilNonWhitespace();
132 return CSSParserToken(WhitespaceToken);
133}
134
135CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type)
136{
137 m_blockStack.append(type);
138 return CSSParserToken(type, CSSParserToken::BlockStart);
139}
140
141CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name)
142{
143 m_blockStack.append(blockType);
144 return CSSParserToken(type, name, CSSParserToken::BlockStart);
145}
146
147CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType)
148{
149 if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) {
150 m_blockStack.removeLast();
151 return CSSParserToken(type, CSSParserToken::BlockEnd);
152 }
153 return CSSParserToken(type);
154}
155
156CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/)
157{
158 return blockStart(LeftParenthesisToken);
159}
160
161CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/)
162{
163 return blockEnd(RightParenthesisToken, LeftParenthesisToken);
164}
165
166CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/)
167{
168 return blockStart(LeftBracketToken);
169}
170
171CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/)
172{
173 return blockEnd(RightBracketToken, LeftBracketToken);
174}
175
176CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/)
177{
178 return blockStart(LeftBraceToken);
179}
180
181CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/)
182{
183 return blockEnd(RightBraceToken, LeftBraceToken);
184}
185
186CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc)
187{
188 if (nextCharsAreNumber(cc)) {
189 reconsume(cc);
190 return consumeNumericToken();
191 }
192 return CSSParserToken(DelimiterToken, cc);
193}
194
195CSSParserToken CSSTokenizer::asterisk(UChar cc)
196{
197 ASSERT_UNUSED(cc, cc == '*');
198 if (consumeIfNext('='))
199 return CSSParserToken(SubstringMatchToken);
200 return CSSParserToken(DelimiterToken, '*');
201}
202
203CSSParserToken CSSTokenizer::lessThan(UChar cc)
204{
205 ASSERT_UNUSED(cc, cc == '<');
206 if (m_input.peekWithoutReplacement(0) == '!'
207 && m_input.peekWithoutReplacement(1) == '-'
208 && m_input.peekWithoutReplacement(2) == '-') {
209 m_input.advance(3);
210 return CSSParserToken(CDOToken);
211 }
212 return CSSParserToken(DelimiterToken, '<');
213}
214
215CSSParserToken CSSTokenizer::comma(UChar /*cc*/)
216{
217 return CSSParserToken(CommaToken);
218}
219
220CSSParserToken CSSTokenizer::hyphenMinus(UChar cc)
221{
222 if (nextCharsAreNumber(cc)) {
223 reconsume(cc);
224 return consumeNumericToken();
225 }
226 if (m_input.peekWithoutReplacement(0) == '-'
227 && m_input.peekWithoutReplacement(1) == '>') {
228 m_input.advance(2);
229 return CSSParserToken(CDCToken);
230 }
231 if (nextCharsAreIdentifier(cc)) {
232 reconsume(cc);
233 return consumeIdentLikeToken();
234 }
235 return CSSParserToken(DelimiterToken, cc);
236}
237
238CSSParserToken CSSTokenizer::solidus(UChar cc)
239{
240 if (consumeIfNext('*')) {
241 // These get ignored, but we need a value to return.
242 consumeUntilCommentEndFound();
243 return CSSParserToken(CommentToken);
244 }
245
246 return CSSParserToken(DelimiterToken, cc);
247}
248
249CSSParserToken CSSTokenizer::colon(UChar /*cc*/)
250{
251 return CSSParserToken(ColonToken);
252}
253
254CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/)
255{
256 return CSSParserToken(SemicolonToken);
257}
258
259CSSParserToken CSSTokenizer::hash(UChar cc)
260{
261 UChar nextChar = m_input.peekWithoutReplacement(0);
262 if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) {
263 HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted;
264 return CSSParserToken(type, consumeName());
265 }
266
267 return CSSParserToken(DelimiterToken, cc);
268}
269
270CSSParserToken CSSTokenizer::circumflexAccent(UChar cc)
271{
272 ASSERT_UNUSED(cc, cc == '^');
273 if (consumeIfNext('='))
274 return CSSParserToken(PrefixMatchToken);
275 return CSSParserToken(DelimiterToken, '^');
276}
277
278CSSParserToken CSSTokenizer::dollarSign(UChar cc)
279{
280 ASSERT_UNUSED(cc, cc == '$');
281 if (consumeIfNext('='))
282 return CSSParserToken(SuffixMatchToken);
283 return CSSParserToken(DelimiterToken, '$');
284}
285
286CSSParserToken CSSTokenizer::verticalLine(UChar cc)
287{
288 ASSERT_UNUSED(cc, cc == '|');
289 if (consumeIfNext('='))
290 return CSSParserToken(DashMatchToken);
291 if (consumeIfNext('|'))
292 return CSSParserToken(ColumnToken);
293 return CSSParserToken(DelimiterToken, '|');
294}
295
296CSSParserToken CSSTokenizer::tilde(UChar cc)
297{
298 ASSERT_UNUSED(cc, cc == '~');
299 if (consumeIfNext('='))
300 return CSSParserToken(IncludeMatchToken);
301 return CSSParserToken(DelimiterToken, '~');
302}
303
304CSSParserToken CSSTokenizer::commercialAt(UChar cc)
305{
306 ASSERT_UNUSED(cc, cc == '@');
307 if (nextCharsAreIdentifier())
308 return CSSParserToken(AtKeywordToken, consumeName());
309 return CSSParserToken(DelimiterToken, '@');
310}
311
312CSSParserToken CSSTokenizer::reverseSolidus(UChar cc)
313{
314 if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
315 reconsume(cc);
316 return consumeIdentLikeToken();
317 }
318 return CSSParserToken(DelimiterToken, cc);
319}
320
321CSSParserToken CSSTokenizer::asciiDigit(UChar cc)
322{
323 reconsume(cc);
324 return consumeNumericToken();
325}
326
327CSSParserToken CSSTokenizer::letterU(UChar cc)
328{
329 if (m_input.peekWithoutReplacement(0) == '+'
330 && (isASCIIHexDigit(m_input.peekWithoutReplacement(1))
331 || m_input.peekWithoutReplacement(1) == '?')) {
332 m_input.advance();
333 return consumeUnicodeRange();
334 }
335 reconsume(cc);
336 return consumeIdentLikeToken();
337}
338
339CSSParserToken CSSTokenizer::nameStart(UChar cc)
340{
341 reconsume(cc);
342 return consumeIdentLikeToken();
343}
344
345CSSParserToken CSSTokenizer::stringStart(UChar cc)
346{
347 return consumeStringTokenUntil(cc);
348}
349
350CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/)
351{
352 return CSSParserToken(EOFToken);
353}
354
355const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = {
356 &CSSTokenizer::endOfFile,
357 0,
358 0,
359 0,
360 0,
361 0,
362 0,
363 0,
364 0,
365 &CSSTokenizer::whiteSpace,
366 &CSSTokenizer::whiteSpace,
367 0,
368 &CSSTokenizer::whiteSpace,
369 &CSSTokenizer::whiteSpace,
370 0,
371 0,
372 0,
373 0,
374 0,
375 0,
376 0,
377 0,
378 0,
379 0,
380 0,
381 0,
382 0,
383 0,
384 0,
385 0,
386 0,
387 0,
388 &CSSTokenizer::whiteSpace,
389 0,
390 &CSSTokenizer::stringStart,
391 &CSSTokenizer::hash,
392 &CSSTokenizer::dollarSign,
393 0,
394 0,
395 &CSSTokenizer::stringStart,
396 &CSSTokenizer::leftParenthesis,
397 &CSSTokenizer::rightParenthesis,
398 &CSSTokenizer::asterisk,
399 &CSSTokenizer::plusOrFullStop,
400 &CSSTokenizer::comma,
401 &CSSTokenizer::hyphenMinus,
402 &CSSTokenizer::plusOrFullStop,
403 &CSSTokenizer::solidus,
404 &CSSTokenizer::asciiDigit,
405 &CSSTokenizer::asciiDigit,
406 &CSSTokenizer::asciiDigit,
407 &CSSTokenizer::asciiDigit,
408 &CSSTokenizer::asciiDigit,
409 &CSSTokenizer::asciiDigit,
410 &CSSTokenizer::asciiDigit,
411 &CSSTokenizer::asciiDigit,
412 &CSSTokenizer::asciiDigit,
413 &CSSTokenizer::asciiDigit,
414 &CSSTokenizer::colon,
415 &CSSTokenizer::semiColon,
416 &CSSTokenizer::lessThan,
417 0,
418 0,
419 0,
420 &CSSTokenizer::commercialAt,
421 &CSSTokenizer::nameStart,
422 &CSSTokenizer::nameStart,
423 &CSSTokenizer::nameStart,
424 &CSSTokenizer::nameStart,
425 &CSSTokenizer::nameStart,
426 &CSSTokenizer::nameStart,
427 &CSSTokenizer::nameStart,
428 &CSSTokenizer::nameStart,
429 &CSSTokenizer::nameStart,
430 &CSSTokenizer::nameStart,
431 &CSSTokenizer::nameStart,
432 &CSSTokenizer::nameStart,
433 &CSSTokenizer::nameStart,
434 &CSSTokenizer::nameStart,
435 &CSSTokenizer::nameStart,
436 &CSSTokenizer::nameStart,
437 &CSSTokenizer::nameStart,
438 &CSSTokenizer::nameStart,
439 &CSSTokenizer::nameStart,
440 &CSSTokenizer::nameStart,
441 &CSSTokenizer::letterU,
442 &CSSTokenizer::nameStart,
443 &CSSTokenizer::nameStart,
444 &CSSTokenizer::nameStart,
445 &CSSTokenizer::nameStart,
446 &CSSTokenizer::nameStart,
447 &CSSTokenizer::leftBracket,
448 &CSSTokenizer::reverseSolidus,
449 &CSSTokenizer::rightBracket,
450 &CSSTokenizer::circumflexAccent,
451 &CSSTokenizer::nameStart,
452 0,
453 &CSSTokenizer::nameStart,
454 &CSSTokenizer::nameStart,
455 &CSSTokenizer::nameStart,
456 &CSSTokenizer::nameStart,
457 &CSSTokenizer::nameStart,
458 &CSSTokenizer::nameStart,
459 &CSSTokenizer::nameStart,
460 &CSSTokenizer::nameStart,
461 &CSSTokenizer::nameStart,
462 &CSSTokenizer::nameStart,
463 &CSSTokenizer::nameStart,
464 &CSSTokenizer::nameStart,
465 &CSSTokenizer::nameStart,
466 &CSSTokenizer::nameStart,
467 &CSSTokenizer::nameStart,
468 &CSSTokenizer::nameStart,
469 &CSSTokenizer::nameStart,
470 &CSSTokenizer::nameStart,
471 &CSSTokenizer::nameStart,
472 &CSSTokenizer::nameStart,
473 &CSSTokenizer::letterU,
474 &CSSTokenizer::nameStart,
475 &CSSTokenizer::nameStart,
476 &CSSTokenizer::nameStart,
477 &CSSTokenizer::nameStart,
478 &CSSTokenizer::nameStart,
479 &CSSTokenizer::leftBrace,
480 &CSSTokenizer::verticalLine,
481 &CSSTokenizer::rightBrace,
482 &CSSTokenizer::tilde,
483 0,
484};
485#if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED
486const unsigned codePointsNumber = 128;
487#endif
488
489CSSParserToken CSSTokenizer::nextToken()
490{
491 // Unlike the HTMLTokenizer, the CSS Syntax spec is written
492 // as a stateless, (fixed-size) look-ahead tokenizer.
493 // We could move to the stateful model and instead create
494 // states for all the "next 3 codepoints are X" cases.
495 // State-machine tokenizers are easier to write to handle
496 // incremental tokenization of partial sources.
497 // However, for now we follow the spec exactly.
498 UChar cc = consume();
499 CodePoint codePointFunc = 0;
500
501 if (isASCII(cc)) {
502 ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
503 codePointFunc = codePoints[cc];
504 } else
505 codePointFunc = &CSSTokenizer::nameStart;
506
507 if (codePointFunc)
508 return ((this)->*(codePointFunc))(cc);
509 return CSSParserToken(DelimiterToken, cc);
510}
511
512// This method merges the following spec sections for efficiency
513// http://www.w3.org/TR/css3-syntax/#consume-a-number
514// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
515CSSParserToken CSSTokenizer::consumeNumber()
516{
517 ASSERT(nextCharsAreNumber());
518
519 NumericValueType type = IntegerValueType;
520 NumericSign sign = NoSign;
521 unsigned numberLength = 0;
522
523 UChar next = m_input.peekWithoutReplacement(0);
524 if (next == '+') {
525 ++numberLength;
526 sign = PlusSign;
527 } else if (next == '-') {
528 ++numberLength;
529 sign = MinusSign;
530 }
531
532 numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength);
533 next = m_input.peekWithoutReplacement(numberLength);
534 if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) {
535 type = NumberValueType;
536 numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2);
537 next = m_input.peekWithoutReplacement(numberLength);
538 }
539
540 if (next == 'E' || next == 'e') {
541 next = m_input.peekWithoutReplacement(numberLength + 1);
542 if (isASCIIDigit(next)) {
543 type = NumberValueType;
544 numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1);
545 } else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) {
546 type = NumberValueType;
547 numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3);
548 }
549 }
550
551 double value = m_input.getDouble(0, numberLength);
552 m_input.advance(numberLength);
553
554 return CSSParserToken(NumberToken, value, type, sign);
555}
556
557// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
558CSSParserToken CSSTokenizer::consumeNumericToken()
559{
560 CSSParserToken token = consumeNumber();
561 if (nextCharsAreIdentifier())
562 token.convertToDimensionWithUnit(consumeName());
563 else if (consumeIfNext('%'))
564 token.convertToPercentage();
565 return token;
566}
567
568// http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token
569CSSParserToken CSSTokenizer::consumeIdentLikeToken()
570{
571 StringView name = consumeName();
572 if (consumeIfNext('(')) {
573 if (equalIgnoringASCIICase(name, "url")) {
574 // The spec is slightly different so as to avoid dropping whitespace
575 // tokens, but they wouldn't be used and this is easier.
576 m_input.advanceUntilNonWhitespace();
577 UChar next = m_input.peekWithoutReplacement(0);
578 if (next != '"' && next != '\'')
579 return consumeUrlToken();
580 }
581 return blockStart(LeftParenthesisToken, FunctionToken, name);
582 }
583 return CSSParserToken(IdentToken, name);
584}
585
586// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
587CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
588{
589 // Strings without escapes get handled without allocations
590 for (unsigned size = 0; ; size++) {
591 UChar cc = m_input.peekWithoutReplacement(size);
592 if (cc == endingCodePoint) {
593 unsigned startOffset = m_input.offset();
594 m_input.advance(size + 1);
595 return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size));
596 }
597 if (isNewLine(cc)) {
598 m_input.advance(size);
599 return CSSParserToken(BadStringToken);
600 }
601 if (cc == '\0' || cc == '\\')
602 break;
603 }
604
605 StringBuilder output;
606 while (true) {
607 UChar cc = consume();
608 if (cc == endingCodePoint || cc == kEndOfFileMarker)
609 return CSSParserToken(StringToken, registerString(output.toString()));
610 if (isNewLine(cc)) {
611 reconsume(cc);
612 return CSSParserToken(BadStringToken);
613 }
614 if (cc == '\\') {
615 if (m_input.nextInputChar() == kEndOfFileMarker)
616 continue;
617 if (isNewLine(m_input.peekWithoutReplacement(0)))
618 consumeSingleWhitespaceIfNext(); // This handles \r\n for us
619 else
620 output.append(consumeEscape());
621 } else
622 output.append(cc);
623 }
624}
625
626CSSParserToken CSSTokenizer::consumeUnicodeRange()
627{
628 ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?');
629 int lengthRemaining = 6;
630 UChar32 start = 0;
631
632 while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
633 start = start * 16 + toASCIIHexValue(consume());
634 --lengthRemaining;
635 }
636
637 UChar32 end = start;
638 if (lengthRemaining && consumeIfNext('?')) {
639 do {
640 start *= 16;
641 end = end * 16 + 0xF;
642 --lengthRemaining;
643 } while (lengthRemaining && consumeIfNext('?'));
644 } else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) {
645 m_input.advance();
646 lengthRemaining = 6;
647 end = 0;
648 do {
649 end = end * 16 + toASCIIHexValue(consume());
650 --lengthRemaining;
651 } while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0)));
652 }
653
654 return CSSParserToken(UnicodeRangeToken, start, end);
655}
656
657// http://dev.w3.org/csswg/css-syntax/#non-printable-code-point
658static bool isNonPrintableCodePoint(UChar cc)
659{
660 return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f';
661}
662
663// http://dev.w3.org/csswg/css-syntax/#consume-url-token
664CSSParserToken CSSTokenizer::consumeUrlToken()
665{
666 m_input.advanceUntilNonWhitespace();
667
668 // URL tokens without escapes get handled without allocations
669 for (unsigned size = 0; ; size++) {
670 UChar cc = m_input.peekWithoutReplacement(size);
671 if (cc == ')') {
672 unsigned startOffset = m_input.offset();
673 m_input.advance(size + 1);
674 return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size));
675 }
676 if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f')
677 break;
678 }
679
680 StringBuilder result;
681 while (true) {
682 UChar cc = consume();
683 if (cc == ')' || cc == kEndOfFileMarker)
684 return CSSParserToken(UrlToken, registerString(result.toString()));
685
686 if (isHTMLSpace(cc)) {
687 m_input.advanceUntilNonWhitespace();
688 if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker)
689 return CSSParserToken(UrlToken, registerString(result.toString()));
690 break;
691 }
692
693 if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc))
694 break;
695
696 if (cc == '\\') {
697 if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
698 result.append(consumeEscape());
699 continue;
700 }
701 break;
702 }
703
704 result.append(cc);
705 }
706
707 consumeBadUrlRemnants();
708 return CSSParserToken(BadUrlToken);
709}
710
711// http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url
712void CSSTokenizer::consumeBadUrlRemnants()
713{
714 while (true) {
715 UChar cc = consume();
716 if (cc == ')' || cc == kEndOfFileMarker)
717 return;
718 if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0)))
719 consumeEscape();
720 }
721}
722
723void CSSTokenizer::consumeSingleWhitespaceIfNext()
724{
725 // We check for \r\n and HTML spaces since we don't do preprocessing
726 UChar next = m_input.peekWithoutReplacement(0);
727 if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n')
728 m_input.advance(2);
729 else if (isHTMLSpace(next))
730 m_input.advance();
731}
732
733void CSSTokenizer::consumeUntilCommentEndFound()
734{
735 UChar c = consume();
736 while (true) {
737 if (c == kEndOfFileMarker)
738 return;
739 if (c != '*') {
740 c = consume();
741 continue;
742 }
743 c = consume();
744 if (c == '/')
745 return;
746 }
747}
748
749bool CSSTokenizer::consumeIfNext(UChar character)
750{
751 // Since we're not doing replacement we can't tell the difference from
752 // a NUL in the middle and the kEndOfFileMarker, so character must not be
753 // NUL.
754 ASSERT(character);
755 if (m_input.peekWithoutReplacement(0) == character) {
756 m_input.advance();
757 return true;
758 }
759 return false;
760}
761
762// http://www.w3.org/TR/css3-syntax/#consume-a-name
763StringView CSSTokenizer::consumeName()
764{
765 // Names without escapes get handled without allocations
766 for (unsigned size = 0; ; ++size) {
767 UChar cc = m_input.peekWithoutReplacement(size);
768 if (isNameCodePoint(cc))
769 continue;
770 // peekWithoutReplacement will return NUL when we hit the end of the
771 // input. In that case we want to still use the rangeAt() fast path
772 // below.
773 if (cc == '\0' && m_input.offset() + size < m_input.length())
774 break;
775 if (cc == '\\')
776 break;
777 unsigned startOffset = m_input.offset();
778 m_input.advance(size);
779 return m_input.rangeAt(startOffset, size);
780 }
781
782 StringBuilder result;
783 while (true) {
784 UChar cc = consume();
785 if (isNameCodePoint(cc)) {
786 result.append(cc);
787 continue;
788 }
789 if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
790 result.append(consumeEscape());
791 continue;
792 }
793 reconsume(cc);
794 return registerString(result.toString());
795 }
796}
797
798// http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
799UChar32 CSSTokenizer::consumeEscape()
800{
801 UChar cc = consume();
802 ASSERT(!isNewLine(cc));
803 if (isASCIIHexDigit(cc)) {
804 unsigned consumedHexDigits = 1;
805 StringBuilder hexChars;
806 hexChars.append(cc);
807 while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
808 cc = consume();
809 hexChars.append(cc);
810 consumedHexDigits++;
811 };
812 consumeSingleWhitespaceIfNext();
813 bool ok = false;
814 UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16);
815 ASSERT(ok);
816 if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
817 return replacementCharacter;
818 return codePoint;
819 }
820
821 if (cc == kEndOfFileMarker)
822 return replacementCharacter;
823 return cc;
824}
825
826bool CSSTokenizer::nextTwoCharsAreValidEscape()
827{
828 return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1));
829}
830
831// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
832bool CSSTokenizer::nextCharsAreNumber(UChar first)
833{
834 UChar second = m_input.peekWithoutReplacement(0);
835 if (isASCIIDigit(first))
836 return true;
837 if (first == '+' || first == '-')
838 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1))));
839 if (first =='.')
840 return (isASCIIDigit(second));
841 return false;
842}
843
844bool CSSTokenizer::nextCharsAreNumber()
845{
846 UChar first = consume();
847 bool areNumber = nextCharsAreNumber(first);
848 reconsume(first);
849 return areNumber;
850}
851
852// http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier
853bool CSSTokenizer::nextCharsAreIdentifier(UChar first)
854{
855 UChar second = m_input.peekWithoutReplacement(0);
856 if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second))
857 return true;
858
859 if (first == '-')
860 return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape();
861
862 return false;
863}
864
865bool CSSTokenizer::nextCharsAreIdentifier()
866{
867 UChar first = consume();
868 bool areIdentifier = nextCharsAreIdentifier(first);
869 reconsume(first);
870 return areIdentifier;
871}
872
873StringView CSSTokenizer::registerString(const String& string)
874{
875 m_stringPool.append(string);
876 return string;
877}
878
879} // namespace WebCore
880