| 1 | // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 | // Copyright (C) 2016 Apple Inc. All rights reserved. |
| 3 | // |
| 4 | // Redistribution and use in source and binary forms, with or without |
| 5 | // modification, are permitted provided that the following conditions are |
| 6 | // met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright |
| 9 | // notice, this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above |
| 11 | // copyright notice, this list of conditions and the following disclaimer |
| 12 | // in the documentation and/or other materials provided with the |
| 13 | // distribution. |
| 14 | // * Neither the name of Google Inc. nor the names of its |
| 15 | // contributors may be used to endorse or promote products derived from |
| 16 | // this software without specific prior written permission. |
| 17 | // |
| 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | |
| 30 | #include "config.h" |
| 31 | #include "CSSTokenizer.h" |
| 32 | |
| 33 | #include "CSSParserIdioms.h" |
| 34 | #include "CSSParserObserverWrapper.h" |
| 35 | #include "CSSParserTokenRange.h" |
| 36 | #include "CSSTokenizerInputStream.h" |
| 37 | #include "HTMLParserIdioms.h" |
| 38 | #include <wtf/text/StringBuilder.h> |
| 39 | #include <wtf/unicode/CharacterNames.h> |
| 40 | |
| 41 | namespace WebCore { |
| 42 | |
| 43 | CSSTokenizer::CSSTokenizer(const String& string) |
| 44 | : m_input(string) |
| 45 | { |
| 46 | // According to the spec, we should perform preprocessing here. |
| 47 | // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing |
| 48 | // |
| 49 | // However, we can skip this step since: |
| 50 | // * We're using HTML spaces (which accept \r and \f as a valid white space) |
| 51 | // * Do not count white spaces |
| 52 | // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters |
| 53 | |
| 54 | if (string.isEmpty()) |
| 55 | return; |
| 56 | |
| 57 | // To avoid resizing we err on the side of reserving too much space. |
| 58 | // Most strings we tokenize have about 3.5 to 5 characters per token. |
| 59 | m_tokens.reserveInitialCapacity(string.length() / 3); |
| 60 | |
| 61 | while (true) { |
| 62 | CSSParserToken token = nextToken(); |
| 63 | if (token.type() == CommentToken) |
| 64 | continue; |
| 65 | if (token.type() == EOFToken) |
| 66 | return; |
| 67 | m_tokens.append(token); |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper) |
| 72 | : m_input(string) |
| 73 | { |
| 74 | if (string.isEmpty()) |
| 75 | return; |
| 76 | |
| 77 | unsigned offset = 0; |
| 78 | while (true) { |
| 79 | CSSParserToken token = nextToken(); |
| 80 | if (token.type() == EOFToken) |
| 81 | break; |
| 82 | if (token.type() == CommentToken) |
| 83 | wrapper.addComment(offset, m_input.offset(), m_tokens.size()); |
| 84 | else { |
| 85 | m_tokens.append(token); |
| 86 | wrapper.addToken(offset); |
| 87 | } |
| 88 | offset = m_input.offset(); |
| 89 | } |
| 90 | |
| 91 | wrapper.addToken(offset); |
| 92 | wrapper.finalizeConstruction(m_tokens.begin()); |
| 93 | } |
| 94 | |
| 95 | CSSParserTokenRange CSSTokenizer::tokenRange() const |
| 96 | { |
| 97 | return m_tokens; |
| 98 | } |
| 99 | |
| 100 | unsigned CSSTokenizer::tokenCount() |
| 101 | { |
| 102 | return m_tokens.size(); |
| 103 | } |
| 104 | |
| 105 | static bool isNewLine(UChar cc) |
| 106 | { |
| 107 | // We check \r and \f here, since we have no preprocessing stage |
| 108 | return (cc == '\r' || cc == '\n' || cc == '\f'); |
| 109 | } |
| 110 | |
| 111 | // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape |
| 112 | static bool twoCharsAreValidEscape(UChar first, UChar second) |
| 113 | { |
| 114 | return first == '\\' && !isNewLine(second); |
| 115 | } |
| 116 | |
| 117 | void CSSTokenizer::reconsume(UChar c) |
| 118 | { |
| 119 | m_input.pushBack(c); |
| 120 | } |
| 121 | |
| 122 | UChar CSSTokenizer::consume() |
| 123 | { |
| 124 | UChar current = m_input.nextInputChar(); |
| 125 | m_input.advance(); |
| 126 | return current; |
| 127 | } |
| 128 | |
| 129 | CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/) |
| 130 | { |
| 131 | m_input.advanceUntilNonWhitespace(); |
| 132 | return CSSParserToken(WhitespaceToken); |
| 133 | } |
| 134 | |
| 135 | CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type) |
| 136 | { |
| 137 | m_blockStack.append(type); |
| 138 | return CSSParserToken(type, CSSParserToken::BlockStart); |
| 139 | } |
| 140 | |
| 141 | CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name) |
| 142 | { |
| 143 | m_blockStack.append(blockType); |
| 144 | return CSSParserToken(type, name, CSSParserToken::BlockStart); |
| 145 | } |
| 146 | |
| 147 | CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType) |
| 148 | { |
| 149 | if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) { |
| 150 | m_blockStack.removeLast(); |
| 151 | return CSSParserToken(type, CSSParserToken::BlockEnd); |
| 152 | } |
| 153 | return CSSParserToken(type); |
| 154 | } |
| 155 | |
| 156 | CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/) |
| 157 | { |
| 158 | return blockStart(LeftParenthesisToken); |
| 159 | } |
| 160 | |
| 161 | CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/) |
| 162 | { |
| 163 | return blockEnd(RightParenthesisToken, LeftParenthesisToken); |
| 164 | } |
| 165 | |
| 166 | CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/) |
| 167 | { |
| 168 | return blockStart(LeftBracketToken); |
| 169 | } |
| 170 | |
| 171 | CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/) |
| 172 | { |
| 173 | return blockEnd(RightBracketToken, LeftBracketToken); |
| 174 | } |
| 175 | |
| 176 | CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/) |
| 177 | { |
| 178 | return blockStart(LeftBraceToken); |
| 179 | } |
| 180 | |
| 181 | CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/) |
| 182 | { |
| 183 | return blockEnd(RightBraceToken, LeftBraceToken); |
| 184 | } |
| 185 | |
| 186 | CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc) |
| 187 | { |
| 188 | if (nextCharsAreNumber(cc)) { |
| 189 | reconsume(cc); |
| 190 | return consumeNumericToken(); |
| 191 | } |
| 192 | return CSSParserToken(DelimiterToken, cc); |
| 193 | } |
| 194 | |
| 195 | CSSParserToken CSSTokenizer::asterisk(UChar cc) |
| 196 | { |
| 197 | ASSERT_UNUSED(cc, cc == '*'); |
| 198 | if (consumeIfNext('=')) |
| 199 | return CSSParserToken(SubstringMatchToken); |
| 200 | return CSSParserToken(DelimiterToken, '*'); |
| 201 | } |
| 202 | |
| 203 | CSSParserToken CSSTokenizer::lessThan(UChar cc) |
| 204 | { |
| 205 | ASSERT_UNUSED(cc, cc == '<'); |
| 206 | if (m_input.peekWithoutReplacement(0) == '!' |
| 207 | && m_input.peekWithoutReplacement(1) == '-' |
| 208 | && m_input.peekWithoutReplacement(2) == '-') { |
| 209 | m_input.advance(3); |
| 210 | return CSSParserToken(CDOToken); |
| 211 | } |
| 212 | return CSSParserToken(DelimiterToken, '<'); |
| 213 | } |
| 214 | |
| 215 | CSSParserToken CSSTokenizer::comma(UChar /*cc*/) |
| 216 | { |
| 217 | return CSSParserToken(CommaToken); |
| 218 | } |
| 219 | |
| 220 | CSSParserToken CSSTokenizer::hyphenMinus(UChar cc) |
| 221 | { |
| 222 | if (nextCharsAreNumber(cc)) { |
| 223 | reconsume(cc); |
| 224 | return consumeNumericToken(); |
| 225 | } |
| 226 | if (m_input.peekWithoutReplacement(0) == '-' |
| 227 | && m_input.peekWithoutReplacement(1) == '>') { |
| 228 | m_input.advance(2); |
| 229 | return CSSParserToken(CDCToken); |
| 230 | } |
| 231 | if (nextCharsAreIdentifier(cc)) { |
| 232 | reconsume(cc); |
| 233 | return consumeIdentLikeToken(); |
| 234 | } |
| 235 | return CSSParserToken(DelimiterToken, cc); |
| 236 | } |
| 237 | |
| 238 | CSSParserToken CSSTokenizer::solidus(UChar cc) |
| 239 | { |
| 240 | if (consumeIfNext('*')) { |
| 241 | // These get ignored, but we need a value to return. |
| 242 | consumeUntilCommentEndFound(); |
| 243 | return CSSParserToken(CommentToken); |
| 244 | } |
| 245 | |
| 246 | return CSSParserToken(DelimiterToken, cc); |
| 247 | } |
| 248 | |
| 249 | CSSParserToken CSSTokenizer::colon(UChar /*cc*/) |
| 250 | { |
| 251 | return CSSParserToken(ColonToken); |
| 252 | } |
| 253 | |
| 254 | CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/) |
| 255 | { |
| 256 | return CSSParserToken(SemicolonToken); |
| 257 | } |
| 258 | |
| 259 | CSSParserToken CSSTokenizer::hash(UChar cc) |
| 260 | { |
| 261 | UChar nextChar = m_input.peekWithoutReplacement(0); |
| 262 | if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) { |
| 263 | HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted; |
| 264 | return CSSParserToken(type, consumeName()); |
| 265 | } |
| 266 | |
| 267 | return CSSParserToken(DelimiterToken, cc); |
| 268 | } |
| 269 | |
| 270 | CSSParserToken CSSTokenizer::circumflexAccent(UChar cc) |
| 271 | { |
| 272 | ASSERT_UNUSED(cc, cc == '^'); |
| 273 | if (consumeIfNext('=')) |
| 274 | return CSSParserToken(PrefixMatchToken); |
| 275 | return CSSParserToken(DelimiterToken, '^'); |
| 276 | } |
| 277 | |
| 278 | CSSParserToken CSSTokenizer::dollarSign(UChar cc) |
| 279 | { |
| 280 | ASSERT_UNUSED(cc, cc == '$'); |
| 281 | if (consumeIfNext('=')) |
| 282 | return CSSParserToken(SuffixMatchToken); |
| 283 | return CSSParserToken(DelimiterToken, '$'); |
| 284 | } |
| 285 | |
| 286 | CSSParserToken CSSTokenizer::verticalLine(UChar cc) |
| 287 | { |
| 288 | ASSERT_UNUSED(cc, cc == '|'); |
| 289 | if (consumeIfNext('=')) |
| 290 | return CSSParserToken(DashMatchToken); |
| 291 | if (consumeIfNext('|')) |
| 292 | return CSSParserToken(ColumnToken); |
| 293 | return CSSParserToken(DelimiterToken, '|'); |
| 294 | } |
| 295 | |
| 296 | CSSParserToken CSSTokenizer::tilde(UChar cc) |
| 297 | { |
| 298 | ASSERT_UNUSED(cc, cc == '~'); |
| 299 | if (consumeIfNext('=')) |
| 300 | return CSSParserToken(IncludeMatchToken); |
| 301 | return CSSParserToken(DelimiterToken, '~'); |
| 302 | } |
| 303 | |
| 304 | CSSParserToken CSSTokenizer::commercialAt(UChar cc) |
| 305 | { |
| 306 | ASSERT_UNUSED(cc, cc == '@'); |
| 307 | if (nextCharsAreIdentifier()) |
| 308 | return CSSParserToken(AtKeywordToken, consumeName()); |
| 309 | return CSSParserToken(DelimiterToken, '@'); |
| 310 | } |
| 311 | |
| 312 | CSSParserToken CSSTokenizer::reverseSolidus(UChar cc) |
| 313 | { |
| 314 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
| 315 | reconsume(cc); |
| 316 | return consumeIdentLikeToken(); |
| 317 | } |
| 318 | return CSSParserToken(DelimiterToken, cc); |
| 319 | } |
| 320 | |
| 321 | CSSParserToken CSSTokenizer::asciiDigit(UChar cc) |
| 322 | { |
| 323 | reconsume(cc); |
| 324 | return consumeNumericToken(); |
| 325 | } |
| 326 | |
| 327 | CSSParserToken CSSTokenizer::letterU(UChar cc) |
| 328 | { |
| 329 | if (m_input.peekWithoutReplacement(0) == '+' |
| 330 | && (isASCIIHexDigit(m_input.peekWithoutReplacement(1)) |
| 331 | || m_input.peekWithoutReplacement(1) == '?')) { |
| 332 | m_input.advance(); |
| 333 | return consumeUnicodeRange(); |
| 334 | } |
| 335 | reconsume(cc); |
| 336 | return consumeIdentLikeToken(); |
| 337 | } |
| 338 | |
| 339 | CSSParserToken CSSTokenizer::nameStart(UChar cc) |
| 340 | { |
| 341 | reconsume(cc); |
| 342 | return consumeIdentLikeToken(); |
| 343 | } |
| 344 | |
| 345 | CSSParserToken CSSTokenizer::stringStart(UChar cc) |
| 346 | { |
| 347 | return consumeStringTokenUntil(cc); |
| 348 | } |
| 349 | |
| 350 | CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/) |
| 351 | { |
| 352 | return CSSParserToken(EOFToken); |
| 353 | } |
| 354 | |
| 355 | const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = { |
| 356 | &CSSTokenizer::endOfFile, |
| 357 | 0, |
| 358 | 0, |
| 359 | 0, |
| 360 | 0, |
| 361 | 0, |
| 362 | 0, |
| 363 | 0, |
| 364 | 0, |
| 365 | &CSSTokenizer::whiteSpace, |
| 366 | &CSSTokenizer::whiteSpace, |
| 367 | 0, |
| 368 | &CSSTokenizer::whiteSpace, |
| 369 | &CSSTokenizer::whiteSpace, |
| 370 | 0, |
| 371 | 0, |
| 372 | 0, |
| 373 | 0, |
| 374 | 0, |
| 375 | 0, |
| 376 | 0, |
| 377 | 0, |
| 378 | 0, |
| 379 | 0, |
| 380 | 0, |
| 381 | 0, |
| 382 | 0, |
| 383 | 0, |
| 384 | 0, |
| 385 | 0, |
| 386 | 0, |
| 387 | 0, |
| 388 | &CSSTokenizer::whiteSpace, |
| 389 | 0, |
| 390 | &CSSTokenizer::stringStart, |
| 391 | &CSSTokenizer::hash, |
| 392 | &CSSTokenizer::dollarSign, |
| 393 | 0, |
| 394 | 0, |
| 395 | &CSSTokenizer::stringStart, |
| 396 | &CSSTokenizer::leftParenthesis, |
| 397 | &CSSTokenizer::rightParenthesis, |
| 398 | &CSSTokenizer::asterisk, |
| 399 | &CSSTokenizer::plusOrFullStop, |
| 400 | &CSSTokenizer::comma, |
| 401 | &CSSTokenizer::hyphenMinus, |
| 402 | &CSSTokenizer::plusOrFullStop, |
| 403 | &CSSTokenizer::solidus, |
| 404 | &CSSTokenizer::asciiDigit, |
| 405 | &CSSTokenizer::asciiDigit, |
| 406 | &CSSTokenizer::asciiDigit, |
| 407 | &CSSTokenizer::asciiDigit, |
| 408 | &CSSTokenizer::asciiDigit, |
| 409 | &CSSTokenizer::asciiDigit, |
| 410 | &CSSTokenizer::asciiDigit, |
| 411 | &CSSTokenizer::asciiDigit, |
| 412 | &CSSTokenizer::asciiDigit, |
| 413 | &CSSTokenizer::asciiDigit, |
| 414 | &CSSTokenizer::colon, |
| 415 | &CSSTokenizer::semiColon, |
| 416 | &CSSTokenizer::lessThan, |
| 417 | 0, |
| 418 | 0, |
| 419 | 0, |
| 420 | &CSSTokenizer::commercialAt, |
| 421 | &CSSTokenizer::nameStart, |
| 422 | &CSSTokenizer::nameStart, |
| 423 | &CSSTokenizer::nameStart, |
| 424 | &CSSTokenizer::nameStart, |
| 425 | &CSSTokenizer::nameStart, |
| 426 | &CSSTokenizer::nameStart, |
| 427 | &CSSTokenizer::nameStart, |
| 428 | &CSSTokenizer::nameStart, |
| 429 | &CSSTokenizer::nameStart, |
| 430 | &CSSTokenizer::nameStart, |
| 431 | &CSSTokenizer::nameStart, |
| 432 | &CSSTokenizer::nameStart, |
| 433 | &CSSTokenizer::nameStart, |
| 434 | &CSSTokenizer::nameStart, |
| 435 | &CSSTokenizer::nameStart, |
| 436 | &CSSTokenizer::nameStart, |
| 437 | &CSSTokenizer::nameStart, |
| 438 | &CSSTokenizer::nameStart, |
| 439 | &CSSTokenizer::nameStart, |
| 440 | &CSSTokenizer::nameStart, |
| 441 | &CSSTokenizer::letterU, |
| 442 | &CSSTokenizer::nameStart, |
| 443 | &CSSTokenizer::nameStart, |
| 444 | &CSSTokenizer::nameStart, |
| 445 | &CSSTokenizer::nameStart, |
| 446 | &CSSTokenizer::nameStart, |
| 447 | &CSSTokenizer::leftBracket, |
| 448 | &CSSTokenizer::reverseSolidus, |
| 449 | &CSSTokenizer::rightBracket, |
| 450 | &CSSTokenizer::circumflexAccent, |
| 451 | &CSSTokenizer::nameStart, |
| 452 | 0, |
| 453 | &CSSTokenizer::nameStart, |
| 454 | &CSSTokenizer::nameStart, |
| 455 | &CSSTokenizer::nameStart, |
| 456 | &CSSTokenizer::nameStart, |
| 457 | &CSSTokenizer::nameStart, |
| 458 | &CSSTokenizer::nameStart, |
| 459 | &CSSTokenizer::nameStart, |
| 460 | &CSSTokenizer::nameStart, |
| 461 | &CSSTokenizer::nameStart, |
| 462 | &CSSTokenizer::nameStart, |
| 463 | &CSSTokenizer::nameStart, |
| 464 | &CSSTokenizer::nameStart, |
| 465 | &CSSTokenizer::nameStart, |
| 466 | &CSSTokenizer::nameStart, |
| 467 | &CSSTokenizer::nameStart, |
| 468 | &CSSTokenizer::nameStart, |
| 469 | &CSSTokenizer::nameStart, |
| 470 | &CSSTokenizer::nameStart, |
| 471 | &CSSTokenizer::nameStart, |
| 472 | &CSSTokenizer::nameStart, |
| 473 | &CSSTokenizer::letterU, |
| 474 | &CSSTokenizer::nameStart, |
| 475 | &CSSTokenizer::nameStart, |
| 476 | &CSSTokenizer::nameStart, |
| 477 | &CSSTokenizer::nameStart, |
| 478 | &CSSTokenizer::nameStart, |
| 479 | &CSSTokenizer::leftBrace, |
| 480 | &CSSTokenizer::verticalLine, |
| 481 | &CSSTokenizer::rightBrace, |
| 482 | &CSSTokenizer::tilde, |
| 483 | 0, |
| 484 | }; |
| 485 | #if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED |
| 486 | const unsigned codePointsNumber = 128; |
| 487 | #endif |
| 488 | |
| 489 | CSSParserToken CSSTokenizer::nextToken() |
| 490 | { |
| 491 | // Unlike the HTMLTokenizer, the CSS Syntax spec is written |
| 492 | // as a stateless, (fixed-size) look-ahead tokenizer. |
| 493 | // We could move to the stateful model and instead create |
| 494 | // states for all the "next 3 codepoints are X" cases. |
| 495 | // State-machine tokenizers are easier to write to handle |
| 496 | // incremental tokenization of partial sources. |
| 497 | // However, for now we follow the spec exactly. |
| 498 | UChar cc = consume(); |
| 499 | CodePoint codePointFunc = 0; |
| 500 | |
| 501 | if (isASCII(cc)) { |
| 502 | ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); |
| 503 | codePointFunc = codePoints[cc]; |
| 504 | } else |
| 505 | codePointFunc = &CSSTokenizer::nameStart; |
| 506 | |
| 507 | if (codePointFunc) |
| 508 | return ((this)->*(codePointFunc))(cc); |
| 509 | return CSSParserToken(DelimiterToken, cc); |
| 510 | } |
| 511 | |
| 512 | // This method merges the following spec sections for efficiency |
| 513 | // http://www.w3.org/TR/css3-syntax/#consume-a-number |
| 514 | // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number |
| 515 | CSSParserToken CSSTokenizer::() |
| 516 | { |
| 517 | ASSERT(nextCharsAreNumber()); |
| 518 | |
| 519 | NumericValueType type = IntegerValueType; |
| 520 | NumericSign sign = NoSign; |
| 521 | unsigned numberLength = 0; |
| 522 | |
| 523 | UChar next = m_input.peekWithoutReplacement(0); |
| 524 | if (next == '+') { |
| 525 | ++numberLength; |
| 526 | sign = PlusSign; |
| 527 | } else if (next == '-') { |
| 528 | ++numberLength; |
| 529 | sign = MinusSign; |
| 530 | } |
| 531 | |
| 532 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength); |
| 533 | next = m_input.peekWithoutReplacement(numberLength); |
| 534 | if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) { |
| 535 | type = NumberValueType; |
| 536 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2); |
| 537 | next = m_input.peekWithoutReplacement(numberLength); |
| 538 | } |
| 539 | |
| 540 | if (next == 'E' || next == 'e') { |
| 541 | next = m_input.peekWithoutReplacement(numberLength + 1); |
| 542 | if (isASCIIDigit(next)) { |
| 543 | type = NumberValueType; |
| 544 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1); |
| 545 | } else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) { |
| 546 | type = NumberValueType; |
| 547 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3); |
| 548 | } |
| 549 | } |
| 550 | |
| 551 | double value = m_input.getDouble(0, numberLength); |
| 552 | m_input.advance(numberLength); |
| 553 | |
| 554 | return CSSParserToken(NumberToken, value, type, sign); |
| 555 | } |
| 556 | |
| 557 | // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token |
| 558 | CSSParserToken CSSTokenizer::() |
| 559 | { |
| 560 | CSSParserToken token = consumeNumber(); |
| 561 | if (nextCharsAreIdentifier()) |
| 562 | token.convertToDimensionWithUnit(consumeName()); |
| 563 | else if (consumeIfNext('%')) |
| 564 | token.convertToPercentage(); |
| 565 | return token; |
| 566 | } |
| 567 | |
| 568 | // http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token |
| 569 | CSSParserToken CSSTokenizer::consumeIdentLikeToken() |
| 570 | { |
| 571 | StringView name = consumeName(); |
| 572 | if (consumeIfNext('(')) { |
| 573 | if (equalIgnoringASCIICase(name, "url" )) { |
| 574 | // The spec is slightly different so as to avoid dropping whitespace |
| 575 | // tokens, but they wouldn't be used and this is easier. |
| 576 | m_input.advanceUntilNonWhitespace(); |
| 577 | UChar next = m_input.peekWithoutReplacement(0); |
| 578 | if (next != '"' && next != '\'') |
| 579 | return consumeUrlToken(); |
| 580 | } |
| 581 | return blockStart(LeftParenthesisToken, FunctionToken, name); |
| 582 | } |
| 583 | return CSSParserToken(IdentToken, name); |
| 584 | } |
| 585 | |
| 586 | // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token |
| 587 | CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) |
| 588 | { |
| 589 | // Strings without escapes get handled without allocations |
| 590 | for (unsigned size = 0; ; size++) { |
| 591 | UChar cc = m_input.peekWithoutReplacement(size); |
| 592 | if (cc == endingCodePoint) { |
| 593 | unsigned startOffset = m_input.offset(); |
| 594 | m_input.advance(size + 1); |
| 595 | return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size)); |
| 596 | } |
| 597 | if (isNewLine(cc)) { |
| 598 | m_input.advance(size); |
| 599 | return CSSParserToken(BadStringToken); |
| 600 | } |
| 601 | if (cc == '\0' || cc == '\\') |
| 602 | break; |
| 603 | } |
| 604 | |
| 605 | StringBuilder output; |
| 606 | while (true) { |
| 607 | UChar cc = consume(); |
| 608 | if (cc == endingCodePoint || cc == kEndOfFileMarker) |
| 609 | return CSSParserToken(StringToken, registerString(output.toString())); |
| 610 | if (isNewLine(cc)) { |
| 611 | reconsume(cc); |
| 612 | return CSSParserToken(BadStringToken); |
| 613 | } |
| 614 | if (cc == '\\') { |
| 615 | if (m_input.nextInputChar() == kEndOfFileMarker) |
| 616 | continue; |
| 617 | if (isNewLine(m_input.peekWithoutReplacement(0))) |
| 618 | consumeSingleWhitespaceIfNext(); // This handles \r\n for us |
| 619 | else |
| 620 | output.append(consumeEscape()); |
| 621 | } else |
| 622 | output.append(cc); |
| 623 | } |
| 624 | } |
| 625 | |
| 626 | CSSParserToken CSSTokenizer::consumeUnicodeRange() |
| 627 | { |
| 628 | ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?'); |
| 629 | int lengthRemaining = 6; |
| 630 | UChar32 start = 0; |
| 631 | |
| 632 | while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { |
| 633 | start = start * 16 + toASCIIHexValue(consume()); |
| 634 | --lengthRemaining; |
| 635 | } |
| 636 | |
| 637 | UChar32 end = start; |
| 638 | if (lengthRemaining && consumeIfNext('?')) { |
| 639 | do { |
| 640 | start *= 16; |
| 641 | end = end * 16 + 0xF; |
| 642 | --lengthRemaining; |
| 643 | } while (lengthRemaining && consumeIfNext('?')); |
| 644 | } else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) { |
| 645 | m_input.advance(); |
| 646 | lengthRemaining = 6; |
| 647 | end = 0; |
| 648 | do { |
| 649 | end = end * 16 + toASCIIHexValue(consume()); |
| 650 | --lengthRemaining; |
| 651 | } while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))); |
| 652 | } |
| 653 | |
| 654 | return CSSParserToken(UnicodeRangeToken, start, end); |
| 655 | } |
| 656 | |
| 657 | // http://dev.w3.org/csswg/css-syntax/#non-printable-code-point |
| 658 | static bool isNonPrintableCodePoint(UChar cc) |
| 659 | { |
| 660 | return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f'; |
| 661 | } |
| 662 | |
| 663 | // http://dev.w3.org/csswg/css-syntax/#consume-url-token |
| 664 | CSSParserToken CSSTokenizer::consumeUrlToken() |
| 665 | { |
| 666 | m_input.advanceUntilNonWhitespace(); |
| 667 | |
| 668 | // URL tokens without escapes get handled without allocations |
| 669 | for (unsigned size = 0; ; size++) { |
| 670 | UChar cc = m_input.peekWithoutReplacement(size); |
| 671 | if (cc == ')') { |
| 672 | unsigned startOffset = m_input.offset(); |
| 673 | m_input.advance(size + 1); |
| 674 | return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size)); |
| 675 | } |
| 676 | if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f') |
| 677 | break; |
| 678 | } |
| 679 | |
| 680 | StringBuilder result; |
| 681 | while (true) { |
| 682 | UChar cc = consume(); |
| 683 | if (cc == ')' || cc == kEndOfFileMarker) |
| 684 | return CSSParserToken(UrlToken, registerString(result.toString())); |
| 685 | |
| 686 | if (isHTMLSpace(cc)) { |
| 687 | m_input.advanceUntilNonWhitespace(); |
| 688 | if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker) |
| 689 | return CSSParserToken(UrlToken, registerString(result.toString())); |
| 690 | break; |
| 691 | } |
| 692 | |
| 693 | if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc)) |
| 694 | break; |
| 695 | |
| 696 | if (cc == '\\') { |
| 697 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
| 698 | result.append(consumeEscape()); |
| 699 | continue; |
| 700 | } |
| 701 | break; |
| 702 | } |
| 703 | |
| 704 | result.append(cc); |
| 705 | } |
| 706 | |
| 707 | consumeBadUrlRemnants(); |
| 708 | return CSSParserToken(BadUrlToken); |
| 709 | } |
| 710 | |
| 711 | // http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url |
| 712 | void CSSTokenizer::consumeBadUrlRemnants() |
| 713 | { |
| 714 | while (true) { |
| 715 | UChar cc = consume(); |
| 716 | if (cc == ')' || cc == kEndOfFileMarker) |
| 717 | return; |
| 718 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) |
| 719 | consumeEscape(); |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | void CSSTokenizer::consumeSingleWhitespaceIfNext() |
| 724 | { |
| 725 | // We check for \r\n and HTML spaces since we don't do preprocessing |
| 726 | UChar next = m_input.peekWithoutReplacement(0); |
| 727 | if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n') |
| 728 | m_input.advance(2); |
| 729 | else if (isHTMLSpace(next)) |
| 730 | m_input.advance(); |
| 731 | } |
| 732 | |
| 733 | void CSSTokenizer::() |
| 734 | { |
| 735 | UChar c = consume(); |
| 736 | while (true) { |
| 737 | if (c == kEndOfFileMarker) |
| 738 | return; |
| 739 | if (c != '*') { |
| 740 | c = consume(); |
| 741 | continue; |
| 742 | } |
| 743 | c = consume(); |
| 744 | if (c == '/') |
| 745 | return; |
| 746 | } |
| 747 | } |
| 748 | |
| 749 | bool CSSTokenizer::consumeIfNext(UChar character) |
| 750 | { |
| 751 | // Since we're not doing replacement we can't tell the difference from |
| 752 | // a NUL in the middle and the kEndOfFileMarker, so character must not be |
| 753 | // NUL. |
| 754 | ASSERT(character); |
| 755 | if (m_input.peekWithoutReplacement(0) == character) { |
| 756 | m_input.advance(); |
| 757 | return true; |
| 758 | } |
| 759 | return false; |
| 760 | } |
| 761 | |
| 762 | // http://www.w3.org/TR/css3-syntax/#consume-a-name |
| 763 | StringView CSSTokenizer::consumeName() |
| 764 | { |
| 765 | // Names without escapes get handled without allocations |
| 766 | for (unsigned size = 0; ; ++size) { |
| 767 | UChar cc = m_input.peekWithoutReplacement(size); |
| 768 | if (isNameCodePoint(cc)) |
| 769 | continue; |
| 770 | // peekWithoutReplacement will return NUL when we hit the end of the |
| 771 | // input. In that case we want to still use the rangeAt() fast path |
| 772 | // below. |
| 773 | if (cc == '\0' && m_input.offset() + size < m_input.length()) |
| 774 | break; |
| 775 | if (cc == '\\') |
| 776 | break; |
| 777 | unsigned startOffset = m_input.offset(); |
| 778 | m_input.advance(size); |
| 779 | return m_input.rangeAt(startOffset, size); |
| 780 | } |
| 781 | |
| 782 | StringBuilder result; |
| 783 | while (true) { |
| 784 | UChar cc = consume(); |
| 785 | if (isNameCodePoint(cc)) { |
| 786 | result.append(cc); |
| 787 | continue; |
| 788 | } |
| 789 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
| 790 | result.append(consumeEscape()); |
| 791 | continue; |
| 792 | } |
| 793 | reconsume(cc); |
| 794 | return registerString(result.toString()); |
| 795 | } |
| 796 | } |
| 797 | |
| 798 | // http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point |
| 799 | UChar32 CSSTokenizer::consumeEscape() |
| 800 | { |
| 801 | UChar cc = consume(); |
| 802 | ASSERT(!isNewLine(cc)); |
| 803 | if (isASCIIHexDigit(cc)) { |
| 804 | unsigned consumedHexDigits = 1; |
| 805 | StringBuilder hexChars; |
| 806 | hexChars.append(cc); |
| 807 | while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { |
| 808 | cc = consume(); |
| 809 | hexChars.append(cc); |
| 810 | consumedHexDigits++; |
| 811 | }; |
| 812 | consumeSingleWhitespaceIfNext(); |
| 813 | bool ok = false; |
| 814 | UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16); |
| 815 | ASSERT(ok); |
| 816 | if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF) |
| 817 | return replacementCharacter; |
| 818 | return codePoint; |
| 819 | } |
| 820 | |
| 821 | if (cc == kEndOfFileMarker) |
| 822 | return replacementCharacter; |
| 823 | return cc; |
| 824 | } |
| 825 | |
| 826 | bool CSSTokenizer::nextTwoCharsAreValidEscape() |
| 827 | { |
| 828 | return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1)); |
| 829 | } |
| 830 | |
| 831 | // http://www.w3.org/TR/css3-syntax/#starts-with-a-number |
| 832 | bool CSSTokenizer::nextCharsAreNumber(UChar first) |
| 833 | { |
| 834 | UChar second = m_input.peekWithoutReplacement(0); |
| 835 | if (isASCIIDigit(first)) |
| 836 | return true; |
| 837 | if (first == '+' || first == '-') |
| 838 | return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1)))); |
| 839 | if (first =='.') |
| 840 | return (isASCIIDigit(second)); |
| 841 | return false; |
| 842 | } |
| 843 | |
| 844 | bool CSSTokenizer::nextCharsAreNumber() |
| 845 | { |
| 846 | UChar first = consume(); |
| 847 | bool areNumber = nextCharsAreNumber(first); |
| 848 | reconsume(first); |
| 849 | return areNumber; |
| 850 | } |
| 851 | |
| 852 | // http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier |
| 853 | bool CSSTokenizer::nextCharsAreIdentifier(UChar first) |
| 854 | { |
| 855 | UChar second = m_input.peekWithoutReplacement(0); |
| 856 | if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second)) |
| 857 | return true; |
| 858 | |
| 859 | if (first == '-') |
| 860 | return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape(); |
| 861 | |
| 862 | return false; |
| 863 | } |
| 864 | |
| 865 | bool CSSTokenizer::nextCharsAreIdentifier() |
| 866 | { |
| 867 | UChar first = consume(); |
| 868 | bool areIdentifier = nextCharsAreIdentifier(first); |
| 869 | reconsume(first); |
| 870 | return areIdentifier; |
| 871 | } |
| 872 | |
| 873 | StringView CSSTokenizer::registerString(const String& string) |
| 874 | { |
| 875 | m_stringPool.append(string); |
| 876 | return string; |
| 877 | } |
| 878 | |
| 879 | } // namespace WebCore |
| 880 | |