1 | // Copyright 2015 The Chromium Authors. All rights reserved. |
2 | // Copyright (C) 2016 Apple Inc. All rights reserved. |
3 | // |
4 | // Redistribution and use in source and binary forms, with or without |
5 | // modification, are permitted provided that the following conditions are |
6 | // met: |
7 | // |
8 | // * Redistributions of source code must retain the above copyright |
9 | // notice, this list of conditions and the following disclaimer. |
10 | // * Redistributions in binary form must reproduce the above |
11 | // copyright notice, this list of conditions and the following disclaimer |
12 | // in the documentation and/or other materials provided with the |
13 | // distribution. |
14 | // * Neither the name of Google Inc. nor the names of its |
15 | // contributors may be used to endorse or promote products derived from |
16 | // this software without specific prior written permission. |
17 | // |
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | |
30 | #include "config.h" |
31 | #include "CSSTokenizer.h" |
32 | |
33 | #include "CSSParserIdioms.h" |
34 | #include "CSSParserObserverWrapper.h" |
35 | #include "CSSParserTokenRange.h" |
36 | #include "CSSTokenizerInputStream.h" |
37 | #include "HTMLParserIdioms.h" |
38 | #include <wtf/text/StringBuilder.h> |
39 | #include <wtf/unicode/CharacterNames.h> |
40 | |
41 | namespace WebCore { |
42 | |
43 | CSSTokenizer::CSSTokenizer(const String& string) |
44 | : m_input(string) |
45 | { |
46 | // According to the spec, we should perform preprocessing here. |
47 | // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing |
48 | // |
49 | // However, we can skip this step since: |
50 | // * We're using HTML spaces (which accept \r and \f as a valid white space) |
51 | // * Do not count white spaces |
52 | // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters |
53 | |
54 | if (string.isEmpty()) |
55 | return; |
56 | |
57 | // To avoid resizing we err on the side of reserving too much space. |
58 | // Most strings we tokenize have about 3.5 to 5 characters per token. |
59 | m_tokens.reserveInitialCapacity(string.length() / 3); |
60 | |
61 | while (true) { |
62 | CSSParserToken token = nextToken(); |
63 | if (token.type() == CommentToken) |
64 | continue; |
65 | if (token.type() == EOFToken) |
66 | return; |
67 | m_tokens.append(token); |
68 | } |
69 | } |
70 | |
71 | CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper) |
72 | : m_input(string) |
73 | { |
74 | if (string.isEmpty()) |
75 | return; |
76 | |
77 | unsigned offset = 0; |
78 | while (true) { |
79 | CSSParserToken token = nextToken(); |
80 | if (token.type() == EOFToken) |
81 | break; |
82 | if (token.type() == CommentToken) |
83 | wrapper.addComment(offset, m_input.offset(), m_tokens.size()); |
84 | else { |
85 | m_tokens.append(token); |
86 | wrapper.addToken(offset); |
87 | } |
88 | offset = m_input.offset(); |
89 | } |
90 | |
91 | wrapper.addToken(offset); |
92 | wrapper.finalizeConstruction(m_tokens.begin()); |
93 | } |
94 | |
95 | CSSParserTokenRange CSSTokenizer::tokenRange() const |
96 | { |
97 | return m_tokens; |
98 | } |
99 | |
100 | unsigned CSSTokenizer::tokenCount() |
101 | { |
102 | return m_tokens.size(); |
103 | } |
104 | |
105 | static bool isNewLine(UChar cc) |
106 | { |
107 | // We check \r and \f here, since we have no preprocessing stage |
108 | return (cc == '\r' || cc == '\n' || cc == '\f'); |
109 | } |
110 | |
111 | // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape |
112 | static bool twoCharsAreValidEscape(UChar first, UChar second) |
113 | { |
114 | return first == '\\' && !isNewLine(second); |
115 | } |
116 | |
117 | void CSSTokenizer::reconsume(UChar c) |
118 | { |
119 | m_input.pushBack(c); |
120 | } |
121 | |
122 | UChar CSSTokenizer::consume() |
123 | { |
124 | UChar current = m_input.nextInputChar(); |
125 | m_input.advance(); |
126 | return current; |
127 | } |
128 | |
129 | CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/) |
130 | { |
131 | m_input.advanceUntilNonWhitespace(); |
132 | return CSSParserToken(WhitespaceToken); |
133 | } |
134 | |
135 | CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type) |
136 | { |
137 | m_blockStack.append(type); |
138 | return CSSParserToken(type, CSSParserToken::BlockStart); |
139 | } |
140 | |
141 | CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name) |
142 | { |
143 | m_blockStack.append(blockType); |
144 | return CSSParserToken(type, name, CSSParserToken::BlockStart); |
145 | } |
146 | |
147 | CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType) |
148 | { |
149 | if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) { |
150 | m_blockStack.removeLast(); |
151 | return CSSParserToken(type, CSSParserToken::BlockEnd); |
152 | } |
153 | return CSSParserToken(type); |
154 | } |
155 | |
156 | CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/) |
157 | { |
158 | return blockStart(LeftParenthesisToken); |
159 | } |
160 | |
161 | CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/) |
162 | { |
163 | return blockEnd(RightParenthesisToken, LeftParenthesisToken); |
164 | } |
165 | |
166 | CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/) |
167 | { |
168 | return blockStart(LeftBracketToken); |
169 | } |
170 | |
171 | CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/) |
172 | { |
173 | return blockEnd(RightBracketToken, LeftBracketToken); |
174 | } |
175 | |
176 | CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/) |
177 | { |
178 | return blockStart(LeftBraceToken); |
179 | } |
180 | |
181 | CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/) |
182 | { |
183 | return blockEnd(RightBraceToken, LeftBraceToken); |
184 | } |
185 | |
186 | CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc) |
187 | { |
188 | if (nextCharsAreNumber(cc)) { |
189 | reconsume(cc); |
190 | return consumeNumericToken(); |
191 | } |
192 | return CSSParserToken(DelimiterToken, cc); |
193 | } |
194 | |
195 | CSSParserToken CSSTokenizer::asterisk(UChar cc) |
196 | { |
197 | ASSERT_UNUSED(cc, cc == '*'); |
198 | if (consumeIfNext('=')) |
199 | return CSSParserToken(SubstringMatchToken); |
200 | return CSSParserToken(DelimiterToken, '*'); |
201 | } |
202 | |
203 | CSSParserToken CSSTokenizer::lessThan(UChar cc) |
204 | { |
205 | ASSERT_UNUSED(cc, cc == '<'); |
206 | if (m_input.peekWithoutReplacement(0) == '!' |
207 | && m_input.peekWithoutReplacement(1) == '-' |
208 | && m_input.peekWithoutReplacement(2) == '-') { |
209 | m_input.advance(3); |
210 | return CSSParserToken(CDOToken); |
211 | } |
212 | return CSSParserToken(DelimiterToken, '<'); |
213 | } |
214 | |
215 | CSSParserToken CSSTokenizer::comma(UChar /*cc*/) |
216 | { |
217 | return CSSParserToken(CommaToken); |
218 | } |
219 | |
220 | CSSParserToken CSSTokenizer::hyphenMinus(UChar cc) |
221 | { |
222 | if (nextCharsAreNumber(cc)) { |
223 | reconsume(cc); |
224 | return consumeNumericToken(); |
225 | } |
226 | if (m_input.peekWithoutReplacement(0) == '-' |
227 | && m_input.peekWithoutReplacement(1) == '>') { |
228 | m_input.advance(2); |
229 | return CSSParserToken(CDCToken); |
230 | } |
231 | if (nextCharsAreIdentifier(cc)) { |
232 | reconsume(cc); |
233 | return consumeIdentLikeToken(); |
234 | } |
235 | return CSSParserToken(DelimiterToken, cc); |
236 | } |
237 | |
238 | CSSParserToken CSSTokenizer::solidus(UChar cc) |
239 | { |
240 | if (consumeIfNext('*')) { |
241 | // These get ignored, but we need a value to return. |
242 | consumeUntilCommentEndFound(); |
243 | return CSSParserToken(CommentToken); |
244 | } |
245 | |
246 | return CSSParserToken(DelimiterToken, cc); |
247 | } |
248 | |
249 | CSSParserToken CSSTokenizer::colon(UChar /*cc*/) |
250 | { |
251 | return CSSParserToken(ColonToken); |
252 | } |
253 | |
254 | CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/) |
255 | { |
256 | return CSSParserToken(SemicolonToken); |
257 | } |
258 | |
259 | CSSParserToken CSSTokenizer::hash(UChar cc) |
260 | { |
261 | UChar nextChar = m_input.peekWithoutReplacement(0); |
262 | if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) { |
263 | HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted; |
264 | return CSSParserToken(type, consumeName()); |
265 | } |
266 | |
267 | return CSSParserToken(DelimiterToken, cc); |
268 | } |
269 | |
270 | CSSParserToken CSSTokenizer::circumflexAccent(UChar cc) |
271 | { |
272 | ASSERT_UNUSED(cc, cc == '^'); |
273 | if (consumeIfNext('=')) |
274 | return CSSParserToken(PrefixMatchToken); |
275 | return CSSParserToken(DelimiterToken, '^'); |
276 | } |
277 | |
278 | CSSParserToken CSSTokenizer::dollarSign(UChar cc) |
279 | { |
280 | ASSERT_UNUSED(cc, cc == '$'); |
281 | if (consumeIfNext('=')) |
282 | return CSSParserToken(SuffixMatchToken); |
283 | return CSSParserToken(DelimiterToken, '$'); |
284 | } |
285 | |
286 | CSSParserToken CSSTokenizer::verticalLine(UChar cc) |
287 | { |
288 | ASSERT_UNUSED(cc, cc == '|'); |
289 | if (consumeIfNext('=')) |
290 | return CSSParserToken(DashMatchToken); |
291 | if (consumeIfNext('|')) |
292 | return CSSParserToken(ColumnToken); |
293 | return CSSParserToken(DelimiterToken, '|'); |
294 | } |
295 | |
296 | CSSParserToken CSSTokenizer::tilde(UChar cc) |
297 | { |
298 | ASSERT_UNUSED(cc, cc == '~'); |
299 | if (consumeIfNext('=')) |
300 | return CSSParserToken(IncludeMatchToken); |
301 | return CSSParserToken(DelimiterToken, '~'); |
302 | } |
303 | |
304 | CSSParserToken CSSTokenizer::commercialAt(UChar cc) |
305 | { |
306 | ASSERT_UNUSED(cc, cc == '@'); |
307 | if (nextCharsAreIdentifier()) |
308 | return CSSParserToken(AtKeywordToken, consumeName()); |
309 | return CSSParserToken(DelimiterToken, '@'); |
310 | } |
311 | |
312 | CSSParserToken CSSTokenizer::reverseSolidus(UChar cc) |
313 | { |
314 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
315 | reconsume(cc); |
316 | return consumeIdentLikeToken(); |
317 | } |
318 | return CSSParserToken(DelimiterToken, cc); |
319 | } |
320 | |
321 | CSSParserToken CSSTokenizer::asciiDigit(UChar cc) |
322 | { |
323 | reconsume(cc); |
324 | return consumeNumericToken(); |
325 | } |
326 | |
327 | CSSParserToken CSSTokenizer::letterU(UChar cc) |
328 | { |
329 | if (m_input.peekWithoutReplacement(0) == '+' |
330 | && (isASCIIHexDigit(m_input.peekWithoutReplacement(1)) |
331 | || m_input.peekWithoutReplacement(1) == '?')) { |
332 | m_input.advance(); |
333 | return consumeUnicodeRange(); |
334 | } |
335 | reconsume(cc); |
336 | return consumeIdentLikeToken(); |
337 | } |
338 | |
339 | CSSParserToken CSSTokenizer::nameStart(UChar cc) |
340 | { |
341 | reconsume(cc); |
342 | return consumeIdentLikeToken(); |
343 | } |
344 | |
345 | CSSParserToken CSSTokenizer::stringStart(UChar cc) |
346 | { |
347 | return consumeStringTokenUntil(cc); |
348 | } |
349 | |
350 | CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/) |
351 | { |
352 | return CSSParserToken(EOFToken); |
353 | } |
354 | |
355 | const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = { |
356 | &CSSTokenizer::endOfFile, |
357 | 0, |
358 | 0, |
359 | 0, |
360 | 0, |
361 | 0, |
362 | 0, |
363 | 0, |
364 | 0, |
365 | &CSSTokenizer::whiteSpace, |
366 | &CSSTokenizer::whiteSpace, |
367 | 0, |
368 | &CSSTokenizer::whiteSpace, |
369 | &CSSTokenizer::whiteSpace, |
370 | 0, |
371 | 0, |
372 | 0, |
373 | 0, |
374 | 0, |
375 | 0, |
376 | 0, |
377 | 0, |
378 | 0, |
379 | 0, |
380 | 0, |
381 | 0, |
382 | 0, |
383 | 0, |
384 | 0, |
385 | 0, |
386 | 0, |
387 | 0, |
388 | &CSSTokenizer::whiteSpace, |
389 | 0, |
390 | &CSSTokenizer::stringStart, |
391 | &CSSTokenizer::hash, |
392 | &CSSTokenizer::dollarSign, |
393 | 0, |
394 | 0, |
395 | &CSSTokenizer::stringStart, |
396 | &CSSTokenizer::leftParenthesis, |
397 | &CSSTokenizer::rightParenthesis, |
398 | &CSSTokenizer::asterisk, |
399 | &CSSTokenizer::plusOrFullStop, |
400 | &CSSTokenizer::comma, |
401 | &CSSTokenizer::hyphenMinus, |
402 | &CSSTokenizer::plusOrFullStop, |
403 | &CSSTokenizer::solidus, |
404 | &CSSTokenizer::asciiDigit, |
405 | &CSSTokenizer::asciiDigit, |
406 | &CSSTokenizer::asciiDigit, |
407 | &CSSTokenizer::asciiDigit, |
408 | &CSSTokenizer::asciiDigit, |
409 | &CSSTokenizer::asciiDigit, |
410 | &CSSTokenizer::asciiDigit, |
411 | &CSSTokenizer::asciiDigit, |
412 | &CSSTokenizer::asciiDigit, |
413 | &CSSTokenizer::asciiDigit, |
414 | &CSSTokenizer::colon, |
415 | &CSSTokenizer::semiColon, |
416 | &CSSTokenizer::lessThan, |
417 | 0, |
418 | 0, |
419 | 0, |
420 | &CSSTokenizer::commercialAt, |
421 | &CSSTokenizer::nameStart, |
422 | &CSSTokenizer::nameStart, |
423 | &CSSTokenizer::nameStart, |
424 | &CSSTokenizer::nameStart, |
425 | &CSSTokenizer::nameStart, |
426 | &CSSTokenizer::nameStart, |
427 | &CSSTokenizer::nameStart, |
428 | &CSSTokenizer::nameStart, |
429 | &CSSTokenizer::nameStart, |
430 | &CSSTokenizer::nameStart, |
431 | &CSSTokenizer::nameStart, |
432 | &CSSTokenizer::nameStart, |
433 | &CSSTokenizer::nameStart, |
434 | &CSSTokenizer::nameStart, |
435 | &CSSTokenizer::nameStart, |
436 | &CSSTokenizer::nameStart, |
437 | &CSSTokenizer::nameStart, |
438 | &CSSTokenizer::nameStart, |
439 | &CSSTokenizer::nameStart, |
440 | &CSSTokenizer::nameStart, |
441 | &CSSTokenizer::letterU, |
442 | &CSSTokenizer::nameStart, |
443 | &CSSTokenizer::nameStart, |
444 | &CSSTokenizer::nameStart, |
445 | &CSSTokenizer::nameStart, |
446 | &CSSTokenizer::nameStart, |
447 | &CSSTokenizer::leftBracket, |
448 | &CSSTokenizer::reverseSolidus, |
449 | &CSSTokenizer::rightBracket, |
450 | &CSSTokenizer::circumflexAccent, |
451 | &CSSTokenizer::nameStart, |
452 | 0, |
453 | &CSSTokenizer::nameStart, |
454 | &CSSTokenizer::nameStart, |
455 | &CSSTokenizer::nameStart, |
456 | &CSSTokenizer::nameStart, |
457 | &CSSTokenizer::nameStart, |
458 | &CSSTokenizer::nameStart, |
459 | &CSSTokenizer::nameStart, |
460 | &CSSTokenizer::nameStart, |
461 | &CSSTokenizer::nameStart, |
462 | &CSSTokenizer::nameStart, |
463 | &CSSTokenizer::nameStart, |
464 | &CSSTokenizer::nameStart, |
465 | &CSSTokenizer::nameStart, |
466 | &CSSTokenizer::nameStart, |
467 | &CSSTokenizer::nameStart, |
468 | &CSSTokenizer::nameStart, |
469 | &CSSTokenizer::nameStart, |
470 | &CSSTokenizer::nameStart, |
471 | &CSSTokenizer::nameStart, |
472 | &CSSTokenizer::nameStart, |
473 | &CSSTokenizer::letterU, |
474 | &CSSTokenizer::nameStart, |
475 | &CSSTokenizer::nameStart, |
476 | &CSSTokenizer::nameStart, |
477 | &CSSTokenizer::nameStart, |
478 | &CSSTokenizer::nameStart, |
479 | &CSSTokenizer::leftBrace, |
480 | &CSSTokenizer::verticalLine, |
481 | &CSSTokenizer::rightBrace, |
482 | &CSSTokenizer::tilde, |
483 | 0, |
484 | }; |
485 | #if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED |
486 | const unsigned codePointsNumber = 128; |
487 | #endif |
488 | |
489 | CSSParserToken CSSTokenizer::nextToken() |
490 | { |
491 | // Unlike the HTMLTokenizer, the CSS Syntax spec is written |
492 | // as a stateless, (fixed-size) look-ahead tokenizer. |
493 | // We could move to the stateful model and instead create |
494 | // states for all the "next 3 codepoints are X" cases. |
495 | // State-machine tokenizers are easier to write to handle |
496 | // incremental tokenization of partial sources. |
497 | // However, for now we follow the spec exactly. |
498 | UChar cc = consume(); |
499 | CodePoint codePointFunc = 0; |
500 | |
501 | if (isASCII(cc)) { |
502 | ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); |
503 | codePointFunc = codePoints[cc]; |
504 | } else |
505 | codePointFunc = &CSSTokenizer::nameStart; |
506 | |
507 | if (codePointFunc) |
508 | return ((this)->*(codePointFunc))(cc); |
509 | return CSSParserToken(DelimiterToken, cc); |
510 | } |
511 | |
512 | // This method merges the following spec sections for efficiency |
513 | // http://www.w3.org/TR/css3-syntax/#consume-a-number |
514 | // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number |
515 | CSSParserToken CSSTokenizer::() |
516 | { |
517 | ASSERT(nextCharsAreNumber()); |
518 | |
519 | NumericValueType type = IntegerValueType; |
520 | NumericSign sign = NoSign; |
521 | unsigned numberLength = 0; |
522 | |
523 | UChar next = m_input.peekWithoutReplacement(0); |
524 | if (next == '+') { |
525 | ++numberLength; |
526 | sign = PlusSign; |
527 | } else if (next == '-') { |
528 | ++numberLength; |
529 | sign = MinusSign; |
530 | } |
531 | |
532 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength); |
533 | next = m_input.peekWithoutReplacement(numberLength); |
534 | if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) { |
535 | type = NumberValueType; |
536 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2); |
537 | next = m_input.peekWithoutReplacement(numberLength); |
538 | } |
539 | |
540 | if (next == 'E' || next == 'e') { |
541 | next = m_input.peekWithoutReplacement(numberLength + 1); |
542 | if (isASCIIDigit(next)) { |
543 | type = NumberValueType; |
544 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1); |
545 | } else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) { |
546 | type = NumberValueType; |
547 | numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3); |
548 | } |
549 | } |
550 | |
551 | double value = m_input.getDouble(0, numberLength); |
552 | m_input.advance(numberLength); |
553 | |
554 | return CSSParserToken(NumberToken, value, type, sign); |
555 | } |
556 | |
557 | // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token |
558 | CSSParserToken CSSTokenizer::() |
559 | { |
560 | CSSParserToken token = consumeNumber(); |
561 | if (nextCharsAreIdentifier()) |
562 | token.convertToDimensionWithUnit(consumeName()); |
563 | else if (consumeIfNext('%')) |
564 | token.convertToPercentage(); |
565 | return token; |
566 | } |
567 | |
568 | // http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token |
569 | CSSParserToken CSSTokenizer::consumeIdentLikeToken() |
570 | { |
571 | StringView name = consumeName(); |
572 | if (consumeIfNext('(')) { |
573 | if (equalIgnoringASCIICase(name, "url" )) { |
574 | // The spec is slightly different so as to avoid dropping whitespace |
575 | // tokens, but they wouldn't be used and this is easier. |
576 | m_input.advanceUntilNonWhitespace(); |
577 | UChar next = m_input.peekWithoutReplacement(0); |
578 | if (next != '"' && next != '\'') |
579 | return consumeUrlToken(); |
580 | } |
581 | return blockStart(LeftParenthesisToken, FunctionToken, name); |
582 | } |
583 | return CSSParserToken(IdentToken, name); |
584 | } |
585 | |
586 | // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token |
587 | CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) |
588 | { |
589 | // Strings without escapes get handled without allocations |
590 | for (unsigned size = 0; ; size++) { |
591 | UChar cc = m_input.peekWithoutReplacement(size); |
592 | if (cc == endingCodePoint) { |
593 | unsigned startOffset = m_input.offset(); |
594 | m_input.advance(size + 1); |
595 | return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size)); |
596 | } |
597 | if (isNewLine(cc)) { |
598 | m_input.advance(size); |
599 | return CSSParserToken(BadStringToken); |
600 | } |
601 | if (cc == '\0' || cc == '\\') |
602 | break; |
603 | } |
604 | |
605 | StringBuilder output; |
606 | while (true) { |
607 | UChar cc = consume(); |
608 | if (cc == endingCodePoint || cc == kEndOfFileMarker) |
609 | return CSSParserToken(StringToken, registerString(output.toString())); |
610 | if (isNewLine(cc)) { |
611 | reconsume(cc); |
612 | return CSSParserToken(BadStringToken); |
613 | } |
614 | if (cc == '\\') { |
615 | if (m_input.nextInputChar() == kEndOfFileMarker) |
616 | continue; |
617 | if (isNewLine(m_input.peekWithoutReplacement(0))) |
618 | consumeSingleWhitespaceIfNext(); // This handles \r\n for us |
619 | else |
620 | output.append(consumeEscape()); |
621 | } else |
622 | output.append(cc); |
623 | } |
624 | } |
625 | |
626 | CSSParserToken CSSTokenizer::consumeUnicodeRange() |
627 | { |
628 | ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?'); |
629 | int lengthRemaining = 6; |
630 | UChar32 start = 0; |
631 | |
632 | while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { |
633 | start = start * 16 + toASCIIHexValue(consume()); |
634 | --lengthRemaining; |
635 | } |
636 | |
637 | UChar32 end = start; |
638 | if (lengthRemaining && consumeIfNext('?')) { |
639 | do { |
640 | start *= 16; |
641 | end = end * 16 + 0xF; |
642 | --lengthRemaining; |
643 | } while (lengthRemaining && consumeIfNext('?')); |
644 | } else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) { |
645 | m_input.advance(); |
646 | lengthRemaining = 6; |
647 | end = 0; |
648 | do { |
649 | end = end * 16 + toASCIIHexValue(consume()); |
650 | --lengthRemaining; |
651 | } while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))); |
652 | } |
653 | |
654 | return CSSParserToken(UnicodeRangeToken, start, end); |
655 | } |
656 | |
657 | // http://dev.w3.org/csswg/css-syntax/#non-printable-code-point |
658 | static bool isNonPrintableCodePoint(UChar cc) |
659 | { |
660 | return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f'; |
661 | } |
662 | |
663 | // http://dev.w3.org/csswg/css-syntax/#consume-url-token |
664 | CSSParserToken CSSTokenizer::consumeUrlToken() |
665 | { |
666 | m_input.advanceUntilNonWhitespace(); |
667 | |
668 | // URL tokens without escapes get handled without allocations |
669 | for (unsigned size = 0; ; size++) { |
670 | UChar cc = m_input.peekWithoutReplacement(size); |
671 | if (cc == ')') { |
672 | unsigned startOffset = m_input.offset(); |
673 | m_input.advance(size + 1); |
674 | return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size)); |
675 | } |
676 | if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f') |
677 | break; |
678 | } |
679 | |
680 | StringBuilder result; |
681 | while (true) { |
682 | UChar cc = consume(); |
683 | if (cc == ')' || cc == kEndOfFileMarker) |
684 | return CSSParserToken(UrlToken, registerString(result.toString())); |
685 | |
686 | if (isHTMLSpace(cc)) { |
687 | m_input.advanceUntilNonWhitespace(); |
688 | if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker) |
689 | return CSSParserToken(UrlToken, registerString(result.toString())); |
690 | break; |
691 | } |
692 | |
693 | if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc)) |
694 | break; |
695 | |
696 | if (cc == '\\') { |
697 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
698 | result.append(consumeEscape()); |
699 | continue; |
700 | } |
701 | break; |
702 | } |
703 | |
704 | result.append(cc); |
705 | } |
706 | |
707 | consumeBadUrlRemnants(); |
708 | return CSSParserToken(BadUrlToken); |
709 | } |
710 | |
711 | // http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url |
712 | void CSSTokenizer::consumeBadUrlRemnants() |
713 | { |
714 | while (true) { |
715 | UChar cc = consume(); |
716 | if (cc == ')' || cc == kEndOfFileMarker) |
717 | return; |
718 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) |
719 | consumeEscape(); |
720 | } |
721 | } |
722 | |
723 | void CSSTokenizer::consumeSingleWhitespaceIfNext() |
724 | { |
725 | // We check for \r\n and HTML spaces since we don't do preprocessing |
726 | UChar next = m_input.peekWithoutReplacement(0); |
727 | if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n') |
728 | m_input.advance(2); |
729 | else if (isHTMLSpace(next)) |
730 | m_input.advance(); |
731 | } |
732 | |
733 | void CSSTokenizer::() |
734 | { |
735 | UChar c = consume(); |
736 | while (true) { |
737 | if (c == kEndOfFileMarker) |
738 | return; |
739 | if (c != '*') { |
740 | c = consume(); |
741 | continue; |
742 | } |
743 | c = consume(); |
744 | if (c == '/') |
745 | return; |
746 | } |
747 | } |
748 | |
749 | bool CSSTokenizer::consumeIfNext(UChar character) |
750 | { |
751 | // Since we're not doing replacement we can't tell the difference from |
752 | // a NUL in the middle and the kEndOfFileMarker, so character must not be |
753 | // NUL. |
754 | ASSERT(character); |
755 | if (m_input.peekWithoutReplacement(0) == character) { |
756 | m_input.advance(); |
757 | return true; |
758 | } |
759 | return false; |
760 | } |
761 | |
762 | // http://www.w3.org/TR/css3-syntax/#consume-a-name |
763 | StringView CSSTokenizer::consumeName() |
764 | { |
765 | // Names without escapes get handled without allocations |
766 | for (unsigned size = 0; ; ++size) { |
767 | UChar cc = m_input.peekWithoutReplacement(size); |
768 | if (isNameCodePoint(cc)) |
769 | continue; |
770 | // peekWithoutReplacement will return NUL when we hit the end of the |
771 | // input. In that case we want to still use the rangeAt() fast path |
772 | // below. |
773 | if (cc == '\0' && m_input.offset() + size < m_input.length()) |
774 | break; |
775 | if (cc == '\\') |
776 | break; |
777 | unsigned startOffset = m_input.offset(); |
778 | m_input.advance(size); |
779 | return m_input.rangeAt(startOffset, size); |
780 | } |
781 | |
782 | StringBuilder result; |
783 | while (true) { |
784 | UChar cc = consume(); |
785 | if (isNameCodePoint(cc)) { |
786 | result.append(cc); |
787 | continue; |
788 | } |
789 | if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { |
790 | result.append(consumeEscape()); |
791 | continue; |
792 | } |
793 | reconsume(cc); |
794 | return registerString(result.toString()); |
795 | } |
796 | } |
797 | |
798 | // http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point |
799 | UChar32 CSSTokenizer::consumeEscape() |
800 | { |
801 | UChar cc = consume(); |
802 | ASSERT(!isNewLine(cc)); |
803 | if (isASCIIHexDigit(cc)) { |
804 | unsigned consumedHexDigits = 1; |
805 | StringBuilder hexChars; |
806 | hexChars.append(cc); |
807 | while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { |
808 | cc = consume(); |
809 | hexChars.append(cc); |
810 | consumedHexDigits++; |
811 | }; |
812 | consumeSingleWhitespaceIfNext(); |
813 | bool ok = false; |
814 | UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16); |
815 | ASSERT(ok); |
816 | if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF) |
817 | return replacementCharacter; |
818 | return codePoint; |
819 | } |
820 | |
821 | if (cc == kEndOfFileMarker) |
822 | return replacementCharacter; |
823 | return cc; |
824 | } |
825 | |
826 | bool CSSTokenizer::nextTwoCharsAreValidEscape() |
827 | { |
828 | return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1)); |
829 | } |
830 | |
831 | // http://www.w3.org/TR/css3-syntax/#starts-with-a-number |
832 | bool CSSTokenizer::nextCharsAreNumber(UChar first) |
833 | { |
834 | UChar second = m_input.peekWithoutReplacement(0); |
835 | if (isASCIIDigit(first)) |
836 | return true; |
837 | if (first == '+' || first == '-') |
838 | return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1)))); |
839 | if (first =='.') |
840 | return (isASCIIDigit(second)); |
841 | return false; |
842 | } |
843 | |
844 | bool CSSTokenizer::nextCharsAreNumber() |
845 | { |
846 | UChar first = consume(); |
847 | bool areNumber = nextCharsAreNumber(first); |
848 | reconsume(first); |
849 | return areNumber; |
850 | } |
851 | |
852 | // http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier |
853 | bool CSSTokenizer::nextCharsAreIdentifier(UChar first) |
854 | { |
855 | UChar second = m_input.peekWithoutReplacement(0); |
856 | if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second)) |
857 | return true; |
858 | |
859 | if (first == '-') |
860 | return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape(); |
861 | |
862 | return false; |
863 | } |
864 | |
865 | bool CSSTokenizer::nextCharsAreIdentifier() |
866 | { |
867 | UChar first = consume(); |
868 | bool areIdentifier = nextCharsAreIdentifier(first); |
869 | reconsume(first); |
870 | return areIdentifier; |
871 | } |
872 | |
873 | StringView CSSTokenizer::registerString(const String& string) |
874 | { |
875 | m_stringPool.append(string); |
876 | return string; |
877 | } |
878 | |
879 | } // namespace WebCore |
880 | |