1/*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006-2017 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 * Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "config.h"
26#include "Lexer.h"
27
28#include "BuiltinNames.h"
29#include "Identifier.h"
30#include "JSCInlines.h"
31#include "JSFunctionInlines.h"
32#include "KeywordLookup.h"
33#include "Lexer.lut.h"
34#include "Nodes.h"
35#include "ParseInt.h"
36#include "Parser.h"
37#include <ctype.h>
38#include <limits.h>
39#include <string.h>
40#include <wtf/Assertions.h>
41#include <wtf/HexNumber.h>
42#include <wtf/Variant.h>
43#include <wtf/dtoa.h>
44
45namespace JSC {
46
47bool isLexerKeyword(const Identifier& identifier)
48{
49 return JSC::mainTable.entry(identifier);
50}
51
52enum CharacterType {
53 // Types for the main switch
54
55 // The first three types are fixed, and also used for identifying
56 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
57 CharacterIdentifierStart,
58 CharacterZero,
59 CharacterNumber,
60
61 // For single-byte characters grandfathered into Other_ID_Continue -- namely just U+00B7 MIDDLE DOT.
62 // (http://unicode.org/reports/tr31/#Backward_Compatibility)
63 CharacterOtherIdentifierPart,
64
65 CharacterInvalid,
66 CharacterLineTerminator,
67 CharacterExclamationMark,
68 CharacterOpenParen,
69 CharacterCloseParen,
70 CharacterOpenBracket,
71 CharacterCloseBracket,
72 CharacterComma,
73 CharacterColon,
74 CharacterQuestion,
75 CharacterTilde,
76 CharacterQuote,
77 CharacterBackQuote,
78 CharacterDot,
79 CharacterSlash,
80 CharacterBackSlash,
81 CharacterSemicolon,
82 CharacterOpenBrace,
83 CharacterCloseBrace,
84
85 CharacterAdd,
86 CharacterSub,
87 CharacterMultiply,
88 CharacterModulo,
89 CharacterAnd,
90 CharacterXor,
91 CharacterOr,
92 CharacterLess,
93 CharacterGreater,
94 CharacterEqual,
95
96 // Other types (only one so far)
97 CharacterWhiteSpace,
98 CharacterPrivateIdentifierStart
99};
100
101// 256 Latin-1 codes
102static constexpr const unsigned short typesOfLatin1Characters[256] = {
103/* 0 - Null */ CharacterInvalid,
104/* 1 - Start of Heading */ CharacterInvalid,
105/* 2 - Start of Text */ CharacterInvalid,
106/* 3 - End of Text */ CharacterInvalid,
107/* 4 - End of Transm. */ CharacterInvalid,
108/* 5 - Enquiry */ CharacterInvalid,
109/* 6 - Acknowledgment */ CharacterInvalid,
110/* 7 - Bell */ CharacterInvalid,
111/* 8 - Back Space */ CharacterInvalid,
112/* 9 - Horizontal Tab */ CharacterWhiteSpace,
113/* 10 - Line Feed */ CharacterLineTerminator,
114/* 11 - Vertical Tab */ CharacterWhiteSpace,
115/* 12 - Form Feed */ CharacterWhiteSpace,
116/* 13 - Carriage Return */ CharacterLineTerminator,
117/* 14 - Shift Out */ CharacterInvalid,
118/* 15 - Shift In */ CharacterInvalid,
119/* 16 - Data Line Escape */ CharacterInvalid,
120/* 17 - Device Control 1 */ CharacterInvalid,
121/* 18 - Device Control 2 */ CharacterInvalid,
122/* 19 - Device Control 3 */ CharacterInvalid,
123/* 20 - Device Control 4 */ CharacterInvalid,
124/* 21 - Negative Ack. */ CharacterInvalid,
125/* 22 - Synchronous Idle */ CharacterInvalid,
126/* 23 - End of Transmit */ CharacterInvalid,
127/* 24 - Cancel */ CharacterInvalid,
128/* 25 - End of Medium */ CharacterInvalid,
129/* 26 - Substitute */ CharacterInvalid,
130/* 27 - Escape */ CharacterInvalid,
131/* 28 - File Separator */ CharacterInvalid,
132/* 29 - Group Separator */ CharacterInvalid,
133/* 30 - Record Separator */ CharacterInvalid,
134/* 31 - Unit Separator */ CharacterInvalid,
135/* 32 - Space */ CharacterWhiteSpace,
136/* 33 - ! */ CharacterExclamationMark,
137/* 34 - " */ CharacterQuote,
138/* 35 - # */ CharacterInvalid,
139/* 36 - $ */ CharacterIdentifierStart,
140/* 37 - % */ CharacterModulo,
141/* 38 - & */ CharacterAnd,
142/* 39 - ' */ CharacterQuote,
143/* 40 - ( */ CharacterOpenParen,
144/* 41 - ) */ CharacterCloseParen,
145/* 42 - * */ CharacterMultiply,
146/* 43 - + */ CharacterAdd,
147/* 44 - , */ CharacterComma,
148/* 45 - - */ CharacterSub,
149/* 46 - . */ CharacterDot,
150/* 47 - / */ CharacterSlash,
151/* 48 - 0 */ CharacterZero,
152/* 49 - 1 */ CharacterNumber,
153/* 50 - 2 */ CharacterNumber,
154/* 51 - 3 */ CharacterNumber,
155/* 52 - 4 */ CharacterNumber,
156/* 53 - 5 */ CharacterNumber,
157/* 54 - 6 */ CharacterNumber,
158/* 55 - 7 */ CharacterNumber,
159/* 56 - 8 */ CharacterNumber,
160/* 57 - 9 */ CharacterNumber,
161/* 58 - : */ CharacterColon,
162/* 59 - ; */ CharacterSemicolon,
163/* 60 - < */ CharacterLess,
164/* 61 - = */ CharacterEqual,
165/* 62 - > */ CharacterGreater,
166/* 63 - ? */ CharacterQuestion,
167/* 64 - @ */ CharacterPrivateIdentifierStart,
168/* 65 - A */ CharacterIdentifierStart,
169/* 66 - B */ CharacterIdentifierStart,
170/* 67 - C */ CharacterIdentifierStart,
171/* 68 - D */ CharacterIdentifierStart,
172/* 69 - E */ CharacterIdentifierStart,
173/* 70 - F */ CharacterIdentifierStart,
174/* 71 - G */ CharacterIdentifierStart,
175/* 72 - H */ CharacterIdentifierStart,
176/* 73 - I */ CharacterIdentifierStart,
177/* 74 - J */ CharacterIdentifierStart,
178/* 75 - K */ CharacterIdentifierStart,
179/* 76 - L */ CharacterIdentifierStart,
180/* 77 - M */ CharacterIdentifierStart,
181/* 78 - N */ CharacterIdentifierStart,
182/* 79 - O */ CharacterIdentifierStart,
183/* 80 - P */ CharacterIdentifierStart,
184/* 81 - Q */ CharacterIdentifierStart,
185/* 82 - R */ CharacterIdentifierStart,
186/* 83 - S */ CharacterIdentifierStart,
187/* 84 - T */ CharacterIdentifierStart,
188/* 85 - U */ CharacterIdentifierStart,
189/* 86 - V */ CharacterIdentifierStart,
190/* 87 - W */ CharacterIdentifierStart,
191/* 88 - X */ CharacterIdentifierStart,
192/* 89 - Y */ CharacterIdentifierStart,
193/* 90 - Z */ CharacterIdentifierStart,
194/* 91 - [ */ CharacterOpenBracket,
195/* 92 - \ */ CharacterBackSlash,
196/* 93 - ] */ CharacterCloseBracket,
197/* 94 - ^ */ CharacterXor,
198/* 95 - _ */ CharacterIdentifierStart,
199/* 96 - ` */ CharacterBackQuote,
200/* 97 - a */ CharacterIdentifierStart,
201/* 98 - b */ CharacterIdentifierStart,
202/* 99 - c */ CharacterIdentifierStart,
203/* 100 - d */ CharacterIdentifierStart,
204/* 101 - e */ CharacterIdentifierStart,
205/* 102 - f */ CharacterIdentifierStart,
206/* 103 - g */ CharacterIdentifierStart,
207/* 104 - h */ CharacterIdentifierStart,
208/* 105 - i */ CharacterIdentifierStart,
209/* 106 - j */ CharacterIdentifierStart,
210/* 107 - k */ CharacterIdentifierStart,
211/* 108 - l */ CharacterIdentifierStart,
212/* 109 - m */ CharacterIdentifierStart,
213/* 110 - n */ CharacterIdentifierStart,
214/* 111 - o */ CharacterIdentifierStart,
215/* 112 - p */ CharacterIdentifierStart,
216/* 113 - q */ CharacterIdentifierStart,
217/* 114 - r */ CharacterIdentifierStart,
218/* 115 - s */ CharacterIdentifierStart,
219/* 116 - t */ CharacterIdentifierStart,
220/* 117 - u */ CharacterIdentifierStart,
221/* 118 - v */ CharacterIdentifierStart,
222/* 119 - w */ CharacterIdentifierStart,
223/* 120 - x */ CharacterIdentifierStart,
224/* 121 - y */ CharacterIdentifierStart,
225/* 122 - z */ CharacterIdentifierStart,
226/* 123 - { */ CharacterOpenBrace,
227/* 124 - | */ CharacterOr,
228/* 125 - } */ CharacterCloseBrace,
229/* 126 - ~ */ CharacterTilde,
230/* 127 - Delete */ CharacterInvalid,
231/* 128 - Cc category */ CharacterInvalid,
232/* 129 - Cc category */ CharacterInvalid,
233/* 130 - Cc category */ CharacterInvalid,
234/* 131 - Cc category */ CharacterInvalid,
235/* 132 - Cc category */ CharacterInvalid,
236/* 133 - Cc category */ CharacterInvalid,
237/* 134 - Cc category */ CharacterInvalid,
238/* 135 - Cc category */ CharacterInvalid,
239/* 136 - Cc category */ CharacterInvalid,
240/* 137 - Cc category */ CharacterInvalid,
241/* 138 - Cc category */ CharacterInvalid,
242/* 139 - Cc category */ CharacterInvalid,
243/* 140 - Cc category */ CharacterInvalid,
244/* 141 - Cc category */ CharacterInvalid,
245/* 142 - Cc category */ CharacterInvalid,
246/* 143 - Cc category */ CharacterInvalid,
247/* 144 - Cc category */ CharacterInvalid,
248/* 145 - Cc category */ CharacterInvalid,
249/* 146 - Cc category */ CharacterInvalid,
250/* 147 - Cc category */ CharacterInvalid,
251/* 148 - Cc category */ CharacterInvalid,
252/* 149 - Cc category */ CharacterInvalid,
253/* 150 - Cc category */ CharacterInvalid,
254/* 151 - Cc category */ CharacterInvalid,
255/* 152 - Cc category */ CharacterInvalid,
256/* 153 - Cc category */ CharacterInvalid,
257/* 154 - Cc category */ CharacterInvalid,
258/* 155 - Cc category */ CharacterInvalid,
259/* 156 - Cc category */ CharacterInvalid,
260/* 157 - Cc category */ CharacterInvalid,
261/* 158 - Cc category */ CharacterInvalid,
262/* 159 - Cc category */ CharacterInvalid,
263/* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
264/* 161 - Po category */ CharacterInvalid,
265/* 162 - Sc category */ CharacterInvalid,
266/* 163 - Sc category */ CharacterInvalid,
267/* 164 - Sc category */ CharacterInvalid,
268/* 165 - Sc category */ CharacterInvalid,
269/* 166 - So category */ CharacterInvalid,
270/* 167 - So category */ CharacterInvalid,
271/* 168 - Sk category */ CharacterInvalid,
272/* 169 - So category */ CharacterInvalid,
273/* 170 - Ll category */ CharacterIdentifierStart,
274/* 171 - Pi category */ CharacterInvalid,
275/* 172 - Sm category */ CharacterInvalid,
276/* 173 - Cf category */ CharacterInvalid,
277/* 174 - So category */ CharacterInvalid,
278/* 175 - Sk category */ CharacterInvalid,
279/* 176 - So category */ CharacterInvalid,
280/* 177 - Sm category */ CharacterInvalid,
281/* 178 - No category */ CharacterInvalid,
282/* 179 - No category */ CharacterInvalid,
283/* 180 - Sk category */ CharacterInvalid,
284/* 181 - Ll category */ CharacterIdentifierStart,
285/* 182 - So category */ CharacterInvalid,
286/* 183 - Po category */ CharacterOtherIdentifierPart,
287/* 184 - Sk category */ CharacterInvalid,
288/* 185 - No category */ CharacterInvalid,
289/* 186 - Ll category */ CharacterIdentifierStart,
290/* 187 - Pf category */ CharacterInvalid,
291/* 188 - No category */ CharacterInvalid,
292/* 189 - No category */ CharacterInvalid,
293/* 190 - No category */ CharacterInvalid,
294/* 191 - Po category */ CharacterInvalid,
295/* 192 - Lu category */ CharacterIdentifierStart,
296/* 193 - Lu category */ CharacterIdentifierStart,
297/* 194 - Lu category */ CharacterIdentifierStart,
298/* 195 - Lu category */ CharacterIdentifierStart,
299/* 196 - Lu category */ CharacterIdentifierStart,
300/* 197 - Lu category */ CharacterIdentifierStart,
301/* 198 - Lu category */ CharacterIdentifierStart,
302/* 199 - Lu category */ CharacterIdentifierStart,
303/* 200 - Lu category */ CharacterIdentifierStart,
304/* 201 - Lu category */ CharacterIdentifierStart,
305/* 202 - Lu category */ CharacterIdentifierStart,
306/* 203 - Lu category */ CharacterIdentifierStart,
307/* 204 - Lu category */ CharacterIdentifierStart,
308/* 205 - Lu category */ CharacterIdentifierStart,
309/* 206 - Lu category */ CharacterIdentifierStart,
310/* 207 - Lu category */ CharacterIdentifierStart,
311/* 208 - Lu category */ CharacterIdentifierStart,
312/* 209 - Lu category */ CharacterIdentifierStart,
313/* 210 - Lu category */ CharacterIdentifierStart,
314/* 211 - Lu category */ CharacterIdentifierStart,
315/* 212 - Lu category */ CharacterIdentifierStart,
316/* 213 - Lu category */ CharacterIdentifierStart,
317/* 214 - Lu category */ CharacterIdentifierStart,
318/* 215 - Sm category */ CharacterInvalid,
319/* 216 - Lu category */ CharacterIdentifierStart,
320/* 217 - Lu category */ CharacterIdentifierStart,
321/* 218 - Lu category */ CharacterIdentifierStart,
322/* 219 - Lu category */ CharacterIdentifierStart,
323/* 220 - Lu category */ CharacterIdentifierStart,
324/* 221 - Lu category */ CharacterIdentifierStart,
325/* 222 - Lu category */ CharacterIdentifierStart,
326/* 223 - Ll category */ CharacterIdentifierStart,
327/* 224 - Ll category */ CharacterIdentifierStart,
328/* 225 - Ll category */ CharacterIdentifierStart,
329/* 226 - Ll category */ CharacterIdentifierStart,
330/* 227 - Ll category */ CharacterIdentifierStart,
331/* 228 - Ll category */ CharacterIdentifierStart,
332/* 229 - Ll category */ CharacterIdentifierStart,
333/* 230 - Ll category */ CharacterIdentifierStart,
334/* 231 - Ll category */ CharacterIdentifierStart,
335/* 232 - Ll category */ CharacterIdentifierStart,
336/* 233 - Ll category */ CharacterIdentifierStart,
337/* 234 - Ll category */ CharacterIdentifierStart,
338/* 235 - Ll category */ CharacterIdentifierStart,
339/* 236 - Ll category */ CharacterIdentifierStart,
340/* 237 - Ll category */ CharacterIdentifierStart,
341/* 238 - Ll category */ CharacterIdentifierStart,
342/* 239 - Ll category */ CharacterIdentifierStart,
343/* 240 - Ll category */ CharacterIdentifierStart,
344/* 241 - Ll category */ CharacterIdentifierStart,
345/* 242 - Ll category */ CharacterIdentifierStart,
346/* 243 - Ll category */ CharacterIdentifierStart,
347/* 244 - Ll category */ CharacterIdentifierStart,
348/* 245 - Ll category */ CharacterIdentifierStart,
349/* 246 - Ll category */ CharacterIdentifierStart,
350/* 247 - Sm category */ CharacterInvalid,
351/* 248 - Ll category */ CharacterIdentifierStart,
352/* 249 - Ll category */ CharacterIdentifierStart,
353/* 250 - Ll category */ CharacterIdentifierStart,
354/* 251 - Ll category */ CharacterIdentifierStart,
355/* 252 - Ll category */ CharacterIdentifierStart,
356/* 253 - Ll category */ CharacterIdentifierStart,
357/* 254 - Ll category */ CharacterIdentifierStart,
358/* 255 - Ll category */ CharacterIdentifierStart
359};
360
361// This table provides the character that results from \X where X is the index in the table beginning
362// with SPACE. A table value of 0 means that more processing needs to be done.
363static constexpr const LChar singleCharacterEscapeValuesForASCII[128] = {
364/* 0 - Null */ 0,
365/* 1 - Start of Heading */ 0,
366/* 2 - Start of Text */ 0,
367/* 3 - End of Text */ 0,
368/* 4 - End of Transm. */ 0,
369/* 5 - Enquiry */ 0,
370/* 6 - Acknowledgment */ 0,
371/* 7 - Bell */ 0,
372/* 8 - Back Space */ 0,
373/* 9 - Horizontal Tab */ 0,
374/* 10 - Line Feed */ 0,
375/* 11 - Vertical Tab */ 0,
376/* 12 - Form Feed */ 0,
377/* 13 - Carriage Return */ 0,
378/* 14 - Shift Out */ 0,
379/* 15 - Shift In */ 0,
380/* 16 - Data Line Escape */ 0,
381/* 17 - Device Control 1 */ 0,
382/* 18 - Device Control 2 */ 0,
383/* 19 - Device Control 3 */ 0,
384/* 20 - Device Control 4 */ 0,
385/* 21 - Negative Ack. */ 0,
386/* 22 - Synchronous Idle */ 0,
387/* 23 - End of Transmit */ 0,
388/* 24 - Cancel */ 0,
389/* 25 - End of Medium */ 0,
390/* 26 - Substitute */ 0,
391/* 27 - Escape */ 0,
392/* 28 - File Separator */ 0,
393/* 29 - Group Separator */ 0,
394/* 30 - Record Separator */ 0,
395/* 31 - Unit Separator */ 0,
396/* 32 - Space */ ' ',
397/* 33 - ! */ '!',
398/* 34 - " */ '"',
399/* 35 - # */ '#',
400/* 36 - $ */ '$',
401/* 37 - % */ '%',
402/* 38 - & */ '&',
403/* 39 - ' */ '\'',
404/* 40 - ( */ '(',
405/* 41 - ) */ ')',
406/* 42 - * */ '*',
407/* 43 - + */ '+',
408/* 44 - , */ ',',
409/* 45 - - */ '-',
410/* 46 - . */ '.',
411/* 47 - / */ '/',
412/* 48 - 0 */ 0,
413/* 49 - 1 */ 0,
414/* 50 - 2 */ 0,
415/* 51 - 3 */ 0,
416/* 52 - 4 */ 0,
417/* 53 - 5 */ 0,
418/* 54 - 6 */ 0,
419/* 55 - 7 */ 0,
420/* 56 - 8 */ 0,
421/* 57 - 9 */ 0,
422/* 58 - : */ ':',
423/* 59 - ; */ ';',
424/* 60 - < */ '<',
425/* 61 - = */ '=',
426/* 62 - > */ '>',
427/* 63 - ? */ '?',
428/* 64 - @ */ '@',
429/* 65 - A */ 'A',
430/* 66 - B */ 'B',
431/* 67 - C */ 'C',
432/* 68 - D */ 'D',
433/* 69 - E */ 'E',
434/* 70 - F */ 'F',
435/* 71 - G */ 'G',
436/* 72 - H */ 'H',
437/* 73 - I */ 'I',
438/* 74 - J */ 'J',
439/* 75 - K */ 'K',
440/* 76 - L */ 'L',
441/* 77 - M */ 'M',
442/* 78 - N */ 'N',
443/* 79 - O */ 'O',
444/* 80 - P */ 'P',
445/* 81 - Q */ 'Q',
446/* 82 - R */ 'R',
447/* 83 - S */ 'S',
448/* 84 - T */ 'T',
449/* 85 - U */ 'U',
450/* 86 - V */ 'V',
451/* 87 - W */ 'W',
452/* 88 - X */ 'X',
453/* 89 - Y */ 'Y',
454/* 90 - Z */ 'Z',
455/* 91 - [ */ '[',
456/* 92 - \ */ '\\',
457/* 93 - ] */ ']',
458/* 94 - ^ */ '^',
459/* 95 - _ */ '_',
460/* 96 - ` */ '`',
461/* 97 - a */ 'a',
462/* 98 - b */ 0x08,
463/* 99 - c */ 'c',
464/* 100 - d */ 'd',
465/* 101 - e */ 'e',
466/* 102 - f */ 0x0C,
467/* 103 - g */ 'g',
468/* 104 - h */ 'h',
469/* 105 - i */ 'i',
470/* 106 - j */ 'j',
471/* 107 - k */ 'k',
472/* 108 - l */ 'l',
473/* 109 - m */ 'm',
474/* 110 - n */ 0x0A,
475/* 111 - o */ 'o',
476/* 112 - p */ 'p',
477/* 113 - q */ 'q',
478/* 114 - r */ 0x0D,
479/* 115 - s */ 's',
480/* 116 - t */ 0x09,
481/* 117 - u */ 0,
482/* 118 - v */ 0x0B,
483/* 119 - w */ 'w',
484/* 120 - x */ 0,
485/* 121 - y */ 'y',
486/* 122 - z */ 'z',
487/* 123 - { */ '{',
488/* 124 - | */ '|',
489/* 125 - } */ '}',
490/* 126 - ~ */ '~',
491/* 127 - Delete */ 0
492};
493
494template <typename T>
495Lexer<T>::Lexer(VM* vm, JSParserBuiltinMode builtinMode, JSParserScriptMode scriptMode)
496 : m_isReparsingFunction(false)
497 , m_vm(vm)
498 , m_parsingBuiltinFunction(builtinMode == JSParserBuiltinMode::Builtin)
499 , m_scriptMode(scriptMode)
500{
501}
502
503static inline JSTokenType tokenTypeForIntegerLikeToken(double doubleValue)
504{
505 if ((doubleValue || !std::signbit(doubleValue)) && static_cast<int64_t>(doubleValue) == doubleValue)
506 return INTEGER;
507 return DOUBLE;
508}
509
510template <typename T>
511Lexer<T>::~Lexer()
512{
513}
514
515template <typename T>
516String Lexer<T>::invalidCharacterMessage() const
517{
518 switch (m_current) {
519 case 0:
520 return "Invalid character: '\\0'"_s;
521 case 10:
522 return "Invalid character: '\\n'"_s;
523 case 11:
524 return "Invalid character: '\\v'"_s;
525 case 13:
526 return "Invalid character: '\\r'"_s;
527 case 35:
528 return "Invalid character: '#'"_s;
529 case 64:
530 return "Invalid character: '@'"_s;
531 case 96:
532 return "Invalid character: '`'"_s;
533 default:
534 return makeString("Invalid character '\\u", hex(m_current, 4, Lowercase), '\'');
535 }
536}
537
538template <typename T>
539ALWAYS_INLINE const T* Lexer<T>::currentSourcePtr() const
540{
541 ASSERT(m_code <= m_codeEnd);
542 return m_code;
543}
544
545template <typename T>
546void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
547{
548 m_arena = &arena->identifierArena();
549
550 m_lineNumber = source.firstLine().oneBasedInt();
551 m_lastToken = -1;
552
553 StringView sourceString = source.provider()->source();
554
555 if (!sourceString.isNull())
556 setCodeStart(sourceString);
557 else
558 m_codeStart = 0;
559
560 m_source = &source;
561 m_sourceOffset = source.startOffset();
562 m_codeStartPlusOffset = m_codeStart + source.startOffset();
563 m_code = m_codeStartPlusOffset;
564 m_codeEnd = m_codeStart + source.endOffset();
565 m_error = false;
566 m_atLineStart = true;
567 m_lineStart = m_code;
568 m_lexErrorMessage = String();
569 m_sourceURLDirective = String();
570 m_sourceMappingURLDirective = String();
571
572 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
573 m_buffer16.reserveInitialCapacity(initialReadBufferCapacity);
574 m_bufferForRawTemplateString16.reserveInitialCapacity(initialReadBufferCapacity);
575
576 if (LIKELY(m_code < m_codeEnd))
577 m_current = *m_code;
578 else
579 m_current = 0;
580 ASSERT(currentOffset() == source.startOffset());
581}
582
583template <typename T>
584template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
585{
586 m_code += shiftAmount;
587 ASSERT(currentOffset() >= currentLineStartOffset());
588 m_current = *m_code;
589}
590
591template <typename T>
592ALWAYS_INLINE void Lexer<T>::shift()
593{
594 // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
595 m_current = 0;
596 ++m_code;
597 if (LIKELY(m_code < m_codeEnd))
598 m_current = *m_code;
599}
600
601template <typename T>
602ALWAYS_INLINE bool Lexer<T>::atEnd() const
603{
604 ASSERT(!m_current || m_code < m_codeEnd);
605 return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
606}
607
608template <typename T>
609ALWAYS_INLINE T Lexer<T>::peek(int offset) const
610{
611 ASSERT(offset > 0 && offset < 5);
612 const T* code = m_code + offset;
613 return (code < m_codeEnd) ? *code : 0;
614}
615
616struct ParsedUnicodeEscapeValue {
617 ParsedUnicodeEscapeValue(UChar32 value)
618 : m_value(value)
619 {
620 ASSERT(isValid());
621 }
622
623 enum SpecialValueType { Incomplete = -2, Invalid = -1 };
624 ParsedUnicodeEscapeValue(SpecialValueType type)
625 : m_value(type)
626 {
627 }
628
629 bool isValid() const { return m_value >= 0; }
630 bool isIncomplete() const { return m_value == Incomplete; }
631
632 UChar32 value() const
633 {
634 ASSERT(isValid());
635 return m_value;
636 }
637
638private:
639 UChar32 m_value;
640};
641
642template<typename CharacterType>
643ParsedUnicodeEscapeValue Lexer<CharacterType>::parseUnicodeEscape()
644{
645 if (m_current == '{') {
646 shift();
647 UChar32 codePoint = 0;
648 do {
649 if (!isASCIIHexDigit(m_current))
650 return m_current ? ParsedUnicodeEscapeValue::Invalid : ParsedUnicodeEscapeValue::Incomplete;
651 codePoint = (codePoint << 4) | toASCIIHexValue(m_current);
652 if (codePoint > UCHAR_MAX_VALUE) {
653 // For raw template literal syntax, we consume `NotEscapeSequence`.
654 // Here, we consume NotCodePoint's HexDigits.
655 //
656 // NotEscapeSequence ::
657 // u { [lookahread not one of HexDigit]
658 // u { NotCodePoint
659 // u { CodePoint [lookahead != }]
660 //
661 // NotCodePoint ::
662 // HexDigits but not if MV of HexDigits <= 0x10FFFF
663 //
664 // CodePoint ::
665 // HexDigits but not if MV of HexDigits > 0x10FFFF
666 shift();
667 while (isASCIIHexDigit(m_current))
668 shift();
669
670 return atEnd() ? ParsedUnicodeEscapeValue::Incomplete : ParsedUnicodeEscapeValue::Invalid;
671 }
672 shift();
673 } while (m_current != '}');
674 shift();
675 return codePoint;
676 }
677
678 auto character2 = peek(1);
679 auto character3 = peek(2);
680 auto character4 = peek(3);
681 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(character2) || !isASCIIHexDigit(character3) || !isASCIIHexDigit(character4))) {
682 auto result = (m_code + 4) >= m_codeEnd ? ParsedUnicodeEscapeValue::Incomplete : ParsedUnicodeEscapeValue::Invalid;
683
684 // For raw template literal syntax, we consume `NotEscapeSequence`.
685 //
686 // NotEscapeSequence ::
687 // u [lookahead not one of HexDigit][lookahead != {]
688 // u HexDigit [lookahead not one of HexDigit]
689 // u HexDigit HexDigit [lookahead not one of HexDigit]
690 // u HexDigit HexDigit HexDigit [lookahead not one of HexDigit]
691 while (isASCIIHexDigit(m_current))
692 shift();
693
694 return result;
695 }
696
697 auto result = convertUnicode(m_current, character2, character3, character4);
698 shift();
699 shift();
700 shift();
701 shift();
702 return result;
703}
704
705template <typename T>
706void Lexer<T>::shiftLineTerminator()
707{
708 ASSERT(isLineTerminator(m_current));
709
710 m_positionBeforeLastNewline = currentPosition();
711 T prev = m_current;
712 shift();
713
714 if (prev == '\r' && m_current == '\n')
715 shift();
716
717 ++m_lineNumber;
718}
719
720template <typename T>
721ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
722{
723 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
724}
725
726template <typename T>
727ALWAYS_INLINE void Lexer<T>::skipWhitespace()
728{
729 while (isWhiteSpace(m_current))
730 shift();
731}
732
733static NEVER_INLINE bool isNonLatin1IdentStart(UChar c)
734{
735 return u_hasBinaryProperty(c, UCHAR_ID_START);
736}
737
738static inline bool isIdentStart(LChar c)
739{
740 return typesOfLatin1Characters[c] == CharacterIdentifierStart;
741}
742
743static inline bool isIdentStart(UChar32 c)
744{
745 return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
746}
747
748static NEVER_INLINE bool isNonLatin1IdentPart(UChar32 c)
749{
750 return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == 0x200C || c == 0x200D;
751}
752
753static ALWAYS_INLINE bool isIdentPart(LChar c)
754{
755 // Character types are divided into two groups depending on whether they can be part of an
756 // identifier or not. Those whose type value is less or equal than CharacterOtherIdentifierPart can be
757 // part of an identifier. (See the CharacterType definition for more details.)
758 return typesOfLatin1Characters[c] <= CharacterOtherIdentifierPart;
759}
760
761static ALWAYS_INLINE bool isIdentPart(UChar32 c)
762{
763 return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
764}
765
766static ALWAYS_INLINE bool isIdentPart(UChar c)
767{
768 return isIdentPart(static_cast<UChar32>(c));
769}
770
771template<typename CharacterType> ALWAYS_INLINE bool isIdentPartIncludingEscapeTemplate(const CharacterType* code, const CharacterType* codeEnd)
772{
773 if (isIdentPart(code[0]))
774 return true;
775
776 // Shortest sequence handled below is \u{0}, which is 5 characters.
777 if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u'))
778 return false;
779
780 if (code[2] == '{') {
781 UChar32 codePoint = 0;
782 const CharacterType* pointer;
783 for (pointer = &code[3]; pointer < codeEnd; ++pointer) {
784 auto digit = *pointer;
785 if (!isASCIIHexDigit(digit))
786 break;
787 codePoint = (codePoint << 4) | toASCIIHexValue(digit);
788 if (codePoint > UCHAR_MAX_VALUE)
789 return false;
790 }
791 return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}';
792 }
793
794 // Shortest sequence handled below is \uXXXX, which is 6 characters.
795 if (codeEnd - code < 6)
796 return false;
797
798 auto character1 = code[2];
799 auto character2 = code[3];
800 auto character3 = code[4];
801 auto character4 = code[5];
802 return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4)
803 && isIdentPart(Lexer<LChar>::convertUnicode(character1, character2, character3, character4));
804}
805
806static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd)
807{
808 return isIdentPartIncludingEscapeTemplate(code, codeEnd);
809}
810
811static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd)
812{
813 return isIdentPartIncludingEscapeTemplate(code, codeEnd);
814}
815
816static inline LChar singleEscape(int c)
817{
818 if (c < 128) {
819 ASSERT(static_cast<size_t>(c) < WTF_ARRAY_LENGTH(singleCharacterEscapeValuesForASCII));
820 return singleCharacterEscapeValuesForASCII[c];
821 }
822 return 0;
823}
824
825template <typename T>
826inline void Lexer<T>::record8(int c)
827{
828 ASSERT(c >= 0);
829 ASSERT(c <= 0xFF);
830 m_buffer8.append(static_cast<LChar>(c));
831}
832
833template <typename T>
834inline void assertCharIsIn8BitRange(T c)
835{
836 UNUSED_PARAM(c);
837 ASSERT(c >= 0);
838 ASSERT(c <= 0xFF);
839}
840
841template <>
842inline void assertCharIsIn8BitRange(UChar c)
843{
844 UNUSED_PARAM(c);
845 ASSERT(c <= 0xFF);
846}
847
848template <>
849inline void assertCharIsIn8BitRange(LChar)
850{
851}
852
853template <typename T>
854inline void Lexer<T>::append8(const T* p, size_t length)
855{
856 size_t currentSize = m_buffer8.size();
857 m_buffer8.grow(currentSize + length);
858 LChar* rawBuffer = m_buffer8.data() + currentSize;
859
860 for (size_t i = 0; i < length; i++) {
861 T c = p[i];
862 assertCharIsIn8BitRange(c);
863 rawBuffer[i] = c;
864 }
865}
866
867template <typename T>
868inline void Lexer<T>::append16(const LChar* p, size_t length)
869{
870 size_t currentSize = m_buffer16.size();
871 m_buffer16.grow(currentSize + length);
872 UChar* rawBuffer = m_buffer16.data() + currentSize;
873
874 for (size_t i = 0; i < length; i++)
875 rawBuffer[i] = p[i];
876}
877
878template <typename T>
879inline void Lexer<T>::record16(T c)
880{
881 m_buffer16.append(c);
882}
883
884template <typename T>
885inline void Lexer<T>::record16(int c)
886{
887 ASSERT(c >= 0);
888 ASSERT(c <= static_cast<int>(USHRT_MAX));
889 m_buffer16.append(static_cast<UChar>(c));
890}
891
892template<typename CharacterType> inline void Lexer<CharacterType>::recordUnicodeCodePoint(UChar32 codePoint)
893{
894 ASSERT(codePoint >= 0);
895 ASSERT(codePoint <= UCHAR_MAX_VALUE);
896 if (U_IS_BMP(codePoint))
897 record16(codePoint);
898 else {
899 UChar codeUnits[2] = { U16_LEAD(codePoint), U16_TRAIL(codePoint) };
900 append16(codeUnits, 2);
901 }
902}
903
904#if !ASSERT_DISABLED
905bool isSafeBuiltinIdentifier(VM& vm, const Identifier* ident)
906{
907 if (!ident)
908 return true;
909 /* Just block any use of suspicious identifiers. This is intended to
910 * be used as a safety net while implementing builtins.
911 */
912 // FIXME: How can a debug-only assertion be a safety net?
913 if (*ident == vm.propertyNames->builtinNames().callPublicName())
914 return false;
915 if (*ident == vm.propertyNames->builtinNames().applyPublicName())
916 return false;
917 if (*ident == vm.propertyNames->eval)
918 return false;
919 if (*ident == vm.propertyNames->Function)
920 return false;
921 return true;
922}
923#endif
924
925template <>
926template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
927{
928 tokenData->escaped = false;
929 const ptrdiff_t remaining = m_codeEnd - m_code;
930 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
931 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
932 if (keyword != IDENT) {
933 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
934 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
935 }
936 }
937
938 bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
939 if (isPrivateName)
940 shift();
941
942 const LChar* identifierStart = currentSourcePtr();
943 unsigned identifierLineStart = currentLineStartOffset();
944
945 while (isIdentPart(m_current))
946 shift();
947
948 if (UNLIKELY(m_current == '\\')) {
949 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
950 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
951 }
952
953 const Identifier* ident = nullptr;
954
955 if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
956 int identifierLength = currentSourcePtr() - identifierStart;
957 ident = makeIdentifier(identifierStart, identifierLength);
958 if (m_parsingBuiltinFunction) {
959 if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
960 m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
961 return ERRORTOK;
962 }
963 if (isPrivateName)
964 ident = &m_arena->makeIdentifier(m_vm, m_vm->propertyNames->lookUpPrivateName(*ident));
965 else if (*ident == m_vm->propertyNames->undefinedKeyword)
966 tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
967 if (!ident)
968 return INVALID_PRIVATE_NAME_ERRORTOK;
969 }
970 tokenData->ident = ident;
971 } else
972 tokenData->ident = nullptr;
973
974 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
975 ASSERT(shouldCreateIdentifier);
976 if (remaining < maxTokenLength) {
977 const HashTableValue* entry = JSC::mainTable.entry(*ident);
978 ASSERT((remaining < maxTokenLength) || !entry);
979 if (!entry)
980 return IDENT;
981 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
982 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
983 }
984 return IDENT;
985 }
986
987 return IDENT;
988}
989
990template <>
991template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
992{
993 tokenData->escaped = false;
994 const ptrdiff_t remaining = m_codeEnd - m_code;
995 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
996 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
997 if (keyword != IDENT) {
998 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
999 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
1000 }
1001 }
1002
1003 bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
1004 if (isPrivateName)
1005 shift();
1006
1007 const UChar* identifierStart = currentSourcePtr();
1008 int identifierLineStart = currentLineStartOffset();
1009
1010 UChar orAllChars = 0;
1011
1012 while (isIdentPart(m_current)) {
1013 orAllChars |= m_current;
1014 shift();
1015 }
1016
1017 if (UNLIKELY(m_current == '\\')) {
1018 ASSERT(!isPrivateName);
1019 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
1020 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
1021 }
1022
1023 bool isAll8Bit = false;
1024
1025 if (!(orAllChars & ~0xff))
1026 isAll8Bit = true;
1027
1028 const Identifier* ident = nullptr;
1029
1030 if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
1031 int identifierLength = currentSourcePtr() - identifierStart;
1032 if (isAll8Bit)
1033 ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
1034 else
1035 ident = makeIdentifier(identifierStart, identifierLength);
1036 if (m_parsingBuiltinFunction) {
1037 if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
1038 m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
1039 return ERRORTOK;
1040 }
1041 if (isPrivateName)
1042 ident = &m_arena->makeIdentifier(m_vm, m_vm->propertyNames->lookUpPrivateName(*ident));
1043 else if (*ident == m_vm->propertyNames->undefinedKeyword)
1044 tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
1045 if (!ident)
1046 return INVALID_PRIVATE_NAME_ERRORTOK;
1047 }
1048 tokenData->ident = ident;
1049 } else
1050 tokenData->ident = nullptr;
1051
1052 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
1053 ASSERT(shouldCreateIdentifier);
1054 if (remaining < maxTokenLength) {
1055 const HashTableValue* entry = JSC::mainTable.entry(*ident);
1056 ASSERT((remaining < maxTokenLength) || !entry);
1057 if (!entry)
1058 return IDENT;
1059 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
1060 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
1061 }
1062 return IDENT;
1063 }
1064
1065 return IDENT;
1066}
1067
1068template<typename CharacterType> template<bool shouldCreateIdentifier> JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
1069{
1070 tokenData->escaped = true;
1071 auto identifierStart = currentSourcePtr();
1072 bool bufferRequired = false;
1073
1074 while (true) {
1075 if (LIKELY(isIdentPart(m_current))) {
1076 shift();
1077 continue;
1078 }
1079 if (LIKELY(m_current != '\\'))
1080 break;
1081
1082 // \uXXXX unicode characters.
1083 bufferRequired = true;
1084 if (identifierStart != currentSourcePtr())
1085 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
1086 shift();
1087 if (UNLIKELY(m_current != 'u'))
1088 return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
1089 shift();
1090 auto character = parseUnicodeEscape();
1091 if (UNLIKELY(!character.isValid()))
1092 return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
1093 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value())))
1094 return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
1095 if (shouldCreateIdentifier)
1096 recordUnicodeCodePoint(character.value());
1097 identifierStart = currentSourcePtr();
1098 }
1099
1100 int identifierLength;
1101 const Identifier* ident = nullptr;
1102 if (shouldCreateIdentifier) {
1103 if (!bufferRequired) {
1104 identifierLength = currentSourcePtr() - identifierStart;
1105 ident = makeIdentifier(identifierStart, identifierLength);
1106 } else {
1107 if (identifierStart != currentSourcePtr())
1108 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
1109 ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1110 }
1111
1112 tokenData->ident = ident;
1113 } else
1114 tokenData->ident = nullptr;
1115
1116 m_buffer16.shrink(0);
1117
1118 if (LIKELY(!(lexerFlags & LexerFlagsIgnoreReservedWords))) {
1119 ASSERT(shouldCreateIdentifier);
1120 const HashTableValue* entry = JSC::mainTable.entry(*ident);
1121 if (!entry)
1122 return IDENT;
1123 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
1124 if ((token != RESERVED_IF_STRICT) || strictMode)
1125 return bufferRequired ? UNEXPECTED_ESCAPE_ERRORTOK : token;
1126 }
1127
1128 return IDENT;
1129}
1130
1131static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
1132{
1133 return character < 0xE;
1134}
1135
1136static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
1137{
1138 return character < 0xE || character > 0xFF;
1139}
1140
1141template <typename T>
1142template <bool shouldBuildStrings> ALWAYS_INLINE typename Lexer<T>::StringParseResult Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
1143{
1144 int startingOffset = currentOffset();
1145 int startingLineStartOffset = currentLineStartOffset();
1146 int startingLineNumber = lineNumber();
1147 T stringQuoteCharacter = m_current;
1148 shift();
1149
1150 const T* stringStart = currentSourcePtr();
1151
1152 while (m_current != stringQuoteCharacter) {
1153 if (UNLIKELY(m_current == '\\')) {
1154 if (stringStart != currentSourcePtr() && shouldBuildStrings)
1155 append8(stringStart, currentSourcePtr() - stringStart);
1156 shift();
1157
1158 LChar escape = singleEscape(m_current);
1159
1160 // Most common escape sequences first.
1161 if (escape) {
1162 if (shouldBuildStrings)
1163 record8(escape);
1164 shift();
1165 } else if (UNLIKELY(isLineTerminator(m_current)))
1166 shiftLineTerminator();
1167 else if (m_current == 'x') {
1168 shift();
1169 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1170 m_lexErrorMessage = "\\x can only be followed by a hex character sequence"_s;
1171 return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed;
1172 }
1173 T prev = m_current;
1174 shift();
1175 if (shouldBuildStrings)
1176 record8(convertHex(prev, m_current));
1177 shift();
1178 } else {
1179 setOffset(startingOffset, startingLineStartOffset);
1180 setLineNumber(startingLineNumber);
1181 m_buffer8.shrink(0);
1182 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1183 }
1184 stringStart = currentSourcePtr();
1185 continue;
1186 }
1187
1188 if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
1189 setOffset(startingOffset, startingLineStartOffset);
1190 setLineNumber(startingLineNumber);
1191 m_buffer8.shrink(0);
1192 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1193 }
1194
1195 shift();
1196 }
1197
1198 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1199 append8(stringStart, currentSourcePtr() - stringStart);
1200 if (shouldBuildStrings) {
1201 tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
1202 m_buffer8.shrink(0);
1203 } else
1204 tokenData->ident = 0;
1205
1206 return StringParsedSuccessfully;
1207}
1208
1209template <typename T>
1210template <bool shouldBuildStrings, LexerEscapeParseMode escapeParseMode> ALWAYS_INLINE auto Lexer<T>::parseComplexEscape(bool strictMode, T stringQuoteCharacter) -> StringParseResult
1211{
1212 if (m_current == 'x') {
1213 shift();
1214 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1215 // For raw template literal syntax, we consume `NotEscapeSequence`.
1216 //
1217 // NotEscapeSequence ::
1218 // x [lookahread not one of HexDigit]
1219 // x HexDigit [lookahread not one of HexDigit]
1220 if (isASCIIHexDigit(m_current))
1221 shift();
1222 ASSERT(!isASCIIHexDigit(m_current));
1223
1224 m_lexErrorMessage = "\\x can only be followed by a hex character sequence"_s;
1225 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1226 }
1227
1228 T prev = m_current;
1229 shift();
1230 if (shouldBuildStrings)
1231 record16(convertHex(prev, m_current));
1232 shift();
1233
1234 return StringParsedSuccessfully;
1235 }
1236
1237 if (m_current == 'u') {
1238 shift();
1239
1240 if (escapeParseMode == LexerEscapeParseMode::String && m_current == stringQuoteCharacter) {
1241 if (shouldBuildStrings)
1242 record16('u');
1243 return StringParsedSuccessfully;
1244 }
1245
1246 auto character = parseUnicodeEscape();
1247 if (character.isValid()) {
1248 if (shouldBuildStrings)
1249 recordUnicodeCodePoint(character.value());
1250 return StringParsedSuccessfully;
1251 }
1252
1253 m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence"_s;
1254 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1255 }
1256
1257 if (strictMode) {
1258 if (isASCIIDigit(m_current)) {
1259 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
1260 int character1 = m_current;
1261 shift();
1262 if (character1 != '0' || isASCIIDigit(m_current)) {
1263 // For raw template literal syntax, we consume `NotEscapeSequence`.
1264 //
1265 // NotEscapeSequence ::
1266 // 0 DecimalDigit
1267 // DecimalDigit but not 0
1268 if (character1 == '0')
1269 shift();
1270
1271 m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'"_s;
1272 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1273 }
1274 if (shouldBuildStrings)
1275 record16(0);
1276 return StringParsedSuccessfully;
1277 }
1278 } else {
1279 if (isASCIIOctalDigit(m_current)) {
1280 // Octal character sequences
1281 T character1 = m_current;
1282 shift();
1283 if (isASCIIOctalDigit(m_current)) {
1284 // Two octal characters
1285 T character2 = m_current;
1286 shift();
1287 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
1288 if (shouldBuildStrings)
1289 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
1290 shift();
1291 } else {
1292 if (shouldBuildStrings)
1293 record16((character1 - '0') * 8 + character2 - '0');
1294 }
1295 } else {
1296 if (shouldBuildStrings)
1297 record16(character1 - '0');
1298 }
1299 return StringParsedSuccessfully;
1300 }
1301 }
1302
1303 if (!atEnd()) {
1304 if (shouldBuildStrings)
1305 record16(m_current);
1306 shift();
1307 return StringParsedSuccessfully;
1308 }
1309
1310 m_lexErrorMessage = "Unterminated string constant"_s;
1311 return StringUnterminated;
1312}
1313
1314template <typename T>
1315template <bool shouldBuildStrings> auto Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode) -> StringParseResult
1316{
1317 T stringQuoteCharacter = m_current;
1318 shift();
1319
1320 const T* stringStart = currentSourcePtr();
1321
1322 while (m_current != stringQuoteCharacter) {
1323 if (UNLIKELY(m_current == '\\')) {
1324 if (stringStart != currentSourcePtr() && shouldBuildStrings)
1325 append16(stringStart, currentSourcePtr() - stringStart);
1326 shift();
1327
1328 LChar escape = singleEscape(m_current);
1329
1330 // Most common escape sequences first
1331 if (escape) {
1332 if (shouldBuildStrings)
1333 record16(escape);
1334 shift();
1335 } else if (UNLIKELY(isLineTerminator(m_current)))
1336 shiftLineTerminator();
1337 else {
1338 StringParseResult result = parseComplexEscape<shouldBuildStrings, LexerEscapeParseMode::String>(strictMode, stringQuoteCharacter);
1339 if (result != StringParsedSuccessfully)
1340 return result;
1341 }
1342
1343 stringStart = currentSourcePtr();
1344 continue;
1345 }
1346 // Fast check for characters that require special handling.
1347 // Catches 0, \n, and \r as efficiently as possible, and lets through all common ASCII characters.
1348 static_assert(std::is_unsigned<T>::value, "Lexer expects an unsigned character type");
1349 if (UNLIKELY(m_current < 0xE)) {
1350 // New-line or end of input is not allowed
1351 if (atEnd() || m_current == '\r' || m_current == '\n') {
1352 m_lexErrorMessage = "Unexpected EOF"_s;
1353 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1354 }
1355 // Anything else is just a normal character
1356 }
1357 shift();
1358 }
1359
1360 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1361 append16(stringStart, currentSourcePtr() - stringStart);
1362 if (shouldBuildStrings)
1363 tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1364 else
1365 tokenData->ident = 0;
1366
1367 m_buffer16.shrink(0);
1368 return StringParsedSuccessfully;
1369}
1370
1371template <typename T>
1372typename Lexer<T>::StringParseResult Lexer<T>::parseTemplateLiteral(JSTokenData* tokenData, RawStringsBuildMode rawStringsBuildMode)
1373{
1374 bool parseCookedFailed = false;
1375 const T* stringStart = currentSourcePtr();
1376 const T* rawStringStart = currentSourcePtr();
1377
1378 while (m_current != '`') {
1379 if (UNLIKELY(m_current == '\\')) {
1380 if (stringStart != currentSourcePtr())
1381 append16(stringStart, currentSourcePtr() - stringStart);
1382 shift();
1383
1384 LChar escape = singleEscape(m_current);
1385
1386 // Most common escape sequences first.
1387 if (escape) {
1388 record16(escape);
1389 shift();
1390 } else if (UNLIKELY(isLineTerminator(m_current))) {
1391 // Normalize <CR>, <CR><LF> to <LF>.
1392 if (m_current == '\r') {
1393 ASSERT_WITH_MESSAGE(rawStringStart != currentSourcePtr(), "We should have at least shifted the escape.");
1394
1395 if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings) {
1396 m_bufferForRawTemplateString16.append(rawStringStart, currentSourcePtr() - rawStringStart);
1397 m_bufferForRawTemplateString16.append('\n');
1398 }
1399
1400 shiftLineTerminator();
1401 rawStringStart = currentSourcePtr();
1402 } else
1403 shiftLineTerminator();
1404 } else {
1405 bool strictMode = true;
1406 StringParseResult result = parseComplexEscape<true, LexerEscapeParseMode::Template>(strictMode, '`');
1407 if (result != StringParsedSuccessfully) {
1408 if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings && result == StringCannotBeParsed)
1409 parseCookedFailed = true;
1410 else
1411 return result;
1412 }
1413 }
1414
1415 stringStart = currentSourcePtr();
1416 continue;
1417 }
1418
1419 if (m_current == '$' && peek(1) == '{')
1420 break;
1421
1422 // Fast check for characters that require special handling.
1423 // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
1424 // as possible, and lets through all common ASCII characters.
1425 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
1426 // End of input is not allowed.
1427 // Unlike String, line terminator is allowed.
1428 if (atEnd()) {
1429 m_lexErrorMessage = "Unexpected EOF"_s;
1430 return StringUnterminated;
1431 }
1432
1433 if (isLineTerminator(m_current)) {
1434 if (m_current == '\r') {
1435 // Normalize <CR>, <CR><LF> to <LF>.
1436 if (stringStart != currentSourcePtr())
1437 append16(stringStart, currentSourcePtr() - stringStart);
1438 if (rawStringStart != currentSourcePtr() && rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings)
1439 m_bufferForRawTemplateString16.append(rawStringStart, currentSourcePtr() - rawStringStart);
1440
1441 record16('\n');
1442 if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings)
1443 m_bufferForRawTemplateString16.append('\n');
1444 shiftLineTerminator();
1445 stringStart = currentSourcePtr();
1446 rawStringStart = currentSourcePtr();
1447 } else
1448 shiftLineTerminator();
1449 continue;
1450 }
1451 // Anything else is just a normal character
1452 }
1453
1454 shift();
1455 }
1456
1457 bool isTail = m_current == '`';
1458
1459 if (currentSourcePtr() != stringStart)
1460 append16(stringStart, currentSourcePtr() - stringStart);
1461 if (rawStringStart != currentSourcePtr() && rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings)
1462 m_bufferForRawTemplateString16.append(rawStringStart, currentSourcePtr() - rawStringStart);
1463
1464 if (!parseCookedFailed)
1465 tokenData->cooked = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1466 else
1467 tokenData->cooked = nullptr;
1468
1469 // Line terminator normalization (e.g. <CR> => <LF>) should be applied to both the raw and cooked representations.
1470 if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings)
1471 tokenData->raw = makeIdentifier(m_bufferForRawTemplateString16.data(), m_bufferForRawTemplateString16.size());
1472 else
1473 tokenData->raw = nullptr;
1474
1475 tokenData->isTail = isTail;
1476
1477 m_buffer16.shrink(0);
1478 m_bufferForRawTemplateString16.shrink(0);
1479
1480 if (isTail) {
1481 // Skip `
1482 shift();
1483 } else {
1484 // Skip $ and {
1485 shift();
1486 shift();
1487 }
1488
1489 return StringParsedSuccessfully;
1490}
1491
1492template <typename T>
1493ALWAYS_INLINE auto Lexer<T>::parseHex() -> NumberParseResult
1494{
1495 // Optimization: most hexadecimal values fit into 4 bytes.
1496 uint32_t hexValue = 0;
1497 int maximumDigits = 7;
1498
1499 do {
1500 hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1501 shift();
1502 --maximumDigits;
1503 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1504
1505 if (LIKELY(maximumDigits >= 0 && m_current != 'n'))
1506 return hexValue;
1507
1508 // No more place in the hexValue buffer.
1509 // The values are shifted out and placed into the m_buffer8 vector.
1510 for (int i = 0; i < 8; ++i) {
1511 int digit = hexValue >> 28;
1512 if (digit < 10)
1513 record8(digit + '0');
1514 else
1515 record8(digit - 10 + 'a');
1516 hexValue <<= 4;
1517 }
1518
1519 while (isASCIIHexDigit(m_current)) {
1520 record8(m_current);
1521 shift();
1522 }
1523
1524 if (UNLIKELY(Options::useBigInt() && m_current == 'n'))
1525 return makeIdentifier(m_buffer8.data(), m_buffer8.size());
1526
1527 return parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1528}
1529
1530template <typename T>
1531ALWAYS_INLINE auto Lexer<T>::parseBinary() -> Optional<NumberParseResult>
1532{
1533 // Optimization: most binary values fit into 4 bytes.
1534 uint32_t binaryValue = 0;
1535 const unsigned maximumDigits = 32;
1536 int digit = maximumDigits - 1;
1537 // Temporary buffer for the digits. Makes easier
1538 // to reconstruct the input characters when needed.
1539 LChar digits[maximumDigits];
1540
1541 do {
1542 binaryValue = (binaryValue << 1) + (m_current - '0');
1543 digits[digit] = m_current;
1544 shift();
1545 --digit;
1546 } while (isASCIIBinaryDigit(m_current) && digit >= 0);
1547
1548 if (LIKELY(!isASCIIDigit(m_current) && digit >= 0 && m_current != 'n'))
1549 return Variant<double, const Identifier*> { binaryValue };
1550
1551 for (int i = maximumDigits - 1; i > digit; --i)
1552 record8(digits[i]);
1553
1554 while (isASCIIBinaryDigit(m_current)) {
1555 record8(m_current);
1556 shift();
1557 }
1558
1559 if (UNLIKELY(Options::useBigInt() && m_current == 'n'))
1560 return Variant<double, const Identifier*> { makeIdentifier(m_buffer8.data(), m_buffer8.size()) };
1561
1562 if (isASCIIDigit(m_current))
1563 return WTF::nullopt;
1564
1565 return Variant<double, const Identifier*> { parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 2) };
1566}
1567
1568template <typename T>
1569ALWAYS_INLINE auto Lexer<T>::parseOctal() -> Optional<NumberParseResult>
1570{
1571 // Optimization: most octal values fit into 4 bytes.
1572 uint32_t octalValue = 0;
1573 const unsigned maximumDigits = 10;
1574 int digit = maximumDigits - 1;
1575 // Temporary buffer for the digits. Makes easier
1576 // to reconstruct the input characters when needed.
1577 LChar digits[maximumDigits];
1578
1579 do {
1580 octalValue = octalValue * 8 + (m_current - '0');
1581 digits[digit] = m_current;
1582 shift();
1583 --digit;
1584 } while (isASCIIOctalDigit(m_current) && digit >= 0);
1585
1586 if (LIKELY(!isASCIIDigit(m_current) && digit >= 0 && m_current != 'n'))
1587 return Variant<double, const Identifier*> { octalValue };
1588
1589
1590 for (int i = maximumDigits - 1; i > digit; --i)
1591 record8(digits[i]);
1592
1593 while (isASCIIOctalDigit(m_current)) {
1594 record8(m_current);
1595 shift();
1596 }
1597
1598 if (UNLIKELY(Options::useBigInt() && m_current == 'n'))
1599 return Variant<double, const Identifier*> { makeIdentifier(m_buffer8.data(), m_buffer8.size()) };
1600
1601 if (isASCIIDigit(m_current))
1602 return WTF::nullopt;
1603
1604 return Variant<double, const Identifier*> { parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8) };
1605}
1606
1607template <typename T>
1608ALWAYS_INLINE auto Lexer<T>::parseDecimal() -> Optional<NumberParseResult>
1609{
1610 // Optimization: most decimal values fit into 4 bytes.
1611 uint32_t decimalValue = 0;
1612
1613 // Since parseOctal may be executed before parseDecimal,
1614 // the m_buffer8 may hold ascii digits.
1615 if (!m_buffer8.size()) {
1616 const unsigned maximumDigits = 10;
1617 int digit = maximumDigits - 1;
1618 // Temporary buffer for the digits. Makes easier
1619 // to reconstruct the input characters when needed.
1620 LChar digits[maximumDigits];
1621
1622 do {
1623 decimalValue = decimalValue * 10 + (m_current - '0');
1624 digits[digit] = m_current;
1625 shift();
1626 --digit;
1627 } while (isASCIIDigit(m_current) && digit >= 0);
1628
1629 if (digit >= 0 && m_current != '.' && !isASCIIAlphaCaselessEqual(m_current, 'e') && m_current != 'n')
1630 return Variant<double, const Identifier*> { decimalValue };
1631
1632 for (int i = maximumDigits - 1; i > digit; --i)
1633 record8(digits[i]);
1634 }
1635
1636 while (isASCIIDigit(m_current)) {
1637 record8(m_current);
1638 shift();
1639 }
1640
1641 if (UNLIKELY(Options::useBigInt() && m_current == 'n'))
1642 return Variant<double, const Identifier*> { makeIdentifier(m_buffer8.data(), m_buffer8.size()) };
1643
1644 return WTF::nullopt;
1645}
1646
1647template <typename T>
1648ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1649{
1650 record8('.');
1651 while (isASCIIDigit(m_current)) {
1652 record8(m_current);
1653 shift();
1654 }
1655}
1656
1657template <typename T>
1658ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1659{
1660 record8('e');
1661 shift();
1662 if (m_current == '+' || m_current == '-') {
1663 record8(m_current);
1664 shift();
1665 }
1666
1667 if (!isASCIIDigit(m_current))
1668 return false;
1669
1670 do {
1671 record8(m_current);
1672 shift();
1673 } while (isASCIIDigit(m_current));
1674 return true;
1675}
1676
1677template <typename T>
1678ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1679{
1680 while (true) {
1681 while (UNLIKELY(m_current == '*')) {
1682 shift();
1683 if (m_current == '/') {
1684 shift();
1685 return true;
1686 }
1687 }
1688
1689 if (atEnd())
1690 return false;
1691
1692 if (isLineTerminator(m_current)) {
1693 shiftLineTerminator();
1694 m_hasLineTerminatorBeforeToken = true;
1695 } else
1696 shift();
1697 }
1698}
1699
1700template <typename T>
1701ALWAYS_INLINE void Lexer<T>::parseCommentDirective()
1702{
1703 // sourceURL and sourceMappingURL directives.
1704 if (!consume("source"))
1705 return;
1706
1707 if (consume("URL=")) {
1708 m_sourceURLDirective = parseCommentDirectiveValue();
1709 return;
1710 }
1711
1712 if (consume("MappingURL=")) {
1713 m_sourceMappingURLDirective = parseCommentDirectiveValue();
1714 return;
1715 }
1716}
1717
1718template <typename T>
1719ALWAYS_INLINE String Lexer<T>::parseCommentDirectiveValue()
1720{
1721 skipWhitespace();
1722 const T* stringStart = currentSourcePtr();
1723 while (!isWhiteSpace(m_current) && !isLineTerminator(m_current) && m_current != '"' && m_current != '\'' && !atEnd())
1724 shift();
1725 const T* stringEnd = currentSourcePtr();
1726 skipWhitespace();
1727
1728 if (!isLineTerminator(m_current) && !atEnd())
1729 return String();
1730
1731 append8(stringStart, stringEnd - stringStart);
1732 String result = String(m_buffer8.data(), m_buffer8.size());
1733 m_buffer8.shrink(0);
1734 return result;
1735}
1736
1737template <typename T>
1738template <unsigned length>
1739ALWAYS_INLINE bool Lexer<T>::consume(const char (&input)[length])
1740{
1741 unsigned lengthToCheck = length - 1; // Ignore the ending NULL byte in the string literal.
1742
1743 unsigned i = 0;
1744 for (; i < lengthToCheck && m_current == input[i]; i++)
1745 shift();
1746
1747 return i == lengthToCheck;
1748}
1749
1750template <typename T>
1751bool Lexer<T>::nextTokenIsColon()
1752{
1753 const T* code = m_code;
1754 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1755 code++;
1756
1757 return code < m_codeEnd && *code == ':';
1758}
1759
1760template <typename T>
1761void Lexer<T>::fillTokenInfo(JSToken* tokenRecord, JSTokenType token, int lineNumber, int endOffset, int lineStartOffset, JSTextPosition endPosition)
1762{
1763 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
1764 tokenLocation->line = lineNumber;
1765 tokenLocation->endOffset = endOffset;
1766 tokenLocation->lineStartOffset = lineStartOffset;
1767 ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1768 tokenRecord->m_endPosition = endPosition;
1769 m_lastToken = token;
1770}
1771
1772template <typename T>
1773JSTokenType Lexer<T>::lexWithoutClearingLineTerminator(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
1774{
1775 JSTokenData* tokenData = &tokenRecord->m_data;
1776 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
1777 m_lastTokenLocation = JSTokenLocation(tokenRecord->m_location);
1778
1779 ASSERT(!m_error);
1780 ASSERT(m_buffer8.isEmpty());
1781 ASSERT(m_buffer16.isEmpty());
1782
1783 JSTokenType token = ERRORTOK;
1784
1785start:
1786 skipWhitespace();
1787
1788 tokenLocation->startOffset = currentOffset();
1789 ASSERT(currentOffset() >= currentLineStartOffset());
1790 tokenRecord->m_startPosition = currentPosition();
1791
1792 if (atEnd()) {
1793 token = EOFTOK;
1794 goto returnToken;
1795 }
1796
1797 CharacterType type;
1798 if (LIKELY(isLatin1(m_current)))
1799 type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1800 else if (isNonLatin1IdentStart(m_current))
1801 type = CharacterIdentifierStart;
1802 else if (isLineTerminator(m_current))
1803 type = CharacterLineTerminator;
1804 else
1805 type = CharacterInvalid;
1806
1807 switch (type) {
1808 case CharacterGreater:
1809 shift();
1810 if (m_current == '>') {
1811 shift();
1812 if (m_current == '>') {
1813 shift();
1814 if (m_current == '=') {
1815 shift();
1816 token = URSHIFTEQUAL;
1817 break;
1818 }
1819 token = URSHIFT;
1820 break;
1821 }
1822 if (m_current == '=') {
1823 shift();
1824 token = RSHIFTEQUAL;
1825 break;
1826 }
1827 token = RSHIFT;
1828 break;
1829 }
1830 if (m_current == '=') {
1831 shift();
1832 token = GE;
1833 break;
1834 }
1835 token = GT;
1836 break;
1837 case CharacterEqual: {
1838 if (peek(1) == '>') {
1839 token = ARROWFUNCTION;
1840 tokenData->line = lineNumber();
1841 tokenData->offset = currentOffset();
1842 tokenData->lineStartOffset = currentLineStartOffset();
1843 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1844 shift();
1845 shift();
1846 break;
1847 }
1848
1849 shift();
1850 if (m_current == '=') {
1851 shift();
1852 if (m_current == '=') {
1853 shift();
1854 token = STREQ;
1855 break;
1856 }
1857 token = EQEQ;
1858 break;
1859 }
1860 token = EQUAL;
1861 break;
1862 }
1863 case CharacterLess:
1864 shift();
1865 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1866 if (m_scriptMode == JSParserScriptMode::Classic) {
1867 // <!-- marks the beginning of a line comment (for www usage)
1868 goto inSingleLineComment;
1869 }
1870 }
1871 if (m_current == '<') {
1872 shift();
1873 if (m_current == '=') {
1874 shift();
1875 token = LSHIFTEQUAL;
1876 break;
1877 }
1878 token = LSHIFT;
1879 break;
1880 }
1881 if (m_current == '=') {
1882 shift();
1883 token = LE;
1884 break;
1885 }
1886 token = LT;
1887 break;
1888 case CharacterExclamationMark:
1889 shift();
1890 if (m_current == '=') {
1891 shift();
1892 if (m_current == '=') {
1893 shift();
1894 token = STRNEQ;
1895 break;
1896 }
1897 token = NE;
1898 break;
1899 }
1900 token = EXCLAMATION;
1901 break;
1902 case CharacterAdd:
1903 shift();
1904 if (m_current == '+') {
1905 shift();
1906 token = (!m_hasLineTerminatorBeforeToken) ? PLUSPLUS : AUTOPLUSPLUS;
1907 break;
1908 }
1909 if (m_current == '=') {
1910 shift();
1911 token = PLUSEQUAL;
1912 break;
1913 }
1914 token = PLUS;
1915 break;
1916 case CharacterSub:
1917 shift();
1918 if (m_current == '-') {
1919 shift();
1920 if ((m_atLineStart || m_hasLineTerminatorBeforeToken) && m_current == '>') {
1921 if (m_scriptMode == JSParserScriptMode::Classic) {
1922 shift();
1923 goto inSingleLineComment;
1924 }
1925 }
1926 token = (!m_hasLineTerminatorBeforeToken) ? MINUSMINUS : AUTOMINUSMINUS;
1927 break;
1928 }
1929 if (m_current == '=') {
1930 shift();
1931 token = MINUSEQUAL;
1932 break;
1933 }
1934 token = MINUS;
1935 break;
1936 case CharacterMultiply:
1937 shift();
1938 if (m_current == '=') {
1939 shift();
1940 token = MULTEQUAL;
1941 break;
1942 }
1943 if (m_current == '*') {
1944 shift();
1945 if (m_current == '=') {
1946 shift();
1947 token = POWEQUAL;
1948 break;
1949 }
1950 token = POW;
1951 break;
1952 }
1953 token = TIMES;
1954 break;
1955 case CharacterSlash:
1956 shift();
1957 if (m_current == '/') {
1958 shift();
1959 goto inSingleLineCommentCheckForDirectives;
1960 }
1961 if (m_current == '*') {
1962 shift();
1963 if (parseMultilineComment())
1964 goto start;
1965 m_lexErrorMessage = "Multiline comment was not closed properly"_s;
1966 token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK;
1967 goto returnError;
1968 }
1969 if (m_current == '=') {
1970 shift();
1971 token = DIVEQUAL;
1972 break;
1973 }
1974 token = DIVIDE;
1975 break;
1976 case CharacterAnd:
1977 shift();
1978 if (m_current == '&') {
1979 shift();
1980 token = AND;
1981 break;
1982 }
1983 if (m_current == '=') {
1984 shift();
1985 token = ANDEQUAL;
1986 break;
1987 }
1988 token = BITAND;
1989 break;
1990 case CharacterXor:
1991 shift();
1992 if (m_current == '=') {
1993 shift();
1994 token = XOREQUAL;
1995 break;
1996 }
1997 token = BITXOR;
1998 break;
1999 case CharacterModulo:
2000 shift();
2001 if (m_current == '=') {
2002 shift();
2003 token = MODEQUAL;
2004 break;
2005 }
2006 token = MOD;
2007 break;
2008 case CharacterOr:
2009 shift();
2010 if (m_current == '=') {
2011 shift();
2012 token = OREQUAL;
2013 break;
2014 }
2015 if (m_current == '|') {
2016 shift();
2017 token = OR;
2018 break;
2019 }
2020 token = BITOR;
2021 break;
2022 case CharacterOpenParen:
2023 token = OPENPAREN;
2024 tokenData->line = lineNumber();
2025 tokenData->offset = currentOffset();
2026 tokenData->lineStartOffset = currentLineStartOffset();
2027 shift();
2028 break;
2029 case CharacterCloseParen:
2030 token = CLOSEPAREN;
2031 shift();
2032 break;
2033 case CharacterOpenBracket:
2034 token = OPENBRACKET;
2035 shift();
2036 break;
2037 case CharacterCloseBracket:
2038 token = CLOSEBRACKET;
2039 shift();
2040 break;
2041 case CharacterComma:
2042 token = COMMA;
2043 shift();
2044 break;
2045 case CharacterColon:
2046 token = COLON;
2047 shift();
2048 break;
2049 case CharacterQuestion:
2050 token = QUESTION;
2051 shift();
2052 break;
2053 case CharacterTilde:
2054 token = TILDE;
2055 shift();
2056 break;
2057 case CharacterSemicolon:
2058 shift();
2059 token = SEMICOLON;
2060 break;
2061 case CharacterBackQuote:
2062 shift();
2063 token = BACKQUOTE;
2064 break;
2065 case CharacterOpenBrace:
2066 tokenData->line = lineNumber();
2067 tokenData->offset = currentOffset();
2068 tokenData->lineStartOffset = currentLineStartOffset();
2069 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
2070 shift();
2071 token = OPENBRACE;
2072 break;
2073 case CharacterCloseBrace:
2074 tokenData->line = lineNumber();
2075 tokenData->offset = currentOffset();
2076 tokenData->lineStartOffset = currentLineStartOffset();
2077 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
2078 shift();
2079 token = CLOSEBRACE;
2080 break;
2081 case CharacterDot:
2082 shift();
2083 if (!isASCIIDigit(m_current)) {
2084 if (UNLIKELY((m_current == '.') && (peek(1) == '.'))) {
2085 shift();
2086 shift();
2087 token = DOTDOTDOT;
2088 break;
2089 }
2090 token = DOT;
2091 break;
2092 }
2093 parseNumberAfterDecimalPoint();
2094 token = DOUBLE;
2095 if (isASCIIAlphaCaselessEqual(m_current, 'e')) {
2096 if (!parseNumberAfterExponentIndicator()) {
2097 m_lexErrorMessage = "Non-number found after exponent indicator"_s;
2098 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
2099 goto returnError;
2100 }
2101 }
2102 size_t parsedLength;
2103 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
2104 if (token == INTEGER)
2105 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2106
2107 if (UNLIKELY(isIdentStart(m_current))) {
2108 m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
2109 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
2110 goto returnError;
2111 }
2112 m_buffer8.shrink(0);
2113 break;
2114 case CharacterZero:
2115 shift();
2116 if (isASCIIAlphaCaselessEqual(m_current, 'x')) {
2117 if (!isASCIIHexDigit(peek(1))) {
2118 m_lexErrorMessage = "No hexadecimal digits after '0x'"_s;
2119 token = UNTERMINATED_HEX_NUMBER_ERRORTOK;
2120 goto returnError;
2121 }
2122
2123 // Shift out the 'x' prefix.
2124 shift();
2125
2126 auto parseNumberResult = parseHex();
2127 if (WTF::holds_alternative<double>(parseNumberResult))
2128 tokenData->doubleValue = WTF::get<double>(parseNumberResult);
2129 else {
2130 token = BIGINT;
2131 shift();
2132 tokenData->bigIntString = WTF::get<const Identifier*>(parseNumberResult);
2133 tokenData->radix = 16;
2134 }
2135
2136 if (isIdentStart(m_current)) {
2137 m_lexErrorMessage = "No space between hexadecimal literal and identifier"_s;
2138 token = UNTERMINATED_HEX_NUMBER_ERRORTOK;
2139 goto returnError;
2140 }
2141 if (LIKELY(token != BIGINT))
2142 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2143 m_buffer8.shrink(0);
2144 break;
2145 }
2146 if (isASCIIAlphaCaselessEqual(m_current, 'b')) {
2147 if (!isASCIIBinaryDigit(peek(1))) {
2148 m_lexErrorMessage = "No binary digits after '0b'"_s;
2149 token = UNTERMINATED_BINARY_NUMBER_ERRORTOK;
2150 goto returnError;
2151 }
2152
2153 // Shift out the 'b' prefix.
2154 shift();
2155
2156 auto parseNumberResult = parseBinary();
2157 if (!parseNumberResult)
2158 tokenData->doubleValue = 0;
2159 else if (WTF::holds_alternative<double>(*parseNumberResult))
2160 tokenData->doubleValue = WTF::get<double>(*parseNumberResult);
2161 else {
2162 token = BIGINT;
2163 shift();
2164 tokenData->bigIntString = WTF::get<const Identifier*>(*parseNumberResult);
2165 tokenData->radix = 2;
2166 }
2167
2168 if (isIdentStart(m_current)) {
2169 m_lexErrorMessage = "No space between binary literal and identifier"_s;
2170 token = UNTERMINATED_BINARY_NUMBER_ERRORTOK;
2171 goto returnError;
2172 }
2173 if (LIKELY(token != BIGINT))
2174 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2175 m_buffer8.shrink(0);
2176 break;
2177 }
2178
2179 if (isASCIIAlphaCaselessEqual(m_current, 'o')) {
2180 if (!isASCIIOctalDigit(peek(1))) {
2181 m_lexErrorMessage = "No octal digits after '0o'"_s;
2182 token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
2183 goto returnError;
2184 }
2185
2186 // Shift out the 'o' prefix.
2187 shift();
2188
2189 auto parseNumberResult = parseOctal();
2190 if (!parseNumberResult)
2191 tokenData->doubleValue = 0;
2192 else if (WTF::holds_alternative<double>(*parseNumberResult))
2193 tokenData->doubleValue = WTF::get<double>(*parseNumberResult);
2194 else {
2195 token = BIGINT;
2196 shift();
2197 tokenData->bigIntString = WTF::get<const Identifier*>(*parseNumberResult);
2198 tokenData->radix = 8;
2199 }
2200
2201 if (isIdentStart(m_current)) {
2202 m_lexErrorMessage = "No space between octal literal and identifier"_s;
2203 token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
2204 goto returnError;
2205 }
2206 if (LIKELY(token != BIGINT))
2207 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2208 m_buffer8.shrink(0);
2209 break;
2210 }
2211
2212 record8('0');
2213 if (strictMode && isASCIIDigit(m_current)) {
2214 m_lexErrorMessage = "Decimal integer literals with a leading zero are forbidden in strict mode"_s;
2215 token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
2216 goto returnError;
2217 }
2218 if (isASCIIOctalDigit(m_current)) {
2219 auto parseNumberResult = parseOctal();
2220 if (parseNumberResult && WTF::holds_alternative<double>(*parseNumberResult)) {
2221 tokenData->doubleValue = WTF::get<double>(*parseNumberResult);
2222 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2223 }
2224 }
2225 FALLTHROUGH;
2226 case CharacterNumber:
2227 if (LIKELY(token != INTEGER && token != DOUBLE)) {
2228 auto parseNumberResult = parseDecimal();
2229 if (parseNumberResult && WTF::holds_alternative<double>(*parseNumberResult)) {
2230 tokenData->doubleValue = WTF::get<double>(*parseNumberResult);
2231 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2232 } else {
2233 if (parseNumberResult) {
2234 ASSERT(WTF::get<const Identifier*>(*parseNumberResult));
2235 token = BIGINT;
2236 shift();
2237 tokenData->bigIntString = WTF::get<const Identifier*>(*parseNumberResult);
2238 tokenData->radix = 10;
2239 } else {
2240 token = INTEGER;
2241 if (m_current == '.') {
2242 shift();
2243 parseNumberAfterDecimalPoint();
2244 token = DOUBLE;
2245 }
2246 if (isASCIIAlphaCaselessEqual(m_current, 'e')) {
2247 if (!parseNumberAfterExponentIndicator()) {
2248 m_lexErrorMessage = "Non-number found after exponent indicator"_s;
2249 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
2250 goto returnError;
2251 }
2252 }
2253 size_t parsedLength;
2254 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
2255 if (token == INTEGER)
2256 token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
2257 }
2258 }
2259 }
2260
2261 if (UNLIKELY(isIdentStart(m_current))) {
2262 m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
2263 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
2264 goto returnError;
2265 }
2266 m_buffer8.shrink(0);
2267 break;
2268 case CharacterQuote: {
2269 StringParseResult result = StringCannotBeParsed;
2270 if (lexerFlags & LexerFlagsDontBuildStrings)
2271 result = parseString<false>(tokenData, strictMode);
2272 else
2273 result = parseString<true>(tokenData, strictMode);
2274
2275 if (UNLIKELY(result != StringParsedSuccessfully)) {
2276 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
2277 goto returnError;
2278 }
2279 shift();
2280 token = STRING;
2281 break;
2282 }
2283 case CharacterIdentifierStart:
2284 ASSERT(isIdentStart(m_current));
2285 FALLTHROUGH;
2286 case CharacterBackSlash:
2287 parseIdent:
2288 if (lexerFlags & LexexFlagsDontBuildKeywords)
2289 token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
2290 else
2291 token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
2292 break;
2293 case CharacterLineTerminator:
2294 ASSERT(isLineTerminator(m_current));
2295 shiftLineTerminator();
2296 m_atLineStart = true;
2297 m_hasLineTerminatorBeforeToken = true;
2298 m_lineStart = m_code;
2299 goto start;
2300 case CharacterPrivateIdentifierStart:
2301 if (m_parsingBuiltinFunction)
2302 goto parseIdent;
2303
2304 FALLTHROUGH;
2305 case CharacterOtherIdentifierPart:
2306 case CharacterInvalid:
2307 m_lexErrorMessage = invalidCharacterMessage();
2308 token = ERRORTOK;
2309 goto returnError;
2310 default:
2311 RELEASE_ASSERT_NOT_REACHED();
2312 m_lexErrorMessage = "Internal Error"_s;
2313 token = ERRORTOK;
2314 goto returnError;
2315 }
2316
2317 m_atLineStart = false;
2318 goto returnToken;
2319
2320inSingleLineCommentCheckForDirectives:
2321 // Script comment directives like "//# sourceURL=test.js".
2322 if (UNLIKELY((m_current == '#' || m_current == '@') && isWhiteSpace(peek(1)))) {
2323 shift();
2324 shift();
2325 parseCommentDirective();
2326 }
2327 // Fall through to complete single line comment parsing.
2328
2329inSingleLineComment:
2330 {
2331 auto lineNumber = m_lineNumber;
2332 auto endOffset = currentOffset();
2333 auto lineStartOffset = currentLineStartOffset();
2334 auto endPosition = currentPosition();
2335
2336 while (!isLineTerminator(m_current)) {
2337 if (atEnd()) {
2338 token = EOFTOK;
2339 fillTokenInfo(tokenRecord, token, lineNumber, endOffset, lineStartOffset, endPosition);
2340 return token;
2341 }
2342 shift();
2343 }
2344 shiftLineTerminator();
2345 m_atLineStart = true;
2346 m_hasLineTerminatorBeforeToken = true;
2347 m_lineStart = m_code;
2348 if (!lastTokenWasRestrKeyword())
2349 goto start;
2350
2351 token = SEMICOLON;
2352 fillTokenInfo(tokenRecord, token, lineNumber, endOffset, lineStartOffset, endPosition);
2353 return token;
2354 }
2355
2356returnToken:
2357 fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
2358 return token;
2359
2360returnError:
2361 m_error = true;
2362 fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
2363 RELEASE_ASSERT(token & ErrorTokenFlag);
2364 return token;
2365}
2366
2367template <typename T>
2368static inline void orCharacter(UChar&, UChar);
2369
2370template <>
2371inline void orCharacter<LChar>(UChar&, UChar) { }
2372
2373template <>
2374inline void orCharacter<UChar>(UChar& orAccumulator, UChar character)
2375{
2376 orAccumulator |= character;
2377}
2378
2379template <typename T>
2380JSTokenType Lexer<T>::scanRegExp(JSToken* tokenRecord, UChar patternPrefix)
2381{
2382 JSTokenData* tokenData = &tokenRecord->m_data;
2383 ASSERT(m_buffer16.isEmpty());
2384
2385 bool lastWasEscape = false;
2386 bool inBrackets = false;
2387 UChar charactersOredTogether = 0;
2388
2389 if (patternPrefix) {
2390 ASSERT(!isLineTerminator(patternPrefix));
2391 ASSERT(patternPrefix != '/');
2392 ASSERT(patternPrefix != '[');
2393 record16(patternPrefix);
2394 }
2395
2396 while (true) {
2397 if (isLineTerminator(m_current) || atEnd()) {
2398 m_buffer16.shrink(0);
2399 JSTokenType token = UNTERMINATED_REGEXP_LITERAL_ERRORTOK;
2400 fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
2401 m_error = true;
2402 m_lexErrorMessage = makeString("Unterminated regular expression literal '", getToken(*tokenRecord), "'");
2403 return token;
2404 }
2405
2406 T prev = m_current;
2407
2408 shift();
2409
2410 if (prev == '/' && !lastWasEscape && !inBrackets)
2411 break;
2412
2413 record16(prev);
2414 orCharacter<T>(charactersOredTogether, prev);
2415
2416 if (lastWasEscape) {
2417 lastWasEscape = false;
2418 continue;
2419 }
2420
2421 switch (prev) {
2422 case '[':
2423 inBrackets = true;
2424 break;
2425 case ']':
2426 inBrackets = false;
2427 break;
2428 case '\\':
2429 lastWasEscape = true;
2430 break;
2431 }
2432 }
2433
2434 tokenData->pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
2435
2436 m_buffer16.shrink(0);
2437 charactersOredTogether = 0;
2438
2439 while (isIdentPart(m_current)) {
2440 record16(m_current);
2441 orCharacter<T>(charactersOredTogether, m_current);
2442 shift();
2443 }
2444
2445 tokenData->flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
2446 m_buffer16.shrink(0);
2447
2448 // Since RegExp always ends with /, m_atLineStart always becomes false.
2449 m_atLineStart = false;
2450
2451 JSTokenType token = REGEXP;
2452 fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
2453 return token;
2454}
2455
2456template <typename T>
2457JSTokenType Lexer<T>::scanTemplateString(JSToken* tokenRecord, RawStringsBuildMode rawStringsBuildMode)
2458{
2459 JSTokenData* tokenData = &tokenRecord->m_data;
2460 ASSERT(!m_error);
2461 ASSERT(m_buffer16.isEmpty());
2462
2463 // Leading backquote ` (for template head) or closing brace } (for template trailing) are already shifted in the previous token scan.
2464 // So in this re-scan phase, shift() is not needed here.
2465 StringParseResult result = parseTemplateLiteral(tokenData, rawStringsBuildMode);
2466 JSTokenType token = ERRORTOK;
2467 if (UNLIKELY(result != StringParsedSuccessfully)) {
2468 token = result == StringUnterminated ? UNTERMINATED_TEMPLATE_LITERAL_ERRORTOK : INVALID_TEMPLATE_LITERAL_ERRORTOK;
2469 m_error = true;
2470 } else
2471 token = TEMPLATE;
2472
2473 // Since TemplateString always ends with ` or }, m_atLineStart always becomes false.
2474 m_atLineStart = false;
2475 fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
2476 return token;
2477}
2478
2479template <typename T>
2480void Lexer<T>::clear()
2481{
2482 m_arena = 0;
2483
2484 Vector<LChar> newBuffer8;
2485 m_buffer8.swap(newBuffer8);
2486
2487 Vector<UChar> newBuffer16;
2488 m_buffer16.swap(newBuffer16);
2489
2490 Vector<UChar> newBufferForRawTemplateString16;
2491 m_bufferForRawTemplateString16.swap(newBufferForRawTemplateString16);
2492
2493 m_isReparsingFunction = false;
2494}
2495
2496// Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
2497template class Lexer<LChar>;
2498template class Lexer<UChar>;
2499
2500} // namespace JSC
2501