1 | /* |
2 | * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org) |
3 | * Copyright (c) 2007, 2008, 2016-2017 Apple Inc. All rights reserved. |
4 | * Copyright (C) 2009 Torch Mobile, Inc. |
5 | * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
6 | * |
7 | * This library is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2 of the License, or (at your option) any later version. |
11 | * |
12 | * This library is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Lesser General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Lesser General Public |
18 | * License along with this library; if not, write to the Free Software |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | * |
21 | */ |
22 | |
23 | #include "config.h" |
24 | #include "RegExp.h" |
25 | |
26 | #include "ExceptionHelpers.h" |
27 | #include "Lexer.h" |
28 | #include "JSCInlines.h" |
29 | #include "RegExpCache.h" |
30 | #include "RegExpInlines.h" |
31 | #include "YarrJIT.h" |
32 | #include <wtf/Assertions.h> |
33 | |
34 | namespace JSC { |
35 | |
36 | const ClassInfo RegExp::s_info = { "RegExp" , nullptr, nullptr, nullptr, CREATE_METHOD_TABLE(RegExp) }; |
37 | |
38 | #if REGEXP_FUNC_TEST_DATA_GEN |
39 | const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData" ; |
40 | RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0; |
41 | |
42 | RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get() |
43 | { |
44 | if (!s_instance) |
45 | s_instance = new RegExpFunctionalTestCollector(); |
46 | |
47 | return s_instance; |
48 | } |
49 | |
50 | void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, const String& s, int startOffset, int* ovector, int result) |
51 | { |
52 | if ((!m_lastRegExp) || (m_lastRegExp != regExp)) { |
53 | m_lastRegExp = regExp; |
54 | fputc('/', m_file); |
55 | outputEscapedString(regExp->pattern(), true); |
56 | fputc('/', m_file); |
57 | if (regExp->global()) |
58 | fputc('g', m_file); |
59 | if (regExp->ignoreCase()) |
60 | fputc('i', m_file); |
61 | if (regExp->multiline()) |
62 | fputc('m', m_file); |
63 | if (regExp->dotAll()) |
64 | fputc('s', m_file); |
65 | if (regExp->unicode()) |
66 | fputc('u', m_file); |
67 | if (regExp->sticky()) |
68 | fputc('y', m_file); |
69 | fprintf(m_file, "\n" ); |
70 | } |
71 | |
72 | fprintf(m_file, " \"" ); |
73 | outputEscapedString(s); |
74 | fprintf(m_file, "\", %d, %d, (" , startOffset, result); |
75 | for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) { |
76 | int subpatternBegin = ovector[i * 2]; |
77 | int subpatternEnd = ovector[i * 2 + 1]; |
78 | if (subpatternBegin == -1) |
79 | subpatternEnd = -1; |
80 | fprintf(m_file, "%d, %d" , subpatternBegin, subpatternEnd); |
81 | if (i < regExp->numSubpatterns()) |
82 | fputs(", " , m_file); |
83 | } |
84 | |
85 | fprintf(m_file, ")\n" ); |
86 | fflush(m_file); |
87 | } |
88 | |
89 | RegExpFunctionalTestCollector::RegExpFunctionalTestCollector() |
90 | { |
91 | m_file = fopen(s_fileName, "r+" ); |
92 | if (!m_file) |
93 | m_file = fopen(s_fileName, "w+" ); |
94 | |
95 | fseek(m_file, 0L, SEEK_END); |
96 | } |
97 | |
98 | RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector() |
99 | { |
100 | fclose(m_file); |
101 | s_instance = 0; |
102 | } |
103 | |
104 | void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash) |
105 | { |
106 | int len = s.length(); |
107 | |
108 | for (int i = 0; i < len; ++i) { |
109 | UChar c = s[i]; |
110 | |
111 | switch (c) { |
112 | case '\0': |
113 | fputs("\\0" , m_file); |
114 | break; |
115 | case '\a': |
116 | fputs("\\a" , m_file); |
117 | break; |
118 | case '\b': |
119 | fputs("\\b" , m_file); |
120 | break; |
121 | case '\f': |
122 | fputs("\\f" , m_file); |
123 | break; |
124 | case '\n': |
125 | fputs("\\n" , m_file); |
126 | break; |
127 | case '\r': |
128 | fputs("\\r" , m_file); |
129 | break; |
130 | case '\t': |
131 | fputs("\\t" , m_file); |
132 | break; |
133 | case '\v': |
134 | fputs("\\v" , m_file); |
135 | break; |
136 | case '/': |
137 | if (escapeSlash) |
138 | fputs("\\/" , m_file); |
139 | else |
140 | fputs("/" , m_file); |
141 | break; |
142 | case '\"': |
143 | fputs("\\\"" , m_file); |
144 | break; |
145 | case '\\': |
146 | fputs("\\\\" , m_file); |
147 | break; |
148 | case '\?': |
149 | fputs("\?" , m_file); |
150 | break; |
151 | default: |
152 | if (c > 0x7f) |
153 | fprintf(m_file, "\\u%04x" , c); |
154 | else |
155 | fputc(c, m_file); |
156 | break; |
157 | } |
158 | } |
159 | } |
160 | #endif |
161 | |
162 | RegExp::RegExp(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
163 | : JSCell(vm, vm.regExpStructure.get()) |
164 | , m_patternString(patternString) |
165 | , m_flags(flags) |
166 | { |
167 | ASSERT(m_flags != Yarr::Flags::DeletedValue); |
168 | } |
169 | |
170 | void RegExp::finishCreation(VM& vm) |
171 | { |
172 | Base::finishCreation(vm); |
173 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm.stackLimit()); |
174 | if (!isValid()) { |
175 | m_state = ParseError; |
176 | return; |
177 | } |
178 | |
179 | m_numSubpatterns = pattern.m_numSubpatterns; |
180 | if (!pattern.m_captureGroupNames.isEmpty() || !pattern.m_namedGroupToParenIndex.isEmpty()) { |
181 | m_rareData = std::make_unique<RareData>(); |
182 | m_rareData->m_captureGroupNames.swap(pattern.m_captureGroupNames); |
183 | m_rareData->m_namedGroupToParenIndex.swap(pattern.m_namedGroupToParenIndex); |
184 | } |
185 | } |
186 | |
187 | void RegExp::destroy(JSCell* cell) |
188 | { |
189 | RegExp* thisObject = static_cast<RegExp*>(cell); |
190 | #if REGEXP_FUNC_TEST_DATA_GEN |
191 | RegExpFunctionalTestCollector::get()->clearRegExp(this); |
192 | #endif |
193 | thisObject->RegExp::~RegExp(); |
194 | } |
195 | |
196 | size_t RegExp::estimatedSize(JSCell* cell, VM& vm) |
197 | { |
198 | RegExp* thisObject = static_cast<RegExp*>(cell); |
199 | size_t regexDataSize = thisObject->m_regExpBytecode ? thisObject->m_regExpBytecode->estimatedSizeInBytes() : 0; |
200 | #if ENABLE(YARR_JIT) |
201 | if (auto* jitCode = thisObject->m_regExpJITCode.get()) |
202 | regexDataSize += jitCode->size(); |
203 | #endif |
204 | return Base::estimatedSize(cell, vm) + regexDataSize; |
205 | } |
206 | |
207 | RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
208 | { |
209 | RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags); |
210 | regExp->finishCreation(vm); |
211 | return regExp; |
212 | } |
213 | |
214 | RegExp* RegExp::create(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags) |
215 | { |
216 | return vm.regExpCache()->lookupOrCreate(patternString, flags); |
217 | } |
218 | |
219 | |
220 | static std::unique_ptr<Yarr::BytecodePattern> byteCodeCompilePattern(VM* vm, Yarr::YarrPattern& pattern) |
221 | { |
222 | return Yarr::byteCompile(pattern, &vm->m_regExpAllocator, &vm->m_regExpAllocatorLock); |
223 | } |
224 | |
225 | void RegExp::byteCodeCompileIfNecessary(VM* vm) |
226 | { |
227 | if (m_regExpBytecode) |
228 | return; |
229 | |
230 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
231 | if (hasError(m_constructionErrorCode)) { |
232 | RELEASE_ASSERT_NOT_REACHED(); |
233 | #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE) |
234 | m_state = ParseError; |
235 | return; |
236 | #endif |
237 | } |
238 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
239 | |
240 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern); |
241 | } |
242 | |
243 | void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize) |
244 | { |
245 | auto locker = holdLock(cellLock()); |
246 | |
247 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
248 | if (hasError(m_constructionErrorCode)) { |
249 | m_state = ParseError; |
250 | return; |
251 | } |
252 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
253 | |
254 | if (!hasCode()) { |
255 | ASSERT(m_state == NotCompiled); |
256 | vm->regExpCache()->addToStrongCache(this); |
257 | m_state = ByteCode; |
258 | } |
259 | |
260 | #if ENABLE(YARR_JIT) |
261 | if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT() |
262 | #if !ENABLE(YARR_JIT_BACKREFERENCES) |
263 | && !pattern.m_containsBackreferences |
264 | #endif |
265 | ) { |
266 | auto& jitCode = ensureRegExpJITCode(); |
267 | Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode); |
268 | if (!jitCode.failureReason()) { |
269 | m_state = JITCode; |
270 | return; |
271 | } |
272 | } |
273 | #else |
274 | UNUSED_PARAM(charSize); |
275 | #endif |
276 | |
277 | if (Options::dumpCompiledRegExpPatterns()) |
278 | dataLog("Can't JIT this regular expression: \"" , m_patternString, "\"\n" ); |
279 | |
280 | m_state = ByteCode; |
281 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern); |
282 | } |
283 | |
284 | int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int>& ovector) |
285 | { |
286 | return matchInline(vm, s, startOffset, ovector); |
287 | } |
288 | |
289 | bool RegExp::matchConcurrently( |
290 | VM& vm, const String& s, unsigned startOffset, int& position, Vector<int>& ovector) |
291 | { |
292 | auto locker = holdLock(cellLock()); |
293 | |
294 | if (!hasCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16)) |
295 | return false; |
296 | |
297 | position = match(vm, s, startOffset, ovector); |
298 | return true; |
299 | } |
300 | |
301 | void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize) |
302 | { |
303 | auto locker = holdLock(cellLock()); |
304 | |
305 | Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit()); |
306 | if (hasError(m_constructionErrorCode)) { |
307 | m_state = ParseError; |
308 | return; |
309 | } |
310 | ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); |
311 | |
312 | if (!hasCode()) { |
313 | ASSERT(m_state == NotCompiled); |
314 | vm->regExpCache()->addToStrongCache(this); |
315 | m_state = ByteCode; |
316 | } |
317 | |
318 | #if ENABLE(YARR_JIT) |
319 | if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT() |
320 | #if !ENABLE(YARR_JIT_BACKREFERENCES) |
321 | && !pattern.m_containsBackreferences |
322 | #endif |
323 | ) { |
324 | auto& jitCode = ensureRegExpJITCode(); |
325 | Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::MatchOnly); |
326 | if (!jitCode.failureReason()) { |
327 | m_state = JITCode; |
328 | return; |
329 | } |
330 | } |
331 | #else |
332 | UNUSED_PARAM(charSize); |
333 | #endif |
334 | |
335 | if (Options::dumpCompiledRegExpPatterns()) |
336 | dataLog("Can't JIT this regular expression: \"" , m_patternString, "\"\n" ); |
337 | |
338 | m_state = ByteCode; |
339 | m_regExpBytecode = byteCodeCompilePattern(vm, pattern); |
340 | } |
341 | |
342 | MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset) |
343 | { |
344 | return matchInline(vm, s, startOffset); |
345 | } |
346 | |
347 | bool RegExp::matchConcurrently(VM& vm, const String& s, unsigned startOffset, MatchResult& result) |
348 | { |
349 | auto locker = holdLock(cellLock()); |
350 | |
351 | if (!hasMatchOnlyCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16)) |
352 | return false; |
353 | |
354 | result = match(vm, s, startOffset); |
355 | return true; |
356 | } |
357 | |
358 | void RegExp::deleteCode() |
359 | { |
360 | auto locker = holdLock(cellLock()); |
361 | |
362 | if (!hasCode()) |
363 | return; |
364 | m_state = NotCompiled; |
365 | #if ENABLE(YARR_JIT) |
366 | if (m_regExpJITCode) |
367 | m_regExpJITCode->clear(); |
368 | #endif |
369 | m_regExpBytecode = nullptr; |
370 | } |
371 | |
372 | #if ENABLE(YARR_JIT_DEBUG) |
373 | void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult) |
374 | { |
375 | int offsetVectorSize = (m_numSubpatterns + 1) * 2; |
376 | Vector<int> interpreterOvector; |
377 | interpreterOvector.resize(offsetVectorSize); |
378 | int* interpreterOffsetVector = interpreterOvector.data(); |
379 | int interpreterResult = 0; |
380 | int differences = 0; |
381 | |
382 | // Initialize interpreterOffsetVector with the return value (index 0) and the |
383 | // first subpattern start indicies (even index values) set to -1. |
384 | // No need to init the subpattern end indicies. |
385 | for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++) |
386 | interpreterOffsetVector[j] = -1; |
387 | |
388 | interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(interpreterOffsetVector)); |
389 | |
390 | if (jitResult != interpreterResult) |
391 | differences++; |
392 | |
393 | for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) |
394 | if ((offsetVector[j] != interpreterOffsetVector[j]) |
395 | || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))) |
396 | differences++; |
397 | |
398 | if (differences) { |
399 | dataLogF("RegExp Discrepency for /%s/\n string input " , pattern().utf8().data()); |
400 | unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset); |
401 | |
402 | dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n" , s.utf8().data() + startOffset); |
403 | |
404 | if (jitResult != interpreterResult) { |
405 | dataLogF(" JIT result = %d, interpreted result = %d\n" , jitResult, interpreterResult); |
406 | differences--; |
407 | } else { |
408 | dataLogF(" Correct result = %d\n" , jitResult); |
409 | } |
410 | |
411 | if (differences) { |
412 | for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) { |
413 | if (offsetVector[j] != interpreterOffsetVector[j]) |
414 | dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n" , j, offsetVector[j], j, interpreterOffsetVector[j]); |
415 | if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])) |
416 | dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n" , j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]); |
417 | } |
418 | } |
419 | } |
420 | } |
421 | #endif |
422 | |
423 | #if ENABLE(REGEXP_TRACING) |
424 | void RegExp::printTraceData() |
425 | { |
426 | char formattedPattern[41]; |
427 | char rawPattern[41]; |
428 | |
429 | strncpy(rawPattern, pattern().utf8().data(), 40); |
430 | rawPattern[40]= '\0'; |
431 | |
432 | int pattLen = strlen(rawPattern); |
433 | |
434 | snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s..." , rawPattern); |
435 | |
436 | #if ENABLE(YARR_JIT) |
437 | const size_t jitAddrSize = 20; |
438 | char jit8BitMatchOnlyAddr[jitAddrSize] { }; |
439 | char jit16BitMatchOnlyAddr[jitAddrSize] { }; |
440 | char jit8BitMatchAddr[jitAddrSize] { }; |
441 | char jit16BitMatchAddr[jitAddrSize] { }; |
442 | switch (m_state) { |
443 | case ParseError: |
444 | case NotCompiled: |
445 | break; |
446 | case ByteCode: |
447 | snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback " ); |
448 | snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- " ); |
449 | snprintf(jit8BitMatchAddr, jitAddrSize, "fallback " ); |
450 | snprintf(jit16BitMatchAddr, jitAddrSize, "---- " ); |
451 | break; |
452 | case JITCode: { |
453 | Yarr::YarrCodeBlock& codeBlock = *m_regExpJITCode.get(); |
454 | snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchOnlyAddr())); |
455 | snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchOnlyAddr())); |
456 | snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchAddr())); |
457 | snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx" , reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchAddr())); |
458 | break; |
459 | } |
460 | } |
461 | #else |
462 | const char* jit8BitMatchOnlyAddr = "JIT Off" ; |
463 | const char* jit16BitMatchOnlyAddr = "" ; |
464 | const char* jit8BitMatchAddr = "JIT Off" ; |
465 | const char* jit16BitMatchAddr = "" ; |
466 | #endif |
467 | unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount); |
468 | unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount); |
469 | |
470 | printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n" , formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen); |
471 | printf(" %16.16s %16.16s %10d %10d %10u\n" , jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen); |
472 | } |
473 | #endif |
474 | |
475 | static CString regexpToSourceString(const RegExp* regExp) |
476 | { |
477 | char postfix[7] = { '/', 0, 0, 0, 0, 0, 0 }; |
478 | int index = 1; |
479 | if (regExp->global()) |
480 | postfix[index++] = 'g'; |
481 | if (regExp->ignoreCase()) |
482 | postfix[index++] = 'i'; |
483 | if (regExp->multiline()) |
484 | postfix[index] = 'm'; |
485 | if (regExp->dotAll()) |
486 | postfix[index++] = 's'; |
487 | if (regExp->unicode()) |
488 | postfix[index++] = 'u'; |
489 | if (regExp->sticky()) |
490 | postfix[index++] = 'y'; |
491 | |
492 | return toCString("/" , regExp->pattern().impl(), postfix); |
493 | } |
494 | |
495 | void RegExp::dumpToStream(const JSCell* cell, PrintStream& out) |
496 | { |
497 | out.print(regexpToSourceString(jsCast<const RegExp*>(cell))); |
498 | } |
499 | |
500 | } // namespace JSC |
501 | |