RegExp.cpp source code [webkit/Source/JavaScriptCore/runtime/RegExp.cpp]

1	/*
2	* Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3	* Copyright (c) 2007, 2008, 2016-2017 Apple Inc. All rights reserved.
4	* Copyright (C) 2009 Torch Mobile, Inc.
5	* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6	*
7	* This library is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU Lesser General Public
9	* License as published by the Free Software Foundation; either
10	* version 2 of the License, or (at your option) any later version.
11	*
12	* This library is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	* Lesser General Public License for more details.
16	*
17	* You should have received a copy of the GNU Lesser General Public
18	* License along with this library; if not, write to the Free Software
19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20	*
21	*/
22
23	#include "config.h"
24	#include "RegExp.h"
25
26	#include "ExceptionHelpers.h"
27	#include "Lexer.h"
28	#include "JSCInlines.h"
29	#include "RegExpCache.h"
30	#include "RegExpInlines.h"
31	#include "YarrJIT.h"
32	#include <wtf/Assertions.h>
33
34	namespace JSC {
35
36	const ClassInfo RegExp::s_info = { "RegExp", nullptr, nullptr, nullptr, CREATE_METHOD_TABLE(RegExp) };
37
38	#if REGEXP_FUNC_TEST_DATA_GEN
39	const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
40	RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = `0`;
41
42	RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
43	{
44	if (!s_instance)
45	s_instance = new RegExpFunctionalTestCollector();
46
47	return s_instance;
48	}
49
50	void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, const String& s, int startOffset, int* ovector, int result)
51	{
52	if ((!m_lastRegExp) \|\| (m_lastRegExp != regExp)) {
53	m_lastRegExp = regExp;
54	fputc(`'/'`, m_file);
55	outputEscapedString(regExp->pattern(), true);
56	fputc(`'/'`, m_file);
57	if (regExp->global())
58	fputc(`'g'`, m_file);
59	if (regExp->ignoreCase())
60	fputc(`'i'`, m_file);
61	if (regExp->multiline())
62	fputc(`'m'`, m_file);
63	if (regExp->dotAll())
64	fputc(`'s'`, m_file);
65	if (regExp->unicode())
66	fputc(`'u'`, m_file);
67	if (regExp->sticky())
68	fputc(`'y'`, m_file);
69	fprintf(m_file, "\n");
70	}
71
72	fprintf(m_file, " \"");
73	outputEscapedString(s);
74	fprintf(m_file, "\", %d, %d, (", startOffset, result);
75	for (unsigned i = `0`; i <= regExp->numSubpatterns(); i++) {
76	int subpatternBegin = ovector[i * `2`];
77	int subpatternEnd = ovector[i * `2` + `1`];
78	if (subpatternBegin == -`1`)
79	subpatternEnd = -`1`;
80	fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
81	if (i < regExp->numSubpatterns())
82	fputs(", ", m_file);
83	}
84
85	fprintf(m_file, ")\n");
86	fflush(m_file);
87	}
88
89	RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
90	{
91	m_file = fopen(s_fileName, "r+");
92	if (!m_file)
93	m_file = fopen(s_fileName, "w+");
94
95	fseek(m_file, `0L`, SEEK_END);
96	}
97
98	RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
99	{
100	fclose(m_file);
101	s_instance = `0`;
102	}
103
104	void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
105	{
106	int len = s.length();
107
108	for (int i = `0`; i < len; ++i) {
109	UChar c = s[i];
110
111	switch (c) {
112	case `'\0'`:
113	fputs("\\0", m_file);
114	break;
115	case `'\a'`:
116	fputs("\\a", m_file);
117	break;
118	case `'\b'`:
119	fputs("\\b", m_file);
120	break;
121	case `'\f'`:
122	fputs("\\f", m_file);
123	break;
124	case `'\n'`:
125	fputs("\\n", m_file);
126	break;
127	case `'\r'`:
128	fputs("\\r", m_file);
129	break;
130	case `'\t'`:
131	fputs("\\t", m_file);
132	break;
133	case `'\v'`:
134	fputs("\\v", m_file);
135	break;
136	case `'/'`:
137	if (escapeSlash)
138	fputs("\\/", m_file);
139	else
140	fputs("/", m_file);
141	break;
142	case `'\"'`:
143	fputs("\\\"", m_file);
144	break;
145	case `'\\'`:
146	fputs("\\\\", m_file);
147	break;
148	case `'\?'`:
149	fputs("\?", m_file);
150	break;
151	default:
152	if (c > `0x7f`)
153	fprintf(m_file, "\\u%04x", c);
154	else
155	fputc(c, m_file);
156	break;
157	}
158	}
159	}
160	#endif
161
162	RegExp::RegExp(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
163	: JSCell (vm, vm.regExpStructure.get())
164	, m_patternString (patternString)
165	, m_flags (flags)
166	{
167	ASSERT(m_flags != Yarr::Flags::DeletedValue);
168	}
169
170	void RegExp::finishCreation(VM& vm)
171	{
172	Base::finishCreation(vm);
173	Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm.stackLimit());
174	if (!isValid()) {
175	m_state = ParseError;
176	return;
177	}
178
179	m_numSubpatterns = pattern.m_numSubpatterns;
180	if (!pattern.m_captureGroupNames.isEmpty() \|\| !pattern.m_namedGroupToParenIndex.isEmpty()) {
181	m_rareData = std::make_unique<RareData>();
182	m_rareData ->m_captureGroupNames.swap(pattern.m_captureGroupNames);
183	m_rareData ->m_namedGroupToParenIndex.swap(pattern.m_namedGroupToParenIndex);
184	}
185	}
186
187	void RegExp::destroy(JSCell* cell)
188	{
189	RegExp* thisObject = static_cast<RegExp*>(cell);
190	#if REGEXP_FUNC_TEST_DATA_GEN
191	RegExpFunctionalTestCollector::get()->clearRegExp(this);
192	#endif
193	thisObject->RegExp::~RegExp();
194	}
195
196	size_t RegExp::estimatedSize(JSCell* cell, VM& vm)
197	{
198	RegExp* thisObject = static_cast<RegExp*>(cell);
199	size_t regexDataSize = thisObject->m_regExpBytecode ? thisObject->m_regExpBytecode ->estimatedSizeInBytes() : `0`;
200	#if ENABLE(YARR_JIT)
201	if (auto* jitCode = thisObject->m_regExpJITCode.get())
202	regexDataSize += jitCode->size();
203	#endif
204	return Base::estimatedSize(cell, vm) + regexDataSize;
205	}
206
207	RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
208	{
209	RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp (vm, patternString, flags);
210	regExp->finishCreation(vm);
211	return regExp;
212	}
213
214	RegExp* RegExp::create(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
215	{
216	return vm.regExpCache()->lookupOrCreate(patternString, flags);
217	}
218
219
220	static std::unique_ptr<Yarr::BytecodePattern> byteCodeCompilePattern(VM* vm, Yarr::YarrPattern& pattern)
221	{
222	return Yarr::byteCompile(pattern, &vm->m_regExpAllocator, &vm->m_regExpAllocatorLock);
223	}
224
225	void RegExp::byteCodeCompileIfNecessary(VM* vm)
226	{
227	if (m_regExpBytecode)
228	return;
229
230	Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
231	if (hasError(m_constructionErrorCode)) {
232	RELEASE_ASSERT_NOT_REACHED();
233	#if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
234	m_state = ParseError;
235	return;
236	#endif
237	}
238	ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
239
240	m_regExpBytecode = byteCodeCompilePattern(vm, pattern);
241	}
242
243	void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
244	{
245	auto locker = holdLock(cellLock());
246
247	Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
248	if (hasError(m_constructionErrorCode)) {
249	m_state = ParseError;
250	return;
251	}
252	ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
253
254	if (!hasCode()) {
255	ASSERT(m_state == NotCompiled);
256	vm->regExpCache()->addToStrongCache(this);
257	m_state = ByteCode;
258	}
259
260	#if ENABLE(YARR_JIT)
261	if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT()
262	#if !ENABLE(YARR_JIT_BACKREFERENCES)
263	&& !pattern.m_containsBackreferences
264	#endif
265	) {
266	auto& jitCode = ensureRegExpJITCode();
267	Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode);
268	if (!jitCode.failureReason()) {
269	m_state = JITCode;
270	return;
271	}
272	}
273	#else
274	UNUSED_PARAM(charSize);
275	#endif
276
277	if (Options::dumpCompiledRegExpPatterns())
278	dataLog("Can't JIT this regular expression: \"", m_patternString, "\"\n");
279
280	m_state = ByteCode;
281	m_regExpBytecode = byteCodeCompilePattern(vm, pattern);
282	}
283
284	int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int>& ovector)
285	{
286	return matchInline(vm, s, startOffset, ovector);
287	}
288
289	bool RegExp::matchConcurrently(
290	VM& vm, const String& s, unsigned startOffset, int& position, Vector<int>& ovector)
291	{
292	auto locker = holdLock(cellLock());
293
294	if (!hasCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16))
295	return false;
296
297	position = match(vm, s, startOffset, ovector);
298	return true;
299	}
300
301	void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
302	{
303	auto locker = holdLock(cellLock());
304
305	Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
306	if (hasError(m_constructionErrorCode)) {
307	m_state = ParseError;
308	return;
309	}
310	ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
311
312	if (!hasCode()) {
313	ASSERT(m_state == NotCompiled);
314	vm->regExpCache()->addToStrongCache(this);
315	m_state = ByteCode;
316	}
317
318	#if ENABLE(YARR_JIT)
319	if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT()
320	#if !ENABLE(YARR_JIT_BACKREFERENCES)
321	&& !pattern.m_containsBackreferences
322	#endif
323	) {
324	auto& jitCode = ensureRegExpJITCode();
325	Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::MatchOnly);
326	if (!jitCode.failureReason()) {
327	m_state = JITCode;
328	return;
329	}
330	}
331	#else
332	UNUSED_PARAM(charSize);
333	#endif
334
335	if (Options::dumpCompiledRegExpPatterns())
336	dataLog("Can't JIT this regular expression: \"", m_patternString, "\"\n");
337
338	m_state = ByteCode;
339	m_regExpBytecode = byteCodeCompilePattern(vm, pattern);
340	}
341
342	MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
343	{
344	return matchInline(vm, s, startOffset);
345	}
346
347	bool RegExp::matchConcurrently(VM& vm, const String& s, unsigned startOffset, MatchResult& result)
348	{
349	auto locker = holdLock(cellLock());
350
351	if (!hasMatchOnlyCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16))
352	return false;
353
354	result = match(vm, s, startOffset);
355	return true;
356	}
357
358	void RegExp::deleteCode()
359	{
360	auto locker = holdLock(cellLock());
361
362	if (!hasCode())
363	return;
364	m_state = NotCompiled;
365	#if ENABLE(YARR_JIT)
366	if (m_regExpJITCode)
367	m_regExpJITCode ->clear();
368	#endif
369	m_regExpBytecode = nullptr;
370	}
371
372	#if ENABLE(YARR_JIT_DEBUG)
373	void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
374	{
375	int offsetVectorSize = (m_numSubpatterns + `1`) * `2`;
376	Vector<int> interpreterOvector;
377	interpreterOvector.resize(offsetVectorSize);
378	int* interpreterOffsetVector = interpreterOvector.data();
379	int interpreterResult = `0`;
380	int differences = `0`;
381
382	// Initialize interpreterOffsetVector with the return value (index 0) and the
383	// first subpattern start indicies (even index values) set to -1.
384	// No need to init the subpattern end indicies.
385	for (unsigned j = `0`, i = `0`; i < m_numSubpatterns + `1`; j += `2`, i++)
386	interpreterOffsetVector[j] = -`1`;
387
388	interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(interpreterOffsetVector));
389
390	if (jitResult != interpreterResult)
391	differences++;
392
393	for (unsigned j = `2`, i = `0`; i < m_numSubpatterns; j +=`2`, i++)
394	if ((offsetVector[j] != interpreterOffsetVector[j])
395	\|\| ((offsetVector[j] >= `0`) && (offsetVector[j+`1`] != interpreterOffsetVector[j+`1`])))
396	differences++;
397
398	if (differences) {
399	dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
400	unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
401
402	dataLogF((segmentLen < `150`) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
403
404	if (jitResult != interpreterResult) {
405	dataLogF(" JIT result = %d, interpreted result = %d\n", jitResult, interpreterResult);
406	differences--;
407	} else {
408	dataLogF(" Correct result = %d\n", jitResult);
409	}
410
411	if (differences) {
412	for (unsigned j = `2`, i = `0`; i < m_numSubpatterns; j +=`2`, i++) {
413	if (offsetVector[j] != interpreterOffsetVector[j])
414	dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
415	if ((offsetVector[j] >= `0`) && (offsetVector[j+`1`] != interpreterOffsetVector[j+`1`]))
416	dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+`1`, offsetVector[j+`1`], j+`1`, interpreterOffsetVector[j+`1`]);
417	}
418	}
419	}
420	}
421	#endif
422
423	#if ENABLE(REGEXP_TRACING)
424	void RegExp::printTraceData()
425	{
426	char formattedPattern[`41`];
427	char rawPattern[`41`];
428
429	strncpy(rawPattern, pattern().utf8().data(), `40`);
430	rawPattern[`40`]= `'\0'`;
431
432	int pattLen = strlen(rawPattern);
433
434	snprintf(formattedPattern, `41`, (pattLen <= `38`) ? "/%.38s/" : "/%.36s...", rawPattern);
435
436	#if ENABLE(YARR_JIT)
437	const size_t jitAddrSize = `20`;
438	char jit8BitMatchOnlyAddr[jitAddrSize] { };
439	char jit16BitMatchOnlyAddr[jitAddrSize] { };
440	char jit8BitMatchAddr[jitAddrSize] { };
441	char jit16BitMatchAddr[jitAddrSize] { };
442	switch (m_state) {
443	case ParseError:
444	case NotCompiled:
445	break;
446	case ByteCode:
447	snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback ");
448	snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- ");
449	snprintf(jit8BitMatchAddr, jitAddrSize, "fallback ");
450	snprintf(jit16BitMatchAddr, jitAddrSize, "---- ");
451	break;
452	case JITCode: {
453	Yarr::YarrCodeBlock& codeBlock = *m_regExpJITCode.get();
454	snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchOnlyAddr()));
455	snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchOnlyAddr()));
456	snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchAddr()));
457	snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchAddr()));
458	break;
459	}
460	}
461	#else
462	const char* jit8BitMatchOnlyAddr = "JIT Off";
463	const char* jit16BitMatchOnlyAddr = "";
464	const char* jit8BitMatchAddr = "JIT Off";
465	const char* jit16BitMatchAddr = "";
466	#endif
467	unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount);
468	unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount);
469
470	printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen);
471	printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen);
472	}
473	#endif
474
475	static CString regexpToSourceString(const RegExp* regExp)
476	{
477	char postfix[`7`] = { `'/'`, `0`, `0`, `0`, `0`, `0`, `0` };
478	int index = `1`;
479	if (regExp->global())
480	postfix[index++] = `'g'`;
481	if (regExp->ignoreCase())
482	postfix[index++] = `'i'`;
483	if (regExp->multiline())
484	postfix[index] = `'m'`;
485	if (regExp->dotAll())
486	postfix[index++] = `'s'`;
487	if (regExp->unicode())
488	postfix[index++] = `'u'`;
489	if (regExp->sticky())
490	postfix[index++] = `'y'`;
491
492	return toCString("/", regExp->pattern().impl(), postfix);
493	}
494
495	void RegExp::dumpToStream(const JSCell* cell, PrintStream& out)
496	{
497	out.print(regexpToSourceString(jsCast<const RegExp*>(cell)));
498	}
499
500	} // namespace JSC
501

Browse the source code of webkit/Source/JavaScriptCore/runtime/RegExp.cpp