1 | /* |
2 | * Copyright 2005 Maksim Orlovich <maksim@kde.org> |
3 | * Copyright (C) 2006, 2013 Apple Inc. All rights reserved. |
4 | * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org> |
5 | * |
6 | * Redistribution and use in source and binary forms, with or without |
7 | * modification, are permitted provided that the following conditions |
8 | * are met: |
9 | * |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
17 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
18 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
19 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
21 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
22 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
23 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
25 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | */ |
27 | |
28 | #include "config.h" |
29 | #include "XPathParser.h" |
30 | |
31 | #include "XPathEvaluator.h" |
32 | #include "XPathNSResolver.h" |
33 | #include "XPathPath.h" |
34 | #include "XPathStep.h" |
35 | #include <wtf/NeverDestroyed.h> |
36 | #include <wtf/StdLibExtras.h> |
37 | #include <wtf/text/StringHash.h> |
38 | |
39 | using namespace WebCore::XPath; |
40 | |
41 | extern int xpathyyparse(WebCore::XPath::Parser&); |
42 | |
43 | #include "XPathGrammar.h" |
44 | |
45 | namespace WebCore { |
46 | namespace XPath { |
47 | |
48 | struct Parser::Token { |
49 | int type; |
50 | String string; |
51 | Step::Axis axis; |
52 | NumericOp::Opcode numericOpcode; |
53 | EqTestOp::Opcode equalityTestOpcode; |
54 | |
55 | Token(int type) : type(type) { } |
56 | Token(int type, const String& string) : type(type), string(string) { } |
57 | Token(int type, Step::Axis axis) : type(type), axis(axis) { } |
58 | Token(int type, NumericOp::Opcode opcode) : type(type), numericOpcode(opcode) { } |
59 | Token(int type, EqTestOp::Opcode opcode) : type(type), equalityTestOpcode(opcode) { } |
60 | }; |
61 | |
62 | enum XMLCat { NameStart, NameCont, NotPartOfName }; |
63 | |
64 | static XMLCat charCat(UChar character) |
65 | { |
66 | if (character == '_') |
67 | return NameStart; |
68 | |
69 | if (character == '.' || character == '-') |
70 | return NameCont; |
71 | unsigned characterTypeMask = U_GET_GC_MASK(character); |
72 | if (characterTypeMask & (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LO_MASK | U_GC_LT_MASK | U_GC_NL_MASK)) |
73 | return NameStart; |
74 | if (characterTypeMask & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_ND_MASK)) |
75 | return NameCont; |
76 | return NotPartOfName; |
77 | } |
78 | |
79 | static HashMap<String, Step::Axis> createAxisNamesMap() |
80 | { |
81 | struct AxisName { |
82 | const char* name; |
83 | Step::Axis axis; |
84 | }; |
85 | const AxisName axisNameList[] = { |
86 | { "ancestor" , Step::AncestorAxis }, |
87 | { "ancestor-or-self" , Step::AncestorOrSelfAxis }, |
88 | { "attribute" , Step::AttributeAxis }, |
89 | { "child" , Step::ChildAxis }, |
90 | { "descendant" , Step::DescendantAxis }, |
91 | { "descendant-or-self" , Step::DescendantOrSelfAxis }, |
92 | { "following" , Step::FollowingAxis }, |
93 | { "following-sibling" , Step::FollowingSiblingAxis }, |
94 | { "namespace" , Step::NamespaceAxis }, |
95 | { "parent" , Step::ParentAxis }, |
96 | { "preceding" , Step::PrecedingAxis }, |
97 | { "preceding-sibling" , Step::PrecedingSiblingAxis }, |
98 | { "self" , Step::SelfAxis } |
99 | }; |
100 | HashMap<String, Step::Axis> map; |
101 | for (auto& axisName : axisNameList) |
102 | map.add(axisName.name, axisName.axis); |
103 | return map; |
104 | } |
105 | |
106 | static bool parseAxisName(const String& name, Step::Axis& type) |
107 | { |
108 | static const auto axisNames = makeNeverDestroyed(createAxisNamesMap()); |
109 | auto it = axisNames.get().find(name); |
110 | if (it == axisNames.get().end()) |
111 | return false; |
112 | type = it->value; |
113 | return true; |
114 | } |
115 | |
116 | // Returns whether the current token can possibly be a binary operator, given |
117 | // the previous token. Necessary to disambiguate some of the operators |
118 | // (* (multiply), div, and, or, mod) in the [32] Operator rule |
119 | // (check http://www.w3.org/TR/xpath#exprlex). |
120 | bool Parser::isBinaryOperatorContext() const |
121 | { |
122 | switch (m_lastTokenType) { |
123 | case 0: |
124 | case '@': case AXISNAME: case '(': case '[': case ',': |
125 | case AND: case OR: case MULOP: |
126 | case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: |
127 | case EQOP: case RELOP: |
128 | return false; |
129 | default: |
130 | return true; |
131 | } |
132 | } |
133 | |
134 | void Parser::skipWS() |
135 | { |
136 | while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos])) |
137 | ++m_nextPos; |
138 | } |
139 | |
140 | Parser::Token Parser::makeTokenAndAdvance(int code, int advance) |
141 | { |
142 | m_nextPos += advance; |
143 | return Token(code); |
144 | } |
145 | |
146 | Parser::Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance) |
147 | { |
148 | m_nextPos += advance; |
149 | return Token(code, val); |
150 | } |
151 | |
152 | Parser::Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance) |
153 | { |
154 | m_nextPos += advance; |
155 | return Token(code, val); |
156 | } |
157 | |
158 | // Returns next char if it's there and interesting, 0 otherwise. |
159 | char Parser::peekAheadHelper() |
160 | { |
161 | if (m_nextPos + 1 >= m_data.length()) |
162 | return 0; |
163 | UChar next = m_data[m_nextPos + 1]; |
164 | if (next >= 0xff) |
165 | return 0; |
166 | return next; |
167 | } |
168 | |
169 | char Parser::peekCurHelper() |
170 | { |
171 | if (m_nextPos >= m_data.length()) |
172 | return 0; |
173 | UChar next = m_data[m_nextPos]; |
174 | if (next >= 0xff) |
175 | return 0; |
176 | return next; |
177 | } |
178 | |
179 | Parser::Token Parser::lexString() |
180 | { |
181 | UChar delimiter = m_data[m_nextPos]; |
182 | int startPos = m_nextPos + 1; |
183 | |
184 | for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { |
185 | if (m_data[m_nextPos] == delimiter) { |
186 | String value = m_data.substring(startPos, m_nextPos - startPos); |
187 | if (value.isNull()) |
188 | value = emptyString(); |
189 | ++m_nextPos; // Consume the char. |
190 | return Token(LITERAL, value); |
191 | } |
192 | } |
193 | |
194 | // Ouch, went off the end -- report error. |
195 | return Token(XPATH_ERROR); |
196 | } |
197 | |
198 | Parser::Token Parser::lexNumber() |
199 | { |
200 | int startPos = m_nextPos; |
201 | bool seenDot = false; |
202 | |
203 | // Go until end or a non-digits character. |
204 | for (; m_nextPos < m_data.length(); ++m_nextPos) { |
205 | UChar aChar = m_data[m_nextPos]; |
206 | if (aChar >= 0xff) break; |
207 | |
208 | if (!isASCIIDigit(aChar)) { |
209 | if (aChar == '.' && !seenDot) |
210 | seenDot = true; |
211 | else |
212 | break; |
213 | } |
214 | } |
215 | |
216 | return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos)); |
217 | } |
218 | |
219 | bool Parser::lexNCName(String& name) |
220 | { |
221 | int startPos = m_nextPos; |
222 | if (m_nextPos >= m_data.length()) |
223 | return false; |
224 | |
225 | if (charCat(m_data[m_nextPos]) != NameStart) |
226 | return false; |
227 | |
228 | // Keep going until we get a character that's not good for names. |
229 | for (; m_nextPos < m_data.length(); ++m_nextPos) |
230 | if (charCat(m_data[m_nextPos]) == NotPartOfName) |
231 | break; |
232 | |
233 | name = m_data.substring(startPos, m_nextPos - startPos); |
234 | return true; |
235 | } |
236 | |
237 | bool Parser::lexQName(String& name) |
238 | { |
239 | String n1; |
240 | if (!lexNCName(n1)) |
241 | return false; |
242 | |
243 | skipWS(); |
244 | |
245 | // If the next character is :, what we just got it the prefix, if not, |
246 | // it's the whole thing. |
247 | if (peekAheadHelper() != ':') { |
248 | name = n1; |
249 | return true; |
250 | } |
251 | |
252 | String n2; |
253 | if (!lexNCName(n2)) |
254 | return false; |
255 | |
256 | name = n1 + ":" + n2; |
257 | return true; |
258 | } |
259 | |
260 | inline Parser::Token Parser::nextTokenInternal() |
261 | { |
262 | skipWS(); |
263 | |
264 | if (m_nextPos >= m_data.length()) |
265 | return Token(0); |
266 | |
267 | char code = peekCurHelper(); |
268 | switch (code) { |
269 | case '(': case ')': case '[': case ']': |
270 | case '@': case ',': case '|': |
271 | return makeTokenAndAdvance(code); |
272 | case '\'': |
273 | case '\"': |
274 | return lexString(); |
275 | case '0': case '1': case '2': case '3': case '4': |
276 | case '5': case '6': case '7': case '8': case '9': |
277 | return lexNumber(); |
278 | case '.': { |
279 | char next = peekAheadHelper(); |
280 | if (next == '.') |
281 | return makeTokenAndAdvance(DOTDOT, 2); |
282 | if (isASCIIDigit(next)) |
283 | return lexNumber(); |
284 | return makeTokenAndAdvance('.'); |
285 | } |
286 | case '/': |
287 | if (peekAheadHelper() == '/') |
288 | return makeTokenAndAdvance(SLASHSLASH, 2); |
289 | return makeTokenAndAdvance('/'); |
290 | case '+': |
291 | return makeTokenAndAdvance(PLUS); |
292 | case '-': |
293 | return makeTokenAndAdvance(MINUS); |
294 | case '=': |
295 | return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ); |
296 | case '!': |
297 | if (peekAheadHelper() == '=') |
298 | return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2); |
299 | return Token(XPATH_ERROR); |
300 | case '<': |
301 | if (peekAheadHelper() == '=') |
302 | return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2); |
303 | return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT); |
304 | case '>': |
305 | if (peekAheadHelper() == '=') |
306 | return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2); |
307 | return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT); |
308 | case '*': |
309 | if (isBinaryOperatorContext()) |
310 | return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul); |
311 | ++m_nextPos; |
312 | return Token(NAMETEST, "*" ); |
313 | case '$': { // $ QName |
314 | m_nextPos++; |
315 | String name; |
316 | if (!lexQName(name)) |
317 | return Token(XPATH_ERROR); |
318 | return Token(VARIABLEREFERENCE, name); |
319 | } |
320 | } |
321 | |
322 | String name; |
323 | if (!lexNCName(name)) |
324 | return Token(XPATH_ERROR); |
325 | |
326 | skipWS(); |
327 | // If we're in an operator context, check for any operator names |
328 | if (isBinaryOperatorContext()) { |
329 | if (name == "and" ) //### hash? |
330 | return Token(AND); |
331 | if (name == "or" ) |
332 | return Token(OR); |
333 | if (name == "mod" ) |
334 | return Token(MULOP, NumericOp::OP_Mod); |
335 | if (name == "div" ) |
336 | return Token(MULOP, NumericOp::OP_Div); |
337 | } |
338 | |
339 | // See whether we are at a : |
340 | if (peekCurHelper() == ':') { |
341 | m_nextPos++; |
342 | // Any chance it's an axis name? |
343 | if (peekCurHelper() == ':') { |
344 | m_nextPos++; |
345 | |
346 | //It might be an axis name. |
347 | Step::Axis axis; |
348 | if (parseAxisName(name, axis)) |
349 | return Token(AXISNAME, axis); |
350 | // Ugh, :: is only valid in axis names -> error |
351 | return Token(XPATH_ERROR); |
352 | } |
353 | |
354 | // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest |
355 | skipWS(); |
356 | if (peekCurHelper() == '*') { |
357 | m_nextPos++; |
358 | return Token(NAMETEST, name + ":*" ); |
359 | } |
360 | |
361 | // Make a full qname. |
362 | String n2; |
363 | if (!lexNCName(n2)) |
364 | return Token(XPATH_ERROR); |
365 | |
366 | name = name + ":" + n2; |
367 | } |
368 | |
369 | skipWS(); |
370 | |
371 | if (peekCurHelper() == '(') { |
372 | // note: we don't swallow the '(' here! |
373 | |
374 | // Either node type oor function name. |
375 | |
376 | if (name == "processing-instruction" ) |
377 | return Token(PI); |
378 | if (name == "node" ) |
379 | return Token(NODE); |
380 | if (name == "text" ) |
381 | return Token(TEXT_); |
382 | if (name == "comment" ) |
383 | return Token(COMMENT); |
384 | |
385 | return Token(FUNCTIONNAME, name); |
386 | } |
387 | |
388 | // At this point, it must be NAMETEST. |
389 | return Token(NAMETEST, name); |
390 | } |
391 | |
392 | inline Parser::Token Parser::nextToken() |
393 | { |
394 | Token token = nextTokenInternal(); |
395 | m_lastTokenType = token.type; |
396 | return token; |
397 | } |
398 | |
399 | Parser::Parser(const String& statement, RefPtr<XPathNSResolver>&& resolver) |
400 | : m_data(statement) |
401 | , m_resolver(WTFMove(resolver)) |
402 | { |
403 | } |
404 | |
405 | int Parser::lex(YYSTYPE& yylval) |
406 | { |
407 | Token token = nextToken(); |
408 | |
409 | switch (token.type) { |
410 | case AXISNAME: |
411 | yylval.axis = token.axis; |
412 | break; |
413 | case MULOP: |
414 | yylval.numericOpcode = token.numericOpcode; |
415 | break; |
416 | case RELOP: |
417 | case EQOP: |
418 | yylval.equalityTestOpcode = token.equalityTestOpcode; |
419 | break; |
420 | case NODETYPE: |
421 | case FUNCTIONNAME: |
422 | case LITERAL: |
423 | case VARIABLEREFERENCE: |
424 | case NUMBER: |
425 | case NAMETEST: |
426 | yylval.string = token.string.releaseImpl().leakRef(); |
427 | break; |
428 | } |
429 | |
430 | return token.type; |
431 | } |
432 | |
433 | bool Parser::expandQualifiedName(const String& qualifiedName, String& localName, String& namespaceURI) |
434 | { |
435 | size_t colon = qualifiedName.find(':'); |
436 | if (colon != notFound) { |
437 | if (!m_resolver) { |
438 | m_sawNamespaceError = true; |
439 | return false; |
440 | } |
441 | namespaceURI = m_resolver->lookupNamespaceURI(qualifiedName.left(colon)); |
442 | if (namespaceURI.isNull()) { |
443 | m_sawNamespaceError = true; |
444 | return false; |
445 | } |
446 | localName = qualifiedName.substring(colon + 1); |
447 | } else |
448 | localName = qualifiedName; |
449 | return true; |
450 | } |
451 | |
452 | ExceptionOr<std::unique_ptr<Expression>> Parser::parseStatement(const String& statement, RefPtr<XPathNSResolver>&& resolver) |
453 | { |
454 | Parser parser { statement, WTFMove(resolver) }; |
455 | |
456 | int parseError = xpathyyparse(parser); |
457 | |
458 | if (parser.m_sawNamespaceError) |
459 | return Exception { NamespaceError }; |
460 | |
461 | if (parseError) |
462 | return Exception { SyntaxError }; |
463 | |
464 | return WTFMove(parser.m_result); |
465 | } |
466 | |
467 | } } |
468 | |