1 | /* |
2 | * Copyright (C) 2004-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2005 Alexey Proskuryakov. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #include "config.h" |
28 | #include "TextIterator.h" |
29 | |
30 | #include "ComposedTreeIterator.h" |
31 | #include "Document.h" |
32 | #include "Editing.h" |
33 | #include "FontCascade.h" |
34 | #include "Frame.h" |
35 | #include "HTMLBodyElement.h" |
36 | #include "HTMLElement.h" |
37 | #include "HTMLFrameOwnerElement.h" |
38 | #include "HTMLInputElement.h" |
39 | #include "HTMLLegendElement.h" |
40 | #include "HTMLMeterElement.h" |
41 | #include "HTMLNames.h" |
42 | #include "HTMLParagraphElement.h" |
43 | #include "HTMLProgressElement.h" |
44 | #include "HTMLSlotElement.h" |
45 | #include "HTMLTextAreaElement.h" |
46 | #include "HTMLTextFormControlElement.h" |
47 | #include "InlineTextBox.h" |
48 | #include "NodeTraversal.h" |
49 | #include "RenderImage.h" |
50 | #include "RenderIterator.h" |
51 | #include "RenderTableCell.h" |
52 | #include "RenderTableRow.h" |
53 | #include "RenderTextControl.h" |
54 | #include "RenderTextFragment.h" |
55 | #include "ShadowRoot.h" |
56 | #include "SimpleLineLayout.h" |
57 | #include "SimpleLineLayoutFunctions.h" |
58 | #include "SimpleLineLayoutResolver.h" |
59 | #include "TextBoundaries.h" |
60 | #include "TextControlInnerElements.h" |
61 | #include "VisiblePosition.h" |
62 | #include "VisibleUnits.h" |
63 | #include <unicode/unorm2.h> |
64 | #include <wtf/Function.h> |
65 | #include <wtf/text/CString.h> |
66 | #include <wtf/text/StringBuilder.h> |
67 | #include <wtf/text/TextBreakIterator.h> |
68 | #include <wtf/unicode/CharacterNames.h> |
69 | |
70 | #if !UCONFIG_NO_COLLATION |
71 | #include <unicode/usearch.h> |
72 | #include <wtf/text/TextBreakIteratorInternalICU.h> |
73 | #endif |
74 | |
75 | namespace WebCore { |
76 | |
77 | using namespace WTF::Unicode; |
78 | using namespace HTMLNames; |
79 | |
80 | // Buffer that knows how to compare with a search target. |
81 | // Keeps enough of the previous text to be able to search in the future, but no more. |
82 | // Non-breaking spaces are always equal to normal spaces. |
83 | // Case folding is also done if the CaseInsensitive option is specified. |
84 | // Matches are further filtered if the AtWordStarts option is specified, although some |
85 | // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well. |
86 | class SearchBuffer { |
87 | WTF_MAKE_NONCOPYABLE(SearchBuffer); |
88 | public: |
89 | SearchBuffer(const String& target, FindOptions); |
90 | ~SearchBuffer(); |
91 | |
92 | // Returns number of characters appended; guaranteed to be in the range [1, length]. |
93 | size_t append(StringView); |
94 | bool needsMoreContext() const; |
95 | void prependContext(StringView); |
96 | void reachedBreak(); |
97 | |
98 | // Result is the size in characters of what was found. |
99 | // And <startOffset> is the number of characters back to the start of what was found. |
100 | size_t search(size_t& startOffset); |
101 | bool atBreak() const; |
102 | |
103 | #if !UCONFIG_NO_COLLATION |
104 | |
105 | private: |
106 | bool isBadMatch(const UChar*, size_t length) const; |
107 | bool isWordStartMatch(size_t start, size_t length) const; |
108 | bool isWordEndMatch(size_t start, size_t length) const; |
109 | |
110 | const String m_target; |
111 | const StringView::UpconvertedCharacters m_targetCharacters; |
112 | FindOptions m_options; |
113 | |
114 | Vector<UChar> m_buffer; |
115 | size_t m_overlap; |
116 | size_t m_prefixLength; |
117 | bool m_atBreak; |
118 | bool m_needsMoreContext; |
119 | |
120 | const bool m_targetRequiresKanaWorkaround; |
121 | Vector<UChar> m_normalizedTarget; |
122 | mutable Vector<UChar> m_normalizedMatch; |
123 | |
124 | #else |
125 | |
126 | private: |
127 | void append(UChar, bool isCharacterStart); |
128 | size_t length() const; |
129 | |
130 | String m_target; |
131 | FindOptions m_options; |
132 | |
133 | Vector<UChar> m_buffer; |
134 | Vector<bool> m_isCharacterStartBuffer; |
135 | bool m_isBufferFull; |
136 | size_t m_cursor; |
137 | |
138 | #endif |
139 | }; |
140 | |
141 | // -------- |
142 | |
143 | static const unsigned bitsInWord = sizeof(unsigned) * 8; |
144 | static const unsigned bitInWordMask = bitsInWord - 1; |
145 | |
146 | BitStack::BitStack() |
147 | : m_size(0) |
148 | { |
149 | } |
150 | |
151 | BitStack::~BitStack() = default; |
152 | |
153 | void BitStack::push(bool bit) |
154 | { |
155 | unsigned index = m_size / bitsInWord; |
156 | unsigned shift = m_size & bitInWordMask; |
157 | if (!shift && index == m_words.size()) { |
158 | m_words.grow(index + 1); |
159 | m_words[index] = 0; |
160 | } |
161 | unsigned& word = m_words[index]; |
162 | unsigned mask = 1U << shift; |
163 | if (bit) |
164 | word |= mask; |
165 | else |
166 | word &= ~mask; |
167 | ++m_size; |
168 | } |
169 | |
170 | void BitStack::pop() |
171 | { |
172 | if (m_size) |
173 | --m_size; |
174 | } |
175 | |
176 | bool BitStack::top() const |
177 | { |
178 | if (!m_size) |
179 | return false; |
180 | unsigned shift = (m_size - 1) & bitInWordMask; |
181 | return m_words.last() & (1U << shift); |
182 | } |
183 | |
184 | unsigned BitStack::size() const |
185 | { |
186 | return m_size; |
187 | } |
188 | |
189 | // -------- |
190 | |
191 | // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees. |
192 | static Node* nextInPreOrderCrossingShadowBoundaries(Node& rangeEndContainer, int rangeEndOffset) |
193 | { |
194 | if (rangeEndOffset >= 0 && !rangeEndContainer.isCharacterDataNode()) { |
195 | if (Node* next = rangeEndContainer.traverseToChildAt(rangeEndOffset)) |
196 | return next; |
197 | } |
198 | for (Node* node = &rangeEndContainer; node; node = node->parentOrShadowHostNode()) { |
199 | if (Node* next = node->nextSibling()) |
200 | return next; |
201 | } |
202 | return nullptr; |
203 | } |
204 | |
205 | static inline bool fullyClipsContents(Node& node) |
206 | { |
207 | auto* renderer = node.renderer(); |
208 | if (!renderer) { |
209 | if (!is<Element>(node)) |
210 | return false; |
211 | return !downcast<Element>(node).hasDisplayContents(); |
212 | } |
213 | if (!is<RenderBox>(*renderer)) |
214 | return false; |
215 | auto& box = downcast<RenderBox>(*renderer); |
216 | if (!box.hasOverflowClip()) |
217 | return false; |
218 | |
219 | // Quirk to keep copy/paste in the CodeMirror editor version used in Jenkins working. |
220 | if (is<HTMLTextAreaElement>(node)) |
221 | return box.size().isEmpty(); |
222 | |
223 | return box.contentSize().isEmpty(); |
224 | } |
225 | |
226 | static inline bool ignoresContainerClip(Node& node) |
227 | { |
228 | auto* renderer = node.renderer(); |
229 | if (!renderer || renderer->isTextOrLineBreak()) |
230 | return false; |
231 | return renderer->style().hasOutOfFlowPosition(); |
232 | } |
233 | |
234 | static void pushFullyClippedState(BitStack& stack, Node& node) |
235 | { |
236 | // Push true if this node full clips its contents, or if a parent already has fully |
237 | // clipped and this is not a node that ignores its container's clip. |
238 | stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node))); |
239 | } |
240 | |
241 | static void setUpFullyClippedStack(BitStack& stack, Node& node) |
242 | { |
243 | // Put the nodes in a vector so we can iterate in reverse order. |
244 | // FIXME: This (and TextIterator in general) should use ComposedTreeIterator. |
245 | Vector<Node*, 100> ancestry; |
246 | for (Node* parent = node.parentOrShadowHostNode(); parent; parent = parent->parentOrShadowHostNode()) |
247 | ancestry.append(parent); |
248 | |
249 | // Call pushFullyClippedState on each node starting with the earliest ancestor. |
250 | size_t size = ancestry.size(); |
251 | for (size_t i = 0; i < size; ++i) |
252 | pushFullyClippedState(stack, *ancestry[size - i - 1]); |
253 | pushFullyClippedState(stack, node); |
254 | } |
255 | |
256 | static bool isClippedByFrameAncestor(const Document& document, TextIteratorBehavior behavior) |
257 | { |
258 | if (!(behavior & TextIteratorClipsToFrameAncestors)) |
259 | return false; |
260 | |
261 | for (auto* owner = document.ownerElement(); owner; owner = owner->document().ownerElement()) { |
262 | BitStack ownerClipStack; |
263 | setUpFullyClippedStack(ownerClipStack, *owner); |
264 | if (ownerClipStack.top()) |
265 | return true; |
266 | } |
267 | return false; |
268 | } |
269 | |
270 | // FIXME: editingIgnoresContent and isRendererReplacedElement try to do the same job. |
271 | // It's not good to have both of them. |
272 | bool isRendererReplacedElement(RenderObject* renderer) |
273 | { |
274 | if (!renderer) |
275 | return false; |
276 | |
277 | bool isAttachment = false; |
278 | #if ENABLE(ATTACHMENT_ELEMENT) |
279 | isAttachment = renderer->isAttachment(); |
280 | #endif |
281 | if (renderer->isImage() || renderer->isWidget() || renderer->isMedia() || isAttachment) |
282 | return true; |
283 | |
284 | if (is<Element>(renderer->node())) { |
285 | Element& element = downcast<Element>(*renderer->node()); |
286 | if (is<HTMLFormControlElement>(element) || is<HTMLLegendElement>(element) || is<HTMLProgressElement>(element) || element.hasTagName(meterTag)) |
287 | return true; |
288 | if (equalLettersIgnoringASCIICase(element.attributeWithoutSynchronization(roleAttr), "img" )) |
289 | return true; |
290 | } |
291 | |
292 | return false; |
293 | } |
294 | |
295 | // -------- |
296 | |
297 | inline void TextIteratorCopyableText::reset() |
298 | { |
299 | m_singleCharacter = 0; |
300 | m_string = String(); |
301 | m_offset = 0; |
302 | m_length = 0; |
303 | } |
304 | |
305 | inline void TextIteratorCopyableText::set(String&& string) |
306 | { |
307 | m_singleCharacter = 0; |
308 | m_string = WTFMove(string); |
309 | m_offset = 0; |
310 | m_length = m_string.length(); |
311 | } |
312 | |
313 | inline void TextIteratorCopyableText::set(String&& string, unsigned offset, unsigned length) |
314 | { |
315 | ASSERT(offset < string.length()); |
316 | ASSERT(length); |
317 | ASSERT(length <= string.length() - offset); |
318 | |
319 | m_singleCharacter = 0; |
320 | m_string = WTFMove(string); |
321 | m_offset = offset; |
322 | m_length = length; |
323 | } |
324 | |
325 | inline void TextIteratorCopyableText::set(UChar singleCharacter) |
326 | { |
327 | m_singleCharacter = singleCharacter; |
328 | m_string = String(); |
329 | m_offset = 0; |
330 | m_length = 0; |
331 | } |
332 | |
333 | void TextIteratorCopyableText::appendToStringBuilder(StringBuilder& builder) const |
334 | { |
335 | if (m_singleCharacter) |
336 | builder.append(m_singleCharacter); |
337 | else |
338 | builder.append(m_string, m_offset, m_length); |
339 | } |
340 | |
341 | // -------- |
342 | |
343 | |
344 | TextIterator::TextIterator(Position start, Position end, TextIteratorBehavior behavior) |
345 | : m_behavior(behavior) |
346 | { |
347 | if (start.isNull() || end.isNull()) |
348 | return; |
349 | ASSERT(comparePositions(start, end) <= 0); |
350 | |
351 | RELEASE_ASSERT(behavior & TextIteratorTraversesFlatTree || start.treeScope() == end.treeScope()); |
352 | |
353 | start.document()->updateLayoutIgnorePendingStylesheets(); |
354 | |
355 | // FIXME: Use Position / PositionIterator instead to avoid offset computation. |
356 | m_startContainer = start.containerNode(); |
357 | m_startOffset = start.computeOffsetInContainerNode(); |
358 | |
359 | m_endContainer = end.containerNode(); |
360 | m_endOffset = end.computeOffsetInContainerNode(); |
361 | |
362 | m_node = start.firstNode().get(); |
363 | if (!m_node) |
364 | return; |
365 | |
366 | init(); |
367 | } |
368 | |
369 | TextIterator::TextIterator(const Range* range, TextIteratorBehavior behavior) |
370 | : m_behavior(behavior) |
371 | { |
372 | if (!range) |
373 | return; |
374 | |
375 | range->ownerDocument().updateLayoutIgnorePendingStylesheets(); |
376 | |
377 | m_startContainer = &range->startContainer(); |
378 | |
379 | // Callers should be handing us well-formed ranges. If we discover that this isn't |
380 | // the case, we could consider changing this assertion to an early return. |
381 | ASSERT(range->boundaryPointsValid()); |
382 | |
383 | m_startOffset = range->startOffset(); |
384 | m_endContainer = &range->endContainer(); |
385 | m_endOffset = range->endOffset(); |
386 | |
387 | m_node = range->firstNode(); |
388 | if (!m_node) |
389 | return; |
390 | |
391 | init(); |
392 | } |
393 | |
394 | void TextIterator::init() |
395 | { |
396 | if (isClippedByFrameAncestor(m_node->document(), m_behavior)) |
397 | return; |
398 | |
399 | setUpFullyClippedStack(m_fullyClippedStack, *m_node); |
400 | |
401 | m_offset = m_node == m_startContainer ? m_startOffset : 0; |
402 | |
403 | m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(*m_endContainer, m_endOffset); |
404 | |
405 | m_positionNode = m_node; |
406 | |
407 | advance(); |
408 | } |
409 | |
410 | TextIterator::~TextIterator() = default; |
411 | |
412 | // FIXME: Use ComposedTreeIterator instead. These functions are more expensive because they might do O(n) work. |
413 | static inline Node* firstChild(TextIteratorBehavior options, Node& node) |
414 | { |
415 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
416 | return firstChildInComposedTreeIgnoringUserAgentShadow(node); |
417 | return node.firstChild(); |
418 | } |
419 | |
420 | static inline Node* nextSibling(TextIteratorBehavior options, Node& node) |
421 | { |
422 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
423 | return nextSiblingInComposedTreeIgnoringUserAgentShadow(node); |
424 | return node.nextSibling(); |
425 | } |
426 | |
427 | static inline Node* nextNode(TextIteratorBehavior options, Node& node) |
428 | { |
429 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
430 | return nextInComposedTreeIgnoringUserAgentShadow(node); |
431 | return NodeTraversal::next(node); |
432 | } |
433 | |
434 | static inline bool isDescendantOf(TextIteratorBehavior options, Node& node, Node& possibleAncestor) |
435 | { |
436 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
437 | return node.isDescendantOrShadowDescendantOf(&possibleAncestor); |
438 | return node.isDescendantOf(&possibleAncestor); |
439 | } |
440 | |
441 | static inline Node* parentNodeOrShadowHost(TextIteratorBehavior options, Node& node) |
442 | { |
443 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
444 | return node.parentInComposedTree(); |
445 | return node.parentOrShadowHostNode(); |
446 | } |
447 | |
448 | static inline bool hasDisplayContents(Node& node) |
449 | { |
450 | return is<Element>(node) && downcast<Element>(node).hasDisplayContents(); |
451 | } |
452 | |
453 | void TextIterator::advance() |
454 | { |
455 | ASSERT(!atEnd()); |
456 | |
457 | // reset the run information |
458 | m_positionNode = nullptr; |
459 | m_copyableText.reset(); |
460 | m_text = StringView(); |
461 | |
462 | // handle remembered node that needed a newline after the text node's newline |
463 | if (m_nodeForAdditionalNewline) { |
464 | // Emit the extra newline, and position it *inside* m_node, after m_node's |
465 | // contents, in case it's a block, in the same way that we position the first |
466 | // newline. The range for the emitted newline should start where the line |
467 | // break begins. |
468 | // FIXME: It would be cleaner if we emitted two newlines during the last |
469 | // iteration, instead of using m_needsAnotherNewline. |
470 | emitCharacter('\n', *m_nodeForAdditionalNewline->parentNode(), m_nodeForAdditionalNewline, 1, 1); |
471 | m_nodeForAdditionalNewline = nullptr; |
472 | return; |
473 | } |
474 | |
475 | if (!m_textBox && m_remainingTextBox) { |
476 | m_textBox = m_remainingTextBox; |
477 | m_remainingTextBox = nullptr; |
478 | m_firstLetterText = nullptr; |
479 | m_offset = 0; |
480 | } |
481 | // handle remembered text box |
482 | if (m_textBox) { |
483 | handleTextBox(); |
484 | if (m_positionNode) |
485 | return; |
486 | } |
487 | |
488 | while (m_node && m_node != m_pastEndNode) { |
489 | // if the range ends at offset 0 of an element, represent the |
490 | // position, but not the content, of that element e.g. if the |
491 | // node is a blockflow element, emit a newline that |
492 | // precedes the element |
493 | if (m_node == m_endContainer && !m_endOffset) { |
494 | representNodeOffsetZero(); |
495 | m_node = nullptr; |
496 | return; |
497 | } |
498 | |
499 | auto* renderer = m_node->renderer(); |
500 | if (!m_handledNode) { |
501 | if (!renderer) { |
502 | m_handledNode = true; |
503 | m_handledChildren = !hasDisplayContents(*m_node); |
504 | } else { |
505 | // handle current node according to its type |
506 | if (renderer->isText() && m_node->isTextNode()) |
507 | m_handledNode = handleTextNode(); |
508 | else if (isRendererReplacedElement(renderer)) |
509 | m_handledNode = handleReplacedElement(); |
510 | else |
511 | m_handledNode = handleNonTextNode(); |
512 | if (m_positionNode) |
513 | return; |
514 | } |
515 | } |
516 | |
517 | // find a new current node to handle in depth-first manner, |
518 | // calling exitNode() as we come back thru a parent node |
519 | Node* next = m_handledChildren ? nullptr : firstChild(m_behavior, *m_node); |
520 | m_offset = 0; |
521 | if (!next) { |
522 | next = nextSibling(m_behavior, *m_node); |
523 | if (!next) { |
524 | bool pastEnd = nextNode(m_behavior, *m_node) == m_pastEndNode; |
525 | Node* parentNode = parentNodeOrShadowHost(m_behavior, *m_node); |
526 | while (!next && parentNode) { |
527 | if ((pastEnd && parentNode == m_endContainer) || isDescendantOf(m_behavior, *m_endContainer, *parentNode)) |
528 | return; |
529 | bool haveRenderer = m_node->renderer(); |
530 | Node* exitedNode = m_node; |
531 | m_node = parentNode; |
532 | m_fullyClippedStack.pop(); |
533 | parentNode = parentNodeOrShadowHost(m_behavior, *m_node); |
534 | if (haveRenderer) |
535 | exitNode(exitedNode); |
536 | if (m_positionNode) { |
537 | m_handledNode = true; |
538 | m_handledChildren = true; |
539 | return; |
540 | } |
541 | next = nextSibling(m_behavior, *m_node); |
542 | if (next && m_node->renderer()) |
543 | exitNode(m_node); |
544 | } |
545 | } |
546 | m_fullyClippedStack.pop(); |
547 | } |
548 | |
549 | // set the new current node |
550 | m_node = next; |
551 | if (m_node) |
552 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
553 | m_handledNode = false; |
554 | m_handledChildren = false; |
555 | m_handledFirstLetter = false; |
556 | m_firstLetterText = nullptr; |
557 | |
558 | // how would this ever be? |
559 | if (m_positionNode) |
560 | return; |
561 | } |
562 | } |
563 | |
564 | static bool hasVisibleTextNode(RenderText& renderer) |
565 | { |
566 | if (renderer.style().visibility() == Visibility::Visible) |
567 | return true; |
568 | if (is<RenderTextFragment>(renderer)) { |
569 | if (auto firstLetter = downcast<RenderTextFragment>(renderer).firstLetter()) { |
570 | if (firstLetter->style().visibility() == Visibility::Visible) |
571 | return true; |
572 | } |
573 | } |
574 | return false; |
575 | } |
576 | |
577 | static unsigned textNodeOffsetInFlow(const Text& firstTextNodeInRange) |
578 | { |
579 | // Calculate the text offset for simple lines. |
580 | RenderObject* renderer = firstTextNodeInRange.renderer(); |
581 | if (!renderer) |
582 | return 0; |
583 | unsigned textOffset = 0; |
584 | for (renderer = renderer->previousSibling(); renderer; renderer = renderer->previousSibling()) { |
585 | if (is<RenderText>(renderer)) |
586 | textOffset += downcast<RenderText>(renderer)->text().length(); |
587 | } |
588 | return textOffset; |
589 | } |
590 | |
591 | static bool isNewLineOrTabCharacter(UChar character) |
592 | { |
593 | return character == '\n' || character == '\t'; |
594 | } |
595 | |
596 | bool TextIterator::handleTextNode() |
597 | { |
598 | Text& textNode = downcast<Text>(*m_node); |
599 | |
600 | if (m_fullyClippedStack.top() && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
601 | return false; |
602 | |
603 | auto& renderer = *textNode.renderer(); |
604 | m_lastTextNode = &textNode; |
605 | String rendererText = renderer.text(); |
606 | |
607 | // handle pre-formatted text |
608 | if (!renderer.style().collapseWhiteSpace()) { |
609 | int runStart = m_offset; |
610 | if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) { |
611 | emitCharacter(' ', textNode, nullptr, runStart, runStart); |
612 | return false; |
613 | } |
614 | if (!m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset) { |
615 | handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer)); |
616 | if (m_firstLetterText) { |
617 | String firstLetter = m_firstLetterText->text(); |
618 | emitText(textNode, *m_firstLetterText, m_offset, m_offset + firstLetter.length()); |
619 | m_firstLetterText = nullptr; |
620 | m_textBox = nullptr; |
621 | return false; |
622 | } |
623 | } |
624 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
625 | return false; |
626 | int rendererTextLength = rendererText.length(); |
627 | int end = (&textNode == m_endContainer) ? m_endOffset : INT_MAX; |
628 | int runEnd = std::min(rendererTextLength, end); |
629 | |
630 | if (runStart >= runEnd) |
631 | return true; |
632 | |
633 | emitText(textNode, renderer, runStart, runEnd); |
634 | return true; |
635 | } |
636 | |
637 | if (const auto* layout = renderer.simpleLineLayout()) { |
638 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
639 | return true; |
640 | ASSERT(renderer.parent()); |
641 | ASSERT(is<RenderBlockFlow>(*renderer.parent())); |
642 | const auto& blockFlow = downcast<RenderBlockFlow>(*renderer.parent()); |
643 | // Use the simple layout runs to iterate over the text content. |
644 | bool isNewTextNode = m_previousSimpleTextNodeInFlow && m_previousSimpleTextNodeInFlow != &textNode; |
645 | // Simple line layout run positions are all absolute to the parent flow. |
646 | // Offsetting is required when multiple renderers are present. |
647 | m_accumulatedSimpleTextLengthInFlow += isNewTextNode ? m_previousSimpleTextNodeInFlow->renderer()->text().length() : 0; |
648 | m_previousSimpleTextNodeInFlow = &textNode; |
649 | |
650 | unsigned endPosition = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : rendererText.length(); |
651 | if (!m_flowRunResolverCache || &m_flowRunResolverCache->flow() != &blockFlow) { |
652 | m_accumulatedSimpleTextLengthInFlow = m_flowRunResolverCache ? 0 : textNodeOffsetInFlow(textNode); |
653 | m_flowRunResolverCache = std::make_unique<SimpleLineLayout::RunResolver>(blockFlow, *layout); |
654 | } |
655 | // Skip to m_offset position. |
656 | auto range = m_flowRunResolverCache->rangeForRenderer(renderer); |
657 | auto it = range.begin(); |
658 | auto end = range.end(); |
659 | auto startPosition = static_cast<unsigned>(m_offset) + m_accumulatedSimpleTextLengthInFlow; |
660 | while (it != end && (*it).end() <= startPosition) |
661 | ++it; |
662 | if (m_nextRunNeedsWhitespace && rendererText[m_offset - 1] == '\n') { |
663 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
664 | return it == end; |
665 | } |
666 | if (it == end) { |
667 | // Collapsed trailing whitespace. |
668 | m_offset = endPosition; |
669 | m_lastTextNodeEndedWithCollapsedSpace = true; |
670 | return true; |
671 | } |
672 | if (m_nextRunNeedsWhitespace) { |
673 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
674 | return false; |
675 | } |
676 | // If the position we are looking for is to the left of the renderer's first run, it could mean that |
677 | // the runs and the renderers are out of sync (e.g. we skipped a renderer in between). |
678 | // Better bail out at this point. |
679 | auto run = *it; |
680 | if (run.start() > startPosition) { |
681 | ASSERT(m_flowRunResolverCache); |
682 | if (&(rendererForPosition(m_flowRunResolverCache->flowContents(), startPosition)) != &renderer) { |
683 | ASSERT_NOT_REACHED(); |
684 | return true; |
685 | } |
686 | } |
687 | ASSERT(run.end() - run.start() <= rendererText.length()); |
688 | // contentStart skips leading whitespace. |
689 | unsigned contentStart = std::max<unsigned>(m_offset, run.start() - m_accumulatedSimpleTextLengthInFlow); |
690 | unsigned contentEnd = std::min(endPosition, run.end() - m_accumulatedSimpleTextLengthInFlow); |
691 | ASSERT_WITH_SECURITY_IMPLICATION(contentStart <= contentEnd); |
692 | // Check if whitespace adjustment is needed when crossing renderer boundary. |
693 | if (isNewTextNode) { |
694 | bool lastCharacterIsNotWhitespace = m_lastCharacter && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter); |
695 | bool addTrailingWhitespaceForPrevious = m_lastTextNodeEndedWithCollapsedSpace && lastCharacterIsNotWhitespace; |
696 | bool leadingWhitespaceIsNeededForCurrent = contentStart > static_cast<unsigned>(m_offset) && lastCharacterIsNotWhitespace; |
697 | if (addTrailingWhitespaceForPrevious || leadingWhitespaceIsNeededForCurrent) { |
698 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
699 | return false; |
700 | } |
701 | } |
702 | // \n \t single whitespace characters need replacing so that the new line/tab characters don't show up. |
703 | unsigned stopPosition = contentStart; |
704 | while (stopPosition < contentEnd && !isNewLineOrTabCharacter(rendererText[stopPosition])) |
705 | ++stopPosition; |
706 | // Emit the text up to the new line/tab character. |
707 | if (stopPosition < contentEnd) { |
708 | if (stopPosition == contentStart) { |
709 | emitCharacter(' ', textNode, nullptr, contentStart, contentStart + 1); |
710 | m_offset = contentStart + 1; |
711 | return false; |
712 | } |
713 | emitText(textNode, renderer, contentStart, stopPosition); |
714 | m_offset = stopPosition + 1; |
715 | m_nextRunNeedsWhitespace = true; |
716 | return false; |
717 | } |
718 | emitText(textNode, renderer, contentStart, contentEnd); |
719 | // When line ending with collapsed whitespace is present, we need to carry over one whitespace: foo(end of line)bar -> foo bar (otherwise we would end up with foobar). |
720 | m_nextRunNeedsWhitespace = run.isEndOfLine() && contentEnd < endPosition && renderer.style().isCollapsibleWhiteSpace(rendererText[contentEnd]); |
721 | m_offset = contentEnd; |
722 | return static_cast<unsigned>(m_offset) == endPosition; |
723 | } |
724 | |
725 | if (renderer.firstTextBox()) |
726 | m_textBox = renderer.firstTextBox(); |
727 | |
728 | bool shouldHandleFirstLetter = !m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset; |
729 | if (shouldHandleFirstLetter) |
730 | handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer)); |
731 | |
732 | if (!renderer.firstTextBox() && rendererText.length() && !shouldHandleFirstLetter) { |
733 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
734 | return false; |
735 | m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space |
736 | return true; |
737 | } |
738 | |
739 | // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text) |
740 | auto& boxesRenderer = m_firstLetterText ? *m_firstLetterText : renderer; |
741 | if (boxesRenderer.containsReversedText()) { |
742 | m_sortedTextBoxes.clear(); |
743 | for (InlineTextBox* textBox = boxesRenderer.firstTextBox(); textBox; textBox = textBox->nextTextBox()) |
744 | m_sortedTextBoxes.append(textBox); |
745 | std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart); |
746 | m_sortedTextBoxesPosition = 0; |
747 | m_textBox = m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]; |
748 | } |
749 | |
750 | handleTextBox(); |
751 | return true; |
752 | } |
753 | |
754 | void TextIterator::handleTextBox() |
755 | { |
756 | Text& textNode = downcast<Text>(*m_node); |
757 | |
758 | auto& renderer = m_firstLetterText ? *m_firstLetterText : *textNode.renderer(); |
759 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) { |
760 | m_textBox = nullptr; |
761 | return; |
762 | } |
763 | String rendererText = renderer.text(); |
764 | unsigned start = m_offset; |
765 | unsigned end = (&textNode == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX; |
766 | while (m_textBox) { |
767 | unsigned textBoxStart = m_textBox->start(); |
768 | unsigned runStart = std::max(textBoxStart, start); |
769 | |
770 | // Check for collapsed space at the start of this run. |
771 | InlineTextBox* firstTextBox = renderer.containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]) : renderer.firstTextBox(); |
772 | bool needSpace = m_lastTextNodeEndedWithCollapsedSpace || (m_textBox == firstTextBox && textBoxStart == runStart && runStart); |
773 | if (needSpace && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter) && m_lastCharacter) { |
774 | if (m_lastTextNode == &textNode && runStart && rendererText[runStart - 1] == ' ') { |
775 | unsigned spaceRunStart = runStart - 1; |
776 | while (spaceRunStart && rendererText[spaceRunStart - 1] == ' ') |
777 | --spaceRunStart; |
778 | emitText(textNode, renderer, spaceRunStart, spaceRunStart + 1); |
779 | } else |
780 | emitCharacter(' ', textNode, nullptr, runStart, runStart); |
781 | return; |
782 | } |
783 | unsigned textBoxEnd = textBoxStart + m_textBox->len(); |
784 | unsigned runEnd = std::min(textBoxEnd, end); |
785 | |
786 | // Determine what the next text box will be, but don't advance yet |
787 | InlineTextBox* nextTextBox = nullptr; |
788 | if (renderer.containsReversedText()) { |
789 | if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size()) |
790 | nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1]; |
791 | } else |
792 | nextTextBox = m_textBox->nextTextBox(); |
793 | ASSERT(!nextTextBox || &nextTextBox->renderer() == &renderer); |
794 | |
795 | if (runStart < runEnd) { |
796 | // Handle either a single newline character (which becomes a space), |
797 | // or a run of characters that does not include a newline. |
798 | // This effectively translates newlines to spaces without copying the text. |
799 | if (rendererText[runStart] == '\n') { |
800 | emitCharacter(' ', textNode, nullptr, runStart, runStart + 1); |
801 | m_offset = runStart + 1; |
802 | } else { |
803 | size_t subrunEnd = rendererText.find('\n', runStart); |
804 | if (subrunEnd == notFound || subrunEnd > runEnd) { |
805 | subrunEnd = runEnd; |
806 | bool lastSpaceCollapsedByNextNonTextBox = !nextTextBox && (m_behavior & TextIteratorBehavesAsIfNodesFollowing) && rendererText.length() > runEnd; |
807 | if (lastSpaceCollapsedByNextNonTextBox) |
808 | ++subrunEnd; // runEnd stopped before last space. Increment by one to restore the space. |
809 | } |
810 | m_offset = subrunEnd; |
811 | emitText(textNode, renderer, runStart, subrunEnd); |
812 | } |
813 | |
814 | // If we are doing a subrun that doesn't go to the end of the text box, |
815 | // come back again to finish handling this text box; don't advance to the next one. |
816 | if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd) |
817 | return; |
818 | |
819 | // Advance and return |
820 | unsigned nextRunStart = nextTextBox ? nextTextBox->start() : rendererText.length(); |
821 | if (nextRunStart > runEnd) |
822 | m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end |
823 | m_textBox = nextTextBox; |
824 | if (renderer.containsReversedText()) |
825 | ++m_sortedTextBoxesPosition; |
826 | return; |
827 | } |
828 | // Advance and continue |
829 | m_textBox = nextTextBox; |
830 | if (renderer.containsReversedText()) |
831 | ++m_sortedTextBoxesPosition; |
832 | } |
833 | if (!m_textBox && m_remainingTextBox) { |
834 | m_textBox = m_remainingTextBox; |
835 | m_remainingTextBox = nullptr; |
836 | m_firstLetterText = nullptr; |
837 | m_offset = 0; |
838 | handleTextBox(); |
839 | } |
840 | } |
841 | |
842 | static inline RenderText* firstRenderTextInFirstLetter(RenderBoxModelObject* firstLetter) |
843 | { |
844 | if (!firstLetter) |
845 | return nullptr; |
846 | |
847 | // FIXME: Should this check descendent objects? |
848 | return childrenOfType<RenderText>(*firstLetter).first(); |
849 | } |
850 | |
851 | void TextIterator::handleTextNodeFirstLetter(RenderTextFragment& renderer) |
852 | { |
853 | if (auto* firstLetter = renderer.firstLetter()) { |
854 | if (firstLetter->style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
855 | return; |
856 | if (auto* firstLetterText = firstRenderTextInFirstLetter(firstLetter)) { |
857 | m_handledFirstLetter = true; |
858 | m_remainingTextBox = m_textBox; |
859 | m_textBox = firstLetterText->firstTextBox(); |
860 | m_sortedTextBoxes.clear(); |
861 | m_firstLetterText = firstLetterText; |
862 | } |
863 | } |
864 | m_handledFirstLetter = true; |
865 | } |
866 | |
867 | bool TextIterator::handleReplacedElement() |
868 | { |
869 | if (m_fullyClippedStack.top()) |
870 | return false; |
871 | |
872 | auto& renderer = *m_node->renderer(); |
873 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
874 | return false; |
875 | |
876 | if (m_lastTextNodeEndedWithCollapsedSpace) { |
877 | emitCharacter(' ', *m_lastTextNode->parentNode(), m_lastTextNode, 1, 1); |
878 | return false; |
879 | } |
880 | |
881 | if ((m_behavior & TextIteratorEntersTextControls) && is<RenderTextControl>(renderer)) { |
882 | if (auto innerTextElement = downcast<RenderTextControl>(renderer).textFormControlElement().innerTextElement()) { |
883 | m_node = innerTextElement->containingShadowRoot(); |
884 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
885 | m_offset = 0; |
886 | return false; |
887 | } |
888 | } |
889 | |
890 | m_hasEmitted = true; |
891 | |
892 | if ((m_behavior & TextIteratorEmitsObjectReplacementCharacters) && renderer.isReplaced()) { |
893 | emitCharacter(objectReplacementCharacter, *m_node->parentNode(), m_node, 0, 1); |
894 | // Don't process subtrees for embedded objects. If the text there is required, |
895 | // it must be explicitly asked by specifying a range falling inside its boundaries. |
896 | m_handledChildren = true; |
897 | return true; |
898 | } |
899 | |
900 | if (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) { |
901 | // We want replaced elements to behave like punctuation for boundary |
902 | // finding, and to simply take up space for the selection preservation |
903 | // code in moveParagraphs, so we use a comma. |
904 | emitCharacter(',', *m_node->parentNode(), m_node, 0, 1); |
905 | return true; |
906 | } |
907 | |
908 | m_positionNode = m_node->parentNode(); |
909 | m_positionOffsetBaseNode = m_node; |
910 | m_positionStartOffset = 0; |
911 | m_positionEndOffset = 1; |
912 | |
913 | if ((m_behavior & TextIteratorEmitsImageAltText) && is<RenderImage>(renderer)) { |
914 | String altText = downcast<RenderImage>(renderer).altText(); |
915 | if (unsigned length = altText.length()) { |
916 | m_lastCharacter = altText[length - 1]; |
917 | m_copyableText.set(WTFMove(altText)); |
918 | m_text = m_copyableText.text(); |
919 | return true; |
920 | } |
921 | } |
922 | |
923 | m_copyableText.reset(); |
924 | m_text = StringView(); |
925 | m_lastCharacter = 0; |
926 | return true; |
927 | } |
928 | |
929 | static bool shouldEmitTabBeforeNode(Node& node) |
930 | { |
931 | auto* renderer = node.renderer(); |
932 | |
933 | // Table cells are delimited by tabs. |
934 | if (!renderer || !isTableCell(&node)) |
935 | return false; |
936 | |
937 | // Want a tab before every cell other than the first one. |
938 | RenderTableCell& cell = downcast<RenderTableCell>(*renderer); |
939 | RenderTable* table = cell.table(); |
940 | return table && (table->cellBefore(&cell) || table->cellAbove(&cell)); |
941 | } |
942 | |
943 | static bool shouldEmitNewlineForNode(Node* node, bool emitsOriginalText) |
944 | { |
945 | auto* renderer = node->renderer(); |
946 | if (!(renderer ? renderer->isBR() : node->hasTagName(brTag))) |
947 | return false; |
948 | return emitsOriginalText || !(node->isInShadowTree() && is<HTMLInputElement>(*node->shadowHost())); |
949 | } |
950 | |
951 | static bool (HTMLElement& element) |
952 | { |
953 | return element.hasTagName(h1Tag) |
954 | || element.hasTagName(h2Tag) |
955 | || element.hasTagName(h3Tag) |
956 | || element.hasTagName(h4Tag) |
957 | || element.hasTagName(h5Tag) |
958 | || element.hasTagName(h6Tag); |
959 | } |
960 | |
961 | static bool shouldEmitNewlinesBeforeAndAfterNode(Node& node) |
962 | { |
963 | // Block flow (versus inline flow) is represented by having |
964 | // a newline both before and after the element. |
965 | auto* renderer = node.renderer(); |
966 | if (!renderer) { |
967 | if (!is<HTMLElement>(node)) |
968 | return false; |
969 | auto& element = downcast<HTMLElement>(node); |
970 | return hasHeaderTag(element) |
971 | || element.hasTagName(blockquoteTag) |
972 | || element.hasTagName(ddTag) |
973 | || element.hasTagName(divTag) |
974 | || element.hasTagName(dlTag) |
975 | || element.hasTagName(dtTag) |
976 | || element.hasTagName(hrTag) |
977 | || element.hasTagName(liTag) |
978 | || element.hasTagName(listingTag) |
979 | || element.hasTagName(olTag) |
980 | || element.hasTagName(pTag) |
981 | || element.hasTagName(preTag) |
982 | || element.hasTagName(trTag) |
983 | || element.hasTagName(ulTag); |
984 | } |
985 | |
986 | // Need to make an exception for table cells, because they are blocks, but we |
987 | // want them tab-delimited rather than having newlines before and after. |
988 | if (isTableCell(&node)) |
989 | return false; |
990 | |
991 | // Need to make an exception for table row elements, because they are neither |
992 | // "inline" or "RenderBlock", but we want newlines for them. |
993 | if (is<RenderTableRow>(*renderer)) { |
994 | RenderTable* table = downcast<RenderTableRow>(*renderer).table(); |
995 | if (table && !table->isInline()) |
996 | return true; |
997 | } |
998 | |
999 | return !renderer->isInline() |
1000 | && is<RenderBlock>(*renderer) |
1001 | && !renderer->isFloatingOrOutOfFlowPositioned() |
1002 | && !renderer->isBody() |
1003 | && !renderer->isRubyText(); |
1004 | } |
1005 | |
1006 | static bool shouldEmitNewlineAfterNode(Node& node, bool emitsCharactersBetweenAllVisiblePositions = false) |
1007 | { |
1008 | // FIXME: It should be better but slower to create a VisiblePosition here. |
1009 | if (!shouldEmitNewlinesBeforeAndAfterNode(node)) |
1010 | return false; |
1011 | |
1012 | // Don't emit a new line at the end of the document unless we're matching the behavior of VisiblePosition. |
1013 | if (emitsCharactersBetweenAllVisiblePositions) |
1014 | return true; |
1015 | Node* subsequentNode = &node; |
1016 | while ((subsequentNode = NodeTraversal::nextSkippingChildren(*subsequentNode))) { |
1017 | if (subsequentNode->renderer()) |
1018 | return true; |
1019 | } |
1020 | return false; |
1021 | } |
1022 | |
1023 | static bool shouldEmitNewlineBeforeNode(Node& node) |
1024 | { |
1025 | return shouldEmitNewlinesBeforeAndAfterNode(node); |
1026 | } |
1027 | |
1028 | static bool (Node& node) |
1029 | { |
1030 | // When there is a significant collapsed bottom margin, emit an extra |
1031 | // newline for a more realistic result. We end up getting the right |
1032 | // result even without margin collapsing. For example: <div><p>text</p></div> |
1033 | // will work right even if both the <div> and the <p> have bottom margins. |
1034 | |
1035 | auto* renderer = node.renderer(); |
1036 | if (!is<RenderBox>(renderer)) |
1037 | return false; |
1038 | |
1039 | // NOTE: We only do this for a select set of nodes, and WinIE appears not to do this at all. |
1040 | if (!is<HTMLElement>(node)) |
1041 | return false; |
1042 | |
1043 | HTMLElement& element = downcast<HTMLElement>(node); |
1044 | if (!hasHeaderTag(element) && !is<HTMLParagraphElement>(element)) |
1045 | return false; |
1046 | |
1047 | auto& renderBox = downcast<RenderBox>(*renderer); |
1048 | if (!renderBox.height()) |
1049 | return false; |
1050 | |
1051 | int bottomMargin = renderBox.collapsedMarginAfter(); |
1052 | int fontSize = renderBox.style().fontDescription().computedPixelSize(); |
1053 | return bottomMargin * 2 >= fontSize; |
1054 | } |
1055 | |
1056 | static int collapsedSpaceLength(RenderText& renderer, int textEnd) |
1057 | { |
1058 | StringImpl& text = renderer.text(); |
1059 | unsigned length = text.length(); |
1060 | for (unsigned i = textEnd; i < length; ++i) { |
1061 | if (!renderer.style().isCollapsibleWhiteSpace(text[i])) |
1062 | return i - textEnd; |
1063 | } |
1064 | return length - textEnd; |
1065 | } |
1066 | |
1067 | static int maxOffsetIncludingCollapsedSpaces(Node& node) |
1068 | { |
1069 | int offset = caretMaxOffset(node); |
1070 | if (auto* renderer = node.renderer()) { |
1071 | if (is<RenderText>(*renderer)) |
1072 | offset += collapsedSpaceLength(downcast<RenderText>(*renderer), offset); |
1073 | } |
1074 | return offset; |
1075 | } |
1076 | |
1077 | // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic). |
1078 | bool TextIterator::shouldRepresentNodeOffsetZero() |
1079 | { |
1080 | if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isTable()) |
1081 | return true; |
1082 | |
1083 | // Leave element positioned flush with start of a paragraph |
1084 | // (e.g. do not insert tab before a table cell at the start of a paragraph) |
1085 | if (m_lastCharacter == '\n') |
1086 | return false; |
1087 | |
1088 | // Otherwise, show the position if we have emitted any characters |
1089 | if (m_hasEmitted) |
1090 | return true; |
1091 | |
1092 | // We've not emitted anything yet. Generally, there is no need for any positioning then. |
1093 | // The only exception is when the element is visually not in the same line as |
1094 | // the start of the range (e.g. the range starts at the end of the previous paragraph). |
1095 | // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we |
1096 | // make quicker checks to possibly avoid that. Another check that we could make is |
1097 | // is whether the inline vs block flow changed since the previous visible element. |
1098 | // I think we're already in a special enough case that that won't be needed, tho. |
1099 | |
1100 | // No character needed if this is the first node in the range. |
1101 | if (m_node == m_startContainer) |
1102 | return false; |
1103 | |
1104 | // If we are outside the start container's subtree, assume we need to emit. |
1105 | // FIXME: m_startContainer could be an inline block |
1106 | if (!m_node->isDescendantOf(m_startContainer)) |
1107 | return true; |
1108 | |
1109 | // If we started as m_startContainer offset 0 and the current node is a descendant of |
1110 | // the start container, we already had enough context to correctly decide whether to |
1111 | // emit after a preceding block. We chose not to emit (m_hasEmitted is false), |
1112 | // so don't second guess that now. |
1113 | // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably |
1114 | // immaterial since we likely would have already emitted something by now. |
1115 | if (m_startOffset == 0) |
1116 | return false; |
1117 | |
1118 | // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning. |
1119 | // Additionally, if the range we are iterating over contains huge sections of unrendered content, |
1120 | // we would create VisiblePositions on every call to this function without this check. |
1121 | if (!m_node->renderer() || m_node->renderer()->style().visibility() != Visibility::Visible |
1122 | || (is<RenderBlockFlow>(*m_node->renderer()) && !downcast<RenderBlockFlow>(*m_node->renderer()).height() && !is<HTMLBodyElement>(*m_node))) |
1123 | return false; |
1124 | |
1125 | // The startPos.isNotNull() check is needed because the start could be before the body, |
1126 | // and in that case we'll get null. We don't want to put in newlines at the start in that case. |
1127 | // The currPos.isNotNull() check is needed because positions in non-HTML content |
1128 | // (like SVG) do not have visible positions, and we don't want to emit for them either. |
1129 | VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM); |
1130 | VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM); |
1131 | return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos); |
1132 | } |
1133 | |
1134 | bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node& node) |
1135 | { |
1136 | return node.renderer() && node.renderer()->isTable() && (node.renderer()->isInline() || (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)); |
1137 | } |
1138 | |
1139 | void TextIterator::representNodeOffsetZero() |
1140 | { |
1141 | // Emit a character to show the positioning of m_node. |
1142 | |
1143 | // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can |
1144 | // create VisiblePositions, which is expensive. So, we perform the inexpensive checks |
1145 | // on m_node to see if it necessitates emitting a character first and will early return |
1146 | // before encountering shouldRepresentNodeOffsetZero()s worse case behavior. |
1147 | if (shouldEmitTabBeforeNode(*m_node)) { |
1148 | if (shouldRepresentNodeOffsetZero()) |
1149 | emitCharacter('\t', *m_node->parentNode(), m_node, 0, 0); |
1150 | } else if (shouldEmitNewlineBeforeNode(*m_node)) { |
1151 | if (shouldRepresentNodeOffsetZero()) |
1152 | emitCharacter('\n', *m_node->parentNode(), m_node, 0, 0); |
1153 | } else if (shouldEmitSpaceBeforeAndAfterNode(*m_node)) { |
1154 | if (shouldRepresentNodeOffsetZero()) |
1155 | emitCharacter(' ', *m_node->parentNode(), m_node, 0, 0); |
1156 | } |
1157 | } |
1158 | |
1159 | bool TextIterator::handleNonTextNode() |
1160 | { |
1161 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText)) |
1162 | emitCharacter('\n', *m_node->parentNode(), m_node, 0, 1); |
1163 | else if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isHR()) |
1164 | emitCharacter(' ', *m_node->parentNode(), m_node, 0, 1); |
1165 | else |
1166 | representNodeOffsetZero(); |
1167 | |
1168 | return true; |
1169 | } |
1170 | |
1171 | void TextIterator::exitNode(Node* exitedNode) |
1172 | { |
1173 | // prevent emitting a newline when exiting a collapsed block at beginning of the range |
1174 | // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could |
1175 | // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and |
1176 | // therefore look like a blank line. |
1177 | if (!m_hasEmitted) |
1178 | return; |
1179 | |
1180 | // Emit with a position *inside* m_node, after m_node's contents, in |
1181 | // case it is a block, because the run should start where the |
1182 | // emitted character is positioned visually. |
1183 | Node* baseNode = exitedNode; |
1184 | // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making |
1185 | // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use |
1186 | // TextIterator in _web_attributedStringFromRange. |
1187 | // See <rdar://problem/5428427> for an example of how this mismatch will cause problems. |
1188 | if (m_lastTextNode && shouldEmitNewlineAfterNode(*m_node, m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)) { |
1189 | // use extra newline to represent margin bottom, as needed |
1190 | bool addNewline = shouldEmitExtraNewlineForNode(*m_node); |
1191 | |
1192 | // FIXME: We need to emit a '\n' as we leave an empty block(s) that |
1193 | // contain a VisiblePosition when doing selection preservation. |
1194 | if (m_lastCharacter != '\n') { |
1195 | // insert a newline with a position following this block's contents. |
1196 | emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1); |
1197 | // remember whether to later add a newline for the current node |
1198 | ASSERT(!m_nodeForAdditionalNewline); |
1199 | if (addNewline) |
1200 | m_nodeForAdditionalNewline = baseNode; |
1201 | } else if (addNewline) |
1202 | // insert a newline with a position following this block's contents. |
1203 | emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1); |
1204 | } |
1205 | |
1206 | // If nothing was emitted, see if we need to emit a space. |
1207 | if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(*m_node)) |
1208 | emitCharacter(' ', *baseNode->parentNode(), baseNode, 1, 1); |
1209 | } |
1210 | |
1211 | void TextIterator::emitCharacter(UChar character, Node& characterNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset) |
1212 | { |
1213 | m_hasEmitted = true; |
1214 | |
1215 | // remember information with which to construct the TextIterator::range() |
1216 | m_positionNode = &characterNode; |
1217 | m_positionOffsetBaseNode = offsetBaseNode; |
1218 | m_positionStartOffset = textStartOffset; |
1219 | m_positionEndOffset = textEndOffset; |
1220 | |
1221 | m_copyableText.set(character); |
1222 | m_text = m_copyableText.text(); |
1223 | m_lastCharacter = character; |
1224 | m_lastTextNodeEndedWithCollapsedSpace = false; |
1225 | m_nextRunNeedsWhitespace = false; |
1226 | } |
1227 | |
1228 | void TextIterator::emitText(Text& textNode, RenderText& renderer, int textStartOffset, int textEndOffset) |
1229 | { |
1230 | ASSERT(textStartOffset >= 0); |
1231 | ASSERT(textEndOffset >= 0); |
1232 | ASSERT(textStartOffset <= textEndOffset); |
1233 | |
1234 | // FIXME: This probably yields the wrong offsets when text-transform: lowercase turns a single character into two characters. |
1235 | String string = (m_behavior & TextIteratorEmitsOriginalText) ? renderer.originalText() |
1236 | : ((m_behavior & TextIteratorEmitsTextsWithoutTranscoding) ? renderer.textWithoutConvertingBackslashToYenSymbol() : renderer.text()); |
1237 | |
1238 | ASSERT(string.length() >= static_cast<unsigned>(textEndOffset)); |
1239 | |
1240 | m_positionNode = &textNode; |
1241 | m_positionOffsetBaseNode = nullptr; |
1242 | m_positionStartOffset = textStartOffset; |
1243 | m_positionEndOffset = textEndOffset; |
1244 | |
1245 | m_lastCharacter = string[textEndOffset - 1]; |
1246 | m_copyableText.set(WTFMove(string), textStartOffset, textEndOffset - textStartOffset); |
1247 | m_text = m_copyableText.text(); |
1248 | |
1249 | m_lastTextNodeEndedWithCollapsedSpace = false; |
1250 | m_nextRunNeedsWhitespace = false; |
1251 | m_hasEmitted = true; |
1252 | } |
1253 | |
1254 | Ref<Range> TextIterator::range() const |
1255 | { |
1256 | ASSERT(!atEnd()); |
1257 | |
1258 | // use the current run information, if we have it |
1259 | if (m_positionOffsetBaseNode) { |
1260 | unsigned index = m_positionOffsetBaseNode->computeNodeIndex(); |
1261 | m_positionStartOffset += index; |
1262 | m_positionEndOffset += index; |
1263 | m_positionOffsetBaseNode = nullptr; |
1264 | } |
1265 | return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); |
1266 | } |
1267 | |
1268 | Node* TextIterator::node() const |
1269 | { |
1270 | Ref<Range> = range(); |
1271 | |
1272 | Node& node = textRange->startContainer(); |
1273 | if (node.isCharacterDataNode()) |
1274 | return &node; |
1275 | |
1276 | return node.traverseToChildAt(textRange->startOffset()); |
1277 | } |
1278 | |
1279 | // -------- |
1280 | |
1281 | SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range& range) |
1282 | { |
1283 | range.ownerDocument().updateLayoutIgnorePendingStylesheets(); |
1284 | |
1285 | Node* startNode = &range.startContainer(); |
1286 | Node* endNode = &range.endContainer(); |
1287 | int startOffset = range.startOffset(); |
1288 | int endOffset = range.endOffset(); |
1289 | |
1290 | if (!startNode->isCharacterDataNode()) { |
1291 | if (startOffset >= 0 && startOffset < static_cast<int>(startNode->countChildNodes())) { |
1292 | startNode = startNode->traverseToChildAt(startOffset); |
1293 | startOffset = 0; |
1294 | } |
1295 | } |
1296 | if (!endNode->isCharacterDataNode()) { |
1297 | if (endOffset > 0 && endOffset <= static_cast<int>(endNode->countChildNodes())) { |
1298 | endNode = endNode->traverseToChildAt(endOffset - 1); |
1299 | endOffset = lastOffsetInNode(endNode); |
1300 | } |
1301 | } |
1302 | |
1303 | m_node = endNode; |
1304 | setUpFullyClippedStack(m_fullyClippedStack, *m_node); |
1305 | m_offset = endOffset; |
1306 | m_handledNode = false; |
1307 | m_handledChildren = endOffset == 0; |
1308 | |
1309 | m_startContainer = startNode; |
1310 | m_startOffset = startOffset; |
1311 | m_endContainer = endNode; |
1312 | m_endOffset = endOffset; |
1313 | |
1314 | m_positionNode = endNode; |
1315 | |
1316 | m_lastTextNode = nullptr; |
1317 | m_lastCharacter = '\n'; |
1318 | |
1319 | m_havePassedStartContainer = false; |
1320 | |
1321 | advance(); |
1322 | } |
1323 | |
1324 | void SimplifiedBackwardsTextIterator::advance() |
1325 | { |
1326 | ASSERT(!atEnd()); |
1327 | |
1328 | m_positionNode = nullptr; |
1329 | m_copyableText.reset(); |
1330 | m_text = StringView(); |
1331 | |
1332 | while (m_node && !m_havePassedStartContainer) { |
1333 | // Don't handle node if we start iterating at [node, 0]. |
1334 | if (!m_handledNode && !(m_node == m_endContainer && !m_endOffset)) { |
1335 | auto* renderer = m_node->renderer(); |
1336 | if (renderer && renderer->isText() && m_node->isTextNode()) { |
1337 | if (renderer->style().visibility() == Visibility::Visible && m_offset > 0) |
1338 | m_handledNode = handleTextNode(); |
1339 | } else if (renderer && (renderer->isImage() || renderer->isWidget())) { |
1340 | if (renderer->style().visibility() == Visibility::Visible && m_offset > 0) |
1341 | m_handledNode = handleReplacedElement(); |
1342 | } else |
1343 | m_handledNode = handleNonTextNode(); |
1344 | if (m_positionNode) |
1345 | return; |
1346 | } |
1347 | |
1348 | if (!m_handledChildren && m_node->hasChildNodes()) { |
1349 | m_node = m_node->lastChild(); |
1350 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
1351 | } else { |
1352 | // Exit empty containers as we pass over them or containers |
1353 | // where [container, 0] is where we started iterating. |
1354 | if (!m_handledNode && canHaveChildrenForEditing(*m_node) && m_node->parentNode() && (!m_node->lastChild() || (m_node == m_endContainer && !m_endOffset))) { |
1355 | exitNode(); |
1356 | if (m_positionNode) { |
1357 | m_handledNode = true; |
1358 | m_handledChildren = true; |
1359 | return; |
1360 | } |
1361 | } |
1362 | |
1363 | // Exit all other containers. |
1364 | while (!m_node->previousSibling()) { |
1365 | if (!advanceRespectingRange(m_node->parentOrShadowHostNode())) |
1366 | break; |
1367 | m_fullyClippedStack.pop(); |
1368 | exitNode(); |
1369 | if (m_positionNode) { |
1370 | m_handledNode = true; |
1371 | m_handledChildren = true; |
1372 | return; |
1373 | } |
1374 | } |
1375 | |
1376 | m_fullyClippedStack.pop(); |
1377 | if (advanceRespectingRange(m_node->previousSibling())) |
1378 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
1379 | else |
1380 | m_node = nullptr; |
1381 | } |
1382 | |
1383 | // For the purpose of word boundary detection, |
1384 | // we should iterate all visible text and trailing (collapsed) whitespaces. |
1385 | m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(*m_node) : 0; |
1386 | m_handledNode = false; |
1387 | m_handledChildren = false; |
1388 | |
1389 | if (m_positionNode) |
1390 | return; |
1391 | } |
1392 | } |
1393 | |
1394 | bool SimplifiedBackwardsTextIterator::handleTextNode() |
1395 | { |
1396 | Text& textNode = downcast<Text>(*m_node); |
1397 | |
1398 | m_lastTextNode = &textNode; |
1399 | |
1400 | int startOffset; |
1401 | int offsetInNode; |
1402 | RenderText* renderer = handleFirstLetter(startOffset, offsetInNode); |
1403 | if (!renderer) |
1404 | return true; |
1405 | |
1406 | String text = renderer->text(); |
1407 | if (!renderer->hasRenderedText() && text.length()) |
1408 | return true; |
1409 | |
1410 | if (startOffset + offsetInNode == m_offset) { |
1411 | ASSERT(!m_shouldHandleFirstLetter); |
1412 | return true; |
1413 | } |
1414 | |
1415 | m_positionEndOffset = m_offset; |
1416 | m_offset = startOffset + offsetInNode; |
1417 | m_positionNode = m_node; |
1418 | m_positionStartOffset = m_offset; |
1419 | |
1420 | ASSERT(m_positionStartOffset < m_positionEndOffset); |
1421 | ASSERT(m_positionStartOffset - offsetInNode >= 0); |
1422 | ASSERT(m_positionEndOffset - offsetInNode > 0); |
1423 | ASSERT(m_positionEndOffset - offsetInNode <= static_cast<int>(text.length())); |
1424 | |
1425 | m_lastCharacter = text[m_positionEndOffset - offsetInNode - 1]; |
1426 | m_copyableText.set(WTFMove(text), m_positionStartOffset - offsetInNode, m_positionEndOffset - m_positionStartOffset); |
1427 | m_text = m_copyableText.text(); |
1428 | |
1429 | return !m_shouldHandleFirstLetter; |
1430 | } |
1431 | |
1432 | RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode) |
1433 | { |
1434 | RenderText& renderer = downcast<RenderText>(*m_node->renderer()); |
1435 | startOffset = (m_node == m_startContainer) ? m_startOffset : 0; |
1436 | |
1437 | if (!is<RenderTextFragment>(renderer)) { |
1438 | offsetInNode = 0; |
1439 | return &renderer; |
1440 | } |
1441 | |
1442 | RenderTextFragment& fragment = downcast<RenderTextFragment>(renderer); |
1443 | int offsetAfterFirstLetter = fragment.start(); |
1444 | if (startOffset >= offsetAfterFirstLetter) { |
1445 | ASSERT(!m_shouldHandleFirstLetter); |
1446 | offsetInNode = offsetAfterFirstLetter; |
1447 | return &renderer; |
1448 | } |
1449 | |
1450 | if (!m_shouldHandleFirstLetter && startOffset + offsetAfterFirstLetter < m_offset) { |
1451 | m_shouldHandleFirstLetter = true; |
1452 | offsetInNode = offsetAfterFirstLetter; |
1453 | return &renderer; |
1454 | } |
1455 | |
1456 | m_shouldHandleFirstLetter = false; |
1457 | offsetInNode = 0; |
1458 | RenderText* firstLetterRenderer = firstRenderTextInFirstLetter(fragment.firstLetter()); |
1459 | |
1460 | m_offset = firstLetterRenderer->caretMaxOffset(); |
1461 | m_offset += collapsedSpaceLength(*firstLetterRenderer, m_offset); |
1462 | |
1463 | return firstLetterRenderer; |
1464 | } |
1465 | |
1466 | bool SimplifiedBackwardsTextIterator::handleReplacedElement() |
1467 | { |
1468 | unsigned index = m_node->computeNodeIndex(); |
1469 | // We want replaced elements to behave like punctuation for boundary |
1470 | // finding, and to simply take up space for the selection preservation |
1471 | // code in moveParagraphs, so we use a comma. Unconditionally emit |
1472 | // here because this iterator is only used for boundary finding. |
1473 | emitCharacter(',', *m_node->parentNode(), index, index + 1); |
1474 | return true; |
1475 | } |
1476 | |
1477 | bool SimplifiedBackwardsTextIterator::handleNonTextNode() |
1478 | { |
1479 | // We can use a linefeed in place of a tab because this simple iterator is only used to |
1480 | // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs. |
1481 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineAfterNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) { |
1482 | if (m_lastCharacter != '\n') { |
1483 | // Corresponds to the same check in TextIterator::exitNode. |
1484 | unsigned index = m_node->computeNodeIndex(); |
1485 | // The start of this emitted range is wrong. Ensuring correctness would require |
1486 | // VisiblePositions and so would be slow. previousBoundary expects this. |
1487 | emitCharacter('\n', *m_node->parentNode(), index + 1, index + 1); |
1488 | } |
1489 | } |
1490 | return true; |
1491 | } |
1492 | |
1493 | void SimplifiedBackwardsTextIterator::exitNode() |
1494 | { |
1495 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineBeforeNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) { |
1496 | // The start of this emitted range is wrong. Ensuring correctness would require |
1497 | // VisiblePositions and so would be slow. previousBoundary expects this. |
1498 | emitCharacter('\n', *m_node, 0, 0); |
1499 | } |
1500 | } |
1501 | |
1502 | void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node& node, int startOffset, int endOffset) |
1503 | { |
1504 | m_positionNode = &node; |
1505 | m_positionStartOffset = startOffset; |
1506 | m_positionEndOffset = endOffset; |
1507 | m_copyableText.set(c); |
1508 | m_text = m_copyableText.text(); |
1509 | m_lastCharacter = c; |
1510 | } |
1511 | |
1512 | bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next) |
1513 | { |
1514 | if (!next) |
1515 | return false; |
1516 | m_havePassedStartContainer |= m_node == m_startContainer; |
1517 | if (m_havePassedStartContainer) |
1518 | return false; |
1519 | m_node = next; |
1520 | return true; |
1521 | } |
1522 | |
1523 | Ref<Range> SimplifiedBackwardsTextIterator::range() const |
1524 | { |
1525 | ASSERT(!atEnd()); |
1526 | |
1527 | return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); |
1528 | } |
1529 | |
1530 | // -------- |
1531 | |
1532 | CharacterIterator::CharacterIterator(const Range& range, TextIteratorBehavior behavior) |
1533 | : m_underlyingIterator(&range, behavior) |
1534 | { |
1535 | while (!atEnd() && !m_underlyingIterator.text().length()) |
1536 | m_underlyingIterator.advance(); |
1537 | } |
1538 | |
1539 | CharacterIterator::CharacterIterator(Position start, Position end, TextIteratorBehavior behavior) |
1540 | : m_underlyingIterator(start, end, behavior) |
1541 | { |
1542 | while (!atEnd() && !m_underlyingIterator.text().length()) |
1543 | m_underlyingIterator.advance(); |
1544 | } |
1545 | |
1546 | Ref<Range> CharacterIterator::range() const |
1547 | { |
1548 | Ref<Range> range = m_underlyingIterator.range(); |
1549 | if (!m_underlyingIterator.atEnd()) { |
1550 | if (m_underlyingIterator.text().length() <= 1) { |
1551 | ASSERT(m_runOffset == 0); |
1552 | } else { |
1553 | Node& node = range->startContainer(); |
1554 | ASSERT(&node == &range->endContainer()); |
1555 | int offset = range->startOffset() + m_runOffset; |
1556 | range->setStart(node, offset); |
1557 | range->setEnd(node, offset + 1); |
1558 | } |
1559 | } |
1560 | return range; |
1561 | } |
1562 | |
1563 | void CharacterIterator::advance(int count) |
1564 | { |
1565 | if (count <= 0) { |
1566 | ASSERT(count == 0); |
1567 | return; |
1568 | } |
1569 | |
1570 | m_atBreak = false; |
1571 | |
1572 | // easy if there is enough left in the current m_underlyingIterator run |
1573 | int remaining = m_underlyingIterator.text().length() - m_runOffset; |
1574 | if (count < remaining) { |
1575 | m_runOffset += count; |
1576 | m_offset += count; |
1577 | return; |
1578 | } |
1579 | |
1580 | // exhaust the current m_underlyingIterator run |
1581 | count -= remaining; |
1582 | m_offset += remaining; |
1583 | |
1584 | // move to a subsequent m_underlyingIterator run |
1585 | for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) { |
1586 | int runLength = m_underlyingIterator.text().length(); |
1587 | if (!runLength) |
1588 | m_atBreak = true; |
1589 | else { |
1590 | // see whether this is m_underlyingIterator to use |
1591 | if (count < runLength) { |
1592 | m_runOffset = count; |
1593 | m_offset += count; |
1594 | return; |
1595 | } |
1596 | |
1597 | // exhaust this m_underlyingIterator run |
1598 | count -= runLength; |
1599 | m_offset += runLength; |
1600 | } |
1601 | } |
1602 | |
1603 | // ran to the end of the m_underlyingIterator... no more runs left |
1604 | m_atBreak = true; |
1605 | m_runOffset = 0; |
1606 | } |
1607 | |
1608 | static Ref<Range> characterSubrange(Document& document, CharacterIterator& it, int offset, int length) |
1609 | { |
1610 | it.advance(offset); |
1611 | if (it.atEnd()) |
1612 | return Range::create(document); |
1613 | |
1614 | Ref<Range> start = it.range(); |
1615 | |
1616 | if (length > 1) |
1617 | it.advance(length - 1); |
1618 | if (it.atEnd()) |
1619 | return Range::create(document); |
1620 | |
1621 | Ref<Range> end = it.range(); |
1622 | |
1623 | return Range::create(document, &start->startContainer(), start->startOffset(), &end->endContainer(), end->endOffset()); |
1624 | } |
1625 | |
1626 | BackwardsCharacterIterator::BackwardsCharacterIterator(const Range& range) |
1627 | : m_underlyingIterator(range) |
1628 | , m_offset(0) |
1629 | , m_runOffset(0) |
1630 | , m_atBreak(true) |
1631 | { |
1632 | while (!atEnd() && !m_underlyingIterator.text().length()) |
1633 | m_underlyingIterator.advance(); |
1634 | } |
1635 | |
1636 | Ref<Range> BackwardsCharacterIterator::range() const |
1637 | { |
1638 | Ref<Range> r = m_underlyingIterator.range(); |
1639 | if (!m_underlyingIterator.atEnd()) { |
1640 | if (m_underlyingIterator.text().length() <= 1) |
1641 | ASSERT(m_runOffset == 0); |
1642 | else { |
1643 | Node& node = r->startContainer(); |
1644 | ASSERT(&node == &r->endContainer()); |
1645 | int offset = r->endOffset() - m_runOffset; |
1646 | r->setStart(node, offset - 1); |
1647 | r->setEnd(node, offset); |
1648 | } |
1649 | } |
1650 | return r; |
1651 | } |
1652 | |
1653 | void BackwardsCharacterIterator::advance(int count) |
1654 | { |
1655 | if (count <= 0) { |
1656 | ASSERT(!count); |
1657 | return; |
1658 | } |
1659 | |
1660 | m_atBreak = false; |
1661 | |
1662 | int remaining = m_underlyingIterator.text().length() - m_runOffset; |
1663 | if (count < remaining) { |
1664 | m_runOffset += count; |
1665 | m_offset += count; |
1666 | return; |
1667 | } |
1668 | |
1669 | count -= remaining; |
1670 | m_offset += remaining; |
1671 | |
1672 | for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) { |
1673 | int runLength = m_underlyingIterator.text().length(); |
1674 | if (runLength == 0) |
1675 | m_atBreak = true; |
1676 | else { |
1677 | if (count < runLength) { |
1678 | m_runOffset = count; |
1679 | m_offset += count; |
1680 | return; |
1681 | } |
1682 | |
1683 | count -= runLength; |
1684 | m_offset += runLength; |
1685 | } |
1686 | } |
1687 | |
1688 | m_atBreak = true; |
1689 | m_runOffset = 0; |
1690 | } |
1691 | |
1692 | // -------- |
1693 | |
1694 | WordAwareIterator::WordAwareIterator(const Range& range) |
1695 | : m_underlyingIterator(&range) |
1696 | , m_didLookAhead(true) // so we consider the first chunk from the text iterator |
1697 | { |
1698 | advance(); // get in position over the first chunk of text |
1699 | } |
1700 | |
1701 | // We're always in one of these modes: |
1702 | // - The current chunk in the text iterator is our current chunk |
1703 | // (typically its a piece of whitespace, or text that ended with whitespace) |
1704 | // - The previous chunk in the text iterator is our current chunk |
1705 | // (we looked ahead to the next chunk and found a word boundary) |
1706 | // - We built up our own chunk of text from many chunks from the text iterator |
1707 | |
1708 | // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries. |
1709 | |
1710 | void WordAwareIterator::advance() |
1711 | { |
1712 | m_previousText.reset(); |
1713 | m_buffer.clear(); |
1714 | |
1715 | // If last time we did a look-ahead, start with that looked-ahead chunk now |
1716 | if (!m_didLookAhead) { |
1717 | ASSERT(!m_underlyingIterator.atEnd()); |
1718 | m_underlyingIterator.advance(); |
1719 | } |
1720 | m_didLookAhead = false; |
1721 | |
1722 | // Go to next non-empty chunk |
1723 | while (!m_underlyingIterator.atEnd() && !m_underlyingIterator.text().length()) |
1724 | m_underlyingIterator.advance(); |
1725 | if (m_underlyingIterator.atEnd()) |
1726 | return; |
1727 | |
1728 | while (1) { |
1729 | // If this chunk ends in whitespace we can just use it as our chunk. |
1730 | if (isSpaceOrNewline(m_underlyingIterator.text()[m_underlyingIterator.text().length() - 1])) |
1731 | return; |
1732 | |
1733 | // If this is the first chunk that failed, save it in previousText before look ahead |
1734 | if (m_buffer.isEmpty()) |
1735 | m_previousText = m_underlyingIterator.copyableText(); |
1736 | |
1737 | // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff |
1738 | m_underlyingIterator.advance(); |
1739 | if (m_underlyingIterator.atEnd() || !m_underlyingIterator.text().length() || isSpaceOrNewline(m_underlyingIterator.text()[0])) { |
1740 | m_didLookAhead = true; |
1741 | return; |
1742 | } |
1743 | |
1744 | if (m_buffer.isEmpty()) { |
1745 | // Start gobbling chunks until we get to a suitable stopping point |
1746 | append(m_buffer, m_previousText.text()); |
1747 | m_previousText.reset(); |
1748 | } |
1749 | append(m_buffer, m_underlyingIterator.text()); |
1750 | } |
1751 | } |
1752 | |
1753 | StringView WordAwareIterator::text() const |
1754 | { |
1755 | if (!m_buffer.isEmpty()) |
1756 | return StringView(m_buffer.data(), m_buffer.size()); |
1757 | if (m_previousText.text().length()) |
1758 | return m_previousText.text(); |
1759 | return m_underlyingIterator.text(); |
1760 | } |
1761 | |
1762 | // -------- |
1763 | |
1764 | static inline UChar foldQuoteMark(UChar c) |
1765 | { |
1766 | switch (c) { |
1767 | case hebrewPunctuationGershayim: |
1768 | case leftDoubleQuotationMark: |
1769 | case leftLowDoubleQuotationMark: |
1770 | case rightDoubleQuotationMark: |
1771 | return '"'; |
1772 | case hebrewPunctuationGeresh: |
1773 | case leftSingleQuotationMark: |
1774 | case leftLowSingleQuotationMark: |
1775 | case rightSingleQuotationMark: |
1776 | return '\''; |
1777 | default: |
1778 | return c; |
1779 | } |
1780 | } |
1781 | |
1782 | // FIXME: We'd like to tailor the searcher to fold quote marks for us instead |
1783 | // of doing it in a separate replacement pass here, but ICU doesn't offer a way |
1784 | // to add tailoring on top of the locale-specific tailoring as of this writing. |
1785 | static inline String foldQuoteMarks(String string) |
1786 | { |
1787 | string.replace(hebrewPunctuationGeresh, '\''); |
1788 | string.replace(hebrewPunctuationGershayim, '"'); |
1789 | string.replace(leftDoubleQuotationMark, '"'); |
1790 | string.replace(leftLowDoubleQuotationMark, '"'); |
1791 | string.replace(leftSingleQuotationMark, '\''); |
1792 | string.replace(leftLowSingleQuotationMark, '\''); |
1793 | string.replace(rightDoubleQuotationMark, '"'); |
1794 | string.replace(rightSingleQuotationMark, '\''); |
1795 | |
1796 | return string; |
1797 | } |
1798 | |
1799 | #if !UCONFIG_NO_COLLATION |
1800 | |
1801 | const size_t minimumSearchBufferSize = 8192; |
1802 | |
1803 | #ifndef NDEBUG |
1804 | static bool searcherInUse; |
1805 | #endif |
1806 | |
1807 | static UStringSearch* createSearcher() |
1808 | { |
1809 | // Provide a non-empty pattern and non-empty text so usearch_open will not fail, |
1810 | // but it doesn't matter exactly what it is, since we don't perform any searches |
1811 | // without setting both the pattern and the text. |
1812 | UErrorCode status = U_ZERO_ERROR; |
1813 | String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search" ); |
1814 | UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status); |
1815 | ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); |
1816 | return searcher; |
1817 | } |
1818 | |
1819 | static UStringSearch* searcher() |
1820 | { |
1821 | static UStringSearch* searcher = createSearcher(); |
1822 | return searcher; |
1823 | } |
1824 | |
1825 | static inline void lockSearcher() |
1826 | { |
1827 | #ifndef NDEBUG |
1828 | ASSERT(!searcherInUse); |
1829 | searcherInUse = true; |
1830 | #endif |
1831 | } |
1832 | |
1833 | static inline void unlockSearcher() |
1834 | { |
1835 | #ifndef NDEBUG |
1836 | ASSERT(searcherInUse); |
1837 | searcherInUse = false; |
1838 | #endif |
1839 | } |
1840 | |
1841 | // ICU's search ignores the distinction between small kana letters and ones |
1842 | // that are not small, and also characters that differ only in the voicing |
1843 | // marks when considering only primary collation strength differences. |
1844 | // This is not helpful for end users, since these differences make words |
1845 | // distinct, so for our purposes we need these to be considered. |
1846 | // The Unicode folks do not think the collation algorithm should be |
1847 | // changed. To work around this, we would like to tailor the ICU searcher, |
1848 | // but we can't get that to work yet. So instead, we check for cases where |
1849 | // these differences occur, and skip those matches. |
1850 | |
1851 | // We refer to the above technique as the "kana workaround". The next few |
1852 | // functions are helper functinos for the kana workaround. |
1853 | |
1854 | static inline bool isKanaLetter(UChar character) |
1855 | { |
1856 | // Hiragana letters. |
1857 | if (character >= 0x3041 && character <= 0x3096) |
1858 | return true; |
1859 | |
1860 | // Katakana letters. |
1861 | if (character >= 0x30A1 && character <= 0x30FA) |
1862 | return true; |
1863 | if (character >= 0x31F0 && character <= 0x31FF) |
1864 | return true; |
1865 | |
1866 | // Halfwidth katakana letters. |
1867 | if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) |
1868 | return true; |
1869 | |
1870 | return false; |
1871 | } |
1872 | |
1873 | static inline bool isSmallKanaLetter(UChar character) |
1874 | { |
1875 | ASSERT(isKanaLetter(character)); |
1876 | |
1877 | switch (character) { |
1878 | case 0x3041: // HIRAGANA LETTER SMALL A |
1879 | case 0x3043: // HIRAGANA LETTER SMALL I |
1880 | case 0x3045: // HIRAGANA LETTER SMALL U |
1881 | case 0x3047: // HIRAGANA LETTER SMALL E |
1882 | case 0x3049: // HIRAGANA LETTER SMALL O |
1883 | case 0x3063: // HIRAGANA LETTER SMALL TU |
1884 | case 0x3083: // HIRAGANA LETTER SMALL YA |
1885 | case 0x3085: // HIRAGANA LETTER SMALL YU |
1886 | case 0x3087: // HIRAGANA LETTER SMALL YO |
1887 | case 0x308E: // HIRAGANA LETTER SMALL WA |
1888 | case 0x3095: // HIRAGANA LETTER SMALL KA |
1889 | case 0x3096: // HIRAGANA LETTER SMALL KE |
1890 | case 0x30A1: // KATAKANA LETTER SMALL A |
1891 | case 0x30A3: // KATAKANA LETTER SMALL I |
1892 | case 0x30A5: // KATAKANA LETTER SMALL U |
1893 | case 0x30A7: // KATAKANA LETTER SMALL E |
1894 | case 0x30A9: // KATAKANA LETTER SMALL O |
1895 | case 0x30C3: // KATAKANA LETTER SMALL TU |
1896 | case 0x30E3: // KATAKANA LETTER SMALL YA |
1897 | case 0x30E5: // KATAKANA LETTER SMALL YU |
1898 | case 0x30E7: // KATAKANA LETTER SMALL YO |
1899 | case 0x30EE: // KATAKANA LETTER SMALL WA |
1900 | case 0x30F5: // KATAKANA LETTER SMALL KA |
1901 | case 0x30F6: // KATAKANA LETTER SMALL KE |
1902 | case 0x31F0: // KATAKANA LETTER SMALL KU |
1903 | case 0x31F1: // KATAKANA LETTER SMALL SI |
1904 | case 0x31F2: // KATAKANA LETTER SMALL SU |
1905 | case 0x31F3: // KATAKANA LETTER SMALL TO |
1906 | case 0x31F4: // KATAKANA LETTER SMALL NU |
1907 | case 0x31F5: // KATAKANA LETTER SMALL HA |
1908 | case 0x31F6: // KATAKANA LETTER SMALL HI |
1909 | case 0x31F7: // KATAKANA LETTER SMALL HU |
1910 | case 0x31F8: // KATAKANA LETTER SMALL HE |
1911 | case 0x31F9: // KATAKANA LETTER SMALL HO |
1912 | case 0x31FA: // KATAKANA LETTER SMALL MU |
1913 | case 0x31FB: // KATAKANA LETTER SMALL RA |
1914 | case 0x31FC: // KATAKANA LETTER SMALL RI |
1915 | case 0x31FD: // KATAKANA LETTER SMALL RU |
1916 | case 0x31FE: // KATAKANA LETTER SMALL RE |
1917 | case 0x31FF: // KATAKANA LETTER SMALL RO |
1918 | case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A |
1919 | case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I |
1920 | case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U |
1921 | case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E |
1922 | case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O |
1923 | case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA |
1924 | case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU |
1925 | case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO |
1926 | case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU |
1927 | return true; |
1928 | } |
1929 | return false; |
1930 | } |
1931 | |
1932 | enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; |
1933 | |
1934 | static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) |
1935 | { |
1936 | ASSERT(isKanaLetter(character)); |
1937 | |
1938 | switch (character) { |
1939 | case 0x304C: // HIRAGANA LETTER GA |
1940 | case 0x304E: // HIRAGANA LETTER GI |
1941 | case 0x3050: // HIRAGANA LETTER GU |
1942 | case 0x3052: // HIRAGANA LETTER GE |
1943 | case 0x3054: // HIRAGANA LETTER GO |
1944 | case 0x3056: // HIRAGANA LETTER ZA |
1945 | case 0x3058: // HIRAGANA LETTER ZI |
1946 | case 0x305A: // HIRAGANA LETTER ZU |
1947 | case 0x305C: // HIRAGANA LETTER ZE |
1948 | case 0x305E: // HIRAGANA LETTER ZO |
1949 | case 0x3060: // HIRAGANA LETTER DA |
1950 | case 0x3062: // HIRAGANA LETTER DI |
1951 | case 0x3065: // HIRAGANA LETTER DU |
1952 | case 0x3067: // HIRAGANA LETTER DE |
1953 | case 0x3069: // HIRAGANA LETTER DO |
1954 | case 0x3070: // HIRAGANA LETTER BA |
1955 | case 0x3073: // HIRAGANA LETTER BI |
1956 | case 0x3076: // HIRAGANA LETTER BU |
1957 | case 0x3079: // HIRAGANA LETTER BE |
1958 | case 0x307C: // HIRAGANA LETTER BO |
1959 | case 0x3094: // HIRAGANA LETTER VU |
1960 | case 0x30AC: // KATAKANA LETTER GA |
1961 | case 0x30AE: // KATAKANA LETTER GI |
1962 | case 0x30B0: // KATAKANA LETTER GU |
1963 | case 0x30B2: // KATAKANA LETTER GE |
1964 | case 0x30B4: // KATAKANA LETTER GO |
1965 | case 0x30B6: // KATAKANA LETTER ZA |
1966 | case 0x30B8: // KATAKANA LETTER ZI |
1967 | case 0x30BA: // KATAKANA LETTER ZU |
1968 | case 0x30BC: // KATAKANA LETTER ZE |
1969 | case 0x30BE: // KATAKANA LETTER ZO |
1970 | case 0x30C0: // KATAKANA LETTER DA |
1971 | case 0x30C2: // KATAKANA LETTER DI |
1972 | case 0x30C5: // KATAKANA LETTER DU |
1973 | case 0x30C7: // KATAKANA LETTER DE |
1974 | case 0x30C9: // KATAKANA LETTER DO |
1975 | case 0x30D0: // KATAKANA LETTER BA |
1976 | case 0x30D3: // KATAKANA LETTER BI |
1977 | case 0x30D6: // KATAKANA LETTER BU |
1978 | case 0x30D9: // KATAKANA LETTER BE |
1979 | case 0x30DC: // KATAKANA LETTER BO |
1980 | case 0x30F4: // KATAKANA LETTER VU |
1981 | case 0x30F7: // KATAKANA LETTER VA |
1982 | case 0x30F8: // KATAKANA LETTER VI |
1983 | case 0x30F9: // KATAKANA LETTER VE |
1984 | case 0x30FA: // KATAKANA LETTER VO |
1985 | return VoicedSoundMark; |
1986 | case 0x3071: // HIRAGANA LETTER PA |
1987 | case 0x3074: // HIRAGANA LETTER PI |
1988 | case 0x3077: // HIRAGANA LETTER PU |
1989 | case 0x307A: // HIRAGANA LETTER PE |
1990 | case 0x307D: // HIRAGANA LETTER PO |
1991 | case 0x30D1: // KATAKANA LETTER PA |
1992 | case 0x30D4: // KATAKANA LETTER PI |
1993 | case 0x30D7: // KATAKANA LETTER PU |
1994 | case 0x30DA: // KATAKANA LETTER PE |
1995 | case 0x30DD: // KATAKANA LETTER PO |
1996 | return SemiVoicedSoundMark; |
1997 | } |
1998 | return NoVoicedSoundMark; |
1999 | } |
2000 | |
2001 | static inline bool isCombiningVoicedSoundMark(UChar character) |
2002 | { |
2003 | switch (character) { |
2004 | case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK |
2005 | case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK |
2006 | return true; |
2007 | } |
2008 | return false; |
2009 | } |
2010 | |
2011 | static inline bool containsKanaLetters(const String& pattern) |
2012 | { |
2013 | if (pattern.is8Bit()) |
2014 | return false; |
2015 | const UChar* characters = pattern.characters16(); |
2016 | unsigned length = pattern.length(); |
2017 | for (unsigned i = 0; i < length; ++i) { |
2018 | if (isKanaLetter(characters[i])) |
2019 | return true; |
2020 | } |
2021 | return false; |
2022 | } |
2023 | |
2024 | static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer) |
2025 | { |
2026 | UErrorCode status = U_ZERO_ERROR; |
2027 | const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); |
2028 | ASSERT(U_SUCCESS(status)); |
2029 | |
2030 | buffer.resize(length); |
2031 | |
2032 | auto normalizedLength = unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status); |
2033 | ASSERT(U_SUCCESS(status) || status == U_BUFFER_OVERFLOW_ERROR); |
2034 | |
2035 | buffer.resize(normalizedLength); |
2036 | |
2037 | if (U_SUCCESS(status)) |
2038 | return; |
2039 | |
2040 | status = U_ZERO_ERROR; |
2041 | unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status); |
2042 | ASSERT(U_SUCCESS(status)); |
2043 | } |
2044 | |
2045 | static bool isNonLatin1Separator(UChar32 character) |
2046 | { |
2047 | ASSERT_ARG(character, character >= 256); |
2048 | |
2049 | return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); |
2050 | } |
2051 | |
2052 | static inline bool isSeparator(UChar32 character) |
2053 | { |
2054 | static const bool latin1SeparatorTable[256] = { |
2055 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2056 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2057 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / |
2058 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? |
2059 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ |
2060 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ |
2061 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` |
2062 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ |
2063 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2064 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2065 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
2066 | 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
2067 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2068 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
2069 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
2070 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 |
2071 | }; |
2072 | |
2073 | if (character < 256) |
2074 | return latin1SeparatorTable[character]; |
2075 | |
2076 | return isNonLatin1Separator(character); |
2077 | } |
2078 | |
2079 | inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
2080 | : m_target(foldQuoteMarks(target)) |
2081 | , m_targetCharacters(StringView(m_target).upconvertedCharacters()) |
2082 | , m_options(options) |
2083 | , m_prefixLength(0) |
2084 | , m_atBreak(true) |
2085 | , m_needsMoreContext(options.contains(AtWordStarts)) |
2086 | , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) |
2087 | { |
2088 | ASSERT(!m_target.isEmpty()); |
2089 | |
2090 | size_t targetLength = m_target.length(); |
2091 | m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); |
2092 | m_overlap = m_buffer.capacity() / 4; |
2093 | |
2094 | if (m_options.contains(AtWordStarts) && targetLength) { |
2095 | UChar32 targetFirstCharacter; |
2096 | U16_GET(m_target, 0, 0u, targetLength, targetFirstCharacter); |
2097 | // Characters in the separator category never really occur at the beginning of a word, |
2098 | // so if the target begins with such a character, we just ignore the AtWordStart option. |
2099 | if (isSeparator(targetFirstCharacter)) { |
2100 | m_options.remove(AtWordStarts); |
2101 | m_needsMoreContext = false; |
2102 | } |
2103 | } |
2104 | |
2105 | // Grab the single global searcher. |
2106 | // If we ever have a reason to do more than once search buffer at once, we'll have |
2107 | // to move to multiple searchers. |
2108 | lockSearcher(); |
2109 | |
2110 | UStringSearch* searcher = WebCore::searcher(); |
2111 | UCollator* collator = usearch_getCollator(searcher); |
2112 | |
2113 | UCollationStrength strength; |
2114 | USearchAttributeValue comparator; |
2115 | if (m_options.contains(CaseInsensitive)) { |
2116 | // Without loss of generality, have 'e' match {'e', 'E', 'é', 'É'} and 'é' match {'é', 'É'}. |
2117 | strength = UCOL_SECONDARY; |
2118 | comparator = USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD; |
2119 | } else { |
2120 | // Without loss of generality, have 'e' match {'e'} and 'é' match {'é'}. |
2121 | strength = UCOL_TERTIARY; |
2122 | comparator = USEARCH_STANDARD_ELEMENT_COMPARISON; |
2123 | } |
2124 | if (ucol_getStrength(collator) != strength) { |
2125 | ucol_setStrength(collator, strength); |
2126 | usearch_reset(searcher); |
2127 | } |
2128 | |
2129 | UErrorCode status = U_ZERO_ERROR; |
2130 | usearch_setAttribute(searcher, USEARCH_ELEMENT_COMPARISON, comparator, &status); |
2131 | ASSERT(status == U_ZERO_ERROR); |
2132 | |
2133 | usearch_setPattern(searcher, m_targetCharacters, targetLength, &status); |
2134 | ASSERT(status == U_ZERO_ERROR); |
2135 | |
2136 | // The kana workaround requires a normalized copy of the target string. |
2137 | if (m_targetRequiresKanaWorkaround) |
2138 | normalizeCharacters(m_targetCharacters, targetLength, m_normalizedTarget); |
2139 | } |
2140 | |
2141 | inline SearchBuffer::~SearchBuffer() |
2142 | { |
2143 | // Leave the static object pointing to a valid string. |
2144 | UErrorCode status = U_ZERO_ERROR; |
2145 | usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status); |
2146 | ASSERT(status == U_ZERO_ERROR); |
2147 | usearch_setText(WebCore::searcher(), &newlineCharacter, 1, &status); |
2148 | ASSERT(status == U_ZERO_ERROR); |
2149 | |
2150 | unlockSearcher(); |
2151 | } |
2152 | |
2153 | inline size_t SearchBuffer::append(StringView text) |
2154 | { |
2155 | ASSERT(text.length()); |
2156 | |
2157 | if (m_atBreak) { |
2158 | m_buffer.shrink(0); |
2159 | m_prefixLength = 0; |
2160 | m_atBreak = false; |
2161 | } else if (m_buffer.size() == m_buffer.capacity()) { |
2162 | memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); |
2163 | m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap); |
2164 | m_buffer.shrink(m_overlap); |
2165 | } |
2166 | |
2167 | size_t oldLength = m_buffer.size(); |
2168 | size_t usableLength = std::min<size_t>(m_buffer.capacity() - oldLength, text.length()); |
2169 | ASSERT(usableLength); |
2170 | m_buffer.grow(oldLength + usableLength); |
2171 | for (unsigned i = 0; i < usableLength; ++i) |
2172 | m_buffer[oldLength + i] = foldQuoteMark(text[i]); |
2173 | return usableLength; |
2174 | } |
2175 | |
2176 | inline bool SearchBuffer::needsMoreContext() const |
2177 | { |
2178 | return m_needsMoreContext; |
2179 | } |
2180 | |
2181 | inline void SearchBuffer::prependContext(StringView text) |
2182 | { |
2183 | ASSERT(m_needsMoreContext); |
2184 | ASSERT(m_prefixLength == m_buffer.size()); |
2185 | |
2186 | if (!text.length()) |
2187 | return; |
2188 | |
2189 | m_atBreak = false; |
2190 | |
2191 | size_t wordBoundaryContextStart = text.length(); |
2192 | if (wordBoundaryContextStart) { |
2193 | U16_BACK_1(text, 0, wordBoundaryContextStart); |
2194 | wordBoundaryContextStart = startOfLastWordBoundaryContext(text.substring(0, wordBoundaryContextStart)); |
2195 | } |
2196 | |
2197 | size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, text.length() - wordBoundaryContextStart); |
2198 | WTF::append(m_buffer, text.substring(text.length() - usableLength, usableLength)); |
2199 | m_prefixLength += usableLength; |
2200 | |
2201 | if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) |
2202 | m_needsMoreContext = false; |
2203 | } |
2204 | |
2205 | inline bool SearchBuffer::atBreak() const |
2206 | { |
2207 | return m_atBreak; |
2208 | } |
2209 | |
2210 | inline void SearchBuffer::reachedBreak() |
2211 | { |
2212 | m_atBreak = true; |
2213 | } |
2214 | |
2215 | inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const |
2216 | { |
2217 | // This function implements the kana workaround. If usearch treats |
2218 | // it as a match, but we do not want to, then it's a "bad match". |
2219 | if (!m_targetRequiresKanaWorkaround) |
2220 | return false; |
2221 | |
2222 | // Normalize into a match buffer. We reuse a single buffer rather than |
2223 | // creating a new one each time. |
2224 | normalizeCharacters(match, matchLength, m_normalizedMatch); |
2225 | |
2226 | const UChar* a = m_normalizedTarget.begin(); |
2227 | const UChar* aEnd = m_normalizedTarget.end(); |
2228 | |
2229 | const UChar* b = m_normalizedMatch.begin(); |
2230 | const UChar* bEnd = m_normalizedMatch.end(); |
2231 | |
2232 | while (true) { |
2233 | // Skip runs of non-kana-letter characters. This is necessary so we can |
2234 | // correctly handle strings where the target and match have different-length |
2235 | // runs of characters that match, while still double checking the correctness |
2236 | // of matches of kana letters with other kana letters. |
2237 | while (a != aEnd && !isKanaLetter(*a)) |
2238 | ++a; |
2239 | while (b != bEnd && !isKanaLetter(*b)) |
2240 | ++b; |
2241 | |
2242 | // If we reached the end of either the target or the match, we should have |
2243 | // reached the end of both; both should have the same number of kana letters. |
2244 | if (a == aEnd || b == bEnd) { |
2245 | ASSERT(a == aEnd); |
2246 | ASSERT(b == bEnd); |
2247 | return false; |
2248 | } |
2249 | |
2250 | // Check for differences in the kana letter character itself. |
2251 | if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b)) |
2252 | return true; |
2253 | if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b)) |
2254 | return true; |
2255 | ++a; |
2256 | ++b; |
2257 | |
2258 | // Check for differences in combining voiced sound marks found after the letter. |
2259 | while (1) { |
2260 | if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) { |
2261 | if (b != bEnd && isCombiningVoicedSoundMark(*b)) |
2262 | return true; |
2263 | break; |
2264 | } |
2265 | if (!(b != bEnd && isCombiningVoicedSoundMark(*b))) |
2266 | return true; |
2267 | if (*a != *b) |
2268 | return true; |
2269 | ++a; |
2270 | ++b; |
2271 | } |
2272 | } |
2273 | } |
2274 | |
2275 | inline bool SearchBuffer::isWordEndMatch(size_t start, size_t length) const |
2276 | { |
2277 | ASSERT(length); |
2278 | ASSERT(m_options.contains(AtWordEnds)); |
2279 | |
2280 | int endWord; |
2281 | // Start searching at the end of matched search, so that multiple word matches succeed. |
2282 | findEndWordBoundary(StringView(m_buffer.data(), m_buffer.size()), start + length - 1, &endWord); |
2283 | return static_cast<size_t>(endWord) == (start + length); |
2284 | } |
2285 | |
2286 | inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const |
2287 | { |
2288 | ASSERT(m_options.contains(AtWordStarts)); |
2289 | |
2290 | if (!start) |
2291 | return true; |
2292 | |
2293 | int size = m_buffer.size(); |
2294 | int offset = start; |
2295 | UChar32 firstCharacter; |
2296 | U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); |
2297 | |
2298 | if (m_options.contains(TreatMedialCapitalAsWordStart)) { |
2299 | UChar32 previousCharacter; |
2300 | U16_PREV(m_buffer.data(), 0, offset, previousCharacter); |
2301 | |
2302 | if (isSeparator(firstCharacter)) { |
2303 | // The start of a separator run is a word start (".org" in "webkit.org"). |
2304 | if (!isSeparator(previousCharacter)) |
2305 | return true; |
2306 | } else if (isASCIIUpper(firstCharacter)) { |
2307 | // The start of an uppercase run is a word start ("Kit" in "WebKit"). |
2308 | if (!isASCIIUpper(previousCharacter)) |
2309 | return true; |
2310 | // The last character of an uppercase run followed by a non-separator, non-digit |
2311 | // is a word start ("Request" in "XMLHTTPRequest"). |
2312 | offset = start; |
2313 | U16_FWD_1(m_buffer.data(), offset, size); |
2314 | UChar32 nextCharacter = 0; |
2315 | if (offset < size) |
2316 | U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); |
2317 | if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) |
2318 | return true; |
2319 | } else if (isASCIIDigit(firstCharacter)) { |
2320 | // The start of a digit run is a word start ("2" in "WebKit2"). |
2321 | if (!isASCIIDigit(previousCharacter)) |
2322 | return true; |
2323 | } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { |
2324 | // The start of a non-separator, non-uppercase, non-digit run is a word start, |
2325 | // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). |
2326 | return true; |
2327 | } |
2328 | } |
2329 | |
2330 | // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes |
2331 | // a word, so treat the position before any CJK character as a word start. |
2332 | if (FontCascade::isCJKIdeographOrSymbol(firstCharacter)) |
2333 | return true; |
2334 | |
2335 | size_t wordBreakSearchStart = start + length; |
2336 | while (wordBreakSearchStart > start) |
2337 | wordBreakSearchStart = findNextWordFromIndex(StringView(m_buffer.data(), m_buffer.size()), wordBreakSearchStart, false /* backwards */); |
2338 | return wordBreakSearchStart == start; |
2339 | } |
2340 | |
2341 | inline size_t SearchBuffer::search(size_t& start) |
2342 | { |
2343 | size_t size = m_buffer.size(); |
2344 | if (m_atBreak) { |
2345 | if (!size) |
2346 | return 0; |
2347 | } else { |
2348 | if (size != m_buffer.capacity()) |
2349 | return 0; |
2350 | } |
2351 | |
2352 | UStringSearch* searcher = WebCore::searcher(); |
2353 | |
2354 | UErrorCode status = U_ZERO_ERROR; |
2355 | usearch_setText(searcher, m_buffer.data(), size, &status); |
2356 | ASSERT(status == U_ZERO_ERROR); |
2357 | |
2358 | usearch_setOffset(searcher, m_prefixLength, &status); |
2359 | ASSERT(status == U_ZERO_ERROR); |
2360 | |
2361 | int matchStart = usearch_next(searcher, &status); |
2362 | ASSERT(status == U_ZERO_ERROR); |
2363 | |
2364 | nextMatch: |
2365 | if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { |
2366 | ASSERT(matchStart == USEARCH_DONE); |
2367 | return 0; |
2368 | } |
2369 | |
2370 | // Matches that start in the overlap area are only tentative. |
2371 | // The same match may appear later, matching more characters, |
2372 | // possibly including a combining character that's not yet in the buffer. |
2373 | if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { |
2374 | size_t overlap = m_overlap; |
2375 | if (m_options.contains(AtWordStarts)) { |
2376 | // Ensure that there is sufficient context before matchStart the next time around for |
2377 | // determining if it is at a word boundary. |
2378 | unsigned wordBoundaryContextStart = matchStart; |
2379 | U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); |
2380 | wordBoundaryContextStart = startOfLastWordBoundaryContext(StringView(m_buffer.data(), wordBoundaryContextStart)); |
2381 | overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart)); |
2382 | } |
2383 | memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); |
2384 | m_prefixLength -= std::min(m_prefixLength, size - overlap); |
2385 | m_buffer.shrink(overlap); |
2386 | return 0; |
2387 | } |
2388 | |
2389 | size_t matchedLength = usearch_getMatchedLength(searcher); |
2390 | ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size); |
2391 | |
2392 | // If this match is "bad", move on to the next match. |
2393 | if (isBadMatch(m_buffer.data() + matchStart, matchedLength) |
2394 | || (m_options.contains(AtWordStarts) && !isWordStartMatch(matchStart, matchedLength)) |
2395 | || (m_options.contains(AtWordEnds) && !isWordEndMatch(matchStart, matchedLength))) { |
2396 | matchStart = usearch_next(searcher, &status); |
2397 | ASSERT(status == U_ZERO_ERROR); |
2398 | goto nextMatch; |
2399 | } |
2400 | |
2401 | size_t newSize = size - (matchStart + 1); |
2402 | memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); |
2403 | m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1); |
2404 | m_buffer.shrink(newSize); |
2405 | |
2406 | start = size - matchStart; |
2407 | return matchedLength; |
2408 | } |
2409 | |
2410 | #else |
2411 | |
2412 | inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
2413 | : m_target(options & CaseInsensitive ? target.foldCase() : target) |
2414 | , m_options(options) |
2415 | , m_buffer(m_target.length()) |
2416 | , m_isCharacterStartBuffer(m_target.length()) |
2417 | , m_isBufferFull(false) |
2418 | , m_cursor(0) |
2419 | { |
2420 | ASSERT(!m_target.isEmpty()); |
2421 | m_target.replace(noBreakSpace, ' '); |
2422 | foldQuoteMarks(m_target); |
2423 | } |
2424 | |
2425 | inline SearchBuffer::~SearchBuffer() = default; |
2426 | |
2427 | inline void SearchBuffer::reachedBreak() |
2428 | { |
2429 | m_cursor = 0; |
2430 | m_isBufferFull = false; |
2431 | } |
2432 | |
2433 | inline bool SearchBuffer::atBreak() const |
2434 | { |
2435 | return !m_cursor && !m_isBufferFull; |
2436 | } |
2437 | |
2438 | inline void SearchBuffer::append(UChar c, bool isStart) |
2439 | { |
2440 | m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMark(c); |
2441 | m_isCharacterStartBuffer[m_cursor] = isStart; |
2442 | if (++m_cursor == m_target.length()) { |
2443 | m_cursor = 0; |
2444 | m_isBufferFull = true; |
2445 | } |
2446 | } |
2447 | |
2448 | inline size_t SearchBuffer::append(const UChar* characters, size_t length) |
2449 | { |
2450 | ASSERT(length); |
2451 | if (!(m_options & CaseInsensitive)) { |
2452 | append(characters[0], true); |
2453 | return 1; |
2454 | } |
2455 | const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough |
2456 | UChar foldedCharacters[maxFoldedCharacters]; |
2457 | UErrorCode status = U_ZERO_ERROR; |
2458 | int numFoldedCharacters = u_strFoldCase(foldedCharacters, maxFoldedCharacters, characters, 1, U_FOLD_CASE_DEFAULT, &status); |
2459 | ASSERT(U_SUCCESS(status)); |
2460 | ASSERT(numFoldedCharacters); |
2461 | ASSERT(numFoldedCharacters <= maxFoldedCharacters); |
2462 | if (U_SUCCESS(status) && numFoldedCharacters) { |
2463 | numFoldedCharacters = std::min(numFoldedCharacters, maxFoldedCharacters); |
2464 | append(foldedCharacters[0], true); |
2465 | for (int i = 1; i < numFoldedCharacters; ++i) |
2466 | append(foldedCharacters[i], false); |
2467 | } |
2468 | return 1; |
2469 | } |
2470 | |
2471 | inline bool SearchBuffer::needsMoreContext() const |
2472 | { |
2473 | return false; |
2474 | } |
2475 | |
2476 | void SearchBuffer::prependContext(const UChar*, size_t) |
2477 | { |
2478 | ASSERT_NOT_REACHED(); |
2479 | } |
2480 | |
2481 | inline size_t SearchBuffer::search(size_t& start) |
2482 | { |
2483 | if (!m_isBufferFull) |
2484 | return 0; |
2485 | if (!m_isCharacterStartBuffer[m_cursor]) |
2486 | return 0; |
2487 | |
2488 | size_t tailSpace = m_target.length() - m_cursor; |
2489 | if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0) |
2490 | return 0; |
2491 | if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0) |
2492 | return 0; |
2493 | |
2494 | start = length(); |
2495 | |
2496 | // Now that we've found a match once, we don't want to find it again, because those |
2497 | // are the SearchBuffer semantics, allowing for a buffer where you append more than one |
2498 | // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if |
2499 | // we want to get rid of that in the future we could track this with a separate boolean |
2500 | // or even move the characters to the start of the buffer and set m_isBufferFull to false. |
2501 | m_isCharacterStartBuffer[m_cursor] = false; |
2502 | |
2503 | return start; |
2504 | } |
2505 | |
2506 | // Returns the number of characters that were appended to the buffer (what we are searching in). |
2507 | // That's not necessarily the same length as the passed-in target string, because case folding |
2508 | // can make two strings match even though they're not the same length. |
2509 | size_t SearchBuffer::length() const |
2510 | { |
2511 | size_t bufferSize = m_target.length(); |
2512 | size_t length = 0; |
2513 | for (size_t i = 0; i < bufferSize; ++i) |
2514 | length += m_isCharacterStartBuffer[i]; |
2515 | return length; |
2516 | } |
2517 | |
2518 | #endif |
2519 | |
2520 | // -------- |
2521 | |
2522 | int TextIterator::rangeLength(const Range* range, bool forSelectionPreservation) |
2523 | { |
2524 | unsigned length = 0; |
2525 | for (TextIterator it(range, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance()) |
2526 | length += it.text().length(); |
2527 | return length; |
2528 | } |
2529 | |
2530 | Ref<Range> TextIterator::subrange(Range& entireRange, int characterOffset, int characterCount) |
2531 | { |
2532 | CharacterIterator entireRangeIterator(entireRange); |
2533 | return characterSubrange(entireRange.ownerDocument(), entireRangeIterator, characterOffset, characterCount); |
2534 | } |
2535 | |
2536 | static inline bool isInsideReplacedElement(TextIterator& iterator) |
2537 | { |
2538 | ASSERT(!iterator.atEnd()); |
2539 | ASSERT(iterator.text().length() == 1); |
2540 | Node* node = iterator.node(); |
2541 | return node && isRendererReplacedElement(node->renderer()); |
2542 | } |
2543 | |
2544 | RefPtr<Range> TextIterator::rangeFromLocationAndLength(ContainerNode* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation) |
2545 | { |
2546 | Ref<Range> resultRange = scope->document().createRange(); |
2547 | |
2548 | int docTextPosition = 0; |
2549 | int rangeEnd = rangeLocation + rangeLength; |
2550 | bool startRangeFound = false; |
2551 | |
2552 | Ref<Range> textRunRange = rangeOfContents(*scope); |
2553 | |
2554 | TextIterator it(textRunRange.ptr(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); |
2555 | |
2556 | // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>. |
2557 | if (!rangeLocation && !rangeLength && it.atEnd()) { |
2558 | resultRange->setStart(textRunRange->startContainer(), 0); |
2559 | resultRange->setEnd(textRunRange->startContainer(), 0); |
2560 | return resultRange; |
2561 | } |
2562 | |
2563 | for (; !it.atEnd(); it.advance()) { |
2564 | int length = it.text().length(); |
2565 | textRunRange = it.range(); |
2566 | |
2567 | bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + length; |
2568 | bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + length; |
2569 | |
2570 | if (foundEnd) { |
2571 | // FIXME: This is a workaround for the fact that the end of a run is often at the wrong |
2572 | // position for emitted '\n's or if the renderer of the current node is a replaced element. |
2573 | if (length == 1 && (it.text()[0] == '\n' || isInsideReplacedElement(it))) { |
2574 | it.advance(); |
2575 | if (!it.atEnd()) { |
2576 | Ref<Range> range = it.range(); |
2577 | textRunRange->setEnd(range->startContainer(), range->startOffset()); |
2578 | } else { |
2579 | Position runStart = textRunRange->startPosition(); |
2580 | Position runEnd = VisiblePosition(runStart).next().deepEquivalent(); |
2581 | if (runEnd.isNotNull()) |
2582 | textRunRange->setEnd(*runEnd.containerNode(), runEnd.computeOffsetInContainerNode()); |
2583 | } |
2584 | } |
2585 | } |
2586 | |
2587 | if (foundStart) { |
2588 | startRangeFound = true; |
2589 | if (textRunRange->startContainer().isTextNode()) { |
2590 | int offset = rangeLocation - docTextPosition; |
2591 | resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset()); |
2592 | } else { |
2593 | if (rangeLocation == docTextPosition) |
2594 | resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset()); |
2595 | else |
2596 | resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset()); |
2597 | } |
2598 | } |
2599 | |
2600 | if (foundEnd) { |
2601 | if (textRunRange->startContainer().isTextNode()) { |
2602 | int offset = rangeEnd - docTextPosition; |
2603 | resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset()); |
2604 | } else { |
2605 | if (rangeEnd == docTextPosition) |
2606 | resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset()); |
2607 | else |
2608 | resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset()); |
2609 | } |
2610 | docTextPosition += length; |
2611 | break; |
2612 | } |
2613 | |
2614 | docTextPosition += length; |
2615 | } |
2616 | |
2617 | if (!startRangeFound) |
2618 | return nullptr; |
2619 | |
2620 | if (rangeLength && rangeEnd > docTextPosition) // rangeEnd is out of bounds |
2621 | resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset()); |
2622 | |
2623 | return resultRange; |
2624 | } |
2625 | |
2626 | bool TextIterator::getLocationAndLengthFromRange(Node* scope, const Range* range, size_t& location, size_t& length) |
2627 | { |
2628 | location = notFound; |
2629 | length = 0; |
2630 | |
2631 | // The critical assumption is that this only gets called with ranges that |
2632 | // concentrate on a given area containing the selection root. This is done |
2633 | // because of text fields and textareas. The DOM for those is not |
2634 | // directly in the document DOM, so ensure that the range does not cross a |
2635 | // boundary of one of those. |
2636 | if (&range->startContainer() != scope && !range->startContainer().isDescendantOf(scope)) |
2637 | return false; |
2638 | if (&range->endContainer() != scope && !range->endContainer().isDescendantOf(scope)) |
2639 | return false; |
2640 | |
2641 | Ref<Range> testRange = Range::create(scope->document(), scope, 0, &range->startContainer(), range->startOffset()); |
2642 | ASSERT(&testRange->startContainer() == scope); |
2643 | location = TextIterator::rangeLength(testRange.ptr()); |
2644 | |
2645 | testRange->setEnd(range->endContainer(), range->endOffset()); |
2646 | ASSERT(&testRange->startContainer() == scope); |
2647 | length = TextIterator::rangeLength(testRange.ptr()) - location; |
2648 | return true; |
2649 | } |
2650 | |
2651 | // -------- |
2652 | |
2653 | bool hasAnyPlainText(const Range& range, TextIteratorBehavior behavior) |
2654 | { |
2655 | for (TextIterator iterator { &range, behavior }; !iterator.atEnd(); iterator.advance()) { |
2656 | if (!iterator.text().isEmpty()) |
2657 | return true; |
2658 | } |
2659 | return false; |
2660 | } |
2661 | |
2662 | String plainText(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
2663 | { |
2664 | // The initial buffer size can be critical for performance: https://bugs.webkit.org/show_bug.cgi?id=81192 |
2665 | static const unsigned initialCapacity = 1 << 15; |
2666 | |
2667 | if (!start.document()) |
2668 | return { }; |
2669 | auto document = makeRef(*start.document()); |
2670 | |
2671 | unsigned bufferLength = 0; |
2672 | StringBuilder builder; |
2673 | builder.reserveCapacity(initialCapacity); |
2674 | TextIteratorBehavior behavior = defaultBehavior; |
2675 | if (!isDisplayString) |
2676 | behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding); |
2677 | |
2678 | for (TextIterator it(start, end, behavior); !it.atEnd(); it.advance()) { |
2679 | it.appendTextToStringBuilder(builder); |
2680 | bufferLength += it.text().length(); |
2681 | } |
2682 | |
2683 | if (!bufferLength) |
2684 | return emptyString(); |
2685 | |
2686 | String result = builder.toString(); |
2687 | |
2688 | if (isDisplayString) |
2689 | document->displayStringModifiedByEncoding(result); |
2690 | |
2691 | return result; |
2692 | } |
2693 | |
2694 | String plainTextReplacingNoBreakSpace(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
2695 | { |
2696 | return plainText(start, end, defaultBehavior, isDisplayString).replace(noBreakSpace, ' '); |
2697 | } |
2698 | |
2699 | String plainText(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
2700 | { |
2701 | if (!range) |
2702 | return emptyString(); |
2703 | return plainText(range->startPosition(), range->endPosition(), defaultBehavior, isDisplayString); |
2704 | } |
2705 | |
2706 | String plainTextUsingBackwardsTextIteratorForTesting(const Range& range) |
2707 | { |
2708 | String result; |
2709 | for (SimplifiedBackwardsTextIterator backwardsIterator(range); !backwardsIterator.atEnd(); backwardsIterator.advance()) |
2710 | result.insert(backwardsIterator.text().toString(), 0); |
2711 | return result; |
2712 | } |
2713 | |
2714 | String plainTextReplacingNoBreakSpace(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
2715 | { |
2716 | return plainText(range, defaultBehavior, isDisplayString).replace(noBreakSpace, ' '); |
2717 | } |
2718 | |
2719 | static Ref<Range> collapsedToBoundary(const Range& range, bool forward) |
2720 | { |
2721 | Ref<Range> result = range.cloneRange(); |
2722 | result->collapse(!forward); |
2723 | return result; |
2724 | } |
2725 | |
2726 | static TextIteratorBehavior findIteratorOptions(FindOptions options) |
2727 | { |
2728 | TextIteratorBehavior iteratorOptions = TextIteratorEntersTextControls | TextIteratorClipsToFrameAncestors; |
2729 | if (!options.contains(DoNotTraverseFlatTree)) |
2730 | iteratorOptions |= TextIteratorTraversesFlatTree; |
2731 | return iteratorOptions; |
2732 | } |
2733 | |
2734 | static void findPlainTextMatches(const Range& range, const String& target, FindOptions options, const WTF::Function<bool(size_t, size_t)>& match) |
2735 | { |
2736 | SearchBuffer buffer(target, options); |
2737 | if (buffer.needsMoreContext()) { |
2738 | Ref<Range> beforeStartRange = range.ownerDocument().createRange(); |
2739 | beforeStartRange->setEnd(range.startContainer(), range.startOffset()); |
2740 | for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) { |
2741 | buffer.prependContext(backwardsIterator.text()); |
2742 | if (!buffer.needsMoreContext()) |
2743 | break; |
2744 | } |
2745 | } |
2746 | |
2747 | CharacterIterator findIterator(range, findIteratorOptions(options)); |
2748 | while (!findIterator.atEnd()) { |
2749 | findIterator.advance(buffer.append(findIterator.text())); |
2750 | while (1) { |
2751 | size_t matchStartOffset; |
2752 | size_t newMatchLength = buffer.search(matchStartOffset); |
2753 | if (!newMatchLength) { |
2754 | if (findIterator.atBreak() && !buffer.atBreak()) { |
2755 | buffer.reachedBreak(); |
2756 | continue; |
2757 | } |
2758 | break; |
2759 | } |
2760 | size_t lastCharacterInBufferOffset = findIterator.characterOffset(); |
2761 | ASSERT(lastCharacterInBufferOffset >= matchStartOffset); |
2762 | if (match(lastCharacterInBufferOffset - matchStartOffset, newMatchLength)) |
2763 | return; |
2764 | } |
2765 | } |
2766 | } |
2767 | |
2768 | static Ref<Range> rangeForMatch(const Range& range, FindOptions options, size_t matchStart, size_t matchLength, bool searchForward) |
2769 | { |
2770 | if (!matchLength) |
2771 | return collapsedToBoundary(range, searchForward); |
2772 | CharacterIterator rangeComputeIterator(range, findIteratorOptions(options)); |
2773 | return characterSubrange(range.ownerDocument(), rangeComputeIterator, matchStart, matchLength); |
2774 | } |
2775 | |
2776 | Ref<Range> findClosestPlainText(const Range& range, const String& target, FindOptions options, unsigned targetOffset) |
2777 | { |
2778 | size_t matchStart = 0; |
2779 | size_t matchLength = 0; |
2780 | size_t distance = std::numeric_limits<size_t>::max(); |
2781 | auto match = [targetOffset, &distance, &matchStart, &matchLength] (size_t start, size_t length) { |
2782 | size_t newDistance = std::min(abs(static_cast<signed>(start - targetOffset)), abs(static_cast<signed>(start + length - targetOffset))); |
2783 | if (newDistance < distance) { |
2784 | matchStart = start; |
2785 | matchLength = length; |
2786 | distance = newDistance; |
2787 | } |
2788 | return false; |
2789 | }; |
2790 | |
2791 | findPlainTextMatches(range, target, options, WTFMove(match)); |
2792 | return rangeForMatch(range, options, matchStart, matchLength, !options.contains(Backwards)); |
2793 | } |
2794 | |
2795 | Ref<Range> findPlainText(const Range& range, const String& target, FindOptions options) |
2796 | { |
2797 | bool searchForward = !options.contains(Backwards); |
2798 | size_t matchStart = 0; |
2799 | size_t matchLength = 0; |
2800 | auto match = [searchForward, &matchStart, &matchLength] (size_t start, size_t length) { |
2801 | matchStart = start; |
2802 | matchLength = length; |
2803 | // Look for the last match when searching backwards instead. |
2804 | return searchForward; |
2805 | }; |
2806 | |
2807 | findPlainTextMatches(range, target, options, WTFMove(match)); |
2808 | return rangeForMatch(range, options, matchStart, matchLength, searchForward); |
2809 | } |
2810 | |
2811 | bool findPlainText(const String& document, const String& target, FindOptions options) |
2812 | { |
2813 | SearchBuffer buffer { target, options }; |
2814 | StringView remainingText { document }; |
2815 | while (!remainingText.isEmpty()) { |
2816 | size_t charactersAppended = buffer.append(document); |
2817 | remainingText = remainingText.substring(charactersAppended); |
2818 | if (remainingText.isEmpty()) |
2819 | buffer.reachedBreak(); |
2820 | size_t matchStartOffset; |
2821 | if (buffer.search(matchStartOffset)) |
2822 | return true; |
2823 | } |
2824 | return false; |
2825 | } |
2826 | |
2827 | } |
2828 | |