1/*
2 * Copyright (C) 2004-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextIterator.h"
29
30#include "ComposedTreeIterator.h"
31#include "Document.h"
32#include "Editing.h"
33#include "FontCascade.h"
34#include "Frame.h"
35#include "HTMLBodyElement.h"
36#include "HTMLElement.h"
37#include "HTMLFrameOwnerElement.h"
38#include "HTMLInputElement.h"
39#include "HTMLLegendElement.h"
40#include "HTMLMeterElement.h"
41#include "HTMLNames.h"
42#include "HTMLParagraphElement.h"
43#include "HTMLProgressElement.h"
44#include "HTMLSlotElement.h"
45#include "HTMLTextAreaElement.h"
46#include "HTMLTextFormControlElement.h"
47#include "InlineTextBox.h"
48#include "NodeTraversal.h"
49#include "RenderImage.h"
50#include "RenderIterator.h"
51#include "RenderTableCell.h"
52#include "RenderTableRow.h"
53#include "RenderTextControl.h"
54#include "RenderTextFragment.h"
55#include "ShadowRoot.h"
56#include "SimpleLineLayout.h"
57#include "SimpleLineLayoutFunctions.h"
58#include "SimpleLineLayoutResolver.h"
59#include "TextBoundaries.h"
60#include "TextControlInnerElements.h"
61#include "VisiblePosition.h"
62#include "VisibleUnits.h"
63#include <unicode/unorm2.h>
64#include <wtf/Function.h>
65#include <wtf/text/CString.h>
66#include <wtf/text/StringBuilder.h>
67#include <wtf/text/TextBreakIterator.h>
68#include <wtf/unicode/CharacterNames.h>
69
70#if !UCONFIG_NO_COLLATION
71#include <unicode/usearch.h>
72#include <wtf/text/TextBreakIteratorInternalICU.h>
73#endif
74
75namespace WebCore {
76
77using namespace WTF::Unicode;
78using namespace HTMLNames;
79
80// Buffer that knows how to compare with a search target.
81// Keeps enough of the previous text to be able to search in the future, but no more.
82// Non-breaking spaces are always equal to normal spaces.
83// Case folding is also done if the CaseInsensitive option is specified.
84// Matches are further filtered if the AtWordStarts option is specified, although some
85// matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
86class SearchBuffer {
87 WTF_MAKE_NONCOPYABLE(SearchBuffer);
88public:
89 SearchBuffer(const String& target, FindOptions);
90 ~SearchBuffer();
91
92 // Returns number of characters appended; guaranteed to be in the range [1, length].
93 size_t append(StringView);
94 bool needsMoreContext() const;
95 void prependContext(StringView);
96 void reachedBreak();
97
98 // Result is the size in characters of what was found.
99 // And <startOffset> is the number of characters back to the start of what was found.
100 size_t search(size_t& startOffset);
101 bool atBreak() const;
102
103#if !UCONFIG_NO_COLLATION
104
105private:
106 bool isBadMatch(const UChar*, size_t length) const;
107 bool isWordStartMatch(size_t start, size_t length) const;
108 bool isWordEndMatch(size_t start, size_t length) const;
109
110 const String m_target;
111 const StringView::UpconvertedCharacters m_targetCharacters;
112 FindOptions m_options;
113
114 Vector<UChar> m_buffer;
115 size_t m_overlap;
116 size_t m_prefixLength;
117 bool m_atBreak;
118 bool m_needsMoreContext;
119
120 const bool m_targetRequiresKanaWorkaround;
121 Vector<UChar> m_normalizedTarget;
122 mutable Vector<UChar> m_normalizedMatch;
123
124#else
125
126private:
127 void append(UChar, bool isCharacterStart);
128 size_t length() const;
129
130 String m_target;
131 FindOptions m_options;
132
133 Vector<UChar> m_buffer;
134 Vector<bool> m_isCharacterStartBuffer;
135 bool m_isBufferFull;
136 size_t m_cursor;
137
138#endif
139};
140
141// --------
142
143static const unsigned bitsInWord = sizeof(unsigned) * 8;
144static const unsigned bitInWordMask = bitsInWord - 1;
145
146BitStack::BitStack()
147 : m_size(0)
148{
149}
150
151BitStack::~BitStack() = default;
152
153void BitStack::push(bool bit)
154{
155 unsigned index = m_size / bitsInWord;
156 unsigned shift = m_size & bitInWordMask;
157 if (!shift && index == m_words.size()) {
158 m_words.grow(index + 1);
159 m_words[index] = 0;
160 }
161 unsigned& word = m_words[index];
162 unsigned mask = 1U << shift;
163 if (bit)
164 word |= mask;
165 else
166 word &= ~mask;
167 ++m_size;
168}
169
170void BitStack::pop()
171{
172 if (m_size)
173 --m_size;
174}
175
176bool BitStack::top() const
177{
178 if (!m_size)
179 return false;
180 unsigned shift = (m_size - 1) & bitInWordMask;
181 return m_words.last() & (1U << shift);
182}
183
184unsigned BitStack::size() const
185{
186 return m_size;
187}
188
189// --------
190
191// This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
192static Node* nextInPreOrderCrossingShadowBoundaries(Node& rangeEndContainer, int rangeEndOffset)
193{
194 if (rangeEndOffset >= 0 && !rangeEndContainer.isCharacterDataNode()) {
195 if (Node* next = rangeEndContainer.traverseToChildAt(rangeEndOffset))
196 return next;
197 }
198 for (Node* node = &rangeEndContainer; node; node = node->parentOrShadowHostNode()) {
199 if (Node* next = node->nextSibling())
200 return next;
201 }
202 return nullptr;
203}
204
205static inline bool fullyClipsContents(Node& node)
206{
207 auto* renderer = node.renderer();
208 if (!renderer) {
209 if (!is<Element>(node))
210 return false;
211 return !downcast<Element>(node).hasDisplayContents();
212 }
213 if (!is<RenderBox>(*renderer))
214 return false;
215 auto& box = downcast<RenderBox>(*renderer);
216 if (!box.hasOverflowClip())
217 return false;
218
219 // Quirk to keep copy/paste in the CodeMirror editor version used in Jenkins working.
220 if (is<HTMLTextAreaElement>(node))
221 return box.size().isEmpty();
222
223 return box.contentSize().isEmpty();
224}
225
226static inline bool ignoresContainerClip(Node& node)
227{
228 auto* renderer = node.renderer();
229 if (!renderer || renderer->isTextOrLineBreak())
230 return false;
231 return renderer->style().hasOutOfFlowPosition();
232}
233
234static void pushFullyClippedState(BitStack& stack, Node& node)
235{
236 // Push true if this node full clips its contents, or if a parent already has fully
237 // clipped and this is not a node that ignores its container's clip.
238 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
239}
240
241static void setUpFullyClippedStack(BitStack& stack, Node& node)
242{
243 // Put the nodes in a vector so we can iterate in reverse order.
244 // FIXME: This (and TextIterator in general) should use ComposedTreeIterator.
245 Vector<Node*, 100> ancestry;
246 for (Node* parent = node.parentOrShadowHostNode(); parent; parent = parent->parentOrShadowHostNode())
247 ancestry.append(parent);
248
249 // Call pushFullyClippedState on each node starting with the earliest ancestor.
250 size_t size = ancestry.size();
251 for (size_t i = 0; i < size; ++i)
252 pushFullyClippedState(stack, *ancestry[size - i - 1]);
253 pushFullyClippedState(stack, node);
254}
255
256static bool isClippedByFrameAncestor(const Document& document, TextIteratorBehavior behavior)
257{
258 if (!(behavior & TextIteratorClipsToFrameAncestors))
259 return false;
260
261 for (auto* owner = document.ownerElement(); owner; owner = owner->document().ownerElement()) {
262 BitStack ownerClipStack;
263 setUpFullyClippedStack(ownerClipStack, *owner);
264 if (ownerClipStack.top())
265 return true;
266 }
267 return false;
268}
269
270// FIXME: editingIgnoresContent and isRendererReplacedElement try to do the same job.
271// It's not good to have both of them.
272bool isRendererReplacedElement(RenderObject* renderer)
273{
274 if (!renderer)
275 return false;
276
277 bool isAttachment = false;
278#if ENABLE(ATTACHMENT_ELEMENT)
279 isAttachment = renderer->isAttachment();
280#endif
281 if (renderer->isImage() || renderer->isWidget() || renderer->isMedia() || isAttachment)
282 return true;
283
284 if (is<Element>(renderer->node())) {
285 Element& element = downcast<Element>(*renderer->node());
286 if (is<HTMLFormControlElement>(element) || is<HTMLLegendElement>(element) || is<HTMLProgressElement>(element) || element.hasTagName(meterTag))
287 return true;
288 if (equalLettersIgnoringASCIICase(element.attributeWithoutSynchronization(roleAttr), "img"))
289 return true;
290 }
291
292 return false;
293}
294
295// --------
296
297inline void TextIteratorCopyableText::reset()
298{
299 m_singleCharacter = 0;
300 m_string = String();
301 m_offset = 0;
302 m_length = 0;
303}
304
305inline void TextIteratorCopyableText::set(String&& string)
306{
307 m_singleCharacter = 0;
308 m_string = WTFMove(string);
309 m_offset = 0;
310 m_length = m_string.length();
311}
312
313inline void TextIteratorCopyableText::set(String&& string, unsigned offset, unsigned length)
314{
315 ASSERT(offset < string.length());
316 ASSERT(length);
317 ASSERT(length <= string.length() - offset);
318
319 m_singleCharacter = 0;
320 m_string = WTFMove(string);
321 m_offset = offset;
322 m_length = length;
323}
324
325inline void TextIteratorCopyableText::set(UChar singleCharacter)
326{
327 m_singleCharacter = singleCharacter;
328 m_string = String();
329 m_offset = 0;
330 m_length = 0;
331}
332
333void TextIteratorCopyableText::appendToStringBuilder(StringBuilder& builder) const
334{
335 if (m_singleCharacter)
336 builder.append(m_singleCharacter);
337 else
338 builder.append(m_string, m_offset, m_length);
339}
340
341// --------
342
343
344TextIterator::TextIterator(Position start, Position end, TextIteratorBehavior behavior)
345 : m_behavior(behavior)
346{
347 if (start.isNull() || end.isNull())
348 return;
349 ASSERT(comparePositions(start, end) <= 0);
350
351 RELEASE_ASSERT(behavior & TextIteratorTraversesFlatTree || start.treeScope() == end.treeScope());
352
353 start.document()->updateLayoutIgnorePendingStylesheets();
354
355 // FIXME: Use Position / PositionIterator instead to avoid offset computation.
356 m_startContainer = start.containerNode();
357 m_startOffset = start.computeOffsetInContainerNode();
358
359 m_endContainer = end.containerNode();
360 m_endOffset = end.computeOffsetInContainerNode();
361
362 m_node = start.firstNode().get();
363 if (!m_node)
364 return;
365
366 init();
367}
368
369TextIterator::TextIterator(const Range* range, TextIteratorBehavior behavior)
370 : m_behavior(behavior)
371{
372 if (!range)
373 return;
374
375 range->ownerDocument().updateLayoutIgnorePendingStylesheets();
376
377 m_startContainer = &range->startContainer();
378
379 // Callers should be handing us well-formed ranges. If we discover that this isn't
380 // the case, we could consider changing this assertion to an early return.
381 ASSERT(range->boundaryPointsValid());
382
383 m_startOffset = range->startOffset();
384 m_endContainer = &range->endContainer();
385 m_endOffset = range->endOffset();
386
387 m_node = range->firstNode();
388 if (!m_node)
389 return;
390
391 init();
392}
393
394void TextIterator::init()
395{
396 if (isClippedByFrameAncestor(m_node->document(), m_behavior))
397 return;
398
399 setUpFullyClippedStack(m_fullyClippedStack, *m_node);
400
401 m_offset = m_node == m_startContainer ? m_startOffset : 0;
402
403 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(*m_endContainer, m_endOffset);
404
405 m_positionNode = m_node;
406
407 advance();
408}
409
410TextIterator::~TextIterator() = default;
411
412// FIXME: Use ComposedTreeIterator instead. These functions are more expensive because they might do O(n) work.
413static inline Node* firstChild(TextIteratorBehavior options, Node& node)
414{
415 if (UNLIKELY(options & TextIteratorTraversesFlatTree))
416 return firstChildInComposedTreeIgnoringUserAgentShadow(node);
417 return node.firstChild();
418}
419
420static inline Node* nextSibling(TextIteratorBehavior options, Node& node)
421{
422 if (UNLIKELY(options & TextIteratorTraversesFlatTree))
423 return nextSiblingInComposedTreeIgnoringUserAgentShadow(node);
424 return node.nextSibling();
425}
426
427static inline Node* nextNode(TextIteratorBehavior options, Node& node)
428{
429 if (UNLIKELY(options & TextIteratorTraversesFlatTree))
430 return nextInComposedTreeIgnoringUserAgentShadow(node);
431 return NodeTraversal::next(node);
432}
433
434static inline bool isDescendantOf(TextIteratorBehavior options, Node& node, Node& possibleAncestor)
435{
436 if (UNLIKELY(options & TextIteratorTraversesFlatTree))
437 return node.isDescendantOrShadowDescendantOf(&possibleAncestor);
438 return node.isDescendantOf(&possibleAncestor);
439}
440
441static inline Node* parentNodeOrShadowHost(TextIteratorBehavior options, Node& node)
442{
443 if (UNLIKELY(options & TextIteratorTraversesFlatTree))
444 return node.parentInComposedTree();
445 return node.parentOrShadowHostNode();
446}
447
448static inline bool hasDisplayContents(Node& node)
449{
450 return is<Element>(node) && downcast<Element>(node).hasDisplayContents();
451}
452
453void TextIterator::advance()
454{
455 ASSERT(!atEnd());
456
457 // reset the run information
458 m_positionNode = nullptr;
459 m_copyableText.reset();
460 m_text = StringView();
461
462 // handle remembered node that needed a newline after the text node's newline
463 if (m_nodeForAdditionalNewline) {
464 // Emit the extra newline, and position it *inside* m_node, after m_node's
465 // contents, in case it's a block, in the same way that we position the first
466 // newline. The range for the emitted newline should start where the line
467 // break begins.
468 // FIXME: It would be cleaner if we emitted two newlines during the last
469 // iteration, instead of using m_needsAnotherNewline.
470 emitCharacter('\n', *m_nodeForAdditionalNewline->parentNode(), m_nodeForAdditionalNewline, 1, 1);
471 m_nodeForAdditionalNewline = nullptr;
472 return;
473 }
474
475 if (!m_textBox && m_remainingTextBox) {
476 m_textBox = m_remainingTextBox;
477 m_remainingTextBox = nullptr;
478 m_firstLetterText = nullptr;
479 m_offset = 0;
480 }
481 // handle remembered text box
482 if (m_textBox) {
483 handleTextBox();
484 if (m_positionNode)
485 return;
486 }
487
488 while (m_node && m_node != m_pastEndNode) {
489 // if the range ends at offset 0 of an element, represent the
490 // position, but not the content, of that element e.g. if the
491 // node is a blockflow element, emit a newline that
492 // precedes the element
493 if (m_node == m_endContainer && !m_endOffset) {
494 representNodeOffsetZero();
495 m_node = nullptr;
496 return;
497 }
498
499 auto* renderer = m_node->renderer();
500 if (!m_handledNode) {
501 if (!renderer) {
502 m_handledNode = true;
503 m_handledChildren = !hasDisplayContents(*m_node);
504 } else {
505 // handle current node according to its type
506 if (renderer->isText() && m_node->isTextNode())
507 m_handledNode = handleTextNode();
508 else if (isRendererReplacedElement(renderer))
509 m_handledNode = handleReplacedElement();
510 else
511 m_handledNode = handleNonTextNode();
512 if (m_positionNode)
513 return;
514 }
515 }
516
517 // find a new current node to handle in depth-first manner,
518 // calling exitNode() as we come back thru a parent node
519 Node* next = m_handledChildren ? nullptr : firstChild(m_behavior, *m_node);
520 m_offset = 0;
521 if (!next) {
522 next = nextSibling(m_behavior, *m_node);
523 if (!next) {
524 bool pastEnd = nextNode(m_behavior, *m_node) == m_pastEndNode;
525 Node* parentNode = parentNodeOrShadowHost(m_behavior, *m_node);
526 while (!next && parentNode) {
527 if ((pastEnd && parentNode == m_endContainer) || isDescendantOf(m_behavior, *m_endContainer, *parentNode))
528 return;
529 bool haveRenderer = m_node->renderer();
530 Node* exitedNode = m_node;
531 m_node = parentNode;
532 m_fullyClippedStack.pop();
533 parentNode = parentNodeOrShadowHost(m_behavior, *m_node);
534 if (haveRenderer)
535 exitNode(exitedNode);
536 if (m_positionNode) {
537 m_handledNode = true;
538 m_handledChildren = true;
539 return;
540 }
541 next = nextSibling(m_behavior, *m_node);
542 if (next && m_node->renderer())
543 exitNode(m_node);
544 }
545 }
546 m_fullyClippedStack.pop();
547 }
548
549 // set the new current node
550 m_node = next;
551 if (m_node)
552 pushFullyClippedState(m_fullyClippedStack, *m_node);
553 m_handledNode = false;
554 m_handledChildren = false;
555 m_handledFirstLetter = false;
556 m_firstLetterText = nullptr;
557
558 // how would this ever be?
559 if (m_positionNode)
560 return;
561 }
562}
563
564static bool hasVisibleTextNode(RenderText& renderer)
565{
566 if (renderer.style().visibility() == Visibility::Visible)
567 return true;
568 if (is<RenderTextFragment>(renderer)) {
569 if (auto firstLetter = downcast<RenderTextFragment>(renderer).firstLetter()) {
570 if (firstLetter->style().visibility() == Visibility::Visible)
571 return true;
572 }
573 }
574 return false;
575}
576
577static unsigned textNodeOffsetInFlow(const Text& firstTextNodeInRange)
578{
579 // Calculate the text offset for simple lines.
580 RenderObject* renderer = firstTextNodeInRange.renderer();
581 if (!renderer)
582 return 0;
583 unsigned textOffset = 0;
584 for (renderer = renderer->previousSibling(); renderer; renderer = renderer->previousSibling()) {
585 if (is<RenderText>(renderer))
586 textOffset += downcast<RenderText>(renderer)->text().length();
587 }
588 return textOffset;
589}
590
591static bool isNewLineOrTabCharacter(UChar character)
592{
593 return character == '\n' || character == '\t';
594}
595
596bool TextIterator::handleTextNode()
597{
598 Text& textNode = downcast<Text>(*m_node);
599
600 if (m_fullyClippedStack.top() && !(m_behavior & TextIteratorIgnoresStyleVisibility))
601 return false;
602
603 auto& renderer = *textNode.renderer();
604 m_lastTextNode = &textNode;
605 String rendererText = renderer.text();
606
607 // handle pre-formatted text
608 if (!renderer.style().collapseWhiteSpace()) {
609 int runStart = m_offset;
610 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
611 emitCharacter(' ', textNode, nullptr, runStart, runStart);
612 return false;
613 }
614 if (!m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset) {
615 handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
616 if (m_firstLetterText) {
617 String firstLetter = m_firstLetterText->text();
618 emitText(textNode, *m_firstLetterText, m_offset, m_offset + firstLetter.length());
619 m_firstLetterText = nullptr;
620 m_textBox = nullptr;
621 return false;
622 }
623 }
624 if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
625 return false;
626 int rendererTextLength = rendererText.length();
627 int end = (&textNode == m_endContainer) ? m_endOffset : INT_MAX;
628 int runEnd = std::min(rendererTextLength, end);
629
630 if (runStart >= runEnd)
631 return true;
632
633 emitText(textNode, renderer, runStart, runEnd);
634 return true;
635 }
636
637 if (const auto* layout = renderer.simpleLineLayout()) {
638 if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
639 return true;
640 ASSERT(renderer.parent());
641 ASSERT(is<RenderBlockFlow>(*renderer.parent()));
642 const auto& blockFlow = downcast<RenderBlockFlow>(*renderer.parent());
643 // Use the simple layout runs to iterate over the text content.
644 bool isNewTextNode = m_previousSimpleTextNodeInFlow && m_previousSimpleTextNodeInFlow != &textNode;
645 // Simple line layout run positions are all absolute to the parent flow.
646 // Offsetting is required when multiple renderers are present.
647 m_accumulatedSimpleTextLengthInFlow += isNewTextNode ? m_previousSimpleTextNodeInFlow->renderer()->text().length() : 0;
648 m_previousSimpleTextNodeInFlow = &textNode;
649
650 unsigned endPosition = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : rendererText.length();
651 if (!m_flowRunResolverCache || &m_flowRunResolverCache->flow() != &blockFlow) {
652 m_accumulatedSimpleTextLengthInFlow = m_flowRunResolverCache ? 0 : textNodeOffsetInFlow(textNode);
653 m_flowRunResolverCache = std::make_unique<SimpleLineLayout::RunResolver>(blockFlow, *layout);
654 }
655 // Skip to m_offset position.
656 auto range = m_flowRunResolverCache->rangeForRenderer(renderer);
657 auto it = range.begin();
658 auto end = range.end();
659 auto startPosition = static_cast<unsigned>(m_offset) + m_accumulatedSimpleTextLengthInFlow;
660 while (it != end && (*it).end() <= startPosition)
661 ++it;
662 if (m_nextRunNeedsWhitespace && rendererText[m_offset - 1] == '\n') {
663 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
664 return it == end;
665 }
666 if (it == end) {
667 // Collapsed trailing whitespace.
668 m_offset = endPosition;
669 m_lastTextNodeEndedWithCollapsedSpace = true;
670 return true;
671 }
672 if (m_nextRunNeedsWhitespace) {
673 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
674 return false;
675 }
676 // If the position we are looking for is to the left of the renderer's first run, it could mean that
677 // the runs and the renderers are out of sync (e.g. we skipped a renderer in between).
678 // Better bail out at this point.
679 auto run = *it;
680 if (run.start() > startPosition) {
681 ASSERT(m_flowRunResolverCache);
682 if (&(rendererForPosition(m_flowRunResolverCache->flowContents(), startPosition)) != &renderer) {
683 ASSERT_NOT_REACHED();
684 return true;
685 }
686 }
687 ASSERT(run.end() - run.start() <= rendererText.length());
688 // contentStart skips leading whitespace.
689 unsigned contentStart = std::max<unsigned>(m_offset, run.start() - m_accumulatedSimpleTextLengthInFlow);
690 unsigned contentEnd = std::min(endPosition, run.end() - m_accumulatedSimpleTextLengthInFlow);
691 ASSERT_WITH_SECURITY_IMPLICATION(contentStart <= contentEnd);
692 // Check if whitespace adjustment is needed when crossing renderer boundary.
693 if (isNewTextNode) {
694 bool lastCharacterIsNotWhitespace = m_lastCharacter && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter);
695 bool addTrailingWhitespaceForPrevious = m_lastTextNodeEndedWithCollapsedSpace && lastCharacterIsNotWhitespace;
696 bool leadingWhitespaceIsNeededForCurrent = contentStart > static_cast<unsigned>(m_offset) && lastCharacterIsNotWhitespace;
697 if (addTrailingWhitespaceForPrevious || leadingWhitespaceIsNeededForCurrent) {
698 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
699 return false;
700 }
701 }
702 // \n \t single whitespace characters need replacing so that the new line/tab characters don't show up.
703 unsigned stopPosition = contentStart;
704 while (stopPosition < contentEnd && !isNewLineOrTabCharacter(rendererText[stopPosition]))
705 ++stopPosition;
706 // Emit the text up to the new line/tab character.
707 if (stopPosition < contentEnd) {
708 if (stopPosition == contentStart) {
709 emitCharacter(' ', textNode, nullptr, contentStart, contentStart + 1);
710 m_offset = contentStart + 1;
711 return false;
712 }
713 emitText(textNode, renderer, contentStart, stopPosition);
714 m_offset = stopPosition + 1;
715 m_nextRunNeedsWhitespace = true;
716 return false;
717 }
718 emitText(textNode, renderer, contentStart, contentEnd);
719 // When line ending with collapsed whitespace is present, we need to carry over one whitespace: foo(end of line)bar -> foo bar (otherwise we would end up with foobar).
720 m_nextRunNeedsWhitespace = run.isEndOfLine() && contentEnd < endPosition && renderer.style().isCollapsibleWhiteSpace(rendererText[contentEnd]);
721 m_offset = contentEnd;
722 return static_cast<unsigned>(m_offset) == endPosition;
723 }
724
725 if (renderer.firstTextBox())
726 m_textBox = renderer.firstTextBox();
727
728 bool shouldHandleFirstLetter = !m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset;
729 if (shouldHandleFirstLetter)
730 handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
731
732 if (!renderer.firstTextBox() && rendererText.length() && !shouldHandleFirstLetter) {
733 if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
734 return false;
735 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
736 return true;
737 }
738
739 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
740 auto& boxesRenderer = m_firstLetterText ? *m_firstLetterText : renderer;
741 if (boxesRenderer.containsReversedText()) {
742 m_sortedTextBoxes.clear();
743 for (InlineTextBox* textBox = boxesRenderer.firstTextBox(); textBox; textBox = textBox->nextTextBox())
744 m_sortedTextBoxes.append(textBox);
745 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
746 m_sortedTextBoxesPosition = 0;
747 m_textBox = m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0];
748 }
749
750 handleTextBox();
751 return true;
752}
753
754void TextIterator::handleTextBox()
755{
756 Text& textNode = downcast<Text>(*m_node);
757
758 auto& renderer = m_firstLetterText ? *m_firstLetterText : *textNode.renderer();
759 if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) {
760 m_textBox = nullptr;
761 return;
762 }
763 String rendererText = renderer.text();
764 unsigned start = m_offset;
765 unsigned end = (&textNode == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
766 while (m_textBox) {
767 unsigned textBoxStart = m_textBox->start();
768 unsigned runStart = std::max(textBoxStart, start);
769
770 // Check for collapsed space at the start of this run.
771 InlineTextBox* firstTextBox = renderer.containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]) : renderer.firstTextBox();
772 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace || (m_textBox == firstTextBox && textBoxStart == runStart && runStart);
773 if (needSpace && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter) && m_lastCharacter) {
774 if (m_lastTextNode == &textNode && runStart && rendererText[runStart - 1] == ' ') {
775 unsigned spaceRunStart = runStart - 1;
776 while (spaceRunStart && rendererText[spaceRunStart - 1] == ' ')
777 --spaceRunStart;
778 emitText(textNode, renderer, spaceRunStart, spaceRunStart + 1);
779 } else
780 emitCharacter(' ', textNode, nullptr, runStart, runStart);
781 return;
782 }
783 unsigned textBoxEnd = textBoxStart + m_textBox->len();
784 unsigned runEnd = std::min(textBoxEnd, end);
785
786 // Determine what the next text box will be, but don't advance yet
787 InlineTextBox* nextTextBox = nullptr;
788 if (renderer.containsReversedText()) {
789 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
790 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
791 } else
792 nextTextBox = m_textBox->nextTextBox();
793 ASSERT(!nextTextBox || &nextTextBox->renderer() == &renderer);
794
795 if (runStart < runEnd) {
796 // Handle either a single newline character (which becomes a space),
797 // or a run of characters that does not include a newline.
798 // This effectively translates newlines to spaces without copying the text.
799 if (rendererText[runStart] == '\n') {
800 emitCharacter(' ', textNode, nullptr, runStart, runStart + 1);
801 m_offset = runStart + 1;
802 } else {
803 size_t subrunEnd = rendererText.find('\n', runStart);
804 if (subrunEnd == notFound || subrunEnd > runEnd) {
805 subrunEnd = runEnd;
806 bool lastSpaceCollapsedByNextNonTextBox = !nextTextBox && (m_behavior & TextIteratorBehavesAsIfNodesFollowing) && rendererText.length() > runEnd;
807 if (lastSpaceCollapsedByNextNonTextBox)
808 ++subrunEnd; // runEnd stopped before last space. Increment by one to restore the space.
809 }
810 m_offset = subrunEnd;
811 emitText(textNode, renderer, runStart, subrunEnd);
812 }
813
814 // If we are doing a subrun that doesn't go to the end of the text box,
815 // come back again to finish handling this text box; don't advance to the next one.
816 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
817 return;
818
819 // Advance and return
820 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : rendererText.length();
821 if (nextRunStart > runEnd)
822 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
823 m_textBox = nextTextBox;
824 if (renderer.containsReversedText())
825 ++m_sortedTextBoxesPosition;
826 return;
827 }
828 // Advance and continue
829 m_textBox = nextTextBox;
830 if (renderer.containsReversedText())
831 ++m_sortedTextBoxesPosition;
832 }
833 if (!m_textBox && m_remainingTextBox) {
834 m_textBox = m_remainingTextBox;
835 m_remainingTextBox = nullptr;
836 m_firstLetterText = nullptr;
837 m_offset = 0;
838 handleTextBox();
839 }
840}
841
842static inline RenderText* firstRenderTextInFirstLetter(RenderBoxModelObject* firstLetter)
843{
844 if (!firstLetter)
845 return nullptr;
846
847 // FIXME: Should this check descendent objects?
848 return childrenOfType<RenderText>(*firstLetter).first();
849}
850
851void TextIterator::handleTextNodeFirstLetter(RenderTextFragment& renderer)
852{
853 if (auto* firstLetter = renderer.firstLetter()) {
854 if (firstLetter->style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
855 return;
856 if (auto* firstLetterText = firstRenderTextInFirstLetter(firstLetter)) {
857 m_handledFirstLetter = true;
858 m_remainingTextBox = m_textBox;
859 m_textBox = firstLetterText->firstTextBox();
860 m_sortedTextBoxes.clear();
861 m_firstLetterText = firstLetterText;
862 }
863 }
864 m_handledFirstLetter = true;
865}
866
867bool TextIterator::handleReplacedElement()
868{
869 if (m_fullyClippedStack.top())
870 return false;
871
872 auto& renderer = *m_node->renderer();
873 if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
874 return false;
875
876 if (m_lastTextNodeEndedWithCollapsedSpace) {
877 emitCharacter(' ', *m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
878 return false;
879 }
880
881 if ((m_behavior & TextIteratorEntersTextControls) && is<RenderTextControl>(renderer)) {
882 if (auto innerTextElement = downcast<RenderTextControl>(renderer).textFormControlElement().innerTextElement()) {
883 m_node = innerTextElement->containingShadowRoot();
884 pushFullyClippedState(m_fullyClippedStack, *m_node);
885 m_offset = 0;
886 return false;
887 }
888 }
889
890 m_hasEmitted = true;
891
892 if ((m_behavior & TextIteratorEmitsObjectReplacementCharacters) && renderer.isReplaced()) {
893 emitCharacter(objectReplacementCharacter, *m_node->parentNode(), m_node, 0, 1);
894 // Don't process subtrees for embedded objects. If the text there is required,
895 // it must be explicitly asked by specifying a range falling inside its boundaries.
896 m_handledChildren = true;
897 return true;
898 }
899
900 if (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) {
901 // We want replaced elements to behave like punctuation for boundary
902 // finding, and to simply take up space for the selection preservation
903 // code in moveParagraphs, so we use a comma.
904 emitCharacter(',', *m_node->parentNode(), m_node, 0, 1);
905 return true;
906 }
907
908 m_positionNode = m_node->parentNode();
909 m_positionOffsetBaseNode = m_node;
910 m_positionStartOffset = 0;
911 m_positionEndOffset = 1;
912
913 if ((m_behavior & TextIteratorEmitsImageAltText) && is<RenderImage>(renderer)) {
914 String altText = downcast<RenderImage>(renderer).altText();
915 if (unsigned length = altText.length()) {
916 m_lastCharacter = altText[length - 1];
917 m_copyableText.set(WTFMove(altText));
918 m_text = m_copyableText.text();
919 return true;
920 }
921 }
922
923 m_copyableText.reset();
924 m_text = StringView();
925 m_lastCharacter = 0;
926 return true;
927}
928
929static bool shouldEmitTabBeforeNode(Node& node)
930{
931 auto* renderer = node.renderer();
932
933 // Table cells are delimited by tabs.
934 if (!renderer || !isTableCell(&node))
935 return false;
936
937 // Want a tab before every cell other than the first one.
938 RenderTableCell& cell = downcast<RenderTableCell>(*renderer);
939 RenderTable* table = cell.table();
940 return table && (table->cellBefore(&cell) || table->cellAbove(&cell));
941}
942
943static bool shouldEmitNewlineForNode(Node* node, bool emitsOriginalText)
944{
945 auto* renderer = node->renderer();
946 if (!(renderer ? renderer->isBR() : node->hasTagName(brTag)))
947 return false;
948 return emitsOriginalText || !(node->isInShadowTree() && is<HTMLInputElement>(*node->shadowHost()));
949}
950
951static bool hasHeaderTag(HTMLElement& element)
952{
953 return element.hasTagName(h1Tag)
954 || element.hasTagName(h2Tag)
955 || element.hasTagName(h3Tag)
956 || element.hasTagName(h4Tag)
957 || element.hasTagName(h5Tag)
958 || element.hasTagName(h6Tag);
959}
960
961static bool shouldEmitNewlinesBeforeAndAfterNode(Node& node)
962{
963 // Block flow (versus inline flow) is represented by having
964 // a newline both before and after the element.
965 auto* renderer = node.renderer();
966 if (!renderer) {
967 if (!is<HTMLElement>(node))
968 return false;
969 auto& element = downcast<HTMLElement>(node);
970 return hasHeaderTag(element)
971 || element.hasTagName(blockquoteTag)
972 || element.hasTagName(ddTag)
973 || element.hasTagName(divTag)
974 || element.hasTagName(dlTag)
975 || element.hasTagName(dtTag)
976 || element.hasTagName(hrTag)
977 || element.hasTagName(liTag)
978 || element.hasTagName(listingTag)
979 || element.hasTagName(olTag)
980 || element.hasTagName(pTag)
981 || element.hasTagName(preTag)
982 || element.hasTagName(trTag)
983 || element.hasTagName(ulTag);
984 }
985
986 // Need to make an exception for table cells, because they are blocks, but we
987 // want them tab-delimited rather than having newlines before and after.
988 if (isTableCell(&node))
989 return false;
990
991 // Need to make an exception for table row elements, because they are neither
992 // "inline" or "RenderBlock", but we want newlines for them.
993 if (is<RenderTableRow>(*renderer)) {
994 RenderTable* table = downcast<RenderTableRow>(*renderer).table();
995 if (table && !table->isInline())
996 return true;
997 }
998
999 return !renderer->isInline()
1000 && is<RenderBlock>(*renderer)
1001 && !renderer->isFloatingOrOutOfFlowPositioned()
1002 && !renderer->isBody()
1003 && !renderer->isRubyText();
1004}
1005
1006static bool shouldEmitNewlineAfterNode(Node& node, bool emitsCharactersBetweenAllVisiblePositions = false)
1007{
1008 // FIXME: It should be better but slower to create a VisiblePosition here.
1009 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
1010 return false;
1011
1012 // Don't emit a new line at the end of the document unless we're matching the behavior of VisiblePosition.
1013 if (emitsCharactersBetweenAllVisiblePositions)
1014 return true;
1015 Node* subsequentNode = &node;
1016 while ((subsequentNode = NodeTraversal::nextSkippingChildren(*subsequentNode))) {
1017 if (subsequentNode->renderer())
1018 return true;
1019 }
1020 return false;
1021}
1022
1023static bool shouldEmitNewlineBeforeNode(Node& node)
1024{
1025 return shouldEmitNewlinesBeforeAndAfterNode(node);
1026}
1027
1028static bool shouldEmitExtraNewlineForNode(Node& node)
1029{
1030 // When there is a significant collapsed bottom margin, emit an extra
1031 // newline for a more realistic result. We end up getting the right
1032 // result even without margin collapsing. For example: <div><p>text</p></div>
1033 // will work right even if both the <div> and the <p> have bottom margins.
1034
1035 auto* renderer = node.renderer();
1036 if (!is<RenderBox>(renderer))
1037 return false;
1038
1039 // NOTE: We only do this for a select set of nodes, and WinIE appears not to do this at all.
1040 if (!is<HTMLElement>(node))
1041 return false;
1042
1043 HTMLElement& element = downcast<HTMLElement>(node);
1044 if (!hasHeaderTag(element) && !is<HTMLParagraphElement>(element))
1045 return false;
1046
1047 auto& renderBox = downcast<RenderBox>(*renderer);
1048 if (!renderBox.height())
1049 return false;
1050
1051 int bottomMargin = renderBox.collapsedMarginAfter();
1052 int fontSize = renderBox.style().fontDescription().computedPixelSize();
1053 return bottomMargin * 2 >= fontSize;
1054}
1055
1056static int collapsedSpaceLength(RenderText& renderer, int textEnd)
1057{
1058 StringImpl& text = renderer.text();
1059 unsigned length = text.length();
1060 for (unsigned i = textEnd; i < length; ++i) {
1061 if (!renderer.style().isCollapsibleWhiteSpace(text[i]))
1062 return i - textEnd;
1063 }
1064 return length - textEnd;
1065}
1066
1067static int maxOffsetIncludingCollapsedSpaces(Node& node)
1068{
1069 int offset = caretMaxOffset(node);
1070 if (auto* renderer = node.renderer()) {
1071 if (is<RenderText>(*renderer))
1072 offset += collapsedSpaceLength(downcast<RenderText>(*renderer), offset);
1073 }
1074 return offset;
1075}
1076
1077// Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
1078bool TextIterator::shouldRepresentNodeOffsetZero()
1079{
1080 if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isTable())
1081 return true;
1082
1083 // Leave element positioned flush with start of a paragraph
1084 // (e.g. do not insert tab before a table cell at the start of a paragraph)
1085 if (m_lastCharacter == '\n')
1086 return false;
1087
1088 // Otherwise, show the position if we have emitted any characters
1089 if (m_hasEmitted)
1090 return true;
1091
1092 // We've not emitted anything yet. Generally, there is no need for any positioning then.
1093 // The only exception is when the element is visually not in the same line as
1094 // the start of the range (e.g. the range starts at the end of the previous paragraph).
1095 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
1096 // make quicker checks to possibly avoid that. Another check that we could make is
1097 // is whether the inline vs block flow changed since the previous visible element.
1098 // I think we're already in a special enough case that that won't be needed, tho.
1099
1100 // No character needed if this is the first node in the range.
1101 if (m_node == m_startContainer)
1102 return false;
1103
1104 // If we are outside the start container's subtree, assume we need to emit.
1105 // FIXME: m_startContainer could be an inline block
1106 if (!m_node->isDescendantOf(m_startContainer))
1107 return true;
1108
1109 // If we started as m_startContainer offset 0 and the current node is a descendant of
1110 // the start container, we already had enough context to correctly decide whether to
1111 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
1112 // so don't second guess that now.
1113 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
1114 // immaterial since we likely would have already emitted something by now.
1115 if (m_startOffset == 0)
1116 return false;
1117
1118 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
1119 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
1120 // we would create VisiblePositions on every call to this function without this check.
1121 if (!m_node->renderer() || m_node->renderer()->style().visibility() != Visibility::Visible
1122 || (is<RenderBlockFlow>(*m_node->renderer()) && !downcast<RenderBlockFlow>(*m_node->renderer()).height() && !is<HTMLBodyElement>(*m_node)))
1123 return false;
1124
1125 // The startPos.isNotNull() check is needed because the start could be before the body,
1126 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
1127 // The currPos.isNotNull() check is needed because positions in non-HTML content
1128 // (like SVG) do not have visible positions, and we don't want to emit for them either.
1129 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
1130 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
1131 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
1132}
1133
1134bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node& node)
1135{
1136 return node.renderer() && node.renderer()->isTable() && (node.renderer()->isInline() || (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions));
1137}
1138
1139void TextIterator::representNodeOffsetZero()
1140{
1141 // Emit a character to show the positioning of m_node.
1142
1143 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
1144 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
1145 // on m_node to see if it necessitates emitting a character first and will early return
1146 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
1147 if (shouldEmitTabBeforeNode(*m_node)) {
1148 if (shouldRepresentNodeOffsetZero())
1149 emitCharacter('\t', *m_node->parentNode(), m_node, 0, 0);
1150 } else if (shouldEmitNewlineBeforeNode(*m_node)) {
1151 if (shouldRepresentNodeOffsetZero())
1152 emitCharacter('\n', *m_node->parentNode(), m_node, 0, 0);
1153 } else if (shouldEmitSpaceBeforeAndAfterNode(*m_node)) {
1154 if (shouldRepresentNodeOffsetZero())
1155 emitCharacter(' ', *m_node->parentNode(), m_node, 0, 0);
1156 }
1157}
1158
1159bool TextIterator::handleNonTextNode()
1160{
1161 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText))
1162 emitCharacter('\n', *m_node->parentNode(), m_node, 0, 1);
1163 else if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isHR())
1164 emitCharacter(' ', *m_node->parentNode(), m_node, 0, 1);
1165 else
1166 representNodeOffsetZero();
1167
1168 return true;
1169}
1170
1171void TextIterator::exitNode(Node* exitedNode)
1172{
1173 // prevent emitting a newline when exiting a collapsed block at beginning of the range
1174 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
1175 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
1176 // therefore look like a blank line.
1177 if (!m_hasEmitted)
1178 return;
1179
1180 // Emit with a position *inside* m_node, after m_node's contents, in
1181 // case it is a block, because the run should start where the
1182 // emitted character is positioned visually.
1183 Node* baseNode = exitedNode;
1184 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
1185 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
1186 // TextIterator in _web_attributedStringFromRange.
1187 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
1188 if (m_lastTextNode && shouldEmitNewlineAfterNode(*m_node, m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)) {
1189 // use extra newline to represent margin bottom, as needed
1190 bool addNewline = shouldEmitExtraNewlineForNode(*m_node);
1191
1192 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
1193 // contain a VisiblePosition when doing selection preservation.
1194 if (m_lastCharacter != '\n') {
1195 // insert a newline with a position following this block's contents.
1196 emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1);
1197 // remember whether to later add a newline for the current node
1198 ASSERT(!m_nodeForAdditionalNewline);
1199 if (addNewline)
1200 m_nodeForAdditionalNewline = baseNode;
1201 } else if (addNewline)
1202 // insert a newline with a position following this block's contents.
1203 emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1);
1204 }
1205
1206 // If nothing was emitted, see if we need to emit a space.
1207 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(*m_node))
1208 emitCharacter(' ', *baseNode->parentNode(), baseNode, 1, 1);
1209}
1210
1211void TextIterator::emitCharacter(UChar character, Node& characterNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
1212{
1213 m_hasEmitted = true;
1214
1215 // remember information with which to construct the TextIterator::range()
1216 m_positionNode = &characterNode;
1217 m_positionOffsetBaseNode = offsetBaseNode;
1218 m_positionStartOffset = textStartOffset;
1219 m_positionEndOffset = textEndOffset;
1220
1221 m_copyableText.set(character);
1222 m_text = m_copyableText.text();
1223 m_lastCharacter = character;
1224 m_lastTextNodeEndedWithCollapsedSpace = false;
1225 m_nextRunNeedsWhitespace = false;
1226}
1227
1228void TextIterator::emitText(Text& textNode, RenderText& renderer, int textStartOffset, int textEndOffset)
1229{
1230 ASSERT(textStartOffset >= 0);
1231 ASSERT(textEndOffset >= 0);
1232 ASSERT(textStartOffset <= textEndOffset);
1233
1234 // FIXME: This probably yields the wrong offsets when text-transform: lowercase turns a single character into two characters.
1235 String string = (m_behavior & TextIteratorEmitsOriginalText) ? renderer.originalText()
1236 : ((m_behavior & TextIteratorEmitsTextsWithoutTranscoding) ? renderer.textWithoutConvertingBackslashToYenSymbol() : renderer.text());
1237
1238 ASSERT(string.length() >= static_cast<unsigned>(textEndOffset));
1239
1240 m_positionNode = &textNode;
1241 m_positionOffsetBaseNode = nullptr;
1242 m_positionStartOffset = textStartOffset;
1243 m_positionEndOffset = textEndOffset;
1244
1245 m_lastCharacter = string[textEndOffset - 1];
1246 m_copyableText.set(WTFMove(string), textStartOffset, textEndOffset - textStartOffset);
1247 m_text = m_copyableText.text();
1248
1249 m_lastTextNodeEndedWithCollapsedSpace = false;
1250 m_nextRunNeedsWhitespace = false;
1251 m_hasEmitted = true;
1252}
1253
1254Ref<Range> TextIterator::range() const
1255{
1256 ASSERT(!atEnd());
1257
1258 // use the current run information, if we have it
1259 if (m_positionOffsetBaseNode) {
1260 unsigned index = m_positionOffsetBaseNode->computeNodeIndex();
1261 m_positionStartOffset += index;
1262 m_positionEndOffset += index;
1263 m_positionOffsetBaseNode = nullptr;
1264 }
1265 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1266}
1267
1268Node* TextIterator::node() const
1269{
1270 Ref<Range> textRange = range();
1271
1272 Node& node = textRange->startContainer();
1273 if (node.isCharacterDataNode())
1274 return &node;
1275
1276 return node.traverseToChildAt(textRange->startOffset());
1277}
1278
1279// --------
1280
1281SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range& range)
1282{
1283 range.ownerDocument().updateLayoutIgnorePendingStylesheets();
1284
1285 Node* startNode = &range.startContainer();
1286 Node* endNode = &range.endContainer();
1287 int startOffset = range.startOffset();
1288 int endOffset = range.endOffset();
1289
1290 if (!startNode->isCharacterDataNode()) {
1291 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->countChildNodes())) {
1292 startNode = startNode->traverseToChildAt(startOffset);
1293 startOffset = 0;
1294 }
1295 }
1296 if (!endNode->isCharacterDataNode()) {
1297 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->countChildNodes())) {
1298 endNode = endNode->traverseToChildAt(endOffset - 1);
1299 endOffset = lastOffsetInNode(endNode);
1300 }
1301 }
1302
1303 m_node = endNode;
1304 setUpFullyClippedStack(m_fullyClippedStack, *m_node);
1305 m_offset = endOffset;
1306 m_handledNode = false;
1307 m_handledChildren = endOffset == 0;
1308
1309 m_startContainer = startNode;
1310 m_startOffset = startOffset;
1311 m_endContainer = endNode;
1312 m_endOffset = endOffset;
1313
1314 m_positionNode = endNode;
1315
1316 m_lastTextNode = nullptr;
1317 m_lastCharacter = '\n';
1318
1319 m_havePassedStartContainer = false;
1320
1321 advance();
1322}
1323
1324void SimplifiedBackwardsTextIterator::advance()
1325{
1326 ASSERT(!atEnd());
1327
1328 m_positionNode = nullptr;
1329 m_copyableText.reset();
1330 m_text = StringView();
1331
1332 while (m_node && !m_havePassedStartContainer) {
1333 // Don't handle node if we start iterating at [node, 0].
1334 if (!m_handledNode && !(m_node == m_endContainer && !m_endOffset)) {
1335 auto* renderer = m_node->renderer();
1336 if (renderer && renderer->isText() && m_node->isTextNode()) {
1337 if (renderer->style().visibility() == Visibility::Visible && m_offset > 0)
1338 m_handledNode = handleTextNode();
1339 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1340 if (renderer->style().visibility() == Visibility::Visible && m_offset > 0)
1341 m_handledNode = handleReplacedElement();
1342 } else
1343 m_handledNode = handleNonTextNode();
1344 if (m_positionNode)
1345 return;
1346 }
1347
1348 if (!m_handledChildren && m_node->hasChildNodes()) {
1349 m_node = m_node->lastChild();
1350 pushFullyClippedState(m_fullyClippedStack, *m_node);
1351 } else {
1352 // Exit empty containers as we pass over them or containers
1353 // where [container, 0] is where we started iterating.
1354 if (!m_handledNode && canHaveChildrenForEditing(*m_node) && m_node->parentNode() && (!m_node->lastChild() || (m_node == m_endContainer && !m_endOffset))) {
1355 exitNode();
1356 if (m_positionNode) {
1357 m_handledNode = true;
1358 m_handledChildren = true;
1359 return;
1360 }
1361 }
1362
1363 // Exit all other containers.
1364 while (!m_node->previousSibling()) {
1365 if (!advanceRespectingRange(m_node->parentOrShadowHostNode()))
1366 break;
1367 m_fullyClippedStack.pop();
1368 exitNode();
1369 if (m_positionNode) {
1370 m_handledNode = true;
1371 m_handledChildren = true;
1372 return;
1373 }
1374 }
1375
1376 m_fullyClippedStack.pop();
1377 if (advanceRespectingRange(m_node->previousSibling()))
1378 pushFullyClippedState(m_fullyClippedStack, *m_node);
1379 else
1380 m_node = nullptr;
1381 }
1382
1383 // For the purpose of word boundary detection,
1384 // we should iterate all visible text and trailing (collapsed) whitespaces.
1385 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(*m_node) : 0;
1386 m_handledNode = false;
1387 m_handledChildren = false;
1388
1389 if (m_positionNode)
1390 return;
1391 }
1392}
1393
1394bool SimplifiedBackwardsTextIterator::handleTextNode()
1395{
1396 Text& textNode = downcast<Text>(*m_node);
1397
1398 m_lastTextNode = &textNode;
1399
1400 int startOffset;
1401 int offsetInNode;
1402 RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1403 if (!renderer)
1404 return true;
1405
1406 String text = renderer->text();
1407 if (!renderer->hasRenderedText() && text.length())
1408 return true;
1409
1410 if (startOffset + offsetInNode == m_offset) {
1411 ASSERT(!m_shouldHandleFirstLetter);
1412 return true;
1413 }
1414
1415 m_positionEndOffset = m_offset;
1416 m_offset = startOffset + offsetInNode;
1417 m_positionNode = m_node;
1418 m_positionStartOffset = m_offset;
1419
1420 ASSERT(m_positionStartOffset < m_positionEndOffset);
1421 ASSERT(m_positionStartOffset - offsetInNode >= 0);
1422 ASSERT(m_positionEndOffset - offsetInNode > 0);
1423 ASSERT(m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1424
1425 m_lastCharacter = text[m_positionEndOffset - offsetInNode - 1];
1426 m_copyableText.set(WTFMove(text), m_positionStartOffset - offsetInNode, m_positionEndOffset - m_positionStartOffset);
1427 m_text = m_copyableText.text();
1428
1429 return !m_shouldHandleFirstLetter;
1430}
1431
1432RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1433{
1434 RenderText& renderer = downcast<RenderText>(*m_node->renderer());
1435 startOffset = (m_node == m_startContainer) ? m_startOffset : 0;
1436
1437 if (!is<RenderTextFragment>(renderer)) {
1438 offsetInNode = 0;
1439 return &renderer;
1440 }
1441
1442 RenderTextFragment& fragment = downcast<RenderTextFragment>(renderer);
1443 int offsetAfterFirstLetter = fragment.start();
1444 if (startOffset >= offsetAfterFirstLetter) {
1445 ASSERT(!m_shouldHandleFirstLetter);
1446 offsetInNode = offsetAfterFirstLetter;
1447 return &renderer;
1448 }
1449
1450 if (!m_shouldHandleFirstLetter && startOffset + offsetAfterFirstLetter < m_offset) {
1451 m_shouldHandleFirstLetter = true;
1452 offsetInNode = offsetAfterFirstLetter;
1453 return &renderer;
1454 }
1455
1456 m_shouldHandleFirstLetter = false;
1457 offsetInNode = 0;
1458 RenderText* firstLetterRenderer = firstRenderTextInFirstLetter(fragment.firstLetter());
1459
1460 m_offset = firstLetterRenderer->caretMaxOffset();
1461 m_offset += collapsedSpaceLength(*firstLetterRenderer, m_offset);
1462
1463 return firstLetterRenderer;
1464}
1465
1466bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1467{
1468 unsigned index = m_node->computeNodeIndex();
1469 // We want replaced elements to behave like punctuation for boundary
1470 // finding, and to simply take up space for the selection preservation
1471 // code in moveParagraphs, so we use a comma. Unconditionally emit
1472 // here because this iterator is only used for boundary finding.
1473 emitCharacter(',', *m_node->parentNode(), index, index + 1);
1474 return true;
1475}
1476
1477bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1478{
1479 // We can use a linefeed in place of a tab because this simple iterator is only used to
1480 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1481 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineAfterNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) {
1482 if (m_lastCharacter != '\n') {
1483 // Corresponds to the same check in TextIterator::exitNode.
1484 unsigned index = m_node->computeNodeIndex();
1485 // The start of this emitted range is wrong. Ensuring correctness would require
1486 // VisiblePositions and so would be slow. previousBoundary expects this.
1487 emitCharacter('\n', *m_node->parentNode(), index + 1, index + 1);
1488 }
1489 }
1490 return true;
1491}
1492
1493void SimplifiedBackwardsTextIterator::exitNode()
1494{
1495 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineBeforeNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) {
1496 // The start of this emitted range is wrong. Ensuring correctness would require
1497 // VisiblePositions and so would be slow. previousBoundary expects this.
1498 emitCharacter('\n', *m_node, 0, 0);
1499 }
1500}
1501
1502void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node& node, int startOffset, int endOffset)
1503{
1504 m_positionNode = &node;
1505 m_positionStartOffset = startOffset;
1506 m_positionEndOffset = endOffset;
1507 m_copyableText.set(c);
1508 m_text = m_copyableText.text();
1509 m_lastCharacter = c;
1510}
1511
1512bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1513{
1514 if (!next)
1515 return false;
1516 m_havePassedStartContainer |= m_node == m_startContainer;
1517 if (m_havePassedStartContainer)
1518 return false;
1519 m_node = next;
1520 return true;
1521}
1522
1523Ref<Range> SimplifiedBackwardsTextIterator::range() const
1524{
1525 ASSERT(!atEnd());
1526
1527 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1528}
1529
1530// --------
1531
1532CharacterIterator::CharacterIterator(const Range& range, TextIteratorBehavior behavior)
1533 : m_underlyingIterator(&range, behavior)
1534{
1535 while (!atEnd() && !m_underlyingIterator.text().length())
1536 m_underlyingIterator.advance();
1537}
1538
1539CharacterIterator::CharacterIterator(Position start, Position end, TextIteratorBehavior behavior)
1540 : m_underlyingIterator(start, end, behavior)
1541{
1542 while (!atEnd() && !m_underlyingIterator.text().length())
1543 m_underlyingIterator.advance();
1544}
1545
1546Ref<Range> CharacterIterator::range() const
1547{
1548 Ref<Range> range = m_underlyingIterator.range();
1549 if (!m_underlyingIterator.atEnd()) {
1550 if (m_underlyingIterator.text().length() <= 1) {
1551 ASSERT(m_runOffset == 0);
1552 } else {
1553 Node& node = range->startContainer();
1554 ASSERT(&node == &range->endContainer());
1555 int offset = range->startOffset() + m_runOffset;
1556 range->setStart(node, offset);
1557 range->setEnd(node, offset + 1);
1558 }
1559 }
1560 return range;
1561}
1562
1563void CharacterIterator::advance(int count)
1564{
1565 if (count <= 0) {
1566 ASSERT(count == 0);
1567 return;
1568 }
1569
1570 m_atBreak = false;
1571
1572 // easy if there is enough left in the current m_underlyingIterator run
1573 int remaining = m_underlyingIterator.text().length() - m_runOffset;
1574 if (count < remaining) {
1575 m_runOffset += count;
1576 m_offset += count;
1577 return;
1578 }
1579
1580 // exhaust the current m_underlyingIterator run
1581 count -= remaining;
1582 m_offset += remaining;
1583
1584 // move to a subsequent m_underlyingIterator run
1585 for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1586 int runLength = m_underlyingIterator.text().length();
1587 if (!runLength)
1588 m_atBreak = true;
1589 else {
1590 // see whether this is m_underlyingIterator to use
1591 if (count < runLength) {
1592 m_runOffset = count;
1593 m_offset += count;
1594 return;
1595 }
1596
1597 // exhaust this m_underlyingIterator run
1598 count -= runLength;
1599 m_offset += runLength;
1600 }
1601 }
1602
1603 // ran to the end of the m_underlyingIterator... no more runs left
1604 m_atBreak = true;
1605 m_runOffset = 0;
1606}
1607
1608static Ref<Range> characterSubrange(Document& document, CharacterIterator& it, int offset, int length)
1609{
1610 it.advance(offset);
1611 if (it.atEnd())
1612 return Range::create(document);
1613
1614 Ref<Range> start = it.range();
1615
1616 if (length > 1)
1617 it.advance(length - 1);
1618 if (it.atEnd())
1619 return Range::create(document);
1620
1621 Ref<Range> end = it.range();
1622
1623 return Range::create(document, &start->startContainer(), start->startOffset(), &end->endContainer(), end->endOffset());
1624}
1625
1626BackwardsCharacterIterator::BackwardsCharacterIterator(const Range& range)
1627 : m_underlyingIterator(range)
1628 , m_offset(0)
1629 , m_runOffset(0)
1630 , m_atBreak(true)
1631{
1632 while (!atEnd() && !m_underlyingIterator.text().length())
1633 m_underlyingIterator.advance();
1634}
1635
1636Ref<Range> BackwardsCharacterIterator::range() const
1637{
1638 Ref<Range> r = m_underlyingIterator.range();
1639 if (!m_underlyingIterator.atEnd()) {
1640 if (m_underlyingIterator.text().length() <= 1)
1641 ASSERT(m_runOffset == 0);
1642 else {
1643 Node& node = r->startContainer();
1644 ASSERT(&node == &r->endContainer());
1645 int offset = r->endOffset() - m_runOffset;
1646 r->setStart(node, offset - 1);
1647 r->setEnd(node, offset);
1648 }
1649 }
1650 return r;
1651}
1652
1653void BackwardsCharacterIterator::advance(int count)
1654{
1655 if (count <= 0) {
1656 ASSERT(!count);
1657 return;
1658 }
1659
1660 m_atBreak = false;
1661
1662 int remaining = m_underlyingIterator.text().length() - m_runOffset;
1663 if (count < remaining) {
1664 m_runOffset += count;
1665 m_offset += count;
1666 return;
1667 }
1668
1669 count -= remaining;
1670 m_offset += remaining;
1671
1672 for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1673 int runLength = m_underlyingIterator.text().length();
1674 if (runLength == 0)
1675 m_atBreak = true;
1676 else {
1677 if (count < runLength) {
1678 m_runOffset = count;
1679 m_offset += count;
1680 return;
1681 }
1682
1683 count -= runLength;
1684 m_offset += runLength;
1685 }
1686 }
1687
1688 m_atBreak = true;
1689 m_runOffset = 0;
1690}
1691
1692// --------
1693
1694WordAwareIterator::WordAwareIterator(const Range& range)
1695 : m_underlyingIterator(&range)
1696 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1697{
1698 advance(); // get in position over the first chunk of text
1699}
1700
1701// We're always in one of these modes:
1702// - The current chunk in the text iterator is our current chunk
1703// (typically its a piece of whitespace, or text that ended with whitespace)
1704// - The previous chunk in the text iterator is our current chunk
1705// (we looked ahead to the next chunk and found a word boundary)
1706// - We built up our own chunk of text from many chunks from the text iterator
1707
1708// FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1709
1710void WordAwareIterator::advance()
1711{
1712 m_previousText.reset();
1713 m_buffer.clear();
1714
1715 // If last time we did a look-ahead, start with that looked-ahead chunk now
1716 if (!m_didLookAhead) {
1717 ASSERT(!m_underlyingIterator.atEnd());
1718 m_underlyingIterator.advance();
1719 }
1720 m_didLookAhead = false;
1721
1722 // Go to next non-empty chunk
1723 while (!m_underlyingIterator.atEnd() && !m_underlyingIterator.text().length())
1724 m_underlyingIterator.advance();
1725 if (m_underlyingIterator.atEnd())
1726 return;
1727
1728 while (1) {
1729 // If this chunk ends in whitespace we can just use it as our chunk.
1730 if (isSpaceOrNewline(m_underlyingIterator.text()[m_underlyingIterator.text().length() - 1]))
1731 return;
1732
1733 // If this is the first chunk that failed, save it in previousText before look ahead
1734 if (m_buffer.isEmpty())
1735 m_previousText = m_underlyingIterator.copyableText();
1736
1737 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1738 m_underlyingIterator.advance();
1739 if (m_underlyingIterator.atEnd() || !m_underlyingIterator.text().length() || isSpaceOrNewline(m_underlyingIterator.text()[0])) {
1740 m_didLookAhead = true;
1741 return;
1742 }
1743
1744 if (m_buffer.isEmpty()) {
1745 // Start gobbling chunks until we get to a suitable stopping point
1746 append(m_buffer, m_previousText.text());
1747 m_previousText.reset();
1748 }
1749 append(m_buffer, m_underlyingIterator.text());
1750 }
1751}
1752
1753StringView WordAwareIterator::text() const
1754{
1755 if (!m_buffer.isEmpty())
1756 return StringView(m_buffer.data(), m_buffer.size());
1757 if (m_previousText.text().length())
1758 return m_previousText.text();
1759 return m_underlyingIterator.text();
1760}
1761
1762// --------
1763
1764static inline UChar foldQuoteMark(UChar c)
1765{
1766 switch (c) {
1767 case hebrewPunctuationGershayim:
1768 case leftDoubleQuotationMark:
1769 case leftLowDoubleQuotationMark:
1770 case rightDoubleQuotationMark:
1771 return '"';
1772 case hebrewPunctuationGeresh:
1773 case leftSingleQuotationMark:
1774 case leftLowSingleQuotationMark:
1775 case rightSingleQuotationMark:
1776 return '\'';
1777 default:
1778 return c;
1779 }
1780}
1781
1782// FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1783// of doing it in a separate replacement pass here, but ICU doesn't offer a way
1784// to add tailoring on top of the locale-specific tailoring as of this writing.
1785static inline String foldQuoteMarks(String string)
1786{
1787 string.replace(hebrewPunctuationGeresh, '\'');
1788 string.replace(hebrewPunctuationGershayim, '"');
1789 string.replace(leftDoubleQuotationMark, '"');
1790 string.replace(leftLowDoubleQuotationMark, '"');
1791 string.replace(leftSingleQuotationMark, '\'');
1792 string.replace(leftLowSingleQuotationMark, '\'');
1793 string.replace(rightDoubleQuotationMark, '"');
1794 string.replace(rightSingleQuotationMark, '\'');
1795
1796 return string;
1797}
1798
1799#if !UCONFIG_NO_COLLATION
1800
1801const size_t minimumSearchBufferSize = 8192;
1802
1803#ifndef NDEBUG
1804static bool searcherInUse;
1805#endif
1806
1807static UStringSearch* createSearcher()
1808{
1809 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1810 // but it doesn't matter exactly what it is, since we don't perform any searches
1811 // without setting both the pattern and the text.
1812 UErrorCode status = U_ZERO_ERROR;
1813 String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search");
1814 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status);
1815 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1816 return searcher;
1817}
1818
1819static UStringSearch* searcher()
1820{
1821 static UStringSearch* searcher = createSearcher();
1822 return searcher;
1823}
1824
1825static inline void lockSearcher()
1826{
1827#ifndef NDEBUG
1828 ASSERT(!searcherInUse);
1829 searcherInUse = true;
1830#endif
1831}
1832
1833static inline void unlockSearcher()
1834{
1835#ifndef NDEBUG
1836 ASSERT(searcherInUse);
1837 searcherInUse = false;
1838#endif
1839}
1840
1841// ICU's search ignores the distinction between small kana letters and ones
1842// that are not small, and also characters that differ only in the voicing
1843// marks when considering only primary collation strength differences.
1844// This is not helpful for end users, since these differences make words
1845// distinct, so for our purposes we need these to be considered.
1846// The Unicode folks do not think the collation algorithm should be
1847// changed. To work around this, we would like to tailor the ICU searcher,
1848// but we can't get that to work yet. So instead, we check for cases where
1849// these differences occur, and skip those matches.
1850
1851// We refer to the above technique as the "kana workaround". The next few
1852// functions are helper functinos for the kana workaround.
1853
1854static inline bool isKanaLetter(UChar character)
1855{
1856 // Hiragana letters.
1857 if (character >= 0x3041 && character <= 0x3096)
1858 return true;
1859
1860 // Katakana letters.
1861 if (character >= 0x30A1 && character <= 0x30FA)
1862 return true;
1863 if (character >= 0x31F0 && character <= 0x31FF)
1864 return true;
1865
1866 // Halfwidth katakana letters.
1867 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1868 return true;
1869
1870 return false;
1871}
1872
1873static inline bool isSmallKanaLetter(UChar character)
1874{
1875 ASSERT(isKanaLetter(character));
1876
1877 switch (character) {
1878 case 0x3041: // HIRAGANA LETTER SMALL A
1879 case 0x3043: // HIRAGANA LETTER SMALL I
1880 case 0x3045: // HIRAGANA LETTER SMALL U
1881 case 0x3047: // HIRAGANA LETTER SMALL E
1882 case 0x3049: // HIRAGANA LETTER SMALL O
1883 case 0x3063: // HIRAGANA LETTER SMALL TU
1884 case 0x3083: // HIRAGANA LETTER SMALL YA
1885 case 0x3085: // HIRAGANA LETTER SMALL YU
1886 case 0x3087: // HIRAGANA LETTER SMALL YO
1887 case 0x308E: // HIRAGANA LETTER SMALL WA
1888 case 0x3095: // HIRAGANA LETTER SMALL KA
1889 case 0x3096: // HIRAGANA LETTER SMALL KE
1890 case 0x30A1: // KATAKANA LETTER SMALL A
1891 case 0x30A3: // KATAKANA LETTER SMALL I
1892 case 0x30A5: // KATAKANA LETTER SMALL U
1893 case 0x30A7: // KATAKANA LETTER SMALL E
1894 case 0x30A9: // KATAKANA LETTER SMALL O
1895 case 0x30C3: // KATAKANA LETTER SMALL TU
1896 case 0x30E3: // KATAKANA LETTER SMALL YA
1897 case 0x30E5: // KATAKANA LETTER SMALL YU
1898 case 0x30E7: // KATAKANA LETTER SMALL YO
1899 case 0x30EE: // KATAKANA LETTER SMALL WA
1900 case 0x30F5: // KATAKANA LETTER SMALL KA
1901 case 0x30F6: // KATAKANA LETTER SMALL KE
1902 case 0x31F0: // KATAKANA LETTER SMALL KU
1903 case 0x31F1: // KATAKANA LETTER SMALL SI
1904 case 0x31F2: // KATAKANA LETTER SMALL SU
1905 case 0x31F3: // KATAKANA LETTER SMALL TO
1906 case 0x31F4: // KATAKANA LETTER SMALL NU
1907 case 0x31F5: // KATAKANA LETTER SMALL HA
1908 case 0x31F6: // KATAKANA LETTER SMALL HI
1909 case 0x31F7: // KATAKANA LETTER SMALL HU
1910 case 0x31F8: // KATAKANA LETTER SMALL HE
1911 case 0x31F9: // KATAKANA LETTER SMALL HO
1912 case 0x31FA: // KATAKANA LETTER SMALL MU
1913 case 0x31FB: // KATAKANA LETTER SMALL RA
1914 case 0x31FC: // KATAKANA LETTER SMALL RI
1915 case 0x31FD: // KATAKANA LETTER SMALL RU
1916 case 0x31FE: // KATAKANA LETTER SMALL RE
1917 case 0x31FF: // KATAKANA LETTER SMALL RO
1918 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1919 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1920 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1921 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1922 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1923 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1924 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1925 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1926 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1927 return true;
1928 }
1929 return false;
1930}
1931
1932enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1933
1934static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1935{
1936 ASSERT(isKanaLetter(character));
1937
1938 switch (character) {
1939 case 0x304C: // HIRAGANA LETTER GA
1940 case 0x304E: // HIRAGANA LETTER GI
1941 case 0x3050: // HIRAGANA LETTER GU
1942 case 0x3052: // HIRAGANA LETTER GE
1943 case 0x3054: // HIRAGANA LETTER GO
1944 case 0x3056: // HIRAGANA LETTER ZA
1945 case 0x3058: // HIRAGANA LETTER ZI
1946 case 0x305A: // HIRAGANA LETTER ZU
1947 case 0x305C: // HIRAGANA LETTER ZE
1948 case 0x305E: // HIRAGANA LETTER ZO
1949 case 0x3060: // HIRAGANA LETTER DA
1950 case 0x3062: // HIRAGANA LETTER DI
1951 case 0x3065: // HIRAGANA LETTER DU
1952 case 0x3067: // HIRAGANA LETTER DE
1953 case 0x3069: // HIRAGANA LETTER DO
1954 case 0x3070: // HIRAGANA LETTER BA
1955 case 0x3073: // HIRAGANA LETTER BI
1956 case 0x3076: // HIRAGANA LETTER BU
1957 case 0x3079: // HIRAGANA LETTER BE
1958 case 0x307C: // HIRAGANA LETTER BO
1959 case 0x3094: // HIRAGANA LETTER VU
1960 case 0x30AC: // KATAKANA LETTER GA
1961 case 0x30AE: // KATAKANA LETTER GI
1962 case 0x30B0: // KATAKANA LETTER GU
1963 case 0x30B2: // KATAKANA LETTER GE
1964 case 0x30B4: // KATAKANA LETTER GO
1965 case 0x30B6: // KATAKANA LETTER ZA
1966 case 0x30B8: // KATAKANA LETTER ZI
1967 case 0x30BA: // KATAKANA LETTER ZU
1968 case 0x30BC: // KATAKANA LETTER ZE
1969 case 0x30BE: // KATAKANA LETTER ZO
1970 case 0x30C0: // KATAKANA LETTER DA
1971 case 0x30C2: // KATAKANA LETTER DI
1972 case 0x30C5: // KATAKANA LETTER DU
1973 case 0x30C7: // KATAKANA LETTER DE
1974 case 0x30C9: // KATAKANA LETTER DO
1975 case 0x30D0: // KATAKANA LETTER BA
1976 case 0x30D3: // KATAKANA LETTER BI
1977 case 0x30D6: // KATAKANA LETTER BU
1978 case 0x30D9: // KATAKANA LETTER BE
1979 case 0x30DC: // KATAKANA LETTER BO
1980 case 0x30F4: // KATAKANA LETTER VU
1981 case 0x30F7: // KATAKANA LETTER VA
1982 case 0x30F8: // KATAKANA LETTER VI
1983 case 0x30F9: // KATAKANA LETTER VE
1984 case 0x30FA: // KATAKANA LETTER VO
1985 return VoicedSoundMark;
1986 case 0x3071: // HIRAGANA LETTER PA
1987 case 0x3074: // HIRAGANA LETTER PI
1988 case 0x3077: // HIRAGANA LETTER PU
1989 case 0x307A: // HIRAGANA LETTER PE
1990 case 0x307D: // HIRAGANA LETTER PO
1991 case 0x30D1: // KATAKANA LETTER PA
1992 case 0x30D4: // KATAKANA LETTER PI
1993 case 0x30D7: // KATAKANA LETTER PU
1994 case 0x30DA: // KATAKANA LETTER PE
1995 case 0x30DD: // KATAKANA LETTER PO
1996 return SemiVoicedSoundMark;
1997 }
1998 return NoVoicedSoundMark;
1999}
2000
2001static inline bool isCombiningVoicedSoundMark(UChar character)
2002{
2003 switch (character) {
2004 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
2005 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
2006 return true;
2007 }
2008 return false;
2009}
2010
2011static inline bool containsKanaLetters(const String& pattern)
2012{
2013 if (pattern.is8Bit())
2014 return false;
2015 const UChar* characters = pattern.characters16();
2016 unsigned length = pattern.length();
2017 for (unsigned i = 0; i < length; ++i) {
2018 if (isKanaLetter(characters[i]))
2019 return true;
2020 }
2021 return false;
2022}
2023
2024static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
2025{
2026 UErrorCode status = U_ZERO_ERROR;
2027 const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
2028 ASSERT(U_SUCCESS(status));
2029
2030 buffer.resize(length);
2031
2032 auto normalizedLength = unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status);
2033 ASSERT(U_SUCCESS(status) || status == U_BUFFER_OVERFLOW_ERROR);
2034
2035 buffer.resize(normalizedLength);
2036
2037 if (U_SUCCESS(status))
2038 return;
2039
2040 status = U_ZERO_ERROR;
2041 unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status);
2042 ASSERT(U_SUCCESS(status));
2043}
2044
2045static bool isNonLatin1Separator(UChar32 character)
2046{
2047 ASSERT_ARG(character, character >= 256);
2048
2049 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
2050}
2051
2052static inline bool isSeparator(UChar32 character)
2053{
2054 static const bool latin1SeparatorTable[256] = {
2055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2056 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
2058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
2059 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
2060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
2061 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
2062 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
2063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2064 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2065 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
2066 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
2067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2068 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
2069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2070 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
2071 };
2072
2073 if (character < 256)
2074 return latin1SeparatorTable[character];
2075
2076 return isNonLatin1Separator(character);
2077}
2078
2079inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2080 : m_target(foldQuoteMarks(target))
2081 , m_targetCharacters(StringView(m_target).upconvertedCharacters())
2082 , m_options(options)
2083 , m_prefixLength(0)
2084 , m_atBreak(true)
2085 , m_needsMoreContext(options.contains(AtWordStarts))
2086 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
2087{
2088 ASSERT(!m_target.isEmpty());
2089
2090 size_t targetLength = m_target.length();
2091 m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize));
2092 m_overlap = m_buffer.capacity() / 4;
2093
2094 if (m_options.contains(AtWordStarts) && targetLength) {
2095 UChar32 targetFirstCharacter;
2096 U16_GET(m_target, 0, 0u, targetLength, targetFirstCharacter);
2097 // Characters in the separator category never really occur at the beginning of a word,
2098 // so if the target begins with such a character, we just ignore the AtWordStart option.
2099 if (isSeparator(targetFirstCharacter)) {
2100 m_options.remove(AtWordStarts);
2101 m_needsMoreContext = false;
2102 }
2103 }
2104
2105 // Grab the single global searcher.
2106 // If we ever have a reason to do more than once search buffer at once, we'll have
2107 // to move to multiple searchers.
2108 lockSearcher();
2109
2110 UStringSearch* searcher = WebCore::searcher();
2111 UCollator* collator = usearch_getCollator(searcher);
2112
2113 UCollationStrength strength;
2114 USearchAttributeValue comparator;
2115 if (m_options.contains(CaseInsensitive)) {
2116 // Without loss of generality, have 'e' match {'e', 'E', 'é', 'É'} and 'é' match {'é', 'É'}.
2117 strength = UCOL_SECONDARY;
2118 comparator = USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD;
2119 } else {
2120 // Without loss of generality, have 'e' match {'e'} and 'é' match {'é'}.
2121 strength = UCOL_TERTIARY;
2122 comparator = USEARCH_STANDARD_ELEMENT_COMPARISON;
2123 }
2124 if (ucol_getStrength(collator) != strength) {
2125 ucol_setStrength(collator, strength);
2126 usearch_reset(searcher);
2127 }
2128
2129 UErrorCode status = U_ZERO_ERROR;
2130 usearch_setAttribute(searcher, USEARCH_ELEMENT_COMPARISON, comparator, &status);
2131 ASSERT(status == U_ZERO_ERROR);
2132
2133 usearch_setPattern(searcher, m_targetCharacters, targetLength, &status);
2134 ASSERT(status == U_ZERO_ERROR);
2135
2136 // The kana workaround requires a normalized copy of the target string.
2137 if (m_targetRequiresKanaWorkaround)
2138 normalizeCharacters(m_targetCharacters, targetLength, m_normalizedTarget);
2139}
2140
2141inline SearchBuffer::~SearchBuffer()
2142{
2143 // Leave the static object pointing to a valid string.
2144 UErrorCode status = U_ZERO_ERROR;
2145 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
2146 ASSERT(status == U_ZERO_ERROR);
2147 usearch_setText(WebCore::searcher(), &newlineCharacter, 1, &status);
2148 ASSERT(status == U_ZERO_ERROR);
2149
2150 unlockSearcher();
2151}
2152
2153inline size_t SearchBuffer::append(StringView text)
2154{
2155 ASSERT(text.length());
2156
2157 if (m_atBreak) {
2158 m_buffer.shrink(0);
2159 m_prefixLength = 0;
2160 m_atBreak = false;
2161 } else if (m_buffer.size() == m_buffer.capacity()) {
2162 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2163 m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap);
2164 m_buffer.shrink(m_overlap);
2165 }
2166
2167 size_t oldLength = m_buffer.size();
2168 size_t usableLength = std::min<size_t>(m_buffer.capacity() - oldLength, text.length());
2169 ASSERT(usableLength);
2170 m_buffer.grow(oldLength + usableLength);
2171 for (unsigned i = 0; i < usableLength; ++i)
2172 m_buffer[oldLength + i] = foldQuoteMark(text[i]);
2173 return usableLength;
2174}
2175
2176inline bool SearchBuffer::needsMoreContext() const
2177{
2178 return m_needsMoreContext;
2179}
2180
2181inline void SearchBuffer::prependContext(StringView text)
2182{
2183 ASSERT(m_needsMoreContext);
2184 ASSERT(m_prefixLength == m_buffer.size());
2185
2186 if (!text.length())
2187 return;
2188
2189 m_atBreak = false;
2190
2191 size_t wordBoundaryContextStart = text.length();
2192 if (wordBoundaryContextStart) {
2193 U16_BACK_1(text, 0, wordBoundaryContextStart);
2194 wordBoundaryContextStart = startOfLastWordBoundaryContext(text.substring(0, wordBoundaryContextStart));
2195 }
2196
2197 size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, text.length() - wordBoundaryContextStart);
2198 WTF::append(m_buffer, text.substring(text.length() - usableLength, usableLength));
2199 m_prefixLength += usableLength;
2200
2201 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2202 m_needsMoreContext = false;
2203}
2204
2205inline bool SearchBuffer::atBreak() const
2206{
2207 return m_atBreak;
2208}
2209
2210inline void SearchBuffer::reachedBreak()
2211{
2212 m_atBreak = true;
2213}
2214
2215inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2216{
2217 // This function implements the kana workaround. If usearch treats
2218 // it as a match, but we do not want to, then it's a "bad match".
2219 if (!m_targetRequiresKanaWorkaround)
2220 return false;
2221
2222 // Normalize into a match buffer. We reuse a single buffer rather than
2223 // creating a new one each time.
2224 normalizeCharacters(match, matchLength, m_normalizedMatch);
2225
2226 const UChar* a = m_normalizedTarget.begin();
2227 const UChar* aEnd = m_normalizedTarget.end();
2228
2229 const UChar* b = m_normalizedMatch.begin();
2230 const UChar* bEnd = m_normalizedMatch.end();
2231
2232 while (true) {
2233 // Skip runs of non-kana-letter characters. This is necessary so we can
2234 // correctly handle strings where the target and match have different-length
2235 // runs of characters that match, while still double checking the correctness
2236 // of matches of kana letters with other kana letters.
2237 while (a != aEnd && !isKanaLetter(*a))
2238 ++a;
2239 while (b != bEnd && !isKanaLetter(*b))
2240 ++b;
2241
2242 // If we reached the end of either the target or the match, we should have
2243 // reached the end of both; both should have the same number of kana letters.
2244 if (a == aEnd || b == bEnd) {
2245 ASSERT(a == aEnd);
2246 ASSERT(b == bEnd);
2247 return false;
2248 }
2249
2250 // Check for differences in the kana letter character itself.
2251 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2252 return true;
2253 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2254 return true;
2255 ++a;
2256 ++b;
2257
2258 // Check for differences in combining voiced sound marks found after the letter.
2259 while (1) {
2260 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2261 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2262 return true;
2263 break;
2264 }
2265 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2266 return true;
2267 if (*a != *b)
2268 return true;
2269 ++a;
2270 ++b;
2271 }
2272 }
2273}
2274
2275inline bool SearchBuffer::isWordEndMatch(size_t start, size_t length) const
2276{
2277 ASSERT(length);
2278 ASSERT(m_options.contains(AtWordEnds));
2279
2280 int endWord;
2281 // Start searching at the end of matched search, so that multiple word matches succeed.
2282 findEndWordBoundary(StringView(m_buffer.data(), m_buffer.size()), start + length - 1, &endWord);
2283 return static_cast<size_t>(endWord) == (start + length);
2284}
2285
2286inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2287{
2288 ASSERT(m_options.contains(AtWordStarts));
2289
2290 if (!start)
2291 return true;
2292
2293 int size = m_buffer.size();
2294 int offset = start;
2295 UChar32 firstCharacter;
2296 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2297
2298 if (m_options.contains(TreatMedialCapitalAsWordStart)) {
2299 UChar32 previousCharacter;
2300 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2301
2302 if (isSeparator(firstCharacter)) {
2303 // The start of a separator run is a word start (".org" in "webkit.org").
2304 if (!isSeparator(previousCharacter))
2305 return true;
2306 } else if (isASCIIUpper(firstCharacter)) {
2307 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2308 if (!isASCIIUpper(previousCharacter))
2309 return true;
2310 // The last character of an uppercase run followed by a non-separator, non-digit
2311 // is a word start ("Request" in "XMLHTTPRequest").
2312 offset = start;
2313 U16_FWD_1(m_buffer.data(), offset, size);
2314 UChar32 nextCharacter = 0;
2315 if (offset < size)
2316 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2317 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2318 return true;
2319 } else if (isASCIIDigit(firstCharacter)) {
2320 // The start of a digit run is a word start ("2" in "WebKit2").
2321 if (!isASCIIDigit(previousCharacter))
2322 return true;
2323 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2324 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2325 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2326 return true;
2327 }
2328 }
2329
2330 // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
2331 // a word, so treat the position before any CJK character as a word start.
2332 if (FontCascade::isCJKIdeographOrSymbol(firstCharacter))
2333 return true;
2334
2335 size_t wordBreakSearchStart = start + length;
2336 while (wordBreakSearchStart > start)
2337 wordBreakSearchStart = findNextWordFromIndex(StringView(m_buffer.data(), m_buffer.size()), wordBreakSearchStart, false /* backwards */);
2338 return wordBreakSearchStart == start;
2339}
2340
2341inline size_t SearchBuffer::search(size_t& start)
2342{
2343 size_t size = m_buffer.size();
2344 if (m_atBreak) {
2345 if (!size)
2346 return 0;
2347 } else {
2348 if (size != m_buffer.capacity())
2349 return 0;
2350 }
2351
2352 UStringSearch* searcher = WebCore::searcher();
2353
2354 UErrorCode status = U_ZERO_ERROR;
2355 usearch_setText(searcher, m_buffer.data(), size, &status);
2356 ASSERT(status == U_ZERO_ERROR);
2357
2358 usearch_setOffset(searcher, m_prefixLength, &status);
2359 ASSERT(status == U_ZERO_ERROR);
2360
2361 int matchStart = usearch_next(searcher, &status);
2362 ASSERT(status == U_ZERO_ERROR);
2363
2364nextMatch:
2365 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2366 ASSERT(matchStart == USEARCH_DONE);
2367 return 0;
2368 }
2369
2370 // Matches that start in the overlap area are only tentative.
2371 // The same match may appear later, matching more characters,
2372 // possibly including a combining character that's not yet in the buffer.
2373 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2374 size_t overlap = m_overlap;
2375 if (m_options.contains(AtWordStarts)) {
2376 // Ensure that there is sufficient context before matchStart the next time around for
2377 // determining if it is at a word boundary.
2378 unsigned wordBoundaryContextStart = matchStart;
2379 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2380 wordBoundaryContextStart = startOfLastWordBoundaryContext(StringView(m_buffer.data(), wordBoundaryContextStart));
2381 overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart));
2382 }
2383 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2384 m_prefixLength -= std::min(m_prefixLength, size - overlap);
2385 m_buffer.shrink(overlap);
2386 return 0;
2387 }
2388
2389 size_t matchedLength = usearch_getMatchedLength(searcher);
2390 ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size);
2391
2392 // If this match is "bad", move on to the next match.
2393 if (isBadMatch(m_buffer.data() + matchStart, matchedLength)
2394 || (m_options.contains(AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))
2395 || (m_options.contains(AtWordEnds) && !isWordEndMatch(matchStart, matchedLength))) {
2396 matchStart = usearch_next(searcher, &status);
2397 ASSERT(status == U_ZERO_ERROR);
2398 goto nextMatch;
2399 }
2400
2401 size_t newSize = size - (matchStart + 1);
2402 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2403 m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1);
2404 m_buffer.shrink(newSize);
2405
2406 start = size - matchStart;
2407 return matchedLength;
2408}
2409
2410#else
2411
2412inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2413 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2414 , m_options(options)
2415 , m_buffer(m_target.length())
2416 , m_isCharacterStartBuffer(m_target.length())
2417 , m_isBufferFull(false)
2418 , m_cursor(0)
2419{
2420 ASSERT(!m_target.isEmpty());
2421 m_target.replace(noBreakSpace, ' ');
2422 foldQuoteMarks(m_target);
2423}
2424
2425inline SearchBuffer::~SearchBuffer() = default;
2426
2427inline void SearchBuffer::reachedBreak()
2428{
2429 m_cursor = 0;
2430 m_isBufferFull = false;
2431}
2432
2433inline bool SearchBuffer::atBreak() const
2434{
2435 return !m_cursor && !m_isBufferFull;
2436}
2437
2438inline void SearchBuffer::append(UChar c, bool isStart)
2439{
2440 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMark(c);
2441 m_isCharacterStartBuffer[m_cursor] = isStart;
2442 if (++m_cursor == m_target.length()) {
2443 m_cursor = 0;
2444 m_isBufferFull = true;
2445 }
2446}
2447
2448inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2449{
2450 ASSERT(length);
2451 if (!(m_options & CaseInsensitive)) {
2452 append(characters[0], true);
2453 return 1;
2454 }
2455 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2456 UChar foldedCharacters[maxFoldedCharacters];
2457 UErrorCode status = U_ZERO_ERROR;
2458 int numFoldedCharacters = u_strFoldCase(foldedCharacters, maxFoldedCharacters, characters, 1, U_FOLD_CASE_DEFAULT, &status);
2459 ASSERT(U_SUCCESS(status));
2460 ASSERT(numFoldedCharacters);
2461 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2462 if (U_SUCCESS(status) && numFoldedCharacters) {
2463 numFoldedCharacters = std::min(numFoldedCharacters, maxFoldedCharacters);
2464 append(foldedCharacters[0], true);
2465 for (int i = 1; i < numFoldedCharacters; ++i)
2466 append(foldedCharacters[i], false);
2467 }
2468 return 1;
2469}
2470
2471inline bool SearchBuffer::needsMoreContext() const
2472{
2473 return false;
2474}
2475
2476void SearchBuffer::prependContext(const UChar*, size_t)
2477{
2478 ASSERT_NOT_REACHED();
2479}
2480
2481inline size_t SearchBuffer::search(size_t& start)
2482{
2483 if (!m_isBufferFull)
2484 return 0;
2485 if (!m_isCharacterStartBuffer[m_cursor])
2486 return 0;
2487
2488 size_t tailSpace = m_target.length() - m_cursor;
2489 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2490 return 0;
2491 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2492 return 0;
2493
2494 start = length();
2495
2496 // Now that we've found a match once, we don't want to find it again, because those
2497 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2498 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2499 // we want to get rid of that in the future we could track this with a separate boolean
2500 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2501 m_isCharacterStartBuffer[m_cursor] = false;
2502
2503 return start;
2504}
2505
2506// Returns the number of characters that were appended to the buffer (what we are searching in).
2507// That's not necessarily the same length as the passed-in target string, because case folding
2508// can make two strings match even though they're not the same length.
2509size_t SearchBuffer::length() const
2510{
2511 size_t bufferSize = m_target.length();
2512 size_t length = 0;
2513 for (size_t i = 0; i < bufferSize; ++i)
2514 length += m_isCharacterStartBuffer[i];
2515 return length;
2516}
2517
2518#endif
2519
2520// --------
2521
2522int TextIterator::rangeLength(const Range* range, bool forSelectionPreservation)
2523{
2524 unsigned length = 0;
2525 for (TextIterator it(range, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2526 length += it.text().length();
2527 return length;
2528}
2529
2530Ref<Range> TextIterator::subrange(Range& entireRange, int characterOffset, int characterCount)
2531{
2532 CharacterIterator entireRangeIterator(entireRange);
2533 return characterSubrange(entireRange.ownerDocument(), entireRangeIterator, characterOffset, characterCount);
2534}
2535
2536static inline bool isInsideReplacedElement(TextIterator& iterator)
2537{
2538 ASSERT(!iterator.atEnd());
2539 ASSERT(iterator.text().length() == 1);
2540 Node* node = iterator.node();
2541 return node && isRendererReplacedElement(node->renderer());
2542}
2543
2544RefPtr<Range> TextIterator::rangeFromLocationAndLength(ContainerNode* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2545{
2546 Ref<Range> resultRange = scope->document().createRange();
2547
2548 int docTextPosition = 0;
2549 int rangeEnd = rangeLocation + rangeLength;
2550 bool startRangeFound = false;
2551
2552 Ref<Range> textRunRange = rangeOfContents(*scope);
2553
2554 TextIterator it(textRunRange.ptr(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2555
2556 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2557 if (!rangeLocation && !rangeLength && it.atEnd()) {
2558 resultRange->setStart(textRunRange->startContainer(), 0);
2559 resultRange->setEnd(textRunRange->startContainer(), 0);
2560 return resultRange;
2561 }
2562
2563 for (; !it.atEnd(); it.advance()) {
2564 int length = it.text().length();
2565 textRunRange = it.range();
2566
2567 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + length;
2568 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + length;
2569
2570 if (foundEnd) {
2571 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2572 // position for emitted '\n's or if the renderer of the current node is a replaced element.
2573 if (length == 1 && (it.text()[0] == '\n' || isInsideReplacedElement(it))) {
2574 it.advance();
2575 if (!it.atEnd()) {
2576 Ref<Range> range = it.range();
2577 textRunRange->setEnd(range->startContainer(), range->startOffset());
2578 } else {
2579 Position runStart = textRunRange->startPosition();
2580 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2581 if (runEnd.isNotNull())
2582 textRunRange->setEnd(*runEnd.containerNode(), runEnd.computeOffsetInContainerNode());
2583 }
2584 }
2585 }
2586
2587 if (foundStart) {
2588 startRangeFound = true;
2589 if (textRunRange->startContainer().isTextNode()) {
2590 int offset = rangeLocation - docTextPosition;
2591 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset());
2592 } else {
2593 if (rangeLocation == docTextPosition)
2594 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset());
2595 else
2596 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset());
2597 }
2598 }
2599
2600 if (foundEnd) {
2601 if (textRunRange->startContainer().isTextNode()) {
2602 int offset = rangeEnd - docTextPosition;
2603 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset());
2604 } else {
2605 if (rangeEnd == docTextPosition)
2606 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset());
2607 else
2608 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset());
2609 }
2610 docTextPosition += length;
2611 break;
2612 }
2613
2614 docTextPosition += length;
2615 }
2616
2617 if (!startRangeFound)
2618 return nullptr;
2619
2620 if (rangeLength && rangeEnd > docTextPosition) // rangeEnd is out of bounds
2621 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset());
2622
2623 return resultRange;
2624}
2625
2626bool TextIterator::getLocationAndLengthFromRange(Node* scope, const Range* range, size_t& location, size_t& length)
2627{
2628 location = notFound;
2629 length = 0;
2630
2631 // The critical assumption is that this only gets called with ranges that
2632 // concentrate on a given area containing the selection root. This is done
2633 // because of text fields and textareas. The DOM for those is not
2634 // directly in the document DOM, so ensure that the range does not cross a
2635 // boundary of one of those.
2636 if (&range->startContainer() != scope && !range->startContainer().isDescendantOf(scope))
2637 return false;
2638 if (&range->endContainer() != scope && !range->endContainer().isDescendantOf(scope))
2639 return false;
2640
2641 Ref<Range> testRange = Range::create(scope->document(), scope, 0, &range->startContainer(), range->startOffset());
2642 ASSERT(&testRange->startContainer() == scope);
2643 location = TextIterator::rangeLength(testRange.ptr());
2644
2645 testRange->setEnd(range->endContainer(), range->endOffset());
2646 ASSERT(&testRange->startContainer() == scope);
2647 length = TextIterator::rangeLength(testRange.ptr()) - location;
2648 return true;
2649}
2650
2651// --------
2652
2653bool hasAnyPlainText(const Range& range, TextIteratorBehavior behavior)
2654{
2655 for (TextIterator iterator { &range, behavior }; !iterator.atEnd(); iterator.advance()) {
2656 if (!iterator.text().isEmpty())
2657 return true;
2658 }
2659 return false;
2660}
2661
2662String plainText(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2663{
2664 // The initial buffer size can be critical for performance: https://bugs.webkit.org/show_bug.cgi?id=81192
2665 static const unsigned initialCapacity = 1 << 15;
2666
2667 if (!start.document())
2668 return { };
2669 auto document = makeRef(*start.document());
2670
2671 unsigned bufferLength = 0;
2672 StringBuilder builder;
2673 builder.reserveCapacity(initialCapacity);
2674 TextIteratorBehavior behavior = defaultBehavior;
2675 if (!isDisplayString)
2676 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2677
2678 for (TextIterator it(start, end, behavior); !it.atEnd(); it.advance()) {
2679 it.appendTextToStringBuilder(builder);
2680 bufferLength += it.text().length();
2681 }
2682
2683 if (!bufferLength)
2684 return emptyString();
2685
2686 String result = builder.toString();
2687
2688 if (isDisplayString)
2689 document->displayStringModifiedByEncoding(result);
2690
2691 return result;
2692}
2693
2694String plainTextReplacingNoBreakSpace(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2695{
2696 return plainText(start, end, defaultBehavior, isDisplayString).replace(noBreakSpace, ' ');
2697}
2698
2699String plainText(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2700{
2701 if (!range)
2702 return emptyString();
2703 return plainText(range->startPosition(), range->endPosition(), defaultBehavior, isDisplayString);
2704}
2705
2706String plainTextUsingBackwardsTextIteratorForTesting(const Range& range)
2707{
2708 String result;
2709 for (SimplifiedBackwardsTextIterator backwardsIterator(range); !backwardsIterator.atEnd(); backwardsIterator.advance())
2710 result.insert(backwardsIterator.text().toString(), 0);
2711 return result;
2712}
2713
2714String plainTextReplacingNoBreakSpace(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2715{
2716 return plainText(range, defaultBehavior, isDisplayString).replace(noBreakSpace, ' ');
2717}
2718
2719static Ref<Range> collapsedToBoundary(const Range& range, bool forward)
2720{
2721 Ref<Range> result = range.cloneRange();
2722 result->collapse(!forward);
2723 return result;
2724}
2725
2726static TextIteratorBehavior findIteratorOptions(FindOptions options)
2727{
2728 TextIteratorBehavior iteratorOptions = TextIteratorEntersTextControls | TextIteratorClipsToFrameAncestors;
2729 if (!options.contains(DoNotTraverseFlatTree))
2730 iteratorOptions |= TextIteratorTraversesFlatTree;
2731 return iteratorOptions;
2732}
2733
2734static void findPlainTextMatches(const Range& range, const String& target, FindOptions options, const WTF::Function<bool(size_t, size_t)>& match)
2735{
2736 SearchBuffer buffer(target, options);
2737 if (buffer.needsMoreContext()) {
2738 Ref<Range> beforeStartRange = range.ownerDocument().createRange();
2739 beforeStartRange->setEnd(range.startContainer(), range.startOffset());
2740 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2741 buffer.prependContext(backwardsIterator.text());
2742 if (!buffer.needsMoreContext())
2743 break;
2744 }
2745 }
2746
2747 CharacterIterator findIterator(range, findIteratorOptions(options));
2748 while (!findIterator.atEnd()) {
2749 findIterator.advance(buffer.append(findIterator.text()));
2750 while (1) {
2751 size_t matchStartOffset;
2752 size_t newMatchLength = buffer.search(matchStartOffset);
2753 if (!newMatchLength) {
2754 if (findIterator.atBreak() && !buffer.atBreak()) {
2755 buffer.reachedBreak();
2756 continue;
2757 }
2758 break;
2759 }
2760 size_t lastCharacterInBufferOffset = findIterator.characterOffset();
2761 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2762 if (match(lastCharacterInBufferOffset - matchStartOffset, newMatchLength))
2763 return;
2764 }
2765 }
2766}
2767
2768static Ref<Range> rangeForMatch(const Range& range, FindOptions options, size_t matchStart, size_t matchLength, bool searchForward)
2769{
2770 if (!matchLength)
2771 return collapsedToBoundary(range, searchForward);
2772 CharacterIterator rangeComputeIterator(range, findIteratorOptions(options));
2773 return characterSubrange(range.ownerDocument(), rangeComputeIterator, matchStart, matchLength);
2774}
2775
2776Ref<Range> findClosestPlainText(const Range& range, const String& target, FindOptions options, unsigned targetOffset)
2777{
2778 size_t matchStart = 0;
2779 size_t matchLength = 0;
2780 size_t distance = std::numeric_limits<size_t>::max();
2781 auto match = [targetOffset, &distance, &matchStart, &matchLength] (size_t start, size_t length) {
2782 size_t newDistance = std::min(abs(static_cast<signed>(start - targetOffset)), abs(static_cast<signed>(start + length - targetOffset)));
2783 if (newDistance < distance) {
2784 matchStart = start;
2785 matchLength = length;
2786 distance = newDistance;
2787 }
2788 return false;
2789 };
2790
2791 findPlainTextMatches(range, target, options, WTFMove(match));
2792 return rangeForMatch(range, options, matchStart, matchLength, !options.contains(Backwards));
2793}
2794
2795Ref<Range> findPlainText(const Range& range, const String& target, FindOptions options)
2796{
2797 bool searchForward = !options.contains(Backwards);
2798 size_t matchStart = 0;
2799 size_t matchLength = 0;
2800 auto match = [searchForward, &matchStart, &matchLength] (size_t start, size_t length) {
2801 matchStart = start;
2802 matchLength = length;
2803 // Look for the last match when searching backwards instead.
2804 return searchForward;
2805 };
2806
2807 findPlainTextMatches(range, target, options, WTFMove(match));
2808 return rangeForMatch(range, options, matchStart, matchLength, searchForward);
2809}
2810
2811bool findPlainText(const String& document, const String& target, FindOptions options)
2812{
2813 SearchBuffer buffer { target, options };
2814 StringView remainingText { document };
2815 while (!remainingText.isEmpty()) {
2816 size_t charactersAppended = buffer.append(document);
2817 remainingText = remainingText.substring(charactersAppended);
2818 if (remainingText.isEmpty())
2819 buffer.reachedBreak();
2820 size_t matchStartOffset;
2821 if (buffer.search(matchStartOffset))
2822 return true;
2823 }
2824 return false;
2825}
2826
2827}
2828