TextIterator.cpp source code [webkit/Source/WebCore/editing/TextIterator.cpp]

1	/*
2	* Copyright (C) 2004-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2005 Alexey Proskuryakov.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include "TextIterator.h"
29
30	#include "ComposedTreeIterator.h"
31	#include "Document.h"
32	#include "Editing.h"
33	#include "FontCascade.h"
34	#include "Frame.h"
35	#include "HTMLBodyElement.h"
36	#include "HTMLElement.h"
37	#include "HTMLFrameOwnerElement.h"
38	#include "HTMLInputElement.h"
39	#include "HTMLLegendElement.h"
40	#include "HTMLMeterElement.h"
41	#include "HTMLNames.h"
42	#include "HTMLParagraphElement.h"
43	#include "HTMLProgressElement.h"
44	#include "HTMLSlotElement.h"
45	#include "HTMLTextAreaElement.h"
46	#include "HTMLTextFormControlElement.h"
47	#include "InlineTextBox.h"
48	#include "NodeTraversal.h"
49	#include "RenderImage.h"
50	#include "RenderIterator.h"
51	#include "RenderTableCell.h"
52	#include "RenderTableRow.h"
53	#include "RenderTextControl.h"
54	#include "RenderTextFragment.h"
55	#include "ShadowRoot.h"
56	#include "SimpleLineLayout.h"
57	#include "SimpleLineLayoutFunctions.h"
58	#include "SimpleLineLayoutResolver.h"
59	#include "TextBoundaries.h"
60	#include "TextControlInnerElements.h"
61	#include "VisiblePosition.h"
62	#include "VisibleUnits.h"
63	#include <unicode/unorm2.h>
64	#include <wtf/Function.h>
65	#include <wtf/text/CString.h>
66	#include <wtf/text/StringBuilder.h>
67	#include <wtf/text/TextBreakIterator.h>
68	#include <wtf/unicode/CharacterNames.h>
69
70	#if !UCONFIG_NO_COLLATION
71	#include <unicode/usearch.h>
72	#include <wtf/text/TextBreakIteratorInternalICU.h>
73	#endif
74
75	namespace WebCore {
76
77	using namespace WTF::Unicode;
78	using namespace HTMLNames;
79
80	// Buffer that knows how to compare with a search target.
81	// Keeps enough of the previous text to be able to search in the future, but no more.
82	// Non-breaking spaces are always equal to normal spaces.
83	// Case folding is also done if the CaseInsensitive option is specified.
84	// Matches are further filtered if the AtWordStarts option is specified, although some
85	// matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
86	class SearchBuffer {
87	WTF_MAKE_NONCOPYABLE(SearchBuffer);
88	public:
89	SearchBuffer(const String& target, FindOptions);
90	~SearchBuffer();
91
92	// Returns number of characters appended; guaranteed to be in the range [1, length].
93	size_t append(StringView);
94	bool needsMoreContext() const;
95	void prependContext(StringView);
96	void reachedBreak();
97
98	// Result is the size in characters of what was found.
99	// And <startOffset> is the number of characters back to the start of what was found.
100	size_t search(size_t& startOffset);
101	bool atBreak() const;
102
103	#if !UCONFIG_NO_COLLATION
104
105	private:
106	bool isBadMatch(const UChar, size_t length) const*;
107	bool isWordStartMatch(size_t start, size_t length) const;
108	bool isWordEndMatch(size_t start, size_t length) const;
109
110	const String m_target;
111	const StringView::UpconvertedCharacters m_targetCharacters;
112	FindOptions m_options;
113
114	Vector<UChar> m_buffer;
115	size_t m_overlap;
116	size_t m_prefixLength;
117	bool m_atBreak;
118	bool m_needsMoreContext;
119
120	const bool m_targetRequiresKanaWorkaround;
121	Vector<UChar> m_normalizedTarget;
122	mutable Vector<UChar> m_normalizedMatch;
123
124	#else
125
126	private:
127	void append(UChar, bool isCharacterStart);
128	size_t length() const;
129
130	String m_target;
131	FindOptions m_options;
132
133	Vector<UChar> m_buffer;
134	Vector<bool> m_isCharacterStartBuffer;
135	bool m_isBufferFull;
136	size_t m_cursor;
137
138	#endif
139	};
140
141	// --------
142
143	static const unsigned bitsInWord = sizeof(unsigned) * `8`;
144	static const unsigned bitInWordMask = bitsInWord - `1`;
145
146	BitStack::BitStack()
147	: m_size(`0`)
148	{
149	}
150
151	BitStack::~BitStack() = default;
152
153	void BitStack::push(bool bit)
154	{
155	unsigned index = m_size / bitsInWord;
156	unsigned shift = m_size & bitInWordMask;
157	if (!shift && index == m_words.size()) {
158	m_words.grow(index + `1`);
159	m_words [index] = `0`;
160	}
161	unsigned& word = m_words [index];
162	unsigned mask = `1U` << shift;
163	if (bit)
164	word \|= mask;
165	else
166	word &= ~mask;
167	++m_size;
168	}
169
170	void BitStack::pop()
171	{
172	if (m_size)
173	--m_size;
174	}
175
176	bool BitStack::top() const
177	{
178	if (!m_size)
179	return false;
180	unsigned shift = (m_size - `1`) & bitInWordMask;
181	return m_words.last() & (`1U` << shift);
182	}
183
184	unsigned BitStack::size() const
185	{
186	return m_size;
187	}
188
189	// --------
190
191	// This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
192	static Node* nextInPreOrderCrossingShadowBoundaries(Node& rangeEndContainer, int rangeEndOffset)
193	{
194	if (rangeEndOffset >= `0` && !rangeEndContainer.isCharacterDataNode()) {
195	if (Node* next = rangeEndContainer.traverseToChildAt(rangeEndOffset))
196	return next;
197	}
198	for (Node* node = &rangeEndContainer; node; node = node->parentOrShadowHostNode()) {
199	if (Node* next = node->nextSibling())
200	return next;
201	}
202	return nullptr;
203	}
204
205	static inline bool fullyClipsContents(Node& node)
206	{
207	auto* renderer = node.renderer();
208	if (!renderer) {
209	if (!is<Element>(node))
210	return false;
211	return !downcast<Element>(node).hasDisplayContents();
212	}
213	if (!is<RenderBox>(*renderer))
214	return false;
215	auto& box = downcast<RenderBox>(*renderer);
216	if (!box.hasOverflowClip())
217	return false;
218
219	// Quirk to keep copy/paste in the CodeMirror editor version used in Jenkins working.
220	if (is<HTMLTextAreaElement>(node))
221	return box.size().isEmpty();
222
223	return box.contentSize().isEmpty();
224	}
225
226	static inline bool ignoresContainerClip(Node& node)
227	{
228	auto* renderer = node.renderer();
229	if (!renderer \|\| renderer->isTextOrLineBreak())
230	return false;
231	return renderer->style().hasOutOfFlowPosition();
232	}
233
234	static void pushFullyClippedState(BitStack& stack, Node& node)
235	{
236	// Push true if this node full clips its contents, or if a parent already has fully
237	// clipped and this is not a node that ignores its container's clip.
238	stack.push(fullyClipsContents(node) \|\| (stack.top() && !ignoresContainerClip(node)));
239	}
240
241	static void setUpFullyClippedStack(BitStack& stack, Node& node)
242	{
243	// Put the nodes in a vector so we can iterate in reverse order.
244	// FIXME: This (and TextIterator in general) should use ComposedTreeIterator.
245	Vector<Node*, `100`> ancestry;
246	for (Node* parent = node.parentOrShadowHostNode(); parent; parent = parent->parentOrShadowHostNode())
247	ancestry.append(parent);
248
249	// Call pushFullyClippedState on each node starting with the earliest ancestor.
250	size_t size = ancestry.size();
251	for (size_t i = `0`; i < size; ++i)
252	pushFullyClippedState(stack, *ancestry [size - i - `1`]);
253	pushFullyClippedState(stack, node);
254	}
255
256	static bool isClippedByFrameAncestor(const Document& document, TextIteratorBehavior behavior)
257	{
258	if (!(behavior & TextIteratorClipsToFrameAncestors))
259	return false;
260
261	for (auto* owner = document.ownerElement(); owner; owner = owner->document().ownerElement()) {
262	BitStack ownerClipStack;
263	setUpFullyClippedStack(ownerClipStack, *owner);
264	if (ownerClipStack.top())
265	return true;
266	}
267	return false;
268	}
269
270	// FIXME: editingIgnoresContent and isRendererReplacedElement try to do the same job.
271	// It's not good to have both of them.
272	bool isRendererReplacedElement(RenderObject* renderer)
273	{
274	if (!renderer)
275	return false;
276
277	bool isAttachment = false;
278	#if ENABLE(ATTACHMENT_ELEMENT)
279	isAttachment = renderer->isAttachment();
280	#endif
281	if (renderer->isImage() \|\| renderer->isWidget() \|\| renderer->isMedia() \|\| isAttachment)
282	return true;
283
284	if (is<Element>(renderer->node())) {
285	Element& element = downcast<Element>(*renderer->node());
286	if (is<HTMLFormControlElement>(element) \|\| is<HTMLLegendElement>(element) \|\| is<HTMLProgressElement>(element) \|\| element.hasTagName(meterTag))
287	return true;
288	if (equalLettersIgnoringASCIICase(element.attributeWithoutSynchronization(roleAttr), "img"))
289	return true;
290	}
291
292	return false;
293	}
294
295	// --------
296
297	inline void TextIteratorCopyableText::reset()
298	{
299	m_singleCharacter = `0`;
300	m_string = String ();
301	m_offset = `0`;
302	m_length = `0`;
303	}
304
305	inline void TextIteratorCopyableText::set(String&& string)
306	{
307	m_singleCharacter = `0`;
308	m_string = WTFMove(string);
309	m_offset = `0`;
310	m_length = m_string.length();
311	}
312
313	inline void TextIteratorCopyableText::set(String&& string, unsigned offset, unsigned length)
314	{
315	ASSERT(offset < string.length());
316	ASSERT(length);
317	ASSERT(length <= string.length() - offset);
318
319	m_singleCharacter = `0`;
320	m_string = WTFMove(string);
321	m_offset = offset;
322	m_length = length;
323	}
324
325	inline void TextIteratorCopyableText::set(UChar singleCharacter)
326	{
327	m_singleCharacter = singleCharacter;
328	m_string = String ();
329	m_offset = `0`;
330	m_length = `0`;
331	}
332
333	void TextIteratorCopyableText::appendToStringBuilder(StringBuilder& builder) const
334	{
335	if (m_singleCharacter)
336	builder.append(m_singleCharacter);
337	else
338	builder.append(m_string, m_offset, m_length);
339	}
340
341	// --------
342
343
344	TextIterator::TextIterator(Position start, Position end, TextIteratorBehavior behavior)
345	: m_behavior(behavior)
346	{
347	if (start.isNull() \|\| end.isNull())
348	return;
349	ASSERT(comparePositions(start, end) <= `0`);
350
351	RELEASE_ASSERT(behavior & TextIteratorTraversesFlatTree \|\| start.treeScope() == end.treeScope());
352
353	start.document()->updateLayoutIgnorePendingStylesheets();
354
355	// FIXME: Use Position / PositionIterator instead to avoid offset computation.
356	m_startContainer = start.containerNode();
357	m_startOffset = start.computeOffsetInContainerNode();
358
359	m_endContainer = end.containerNode();
360	m_endOffset = end.computeOffsetInContainerNode();
361
362	m_node = start.firstNode().get();
363	if (!m_node)
364	return;
365
366	init();
367	}
368
369	TextIterator::TextIterator(const Range* range, TextIteratorBehavior behavior)
370	: m_behavior(behavior)
371	{
372	if (!range)
373	return;
374
375	range->ownerDocument().updateLayoutIgnorePendingStylesheets();
376
377	m_startContainer = &range->startContainer();
378
379	// Callers should be handing us well-formed ranges. If we discover that this isn't
380	// the case, we could consider changing this assertion to an early return.
381	ASSERT(range->boundaryPointsValid());
382
383	m_startOffset = range->startOffset();
384	m_endContainer = &range->endContainer();
385	m_endOffset = range->endOffset();
386
387	m_node = range->firstNode();
388	if (!m_node)
389	return;
390
391	init();
392	}
393
394	void TextIterator::init()
395	{
396	if (isClippedByFrameAncestor(m_node->document(), m_behavior))
397	return;
398
399	setUpFullyClippedStack(m_fullyClippedStack, *m_node);
400
401	m_offset = m_node == m_startContainer ? m_startOffset : `0`;
402
403	m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(*m_endContainer, m_endOffset);
404
405	m_positionNode = m_node;
406
407	advance();
408	}
409
410	TextIterator::~TextIterator() = default;
411
412	// FIXME: Use ComposedTreeIterator instead. These functions are more expensive because they might do O(n) work.
413	static inline Node* firstChild(TextIteratorBehavior options, Node& node)
414	{
415	if (UNLIKELY(options & TextIteratorTraversesFlatTree))
416	return firstChildInComposedTreeIgnoringUserAgentShadow(node);
417	return node.firstChild();
418	}
419
420	static inline Node* nextSibling(TextIteratorBehavior options, Node& node)
421	{
422	if (UNLIKELY(options & TextIteratorTraversesFlatTree))
423	return nextSiblingInComposedTreeIgnoringUserAgentShadow(node);
424	return node.nextSibling();
425	}
426
427	static inline Node* nextNode(TextIteratorBehavior options, Node& node)
428	{
429	if (UNLIKELY(options & TextIteratorTraversesFlatTree))
430	return nextInComposedTreeIgnoringUserAgentShadow(node);
431	return NodeTraversal::next(node);
432	}
433
434	static inline bool isDescendantOf(TextIteratorBehavior options, Node& node, Node& possibleAncestor)
435	{
436	if (UNLIKELY(options & TextIteratorTraversesFlatTree))
437	return node.isDescendantOrShadowDescendantOf(&possibleAncestor);
438	return node.isDescendantOf(&possibleAncestor);
439	}
440
441	static inline Node* parentNodeOrShadowHost(TextIteratorBehavior options, Node& node)
442	{
443	if (UNLIKELY(options & TextIteratorTraversesFlatTree))
444	return node.parentInComposedTree();
445	return node.parentOrShadowHostNode();
446	}
447
448	static inline bool hasDisplayContents(Node& node)
449	{
450	return is<Element>(node) && downcast<Element>(node).hasDisplayContents();
451	}
452
453	void TextIterator::advance()
454	{
455	ASSERT(!atEnd());
456
457	// reset the run information
458	m_positionNode = nullptr;
459	m_copyableText.reset();
460	m_text = StringView ();
461
462	// handle remembered node that needed a newline after the text node's newline
463	if (m_nodeForAdditionalNewline) {
464	// Emit the extra newline, and position it inside* m_node, after m_node's*
465	// contents, in case it's a block, in the same way that we position the first
466	// newline. The range for the emitted newline should start where the line
467	// break begins.
468	// FIXME: It would be cleaner if we emitted two newlines during the last
469	// iteration, instead of using m_needsAnotherNewline.
470	emitCharacter(`'\n'`, *m_nodeForAdditionalNewline->parentNode(), m_nodeForAdditionalNewline, `1`, `1`);
471	m_nodeForAdditionalNewline = nullptr;
472	return;
473	}
474
475	if (!m_textBox && m_remainingTextBox) {
476	m_textBox = m_remainingTextBox;
477	m_remainingTextBox = nullptr;
478	m_firstLetterText = nullptr;
479	m_offset = `0`;
480	}
481	// handle remembered text box
482	if (m_textBox) {
483	handleTextBox();
484	if (m_positionNode)
485	return;
486	}
487
488	while (m_node && m_node != m_pastEndNode) {
489	// if the range ends at offset 0 of an element, represent the
490	// position, but not the content, of that element e.g. if the
491	// node is a blockflow element, emit a newline that
492	// precedes the element
493	if (m_node == m_endContainer && !m_endOffset) {
494	representNodeOffsetZero();
495	m_node = nullptr;
496	return;
497	}
498
499	auto* renderer = m_node->renderer();
500	if (!m_handledNode) {
501	if (!renderer) {
502	m_handledNode = true;
503	m_handledChildren = !hasDisplayContents(*m_node);
504	} else {
505	// handle current node according to its type
506	if (renderer->isText() && m_node->isTextNode())
507	m_handledNode = handleTextNode();
508	else if (isRendererReplacedElement(renderer))
509	m_handledNode = handleReplacedElement();
510	else
511	m_handledNode = handleNonTextNode();
512	if (m_positionNode)
513	return;
514	}
515	}
516
517	// find a new current node to handle in depth-first manner,
518	// calling exitNode() as we come back thru a parent node
519	Node* next = m_handledChildren ? nullptr : firstChild(m_behavior, *m_node);
520	m_offset = `0`;
521	if (!next) {
522	next = nextSibling(m_behavior, *m_node);
523	if (!next) {
524	bool pastEnd = nextNode(m_behavior, *m_node) == m_pastEndNode;
525	Node* parentNode = parentNodeOrShadowHost(m_behavior, *m_node);
526	while (!next && parentNode) {
527	if ((pastEnd && parentNode == m_endContainer) \|\| isDescendantOf(m_behavior, m_endContainer, parentNode))
528	return;
529	bool haveRenderer = m_node->renderer();
530	Node* exitedNode = m_node;
531	m_node = parentNode;
532	m_fullyClippedStack.pop();
533	parentNode = parentNodeOrShadowHost(m_behavior, *m_node);
534	if (haveRenderer)
535	exitNode(exitedNode);
536	if (m_positionNode) {
537	m_handledNode = true;
538	m_handledChildren = true;
539	return;
540	}
541	next = nextSibling(m_behavior, *m_node);
542	if (next && m_node->renderer())
543	exitNode(m_node);
544	}
545	}
546	m_fullyClippedStack.pop();
547	}
548
549	// set the new current node
550	m_node = next;
551	if (m_node)
552	pushFullyClippedState(m_fullyClippedStack, *m_node);
553	m_handledNode = false;
554	m_handledChildren = false;
555	m_handledFirstLetter = false;
556	m_firstLetterText = nullptr;
557
558	// how would this ever be?
559	if (m_positionNode)
560	return;
561	}
562	}
563
564	static bool hasVisibleTextNode(RenderText& renderer)
565	{
566	if (renderer.style().visibility() == Visibility::Visible)
567	return true;
568	if (is<RenderTextFragment>(renderer)) {
569	if (auto firstLetter = downcast<RenderTextFragment>(renderer).firstLetter()) {
570	if (firstLetter->style().visibility() == Visibility::Visible)
571	return true;
572	}
573	}
574	return false;
575	}
576
577	static unsigned textNodeOffsetInFlow(const Text& firstTextNodeInRange)
578	{
579	// Calculate the text offset for simple lines.
580	RenderObject* renderer = firstTextNodeInRange.renderer();
581	if (!renderer)
582	return `0`;
583	unsigned textOffset = `0`;
584	for (renderer = renderer->previousSibling(); renderer; renderer = renderer->previousSibling()) {
585	if (is<RenderText>(renderer))
586	textOffset += downcast<RenderText>(renderer)->text().length();
587	}
588	return textOffset;
589	}
590
591	static bool isNewLineOrTabCharacter(UChar character)
592	{
593	return character == `'\n'` \|\| character == `'\t'`;
594	}
595
596	bool TextIterator::handleTextNode()
597	{
598	Text& textNode = downcast<Text>(*m_node);
599
600	if (m_fullyClippedStack.top() && !(m_behavior & TextIteratorIgnoresStyleVisibility))
601	return false;
602
603	auto& renderer = *textNode.renderer();
604	m_lastTextNode = &textNode;
605	String rendererText = renderer.text();
606
607	// handle pre-formatted text
608	if (!renderer.style().collapseWhiteSpace()) {
609	int runStart = m_offset;
610	if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
611	emitCharacter(`' '`, textNode, nullptr, runStart, runStart);
612	return false;
613	}
614	if (!m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset) {
615	handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
616	if (m_firstLetterText) {
617	String firstLetter = m_firstLetterText->text();
618	emitText(textNode, *m_firstLetterText, m_offset, m_offset + firstLetter.length());
619	m_firstLetterText = nullptr;
620	m_textBox = nullptr;
621	return false;
622	}
623	}
624	if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
625	return false;
626	int rendererTextLength = rendererText.length();
627	int end = (&textNode == m_endContainer) ? m_endOffset : INT_MAX;
628	int runEnd = std::min(rendererTextLength, end);
629
630	if (runStart >= runEnd)
631	return true;
632
633	emitText(textNode, renderer, runStart, runEnd);
634	return true;
635	}
636
637	if (const auto* layout = renderer.simpleLineLayout()) {
638	if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
639	return true;
640	ASSERT(renderer.parent());
641	ASSERT(is<RenderBlockFlow>(*renderer.parent()));
642	const auto& blockFlow = downcast<RenderBlockFlow>(*renderer.parent());
643	// Use the simple layout runs to iterate over the text content.
644	bool isNewTextNode = m_previousSimpleTextNodeInFlow && m_previousSimpleTextNodeInFlow != &textNode;
645	// Simple line layout run positions are all absolute to the parent flow.
646	// Offsetting is required when multiple renderers are present.
647	m_accumulatedSimpleTextLengthInFlow += isNewTextNode ? m_previousSimpleTextNodeInFlow->renderer()->text().length() : `0`;
648	m_previousSimpleTextNodeInFlow = &textNode;
649
650	unsigned endPosition = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : rendererText.length();
651	if (!m_flowRunResolverCache \|\| &m_flowRunResolverCache ->flow() != &blockFlow) {
652	m_accumulatedSimpleTextLengthInFlow = m_flowRunResolverCache ? `0` : textNodeOffsetInFlow(textNode);
653	m_flowRunResolverCache = std::make_unique<SimpleLineLayout::RunResolver>(blockFlow, *layout);
654	}
655	// Skip to m_offset position.
656	auto range = m_flowRunResolverCache ->rangeForRenderer(renderer);
657	auto it = range.begin();
658	auto end = range.end();
659	auto startPosition = static_cast<unsigned>(m_offset) + m_accumulatedSimpleTextLengthInFlow;
660	while (it != end && (*it).end() <= startPosition)
661	++it;
662	if (m_nextRunNeedsWhitespace && rendererText [m_offset - `1`] == `'\n'`) {
663	emitCharacter(`' '`, textNode, nullptr, m_offset, m_offset + `1`);
664	return it == end;
665	}
666	if (it == end) {
667	// Collapsed trailing whitespace.
668	m_offset = endPosition;
669	m_lastTextNodeEndedWithCollapsedSpace = true;
670	return true;
671	}
672	if (m_nextRunNeedsWhitespace) {
673	emitCharacter(`' '`, textNode, nullptr, m_offset, m_offset + `1`);
674	return false;
675	}
676	// If the position we are looking for is to the left of the renderer's first run, it could mean that
677	// the runs and the renderers are out of sync (e.g. we skipped a renderer in between).
678	// Better bail out at this point.
679	auto run = *it;
680	if (run.start() > startPosition) {
681	ASSERT(m_flowRunResolverCache);
682	if (&(rendererForPosition(m_flowRunResolverCache ->flowContents(), startPosition)) != &renderer) {
683	ASSERT_NOT_REACHED();
684	return true;
685	}
686	}
687	ASSERT(run.end() - run.start() <= rendererText.length());
688	// contentStart skips leading whitespace.
689	unsigned contentStart = std::max<unsigned>(m_offset, run.start() - m_accumulatedSimpleTextLengthInFlow);
690	unsigned contentEnd = std::min(endPosition, run.end() - m_accumulatedSimpleTextLengthInFlow);
691	ASSERT_WITH_SECURITY_IMPLICATION(contentStart <= contentEnd);
692	// Check if whitespace adjustment is needed when crossing renderer boundary.
693	if (isNewTextNode) {
694	bool lastCharacterIsNotWhitespace = m_lastCharacter && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter);
695	bool addTrailingWhitespaceForPrevious = m_lastTextNodeEndedWithCollapsedSpace && lastCharacterIsNotWhitespace;
696	bool leadingWhitespaceIsNeededForCurrent = contentStart > static_cast<unsigned>(m_offset) && lastCharacterIsNotWhitespace;
697	if (addTrailingWhitespaceForPrevious \|\| leadingWhitespaceIsNeededForCurrent) {
698	emitCharacter(`' '`, textNode, nullptr, m_offset, m_offset + `1`);
699	return false;
700	}
701	}
702	// \n \t single whitespace characters need replacing so that the new line/tab characters don't show up.
703	unsigned stopPosition = contentStart;
704	while (stopPosition < contentEnd && !isNewLineOrTabCharacter(rendererText [stopPosition]))
705	++stopPosition;
706	// Emit the text up to the new line/tab character.
707	if (stopPosition < contentEnd) {
708	if (stopPosition == contentStart) {
709	emitCharacter(`' '`, textNode, nullptr, contentStart, contentStart + `1`);
710	m_offset = contentStart + `1`;
711	return false;
712	}
713	emitText(textNode, renderer, contentStart, stopPosition);
714	m_offset = stopPosition + `1`;
715	m_nextRunNeedsWhitespace = true;
716	return false;
717	}
718	emitText(textNode, renderer, contentStart, contentEnd);
719	// When line ending with collapsed whitespace is present, we need to carry over one whitespace: foo(end of line)bar -> foo bar (otherwise we would end up with foobar).
720	m_nextRunNeedsWhitespace = run.isEndOfLine() && contentEnd < endPosition && renderer.style().isCollapsibleWhiteSpace(rendererText [contentEnd]);
721	m_offset = contentEnd;
722	return static_cast<unsigned>(m_offset) == endPosition;
723	}
724
725	if (renderer.firstTextBox())
726	m_textBox = renderer.firstTextBox();
727
728	bool shouldHandleFirstLetter = !m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset;
729	if (shouldHandleFirstLetter)
730	handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
731
732	if (!renderer.firstTextBox() && rendererText.length() && !shouldHandleFirstLetter) {
733	if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
734	return false;
735	m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
736	return true;
737	}
738
739	// Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
740	auto& boxesRenderer = m_firstLetterText ? *m_firstLetterText : renderer;
741	if (boxesRenderer.containsReversedText()) {
742	m_sortedTextBoxes.clear();
743	for (InlineTextBox* textBox = boxesRenderer.firstTextBox(); textBox; textBox = textBox->nextTextBox())
744	m_sortedTextBoxes.append(textBox);
745	std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
746	m_sortedTextBoxesPosition = `0`;
747	m_textBox = m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes [`0`];
748	}
749
750	handleTextBox();
751	return true;
752	}
753
754	void TextIterator::handleTextBox()
755	{
756	Text& textNode = downcast<Text>(*m_node);
757
758	auto& renderer = m_firstLetterText ? m_firstLetterText : textNode.renderer();
759	if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) {
760	m_textBox = nullptr;
761	return;
762	}
763	String rendererText = renderer.text();
764	unsigned start = m_offset;
765	unsigned end = (&textNode == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
766	while (m_textBox) {
767	unsigned textBoxStart = m_textBox->start();
768	unsigned runStart = std::max(textBoxStart, start);
769
770	// Check for collapsed space at the start of this run.
771	InlineTextBox* firstTextBox = renderer.containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes [`0`]) : renderer.firstTextBox();
772	bool needSpace = m_lastTextNodeEndedWithCollapsedSpace \|\| (m_textBox == firstTextBox && textBoxStart == runStart && runStart);
773	if (needSpace && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter) && m_lastCharacter) {
774	if (m_lastTextNode == &textNode && runStart && rendererText [runStart - `1`] == `' '`) {
775	unsigned spaceRunStart = runStart - `1`;
776	while (spaceRunStart && rendererText [spaceRunStart - `1`] == `' '`)
777	--spaceRunStart;
778	emitText(textNode, renderer, spaceRunStart, spaceRunStart + `1`);
779	} else
780	emitCharacter(`' '`, textNode, nullptr, runStart, runStart);
781	return;
782	}
783	unsigned textBoxEnd = textBoxStart + m_textBox->len();
784	unsigned runEnd = std::min(textBoxEnd, end);
785
786	// Determine what the next text box will be, but don't advance yet
787	InlineTextBox* nextTextBox = nullptr;
788	if (renderer.containsReversedText()) {
789	if (m_sortedTextBoxesPosition + `1` < m_sortedTextBoxes.size())
790	nextTextBox = m_sortedTextBoxes [m_sortedTextBoxesPosition + `1`];
791	} else
792	nextTextBox = m_textBox->nextTextBox();
793	ASSERT(!nextTextBox \|\| &nextTextBox->renderer() == &renderer);
794
795	if (runStart < runEnd) {
796	// Handle either a single newline character (which becomes a space),
797	// or a run of characters that does not include a newline.
798	// This effectively translates newlines to spaces without copying the text.
799	if (rendererText [runStart] == `'\n'`) {
800	emitCharacter(`' '`, textNode, nullptr, runStart, runStart + `1`);
801	m_offset = runStart + `1`;
802	} else {
803	size_t subrunEnd = rendererText.find(`'\n'`, runStart);
804	if (subrunEnd == notFound \|\| subrunEnd > runEnd) {
805	subrunEnd = runEnd;
806	bool lastSpaceCollapsedByNextNonTextBox = !nextTextBox && (m_behavior & TextIteratorBehavesAsIfNodesFollowing) && rendererText.length() > runEnd;
807	if (lastSpaceCollapsedByNextNonTextBox)
808	++subrunEnd; // runEnd stopped before last space. Increment by one to restore the space.
809	}
810	m_offset = subrunEnd;
811	emitText(textNode, renderer, runStart, subrunEnd);
812	}
813
814	// If we are doing a subrun that doesn't go to the end of the text box,
815	// come back again to finish handling this text box; don't advance to the next one.
816	if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
817	return;
818
819	// Advance and return
820	unsigned nextRunStart = nextTextBox ? nextTextBox->start() : rendererText.length();
821	if (nextRunStart > runEnd)
822	m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
823	m_textBox = nextTextBox;
824	if (renderer.containsReversedText())
825	++m_sortedTextBoxesPosition;
826	return;
827	}
828	// Advance and continue
829	m_textBox = nextTextBox;
830	if (renderer.containsReversedText())
831	++m_sortedTextBoxesPosition;
832	}
833	if (!m_textBox && m_remainingTextBox) {
834	m_textBox = m_remainingTextBox;
835	m_remainingTextBox = nullptr;
836	m_firstLetterText = nullptr;
837	m_offset = `0`;
838	handleTextBox();
839	}
840	}
841
842	static inline RenderText* firstRenderTextInFirstLetter(RenderBoxModelObject* firstLetter)
843	{
844	if (!firstLetter)
845	return nullptr;
846
847	// FIXME: Should this check descendent objects?
848	return childrenOfType<RenderText>(*firstLetter).first();
849	}
850
851	void TextIterator::handleTextNodeFirstLetter(RenderTextFragment& renderer)
852	{
853	if (auto* firstLetter = renderer.firstLetter()) {
854	if (firstLetter->style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
855	return;
856	if (auto* firstLetterText = firstRenderTextInFirstLetter(firstLetter)) {
857	m_handledFirstLetter = true;
858	m_remainingTextBox = m_textBox;
859	m_textBox = firstLetterText->firstTextBox();
860	m_sortedTextBoxes.clear();
861	m_firstLetterText = firstLetterText;
862	}
863	}
864	m_handledFirstLetter = true;
865	}
866
867	bool TextIterator::handleReplacedElement()
868	{
869	if (m_fullyClippedStack.top())
870	return false;
871
872	auto& renderer = *m_node->renderer();
873	if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility))
874	return false;
875
876	if (m_lastTextNodeEndedWithCollapsedSpace) {
877	emitCharacter(`' '`, *m_lastTextNode->parentNode(), m_lastTextNode, `1`, `1`);
878	return false;
879	}
880
881	if ((m_behavior & TextIteratorEntersTextControls) && is<RenderTextControl>(renderer)) {
882	if (auto innerTextElement = downcast<RenderTextControl>(renderer).textFormControlElement().innerTextElement()) {
883	m_node = innerTextElement ->containingShadowRoot();
884	pushFullyClippedState(m_fullyClippedStack, *m_node);
885	m_offset = `0`;
886	return false;
887	}
888	}
889
890	m_hasEmitted = true;
891
892	if ((m_behavior & TextIteratorEmitsObjectReplacementCharacters) && renderer.isReplaced()) {
893	emitCharacter(objectReplacementCharacter, *m_node->parentNode(), m_node, `0`, `1`);
894	// Don't process subtrees for embedded objects. If the text there is required,
895	// it must be explicitly asked by specifying a range falling inside its boundaries.
896	m_handledChildren = true;
897	return true;
898	}
899
900	if (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) {
901	// We want replaced elements to behave like punctuation for boundary
902	// finding, and to simply take up space for the selection preservation
903	// code in moveParagraphs, so we use a comma.
904	emitCharacter(`','`, *m_node->parentNode(), m_node, `0`, `1`);
905	return true;
906	}
907
908	m_positionNode = m_node->parentNode();
909	m_positionOffsetBaseNode = m_node;
910	m_positionStartOffset = `0`;
911	m_positionEndOffset = `1`;
912
913	if ((m_behavior & TextIteratorEmitsImageAltText) && is<RenderImage>(renderer)) {
914	String altText = downcast<RenderImage>(renderer).altText();
915	if (unsigned length = altText.length()) {
916	m_lastCharacter = altText [length - `1`];
917	m_copyableText.set(WTFMove(altText));
918	m_text = m_copyableText.text();
919	return true;
920	}
921	}
922
923	m_copyableText.reset();
924	m_text = StringView ();
925	m_lastCharacter = `0`;
926	return true;
927	}
928
929	static bool shouldEmitTabBeforeNode(Node& node)
930	{
931	auto* renderer = node.renderer();
932
933	// Table cells are delimited by tabs.
934	if (!renderer \|\| !isTableCell(&node))
935	return false;
936
937	// Want a tab before every cell other than the first one.
938	RenderTableCell& cell = downcast<RenderTableCell>(*renderer);
939	RenderTable* table = cell.table();
940	return table && (table->cellBefore(&cell) \|\| table->cellAbove(&cell));
941	}
942
943	static bool shouldEmitNewlineForNode(Node* node, bool emitsOriginalText)
944	{
945	auto* renderer = node->renderer();
946	if (!(renderer ? renderer->isBR() : node->hasTagName(brTag)))
947	return false;
948	return emitsOriginalText \|\| !(node->isInShadowTree() && is<HTMLInputElement>(*node->shadowHost()));
949	}
950
951	static bool hasHeaderTag(HTMLElement& element)
952	{
953	return element.hasTagName(h1Tag)
954	\|\| element.hasTagName(h2Tag)
955	\|\| element.hasTagName(h3Tag)
956	\|\| element.hasTagName(h4Tag)
957	\|\| element.hasTagName(h5Tag)
958	\|\| element.hasTagName(h6Tag);
959	}
960
961	static bool shouldEmitNewlinesBeforeAndAfterNode(Node& node)
962	{
963	// Block flow (versus inline flow) is represented by having
964	// a newline both before and after the element.
965	auto* renderer = node.renderer();
966	if (!renderer) {
967	if (!is<HTMLElement>(node))
968	return false;
969	auto& element = downcast<HTMLElement>(node);
970	return hasHeaderTag(element)
971	\|\| element.hasTagName(blockquoteTag)
972	\|\| element.hasTagName(ddTag)
973	\|\| element.hasTagName(divTag)
974	\|\| element.hasTagName(dlTag)
975	\|\| element.hasTagName(dtTag)
976	\|\| element.hasTagName(hrTag)
977	\|\| element.hasTagName(liTag)
978	\|\| element.hasTagName(listingTag)
979	\|\| element.hasTagName(olTag)
980	\|\| element.hasTagName(pTag)
981	\|\| element.hasTagName(preTag)
982	\|\| element.hasTagName(trTag)
983	\|\| element.hasTagName(ulTag);
984	}
985
986	// Need to make an exception for table cells, because they are blocks, but we
987	// want them tab-delimited rather than having newlines before and after.
988	if (isTableCell(&node))
989	return false;
990
991	// Need to make an exception for table row elements, because they are neither
992	// "inline" or "RenderBlock", but we want newlines for them.
993	if (is<RenderTableRow>(*renderer)) {
994	RenderTable* table = downcast<RenderTableRow>(*renderer).table();
995	if (table && !table->isInline())
996	return true;
997	}
998
999	return !renderer->isInline()
1000	&& is<RenderBlock>(*renderer)
1001	&& !renderer->isFloatingOrOutOfFlowPositioned()
1002	&& !renderer->isBody()
1003	&& !renderer->isRubyText();
1004	}
1005
1006	static bool shouldEmitNewlineAfterNode(Node& node, bool emitsCharactersBetweenAllVisiblePositions = false)
1007	{
1008	// FIXME: It should be better but slower to create a VisiblePosition here.
1009	if (!shouldEmitNewlinesBeforeAndAfterNode(node))
1010	return false;
1011
1012	// Don't emit a new line at the end of the document unless we're matching the behavior of VisiblePosition.
1013	if (emitsCharactersBetweenAllVisiblePositions)
1014	return true;
1015	Node* subsequentNode = &node;
1016	while ((subsequentNode = NodeTraversal::nextSkippingChildren(*subsequentNode))) {
1017	if (subsequentNode->renderer())
1018	return true;
1019	}
1020	return false;
1021	}
1022
1023	static bool shouldEmitNewlineBeforeNode(Node& node)
1024	{
1025	return shouldEmitNewlinesBeforeAndAfterNode(node);
1026	}
1027
1028	static bool shouldEmitExtraNewlineForNode(Node& node)
1029	{
1030	// When there is a significant collapsed bottom margin, emit an extra
1031	// newline for a more realistic result. We end up getting the right
1032	// result even without margin collapsing. For example: <div><p>text</p></div>
1033	// will work right even if both the <div> and the <p> have bottom margins.
1034
1035	auto* renderer = node.renderer();
1036	if (!is<RenderBox>(renderer))
1037	return false;
1038
1039	// NOTE: We only do this for a select set of nodes, and WinIE appears not to do this at all.
1040	if (!is<HTMLElement>(node))
1041	return false;
1042
1043	HTMLElement& element = downcast<HTMLElement>(node);
1044	if (!hasHeaderTag(element) && !is<HTMLParagraphElement>(element))
1045	return false;
1046
1047	auto& renderBox = downcast<RenderBox>(*renderer);
1048	if (!renderBox.height())
1049	return false;
1050
1051	int bottomMargin = renderBox.collapsedMarginAfter();
1052	int fontSize = renderBox.style().fontDescription().computedPixelSize();
1053	return bottomMargin * `2` >= fontSize;
1054	}
1055
1056	static int collapsedSpaceLength(RenderText& renderer, int textEnd)
1057	{
1058	StringImpl& text = renderer.text();
1059	unsigned length = text.length();
1060	for (unsigned i = textEnd; i < length; ++i) {
1061	if (!renderer.style().isCollapsibleWhiteSpace(text [i]))
1062	return i - textEnd;
1063	}
1064	return length - textEnd;
1065	}
1066
1067	static int maxOffsetIncludingCollapsedSpaces(Node& node)
1068	{
1069	int offset = caretMaxOffset(node);
1070	if (auto* renderer = node.renderer()) {
1071	if (is<RenderText>(*renderer))
1072	offset += collapsedSpaceLength(downcast<RenderText>(*renderer), offset);
1073	}
1074	return offset;
1075	}
1076
1077	// Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
1078	bool TextIterator::shouldRepresentNodeOffsetZero()
1079	{
1080	if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isTable())
1081	return true;
1082
1083	// Leave element positioned flush with start of a paragraph
1084	// (e.g. do not insert tab before a table cell at the start of a paragraph)
1085	if (m_lastCharacter == `'\n'`)
1086	return false;
1087
1088	// Otherwise, show the position if we have emitted any characters
1089	if (m_hasEmitted)
1090	return true;
1091
1092	// We've not emitted anything yet. Generally, there is no need for any positioning then.
1093	// The only exception is when the element is visually not in the same line as
1094	// the start of the range (e.g. the range starts at the end of the previous paragraph).
1095	// NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
1096	// make quicker checks to possibly avoid that. Another check that we could make is
1097	// is whether the inline vs block flow changed since the previous visible element.
1098	// I think we're already in a special enough case that that won't be needed, tho.
1099
1100	// No character needed if this is the first node in the range.
1101	if (m_node == m_startContainer)
1102	return false;
1103
1104	// If we are outside the start container's subtree, assume we need to emit.
1105	// FIXME: m_startContainer could be an inline block
1106	if (!m_node->isDescendantOf(m_startContainer))
1107	return true;
1108
1109	// If we started as m_startContainer offset 0 and the current node is a descendant of
1110	// the start container, we already had enough context to correctly decide whether to
1111	// emit after a preceding block. We chose not to emit (m_hasEmitted is false),
1112	// so don't second guess that now.
1113	// NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
1114	// immaterial since we likely would have already emitted something by now.
1115	if (m_startOffset == `0`)
1116	return false;
1117
1118	// If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
1119	// Additionally, if the range we are iterating over contains huge sections of unrendered content,
1120	// we would create VisiblePositions on every call to this function without this check.
1121	if (!m_node->renderer() \|\| m_node->renderer()->style().visibility() != Visibility::Visible
1122	\|\| (is<RenderBlockFlow>(m_node->renderer()) && !downcast<RenderBlockFlow>(m_node->renderer()).height() && !is<HTMLBodyElement>(*m_node)))
1123	return false;
1124
1125	// The startPos.isNotNull() check is needed because the start could be before the body,
1126	// and in that case we'll get null. We don't want to put in newlines at the start in that case.
1127	// The currPos.isNotNull() check is needed because positions in non-HTML content
1128	// (like SVG) do not have visible positions, and we don't want to emit for them either.
1129	VisiblePosition startPos = VisiblePosition (Position (m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
1130	VisiblePosition currPos = VisiblePosition (positionBeforeNode(m_node), DOWNSTREAM);
1131	return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
1132	}
1133
1134	bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node& node)
1135	{
1136	return node.renderer() && node.renderer()->isTable() && (node.renderer()->isInline() \|\| (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions));
1137	}
1138
1139	void TextIterator::representNodeOffsetZero()
1140	{
1141	// Emit a character to show the positioning of m_node.
1142
1143	// When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
1144	// create VisiblePositions, which is expensive. So, we perform the inexpensive checks
1145	// on m_node to see if it necessitates emitting a character first and will early return
1146	// before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
1147	if (shouldEmitTabBeforeNode(*m_node)) {
1148	if (shouldRepresentNodeOffsetZero())
1149	emitCharacter(`'\t'`, *m_node->parentNode(), m_node, `0`, `0`);
1150	} else if (shouldEmitNewlineBeforeNode(*m_node)) {
1151	if (shouldRepresentNodeOffsetZero())
1152	emitCharacter(`'\n'`, *m_node->parentNode(), m_node, `0`, `0`);
1153	} else if (shouldEmitSpaceBeforeAndAfterNode(*m_node)) {
1154	if (shouldRepresentNodeOffsetZero())
1155	emitCharacter(`' '`, *m_node->parentNode(), m_node, `0`, `0`);
1156	}
1157	}
1158
1159	bool TextIterator::handleNonTextNode()
1160	{
1161	if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText))
1162	emitCharacter(`'\n'`, *m_node->parentNode(), m_node, `0`, `1`);
1163	else if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isHR())
1164	emitCharacter(`' '`, *m_node->parentNode(), m_node, `0`, `1`);
1165	else
1166	representNodeOffsetZero();
1167
1168	return true;
1169	}
1170
1171	void TextIterator::exitNode(Node* exitedNode)
1172	{
1173	// prevent emitting a newline when exiting a collapsed block at beginning of the range
1174	// FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
1175	// have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
1176	// therefore look like a blank line.
1177	if (!m_hasEmitted)
1178	return;
1179
1180	// Emit with a position inside* m_node, after m_node's contents, in*
1181	// case it is a block, because the run should start where the
1182	// emitted character is positioned visually.
1183	Node* baseNode = exitedNode;
1184	// FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
1185	// the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
1186	// TextIterator in _web_attributedStringFromRange.
1187	// See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
1188	if (m_lastTextNode && shouldEmitNewlineAfterNode(*m_node, m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)) {
1189	// use extra newline to represent margin bottom, as needed
1190	bool addNewline = shouldEmitExtraNewlineForNode(*m_node);
1191
1192	// FIXME: We need to emit a '\n' as we leave an empty block(s) that
1193	// contain a VisiblePosition when doing selection preservation.
1194	if (m_lastCharacter != `'\n'`) {
1195	// insert a newline with a position following this block's contents.
1196	emitCharacter(`'\n'`, *baseNode->parentNode(), baseNode, `1`, `1`);
1197	// remember whether to later add a newline for the current node
1198	ASSERT(!m_nodeForAdditionalNewline);
1199	if (addNewline)
1200	m_nodeForAdditionalNewline = baseNode;
1201	} else if (addNewline)
1202	// insert a newline with a position following this block's contents.
1203	emitCharacter(`'\n'`, *baseNode->parentNode(), baseNode, `1`, `1`);
1204	}
1205
1206	// If nothing was emitted, see if we need to emit a space.
1207	if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(*m_node))
1208	emitCharacter(`' '`, *baseNode->parentNode(), baseNode, `1`, `1`);
1209	}
1210
1211	void TextIterator::emitCharacter(UChar character, Node& characterNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
1212	{
1213	m_hasEmitted = true;
1214
1215	// remember information with which to construct the TextIterator::range()
1216	m_positionNode = &characterNode;
1217	m_positionOffsetBaseNode = offsetBaseNode;
1218	m_positionStartOffset = textStartOffset;
1219	m_positionEndOffset = textEndOffset;
1220
1221	m_copyableText.set(character);
1222	m_text = m_copyableText.text();
1223	m_lastCharacter = character;
1224	m_lastTextNodeEndedWithCollapsedSpace = false;
1225	m_nextRunNeedsWhitespace = false;
1226	}
1227
1228	void TextIterator::emitText(Text& textNode, RenderText& renderer, int textStartOffset, int textEndOffset)
1229	{
1230	ASSERT(textStartOffset >= `0`);
1231	ASSERT(textEndOffset >= `0`);
1232	ASSERT(textStartOffset <= textEndOffset);
1233
1234	// FIXME: This probably yields the wrong offsets when text-transform: lowercase turns a single character into two characters.
1235	String string = (m_behavior & TextIteratorEmitsOriginalText) ? renderer.originalText()
1236	: ((m_behavior & TextIteratorEmitsTextsWithoutTranscoding) ? renderer.textWithoutConvertingBackslashToYenSymbol() : renderer.text());
1237
1238	ASSERT(string.length() >= static_cast<unsigned>(textEndOffset));
1239
1240	m_positionNode = &textNode;
1241	m_positionOffsetBaseNode = nullptr;
1242	m_positionStartOffset = textStartOffset;
1243	m_positionEndOffset = textEndOffset;
1244
1245	m_lastCharacter = string [textEndOffset - `1`];
1246	m_copyableText.set(WTFMove(string), textStartOffset, textEndOffset - textStartOffset);
1247	m_text = m_copyableText.text();
1248
1249	m_lastTextNodeEndedWithCollapsedSpace = false;
1250	m_nextRunNeedsWhitespace = false;
1251	m_hasEmitted = true;
1252	}
1253
1254	Ref<Range> TextIterator::range() const
1255	{
1256	ASSERT(!atEnd());
1257
1258	// use the current run information, if we have it
1259	if (m_positionOffsetBaseNode) {
1260	unsigned index = m_positionOffsetBaseNode->computeNodeIndex();
1261	m_positionStartOffset += index;
1262	m_positionEndOffset += index;
1263	m_positionOffsetBaseNode = nullptr;
1264	}
1265	return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1266	}
1267
1268	Node* TextIterator::node() const
1269	{
1270	Ref<Range> textRange = range();
1271
1272	Node& node = textRange ->startContainer();
1273	if (node.isCharacterDataNode())
1274	return &node;
1275
1276	return node.traverseToChildAt(textRange ->startOffset());
1277	}
1278
1279	// --------
1280
1281	SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range& range)
1282	{
1283	range.ownerDocument().updateLayoutIgnorePendingStylesheets();
1284
1285	Node* startNode = &range.startContainer();
1286	Node* endNode = &range.endContainer();
1287	int startOffset = range.startOffset();
1288	int endOffset = range.endOffset();
1289
1290	if (!startNode->isCharacterDataNode()) {
1291	if (startOffset >= `0` && startOffset < static_cast<int>(startNode->countChildNodes())) {
1292	startNode = startNode->traverseToChildAt(startOffset);
1293	startOffset = `0`;
1294	}
1295	}
1296	if (!endNode->isCharacterDataNode()) {
1297	if (endOffset > `0` && endOffset <= static_cast<int>(endNode->countChildNodes())) {
1298	endNode = endNode->traverseToChildAt(endOffset - `1`);
1299	endOffset = lastOffsetInNode(endNode);
1300	}
1301	}
1302
1303	m_node = endNode;
1304	setUpFullyClippedStack(m_fullyClippedStack, *m_node);
1305	m_offset = endOffset;
1306	m_handledNode = false;
1307	m_handledChildren = endOffset == `0`;
1308
1309	m_startContainer = startNode;
1310	m_startOffset = startOffset;
1311	m_endContainer = endNode;
1312	m_endOffset = endOffset;
1313
1314	m_positionNode = endNode;
1315
1316	m_lastTextNode = nullptr;
1317	m_lastCharacter = `'\n'`;
1318
1319	m_havePassedStartContainer = false;
1320
1321	advance();
1322	}
1323
1324	void SimplifiedBackwardsTextIterator::advance()
1325	{
1326	ASSERT(!atEnd());
1327
1328	m_positionNode = nullptr;
1329	m_copyableText.reset();
1330	m_text = StringView ();
1331
1332	while (m_node && !m_havePassedStartContainer) {
1333	// Don't handle node if we start iterating at [node, 0].
1334	if (!m_handledNode && !(m_node == m_endContainer && !m_endOffset)) {
1335	auto* renderer = m_node->renderer();
1336	if (renderer && renderer->isText() && m_node->isTextNode()) {
1337	if (renderer->style().visibility() == Visibility::Visible && m_offset > `0`)
1338	m_handledNode = handleTextNode();
1339	} else if (renderer && (renderer->isImage() \|\| renderer->isWidget())) {
1340	if (renderer->style().visibility() == Visibility::Visible && m_offset > `0`)
1341	m_handledNode = handleReplacedElement();
1342	} else
1343	m_handledNode = handleNonTextNode();
1344	if (m_positionNode)
1345	return;
1346	}
1347
1348	if (!m_handledChildren && m_node->hasChildNodes()) {
1349	m_node = m_node->lastChild();
1350	pushFullyClippedState(m_fullyClippedStack, *m_node);
1351	} else {
1352	// Exit empty containers as we pass over them or containers
1353	// where [container, 0] is where we started iterating.
1354	if (!m_handledNode && canHaveChildrenForEditing(*m_node) && m_node->parentNode() && (!m_node->lastChild() \|\| (m_node == m_endContainer && !m_endOffset))) {
1355	exitNode();
1356	if (m_positionNode) {
1357	m_handledNode = true;
1358	m_handledChildren = true;
1359	return;
1360	}
1361	}
1362
1363	// Exit all other containers.
1364	while (!m_node->previousSibling()) {
1365	if (!advanceRespectingRange(m_node->parentOrShadowHostNode()))
1366	break;
1367	m_fullyClippedStack.pop();
1368	exitNode();
1369	if (m_positionNode) {
1370	m_handledNode = true;
1371	m_handledChildren = true;
1372	return;
1373	}
1374	}
1375
1376	m_fullyClippedStack.pop();
1377	if (advanceRespectingRange(m_node->previousSibling()))
1378	pushFullyClippedState(m_fullyClippedStack, *m_node);
1379	else
1380	m_node = nullptr;
1381	}
1382
1383	// For the purpose of word boundary detection,
1384	// we should iterate all visible text and trailing (collapsed) whitespaces.
1385	m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(*m_node) : `0`;
1386	m_handledNode = false;
1387	m_handledChildren = false;
1388
1389	if (m_positionNode)
1390	return;
1391	}
1392	}
1393
1394	bool SimplifiedBackwardsTextIterator::handleTextNode()
1395	{
1396	Text& textNode = downcast<Text>(*m_node);
1397
1398	m_lastTextNode = &textNode;
1399
1400	int startOffset;
1401	int offsetInNode;
1402	RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1403	if (!renderer)
1404	return true;
1405
1406	String text = renderer->text();
1407	if (!renderer->hasRenderedText() && text.length())
1408	return true;
1409
1410	if (startOffset + offsetInNode == m_offset) {
1411	ASSERT(!m_shouldHandleFirstLetter);
1412	return true;
1413	}
1414
1415	m_positionEndOffset = m_offset;
1416	m_offset = startOffset + offsetInNode;
1417	m_positionNode = m_node;
1418	m_positionStartOffset = m_offset;
1419
1420	ASSERT(m_positionStartOffset < m_positionEndOffset);
1421	ASSERT(m_positionStartOffset - offsetInNode >= `0`);
1422	ASSERT(m_positionEndOffset - offsetInNode > `0`);
1423	ASSERT(m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1424
1425	m_lastCharacter = text [m_positionEndOffset - offsetInNode - `1`];
1426	m_copyableText.set(WTFMove(text), m_positionStartOffset - offsetInNode, m_positionEndOffset - m_positionStartOffset);
1427	m_text = m_copyableText.text();
1428
1429	return !m_shouldHandleFirstLetter;
1430	}
1431
1432	RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1433	{
1434	RenderText& renderer = downcast<RenderText>(*m_node->renderer());
1435	startOffset = (m_node == m_startContainer) ? m_startOffset : `0`;
1436
1437	if (!is<RenderTextFragment>(renderer)) {
1438	offsetInNode = `0`;
1439	return &renderer;
1440	}
1441
1442	RenderTextFragment& fragment = downcast<RenderTextFragment>(renderer);
1443	int offsetAfterFirstLetter = fragment.start();
1444	if (startOffset >= offsetAfterFirstLetter) {
1445	ASSERT(!m_shouldHandleFirstLetter);
1446	offsetInNode = offsetAfterFirstLetter;
1447	return &renderer;
1448	}
1449
1450	if (!m_shouldHandleFirstLetter && startOffset + offsetAfterFirstLetter < m_offset) {
1451	m_shouldHandleFirstLetter = true;
1452	offsetInNode = offsetAfterFirstLetter;
1453	return &renderer;
1454	}
1455
1456	m_shouldHandleFirstLetter = false;
1457	offsetInNode = `0`;
1458	RenderText* firstLetterRenderer = firstRenderTextInFirstLetter(fragment.firstLetter());
1459
1460	m_offset = firstLetterRenderer->caretMaxOffset();
1461	m_offset += collapsedSpaceLength(*firstLetterRenderer, m_offset);
1462
1463	return firstLetterRenderer;
1464	}
1465
1466	bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1467	{
1468	unsigned index = m_node->computeNodeIndex();
1469	// We want replaced elements to behave like punctuation for boundary
1470	// finding, and to simply take up space for the selection preservation
1471	// code in moveParagraphs, so we use a comma. Unconditionally emit
1472	// here because this iterator is only used for boundary finding.
1473	emitCharacter(`','`, *m_node->parentNode(), index, index + `1`);
1474	return true;
1475	}
1476
1477	bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1478	{
1479	// We can use a linefeed in place of a tab because this simple iterator is only used to
1480	// find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1481	if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) \|\| shouldEmitNewlineAfterNode(m_node) \|\| shouldEmitTabBeforeNode(m_node)) {
1482	if (m_lastCharacter != `'\n'`) {
1483	// Corresponds to the same check in TextIterator::exitNode.
1484	unsigned index = m_node->computeNodeIndex();
1485	// The start of this emitted range is wrong. Ensuring correctness would require
1486	// VisiblePositions and so would be slow. previousBoundary expects this.
1487	emitCharacter(`'\n'`, *m_node->parentNode(), index + `1`, index + `1`);
1488	}
1489	}
1490	return true;
1491	}
1492
1493	void SimplifiedBackwardsTextIterator::exitNode()
1494	{
1495	if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) \|\| shouldEmitNewlineBeforeNode(m_node) \|\| shouldEmitTabBeforeNode(m_node)) {
1496	// The start of this emitted range is wrong. Ensuring correctness would require
1497	// VisiblePositions and so would be slow. previousBoundary expects this.
1498	emitCharacter(`'\n'`, *m_node, `0`, `0`);
1499	}
1500	}
1501
1502	void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node& node, int startOffset, int endOffset)
1503	{
1504	m_positionNode = &node;
1505	m_positionStartOffset = startOffset;
1506	m_positionEndOffset = endOffset;
1507	m_copyableText.set(c);
1508	m_text = m_copyableText.text();
1509	m_lastCharacter = c;
1510	}
1511
1512	bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1513	{
1514	if (!next)
1515	return false;
1516	m_havePassedStartContainer \|= m_node == m_startContainer;
1517	if (m_havePassedStartContainer)
1518	return false;
1519	m_node = next;
1520	return true;
1521	}
1522
1523	Ref<Range> SimplifiedBackwardsTextIterator::range() const
1524	{
1525	ASSERT(!atEnd());
1526
1527	return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1528	}
1529
1530	// --------
1531
1532	CharacterIterator::CharacterIterator(const Range& range, TextIteratorBehavior behavior)
1533	: m_underlyingIterator (&range, behavior)
1534	{
1535	while (!atEnd() && !m_underlyingIterator.text().length())
1536	m_underlyingIterator.advance();
1537	}
1538
1539	CharacterIterator::CharacterIterator(Position start, Position end, TextIteratorBehavior behavior)
1540	: m_underlyingIterator (start, end, behavior)
1541	{
1542	while (!atEnd() && !m_underlyingIterator.text().length())
1543	m_underlyingIterator.advance();
1544	}
1545
1546	Ref<Range> CharacterIterator::range() const
1547	{
1548	Ref<Range> range = m_underlyingIterator.range();
1549	if (!m_underlyingIterator.atEnd()) {
1550	if (m_underlyingIterator.text().length() <= `1`) {
1551	ASSERT(m_runOffset == `0`);
1552	} else {
1553	Node& node = range ->startContainer();
1554	ASSERT(&node == &range ->endContainer());
1555	int offset = range ->startOffset() + m_runOffset;
1556	range ->setStart(node, offset);
1557	range ->setEnd(node, offset + `1`);
1558	}
1559	}
1560	return range;
1561	}
1562
1563	void CharacterIterator::advance(int count)
1564	{
1565	if (count <= `0`) {
1566	ASSERT(count == `0`);
1567	return;
1568	}
1569
1570	m_atBreak = false;
1571
1572	// easy if there is enough left in the current m_underlyingIterator run
1573	int remaining = m_underlyingIterator.text().length() - m_runOffset;
1574	if (count < remaining) {
1575	m_runOffset += count;
1576	m_offset += count;
1577	return;
1578	}
1579
1580	// exhaust the current m_underlyingIterator run
1581	count -= remaining;
1582	m_offset += remaining;
1583
1584	// move to a subsequent m_underlyingIterator run
1585	for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1586	int runLength = m_underlyingIterator.text().length();
1587	if (!runLength)
1588	m_atBreak = true;
1589	else {
1590	// see whether this is m_underlyingIterator to use
1591	if (count < runLength) {
1592	m_runOffset = count;
1593	m_offset += count;
1594	return;
1595	}
1596
1597	// exhaust this m_underlyingIterator run
1598	count -= runLength;
1599	m_offset += runLength;
1600	}
1601	}
1602
1603	// ran to the end of the m_underlyingIterator... no more runs left
1604	m_atBreak = true;
1605	m_runOffset = `0`;
1606	}
1607
1608	static Ref<Range> characterSubrange(Document& document, CharacterIterator& it, int offset, int length)
1609	{
1610	it.advance(offset);
1611	if (it.atEnd())
1612	return Range::create(document);
1613
1614	Ref<Range> start = it.range();
1615
1616	if (length > `1`)
1617	it.advance(length - `1`);
1618	if (it.atEnd())
1619	return Range::create(document);
1620
1621	Ref<Range> end = it.range();
1622
1623	return Range::create(document, &start ->startContainer(), start ->startOffset(), &end ->endContainer(), end ->endOffset());
1624	}
1625
1626	BackwardsCharacterIterator::BackwardsCharacterIterator(const Range& range)
1627	: m_underlyingIterator (range)
1628	, m_offset(`0`)
1629	, m_runOffset(`0`)
1630	, m_atBreak(true)
1631	{
1632	while (!atEnd() && !m_underlyingIterator.text().length())
1633	m_underlyingIterator.advance();
1634	}
1635
1636	Ref<Range> BackwardsCharacterIterator::range() const
1637	{
1638	Ref<Range> r = m_underlyingIterator.range();
1639	if (!m_underlyingIterator.atEnd()) {
1640	if (m_underlyingIterator.text().length() <= `1`)
1641	ASSERT(m_runOffset == `0`);
1642	else {
1643	Node& node = r ->startContainer();
1644	ASSERT(&node == &r ->endContainer());
1645	int offset = r ->endOffset() - m_runOffset;
1646	r ->setStart(node, offset - `1`);
1647	r ->setEnd(node, offset);
1648	}
1649	}
1650	return r;
1651	}
1652
1653	void BackwardsCharacterIterator::advance(int count)
1654	{
1655	if (count <= `0`) {
1656	ASSERT(!count);
1657	return;
1658	}
1659
1660	m_atBreak = false;
1661
1662	int remaining = m_underlyingIterator.text().length() - m_runOffset;
1663	if (count < remaining) {
1664	m_runOffset += count;
1665	m_offset += count;
1666	return;
1667	}
1668
1669	count -= remaining;
1670	m_offset += remaining;
1671
1672	for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1673	int runLength = m_underlyingIterator.text().length();
1674	if (runLength == `0`)
1675	m_atBreak = true;
1676	else {
1677	if (count < runLength) {
1678	m_runOffset = count;
1679	m_offset += count;
1680	return;
1681	}
1682
1683	count -= runLength;
1684	m_offset += runLength;
1685	}
1686	}
1687
1688	m_atBreak = true;
1689	m_runOffset = `0`;
1690	}
1691
1692	// --------
1693
1694	WordAwareIterator::WordAwareIterator(const Range& range)
1695	: m_underlyingIterator (&range)
1696	, m_didLookAhead(true) // so we consider the first chunk from the text iterator
1697	{
1698	advance(); // get in position over the first chunk of text
1699	}
1700
1701	// We're always in one of these modes:
1702	// - The current chunk in the text iterator is our current chunk
1703	// (typically its a piece of whitespace, or text that ended with whitespace)
1704	// - The previous chunk in the text iterator is our current chunk
1705	// (we looked ahead to the next chunk and found a word boundary)
1706	// - We built up our own chunk of text from many chunks from the text iterator
1707
1708	// FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1709
1710	void WordAwareIterator::advance()
1711	{
1712	m_previousText.reset();
1713	m_buffer.clear();
1714
1715	// If last time we did a look-ahead, start with that looked-ahead chunk now
1716	if (!m_didLookAhead) {
1717	ASSERT(!m_underlyingIterator.atEnd());
1718	m_underlyingIterator.advance();
1719	}
1720	m_didLookAhead = false;
1721
1722	// Go to next non-empty chunk
1723	while (!m_underlyingIterator.atEnd() && !m_underlyingIterator.text().length())
1724	m_underlyingIterator.advance();
1725	if (m_underlyingIterator.atEnd())
1726	return;
1727
1728	while (`1`) {
1729	// If this chunk ends in whitespace we can just use it as our chunk.
1730	if (isSpaceOrNewline(m_underlyingIterator.text()[m_underlyingIterator.text().length() - `1`]))
1731	return;
1732
1733	// If this is the first chunk that failed, save it in previousText before look ahead
1734	if (m_buffer.isEmpty())
1735	m_previousText = m_underlyingIterator.copyableText();
1736
1737	// Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1738	m_underlyingIterator.advance();
1739	if (m_underlyingIterator.atEnd() \|\| !m_underlyingIterator.text().length() \|\| isSpaceOrNewline(m_underlyingIterator.text()[`0`])) {
1740	m_didLookAhead = true;
1741	return;
1742	}
1743
1744	if (m_buffer.isEmpty()) {
1745	// Start gobbling chunks until we get to a suitable stopping point
1746	append(m_buffer, m_previousText.text());
1747	m_previousText.reset();
1748	}
1749	append(m_buffer, m_underlyingIterator.text());
1750	}
1751	}
1752
1753	StringView WordAwareIterator::text() const
1754	{
1755	if (!m_buffer.isEmpty())
1756	return StringView (m_buffer.data(), m_buffer.size());
1757	if (m_previousText.text().length())
1758	return m_previousText.text();
1759	return m_underlyingIterator.text();
1760	}
1761
1762	// --------
1763
1764	static inline UChar foldQuoteMark(UChar c)
1765	{
1766	switch (c) {
1767	case hebrewPunctuationGershayim:
1768	case leftDoubleQuotationMark:
1769	case leftLowDoubleQuotationMark:
1770	case rightDoubleQuotationMark:
1771	return `'"'`;
1772	case hebrewPunctuationGeresh:
1773	case leftSingleQuotationMark:
1774	case leftLowSingleQuotationMark:
1775	case rightSingleQuotationMark:
1776	return `'\''`;
1777	default:
1778	return c;
1779	}
1780	}
1781
1782	// FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1783	// of doing it in a separate replacement pass here, but ICU doesn't offer a way
1784	// to add tailoring on top of the locale-specific tailoring as of this writing.
1785	static inline String foldQuoteMarks(String string)
1786	{
1787	string.replace(hebrewPunctuationGeresh, `'\''`);
1788	string.replace(hebrewPunctuationGershayim, `'"'`);
1789	string.replace(leftDoubleQuotationMark, `'"'`);
1790	string.replace(leftLowDoubleQuotationMark, `'"'`);
1791	string.replace(leftSingleQuotationMark, `'\''`);
1792	string.replace(leftLowSingleQuotationMark, `'\''`);
1793	string.replace(rightDoubleQuotationMark, `'"'`);
1794	string.replace(rightSingleQuotationMark, `'\''`);
1795
1796	return string;
1797	}
1798
1799	#if !UCONFIG_NO_COLLATION
1800
1801	const size_t minimumSearchBufferSize = `8192`;
1802
1803	#ifndef NDEBUG
1804	static bool searcherInUse;
1805	#endif
1806
1807	static UStringSearch* createSearcher()
1808	{
1809	// Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1810	// but it doesn't matter exactly what it is, since we don't perform any searches
1811	// without setting both the pattern and the text.
1812	UErrorCode status = U_ZERO_ERROR;
1813	String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search");
1814	UStringSearch* searcher = usearch_open(&newlineCharacter, `1`, &newlineCharacter, `1`, searchCollatorName.utf8().data(), `0`, &status);
1815	ASSERT(status == U_ZERO_ERROR \|\| status == U_USING_FALLBACK_WARNING \|\| status == U_USING_DEFAULT_WARNING);
1816	return searcher;
1817	}
1818
1819	static UStringSearch* searcher()
1820	{
1821	static UStringSearch* searcher = createSearcher();
1822	return searcher;
1823	}
1824
1825	static inline void lockSearcher()
1826	{
1827	#ifndef NDEBUG
1828	ASSERT(!searcherInUse);
1829	searcherInUse = true;
1830	#endif
1831	}
1832
1833	static inline void unlockSearcher()
1834	{
1835	#ifndef NDEBUG
1836	ASSERT(searcherInUse);
1837	searcherInUse = false;
1838	#endif
1839	}
1840
1841	// ICU's search ignores the distinction between small kana letters and ones
1842	// that are not small, and also characters that differ only in the voicing
1843	// marks when considering only primary collation strength differences.
1844	// This is not helpful for end users, since these differences make words
1845	// distinct, so for our purposes we need these to be considered.
1846	// The Unicode folks do not think the collation algorithm should be
1847	// changed. To work around this, we would like to tailor the ICU searcher,
1848	// but we can't get that to work yet. So instead, we check for cases where
1849	// these differences occur, and skip those matches.
1850
1851	// We refer to the above technique as the "kana workaround". The next few
1852	// functions are helper functinos for the kana workaround.
1853
1854	static inline bool isKanaLetter(UChar character)
1855	{
1856	// Hiragana letters.
1857	if (character >= `0x3041` && character <= `0x3096`)
1858	return true;
1859
1860	// Katakana letters.
1861	if (character >= `0x30A1` && character <= `0x30FA`)
1862	return true;
1863	if (character >= `0x31F0` && character <= `0x31FF`)
1864	return true;
1865
1866	// Halfwidth katakana letters.
1867	if (character >= `0xFF66` && character <= `0xFF9D` && character != `0xFF70`)
1868	return true;
1869
1870	return false;
1871	}
1872
1873	static inline bool isSmallKanaLetter(UChar character)
1874	{
1875	ASSERT(isKanaLetter(character));
1876
1877	switch (character) {
1878	case `0x3041`: // HIRAGANA LETTER SMALL A
1879	case `0x3043`: // HIRAGANA LETTER SMALL I
1880	case `0x3045`: // HIRAGANA LETTER SMALL U
1881	case `0x3047`: // HIRAGANA LETTER SMALL E
1882	case `0x3049`: // HIRAGANA LETTER SMALL O
1883	case `0x3063`: // HIRAGANA LETTER SMALL TU
1884	case `0x3083`: // HIRAGANA LETTER SMALL YA
1885	case `0x3085`: // HIRAGANA LETTER SMALL YU
1886	case `0x3087`: // HIRAGANA LETTER SMALL YO
1887	case `0x308E`: // HIRAGANA LETTER SMALL WA
1888	case `0x3095`: // HIRAGANA LETTER SMALL KA
1889	case `0x3096`: // HIRAGANA LETTER SMALL KE
1890	case `0x30A1`: // KATAKANA LETTER SMALL A
1891	case `0x30A3`: // KATAKANA LETTER SMALL I
1892	case `0x30A5`: // KATAKANA LETTER SMALL U
1893	case `0x30A7`: // KATAKANA LETTER SMALL E
1894	case `0x30A9`: // KATAKANA LETTER SMALL O
1895	case `0x30C3`: // KATAKANA LETTER SMALL TU
1896	case `0x30E3`: // KATAKANA LETTER SMALL YA
1897	case `0x30E5`: // KATAKANA LETTER SMALL YU
1898	case `0x30E7`: // KATAKANA LETTER SMALL YO
1899	case `0x30EE`: // KATAKANA LETTER SMALL WA
1900	case `0x30F5`: // KATAKANA LETTER SMALL KA
1901	case `0x30F6`: // KATAKANA LETTER SMALL KE
1902	case `0x31F0`: // KATAKANA LETTER SMALL KU
1903	case `0x31F1`: // KATAKANA LETTER SMALL SI
1904	case `0x31F2`: // KATAKANA LETTER SMALL SU
1905	case `0x31F3`: // KATAKANA LETTER SMALL TO
1906	case `0x31F4`: // KATAKANA LETTER SMALL NU
1907	case `0x31F5`: // KATAKANA LETTER SMALL HA
1908	case `0x31F6`: // KATAKANA LETTER SMALL HI
1909	case `0x31F7`: // KATAKANA LETTER SMALL HU
1910	case `0x31F8`: // KATAKANA LETTER SMALL HE
1911	case `0x31F9`: // KATAKANA LETTER SMALL HO
1912	case `0x31FA`: // KATAKANA LETTER SMALL MU
1913	case `0x31FB`: // KATAKANA LETTER SMALL RA
1914	case `0x31FC`: // KATAKANA LETTER SMALL RI
1915	case `0x31FD`: // KATAKANA LETTER SMALL RU
1916	case `0x31FE`: // KATAKANA LETTER SMALL RE
1917	case `0x31FF`: // KATAKANA LETTER SMALL RO
1918	case `0xFF67`: // HALFWIDTH KATAKANA LETTER SMALL A
1919	case `0xFF68`: // HALFWIDTH KATAKANA LETTER SMALL I
1920	case `0xFF69`: // HALFWIDTH KATAKANA LETTER SMALL U
1921	case `0xFF6A`: // HALFWIDTH KATAKANA LETTER SMALL E
1922	case `0xFF6B`: // HALFWIDTH KATAKANA LETTER SMALL O
1923	case `0xFF6C`: // HALFWIDTH KATAKANA LETTER SMALL YA
1924	case `0xFF6D`: // HALFWIDTH KATAKANA LETTER SMALL YU
1925	case `0xFF6E`: // HALFWIDTH KATAKANA LETTER SMALL YO
1926	case `0xFF6F`: // HALFWIDTH KATAKANA LETTER SMALL TU
1927	return true;
1928	}
1929	return false;
1930	}
1931
1932	enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1933
1934	static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1935	{
1936	ASSERT(isKanaLetter(character));
1937
1938	switch (character) {
1939	case `0x304C`: // HIRAGANA LETTER GA
1940	case `0x304E`: // HIRAGANA LETTER GI
1941	case `0x3050`: // HIRAGANA LETTER GU
1942	case `0x3052`: // HIRAGANA LETTER GE
1943	case `0x3054`: // HIRAGANA LETTER GO
1944	case `0x3056`: // HIRAGANA LETTER ZA
1945	case `0x3058`: // HIRAGANA LETTER ZI
1946	case `0x305A`: // HIRAGANA LETTER ZU
1947	case `0x305C`: // HIRAGANA LETTER ZE
1948	case `0x305E`: // HIRAGANA LETTER ZO
1949	case `0x3060`: // HIRAGANA LETTER DA
1950	case `0x3062`: // HIRAGANA LETTER DI
1951	case `0x3065`: // HIRAGANA LETTER DU
1952	case `0x3067`: // HIRAGANA LETTER DE
1953	case `0x3069`: // HIRAGANA LETTER DO
1954	case `0x3070`: // HIRAGANA LETTER BA
1955	case `0x3073`: // HIRAGANA LETTER BI
1956	case `0x3076`: // HIRAGANA LETTER BU
1957	case `0x3079`: // HIRAGANA LETTER BE
1958	case `0x307C`: // HIRAGANA LETTER BO
1959	case `0x3094`: // HIRAGANA LETTER VU
1960	case `0x30AC`: // KATAKANA LETTER GA
1961	case `0x30AE`: // KATAKANA LETTER GI
1962	case `0x30B0`: // KATAKANA LETTER GU
1963	case `0x30B2`: // KATAKANA LETTER GE
1964	case `0x30B4`: // KATAKANA LETTER GO
1965	case `0x30B6`: // KATAKANA LETTER ZA
1966	case `0x30B8`: // KATAKANA LETTER ZI
1967	case `0x30BA`: // KATAKANA LETTER ZU
1968	case `0x30BC`: // KATAKANA LETTER ZE
1969	case `0x30BE`: // KATAKANA LETTER ZO
1970	case `0x30C0`: // KATAKANA LETTER DA
1971	case `0x30C2`: // KATAKANA LETTER DI
1972	case `0x30C5`: // KATAKANA LETTER DU
1973	case `0x30C7`: // KATAKANA LETTER DE
1974	case `0x30C9`: // KATAKANA LETTER DO
1975	case `0x30D0`: // KATAKANA LETTER BA
1976	case `0x30D3`: // KATAKANA LETTER BI
1977	case `0x30D6`: // KATAKANA LETTER BU
1978	case `0x30D9`: // KATAKANA LETTER BE
1979	case `0x30DC`: // KATAKANA LETTER BO
1980	case `0x30F4`: // KATAKANA LETTER VU
1981	case `0x30F7`: // KATAKANA LETTER VA
1982	case `0x30F8`: // KATAKANA LETTER VI
1983	case `0x30F9`: // KATAKANA LETTER VE
1984	case `0x30FA`: // KATAKANA LETTER VO
1985	return VoicedSoundMark;
1986	case `0x3071`: // HIRAGANA LETTER PA
1987	case `0x3074`: // HIRAGANA LETTER PI
1988	case `0x3077`: // HIRAGANA LETTER PU
1989	case `0x307A`: // HIRAGANA LETTER PE
1990	case `0x307D`: // HIRAGANA LETTER PO
1991	case `0x30D1`: // KATAKANA LETTER PA
1992	case `0x30D4`: // KATAKANA LETTER PI
1993	case `0x30D7`: // KATAKANA LETTER PU
1994	case `0x30DA`: // KATAKANA LETTER PE
1995	case `0x30DD`: // KATAKANA LETTER PO
1996	return SemiVoicedSoundMark;
1997	}
1998	return NoVoicedSoundMark;
1999	}
2000
2001	static inline bool isCombiningVoicedSoundMark(UChar character)
2002	{
2003	switch (character) {
2004	case `0x3099`: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
2005	case `0x309A`: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
2006	return true;
2007	}
2008	return false;
2009	}
2010
2011	static inline bool containsKanaLetters(const String& pattern)
2012	{
2013	if (pattern.is8Bit())
2014	return false;
2015	const UChar* characters = pattern.characters16();
2016	unsigned length = pattern.length();
2017	for (unsigned i = `0`; i < length; ++i) {
2018	if (isKanaLetter(characters[i]))
2019	return true;
2020	}
2021	return false;
2022	}
2023
2024	static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
2025	{
2026	UErrorCode status = U_ZERO_ERROR;
2027	const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
2028	ASSERT(U_SUCCESS(status));
2029
2030	buffer.resize(length);
2031
2032	auto normalizedLength = unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status);
2033	ASSERT(U_SUCCESS(status) \|\| status == U_BUFFER_OVERFLOW_ERROR);
2034
2035	buffer.resize(normalizedLength);
2036
2037	if (U_SUCCESS(status))
2038	return;
2039
2040	status = U_ZERO_ERROR;
2041	unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status);
2042	ASSERT(U_SUCCESS(status));
2043	}
2044
2045	static bool isNonLatin1Separator(UChar32 character)
2046	{
2047	ASSERT_ARG(character, character >= `256`);
2048
2049	return U_GET_GC_MASK(character) & (U_GC_S_MASK \| U_GC_P_MASK \| U_GC_Z_MASK \| U_GC_CF_MASK);
2050	}
2051
2052	static inline bool isSeparator(UChar32 character)
2053	{
2054	static const bool latin1SeparatorTable[`256`] = {
2055	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2056	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2057	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // space ! " # $ % & ' ( ) + , - . /*
2058	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, // : ; < = > ?
2059	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // @
2060	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, // [ \ ] ^ _
2061	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // `
2062	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `0`, // { \| } ~
2063	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2064	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2065	`0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`,
2066	`1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`,
2067	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2068	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2069	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
2070	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
2071	};
2072
2073	if (character < `256`)
2074	return latin1SeparatorTable[character];
2075
2076	return isNonLatin1Separator(character);
2077	}
2078
2079	inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2080	: m_target(foldQuoteMarks(target))
2081	, m_targetCharacters(StringView (m_target).upconvertedCharacters())
2082	, m_options (options)
2083	, m_prefixLength(`0`)
2084	, m_atBreak(true)
2085	, m_needsMoreContext(options.contains(AtWordStarts))
2086	, m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
2087	{
2088	ASSERT(!m_target.isEmpty());
2089
2090	size_t targetLength = m_target.length();
2091	m_buffer.reserveInitialCapacity(std::max(targetLength * `8`, minimumSearchBufferSize));
2092	m_overlap = m_buffer.capacity() / `4`;
2093
2094	if (m_options.contains(AtWordStarts) && targetLength) {
2095	UChar32 targetFirstCharacter;
2096	U16_GET(m_target, `0`, `0u`, targetLength, targetFirstCharacter);
2097	// Characters in the separator category never really occur at the beginning of a word,
2098	// so if the target begins with such a character, we just ignore the AtWordStart option.
2099	if (isSeparator(targetFirstCharacter)) {
2100	m_options.remove(AtWordStarts);
2101	m_needsMoreContext = false;
2102	}
2103	}
2104
2105	// Grab the single global searcher.
2106	// If we ever have a reason to do more than once search buffer at once, we'll have
2107	// to move to multiple searchers.
2108	lockSearcher();
2109
2110	UStringSearch* searcher = WebCore::searcher();
2111	UCollator* collator = usearch_getCollator(searcher);
2112
2113	UCollationStrength strength;
2114	USearchAttributeValue comparator;
2115	if (m_options.contains(CaseInsensitive)) {
2116	// Without loss of generality, have 'e' match {'e', 'E', 'é', 'É'} and 'é' match {'é', 'É'}.
2117	strength = UCOL_SECONDARY;
2118	comparator = USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD;
2119	} else {
2120	// Without loss of generality, have 'e' match {'e'} and 'é' match {'é'}.
2121	strength = UCOL_TERTIARY;
2122	comparator = USEARCH_STANDARD_ELEMENT_COMPARISON;
2123	}
2124	if (ucol_getStrength(collator) != strength) {
2125	ucol_setStrength(collator, strength);
2126	usearch_reset(searcher);
2127	}
2128
2129	UErrorCode status = U_ZERO_ERROR;
2130	usearch_setAttribute(searcher, USEARCH_ELEMENT_COMPARISON, comparator, &status);
2131	ASSERT(status == U_ZERO_ERROR);
2132
2133	usearch_setPattern(searcher, m_targetCharacters, targetLength, &status);
2134	ASSERT(status == U_ZERO_ERROR);
2135
2136	// The kana workaround requires a normalized copy of the target string.
2137	if (m_targetRequiresKanaWorkaround)
2138	normalizeCharacters(m_targetCharacters, targetLength, m_normalizedTarget);
2139	}
2140
2141	inline SearchBuffer::~SearchBuffer()
2142	{
2143	// Leave the static object pointing to a valid string.
2144	UErrorCode status = U_ZERO_ERROR;
2145	usearch_setPattern(WebCore::searcher(), &newlineCharacter, `1`, &status);
2146	ASSERT(status == U_ZERO_ERROR);
2147	usearch_setText(WebCore::searcher(), &newlineCharacter, `1`, &status);
2148	ASSERT(status == U_ZERO_ERROR);
2149
2150	unlockSearcher();
2151	}
2152
2153	inline size_t SearchBuffer::append(StringView text)
2154	{
2155	ASSERT(text.length());
2156
2157	if (m_atBreak) {
2158	m_buffer.shrink(`0`);
2159	m_prefixLength = `0`;
2160	m_atBreak = false;
2161	} else if (m_buffer.size() == m_buffer.capacity()) {
2162	memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2163	m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap);
2164	m_buffer.shrink(m_overlap);
2165	}
2166
2167	size_t oldLength = m_buffer.size();
2168	size_t usableLength = std::min<size_t>(m_buffer.capacity() - oldLength, text.length());
2169	ASSERT(usableLength);
2170	m_buffer.grow(oldLength + usableLength);
2171	for (unsigned i = `0`; i < usableLength; ++i)
2172	m_buffer [oldLength + i] = foldQuoteMark(text [i]);
2173	return usableLength;
2174	}
2175
2176	inline bool SearchBuffer::needsMoreContext() const
2177	{
2178	return m_needsMoreContext;
2179	}
2180
2181	inline void SearchBuffer::prependContext(StringView text)
2182	{
2183	ASSERT(m_needsMoreContext);
2184	ASSERT(m_prefixLength == m_buffer.size());
2185
2186	if (!text.length())
2187	return;
2188
2189	m_atBreak = false;
2190
2191	size_t wordBoundaryContextStart = text.length();
2192	if (wordBoundaryContextStart) {
2193	U16_BACK_1(text, `0`, wordBoundaryContextStart);
2194	wordBoundaryContextStart = startOfLastWordBoundaryContext(text.substring(`0`, wordBoundaryContextStart));
2195	}
2196
2197	size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, text.length() - wordBoundaryContextStart);
2198	WTF::append(m_buffer, text.substring(text.length() - usableLength, usableLength));
2199	m_prefixLength += usableLength;
2200
2201	if (wordBoundaryContextStart \|\| m_prefixLength == m_buffer.capacity())
2202	m_needsMoreContext = false;
2203	}
2204
2205	inline bool SearchBuffer::atBreak() const
2206	{
2207	return m_atBreak;
2208	}
2209
2210	inline void SearchBuffer::reachedBreak()
2211	{
2212	m_atBreak = true;
2213	}
2214
2215	inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2216	{
2217	// This function implements the kana workaround. If usearch treats
2218	// it as a match, but we do not want to, then it's a "bad match".
2219	if (!m_targetRequiresKanaWorkaround)
2220	return false;
2221
2222	// Normalize into a match buffer. We reuse a single buffer rather than
2223	// creating a new one each time.
2224	normalizeCharacters(match, matchLength, m_normalizedMatch);
2225
2226	const UChar* a = m_normalizedTarget.begin();
2227	const UChar* aEnd = m_normalizedTarget.end();
2228
2229	const UChar* b = m_normalizedMatch.begin();
2230	const UChar* bEnd = m_normalizedMatch.end();
2231
2232	while (true) {
2233	// Skip runs of non-kana-letter characters. This is necessary so we can
2234	// correctly handle strings where the target and match have different-length
2235	// runs of characters that match, while still double checking the correctness
2236	// of matches of kana letters with other kana letters.
2237	while (a != aEnd && !isKanaLetter(*a))
2238	++a;
2239	while (b != bEnd && !isKanaLetter(*b))
2240	++b;
2241
2242	// If we reached the end of either the target or the match, we should have
2243	// reached the end of both; both should have the same number of kana letters.
2244	if (a == aEnd \|\| b == bEnd) {
2245	ASSERT(a == aEnd);
2246	ASSERT(b == bEnd);
2247	return false;
2248	}
2249
2250	// Check for differences in the kana letter character itself.
2251	if (isSmallKanaLetter(a) != isSmallKanaLetter(b))
2252	return true;
2253	if (composedVoicedSoundMark(a) != composedVoicedSoundMark(b))
2254	return true;
2255	++a;
2256	++b;
2257
2258	// Check for differences in combining voiced sound marks found after the letter.
2259	while (`1`) {
2260	if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2261	if (b != bEnd && isCombiningVoicedSoundMark(*b))
2262	return true;
2263	break;
2264	}
2265	if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2266	return true;
2267	if (a != b)
2268	return true;
2269	++a;
2270	++b;
2271	}
2272	}
2273	}
2274
2275	inline bool SearchBuffer::isWordEndMatch(size_t start, size_t length) const
2276	{
2277	ASSERT(length);
2278	ASSERT(m_options.contains(AtWordEnds));
2279
2280	int endWord;
2281	// Start searching at the end of matched search, so that multiple word matches succeed.
2282	findEndWordBoundary(StringView (m_buffer.data(), m_buffer.size()), start + length - `1`, &endWord);
2283	return static_cast<size_t>(endWord) == (start + length);
2284	}
2285
2286	inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2287	{
2288	ASSERT(m_options.contains(AtWordStarts));
2289
2290	if (!start)
2291	return true;
2292
2293	int size = m_buffer.size();
2294	int offset = start;
2295	UChar32 firstCharacter;
2296	U16_GET(m_buffer.data(), `0`, offset, size, firstCharacter);
2297
2298	if (m_options.contains(TreatMedialCapitalAsWordStart)) {
2299	UChar32 previousCharacter;
2300	U16_PREV(m_buffer.data(), `0`, offset, previousCharacter);
2301
2302	if (isSeparator(firstCharacter)) {
2303	// The start of a separator run is a word start (".org" in "webkit.org").
2304	if (!isSeparator(previousCharacter))
2305	return true;
2306	} else if (isASCIIUpper(firstCharacter)) {
2307	// The start of an uppercase run is a word start ("Kit" in "WebKit").
2308	if (!isASCIIUpper(previousCharacter))
2309	return true;
2310	// The last character of an uppercase run followed by a non-separator, non-digit
2311	// is a word start ("Request" in "XMLHTTPRequest").
2312	offset = start;
2313	U16_FWD_1(m_buffer.data(), offset, size);
2314	UChar32 nextCharacter = `0`;
2315	if (offset < size)
2316	U16_GET(m_buffer.data(), `0`, offset, size, nextCharacter);
2317	if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2318	return true;
2319	} else if (isASCIIDigit(firstCharacter)) {
2320	// The start of a digit run is a word start ("2" in "WebKit2").
2321	if (!isASCIIDigit(previousCharacter))
2322	return true;
2323	} else if (isSeparator(previousCharacter) \|\| isASCIIDigit(previousCharacter)) {
2324	// The start of a non-separator, non-uppercase, non-digit run is a word start,
2325	// except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2326	return true;
2327	}
2328	}
2329
2330	// Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
2331	// a word, so treat the position before any CJK character as a word start.
2332	if (FontCascade::isCJKIdeographOrSymbol(firstCharacter))
2333	return true;
2334
2335	size_t wordBreakSearchStart = start + length;
2336	while (wordBreakSearchStart > start)
2337	wordBreakSearchStart = findNextWordFromIndex(StringView (m_buffer.data(), m_buffer.size()), wordBreakSearchStart, false / backwards /);
2338	return wordBreakSearchStart == start;
2339	}
2340
2341	inline size_t SearchBuffer::search(size_t& start)
2342	{
2343	size_t size = m_buffer.size();
2344	if (m_atBreak) {
2345	if (!size)
2346	return `0`;
2347	} else {
2348	if (size != m_buffer.capacity())
2349	return `0`;
2350	}
2351
2352	UStringSearch* searcher = WebCore::searcher();
2353
2354	UErrorCode status = U_ZERO_ERROR;
2355	usearch_setText(searcher, m_buffer.data(), size, &status);
2356	ASSERT(status == U_ZERO_ERROR);
2357
2358	usearch_setOffset(searcher, m_prefixLength, &status);
2359	ASSERT(status == U_ZERO_ERROR);
2360
2361	int matchStart = usearch_next(searcher, &status);
2362	ASSERT(status == U_ZERO_ERROR);
2363
2364	nextMatch:
2365	if (!(matchStart >= `0` && static_cast<size_t>(matchStart) < size)) {
2366	ASSERT(matchStart == USEARCH_DONE);
2367	return `0`;
2368	}
2369
2370	// Matches that start in the overlap area are only tentative.
2371	// The same match may appear later, matching more characters,
2372	// possibly including a combining character that's not yet in the buffer.
2373	if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2374	size_t overlap = m_overlap;
2375	if (m_options.contains(AtWordStarts)) {
2376	// Ensure that there is sufficient context before matchStart the next time around for
2377	// determining if it is at a word boundary.
2378	unsigned wordBoundaryContextStart = matchStart;
2379	U16_BACK_1(m_buffer.data(), `0`, wordBoundaryContextStart);
2380	wordBoundaryContextStart = startOfLastWordBoundaryContext(StringView (m_buffer.data(), wordBoundaryContextStart));
2381	overlap = std::min(size - `1`, std::max(overlap, size - wordBoundaryContextStart));
2382	}
2383	memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2384	m_prefixLength -= std::min(m_prefixLength, size - overlap);
2385	m_buffer.shrink(overlap);
2386	return `0`;
2387	}
2388
2389	size_t matchedLength = usearch_getMatchedLength(searcher);
2390	ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size);
2391
2392	// If this match is "bad", move on to the next match.
2393	if (isBadMatch(m_buffer.data() + matchStart, matchedLength)
2394	\|\| (m_options.contains(AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))
2395	\|\| (m_options.contains(AtWordEnds) && !isWordEndMatch(matchStart, matchedLength))) {
2396	matchStart = usearch_next(searcher, &status);
2397	ASSERT(status == U_ZERO_ERROR);
2398	goto nextMatch;
2399	}
2400
2401	size_t newSize = size - (matchStart + `1`);
2402	memmove(m_buffer.data(), m_buffer.data() + matchStart + `1`, newSize * sizeof(UChar));
2403	m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + `1`);
2404	m_buffer.shrink(newSize);
2405
2406	start = size - matchStart;
2407	return matchedLength;
2408	}
2409
2410	#else
2411
2412	inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2413	: m_target(options & CaseInsensitive ? target.foldCase() : target)
2414	, m_options(options)
2415	, m_buffer(m_target.length())
2416	, m_isCharacterStartBuffer(m_target.length())
2417	, m_isBufferFull(false)
2418	, m_cursor(`0`)
2419	{
2420	ASSERT(!m_target.isEmpty());
2421	m_target.replace(noBreakSpace, `' '`);
2422	foldQuoteMarks(m_target);
2423	}
2424
2425	inline SearchBuffer::~SearchBuffer() = default;
2426
2427	inline void SearchBuffer::reachedBreak()
2428	{
2429	m_cursor = `0`;
2430	m_isBufferFull = false;
2431	}
2432
2433	inline bool SearchBuffer::atBreak() const
2434	{
2435	return !m_cursor && !m_isBufferFull;
2436	}
2437
2438	inline void SearchBuffer::append(UChar c, bool isStart)
2439	{
2440	m_buffer[m_cursor] = c == noBreakSpace ? `' '` : foldQuoteMark(c);
2441	m_isCharacterStartBuffer[m_cursor] = isStart;
2442	if (++m_cursor == m_target.length()) {
2443	m_cursor = `0`;
2444	m_isBufferFull = true;
2445	}
2446	}
2447
2448	inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2449	{
2450	ASSERT(length);
2451	if (!(m_options & CaseInsensitive)) {
2452	append(characters[`0`], true);
2453	return `1`;
2454	}
2455	const int maxFoldedCharacters = `16`; // sensible maximum is 3, this should be more than enough
2456	UChar foldedCharacters[maxFoldedCharacters];
2457	UErrorCode status = U_ZERO_ERROR;
2458	int numFoldedCharacters = u_strFoldCase(foldedCharacters, maxFoldedCharacters, characters, `1`, U_FOLD_CASE_DEFAULT, &status);
2459	ASSERT(U_SUCCESS(status));
2460	ASSERT(numFoldedCharacters);
2461	ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2462	if (U_SUCCESS(status) && numFoldedCharacters) {
2463	numFoldedCharacters = std::min(numFoldedCharacters, maxFoldedCharacters);
2464	append(foldedCharacters[`0`], true);
2465	for (int i = `1`; i < numFoldedCharacters; ++i)
2466	append(foldedCharacters[i], false);
2467	}
2468	return `1`;
2469	}
2470
2471	inline bool SearchBuffer::needsMoreContext() const
2472	{
2473	return false;
2474	}
2475
2476	void SearchBuffer::prependContext(const UChar*, size_t)
2477	{
2478	ASSERT_NOT_REACHED();
2479	}
2480
2481	inline size_t SearchBuffer::search(size_t& start)
2482	{
2483	if (!m_isBufferFull)
2484	return `0`;
2485	if (!m_isCharacterStartBuffer[m_cursor])
2486	return `0`;
2487
2488	size_t tailSpace = m_target.length() - m_cursor;
2489	if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != `0`)
2490	return `0`;
2491	if (memcmp(&m_buffer[`0`], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != `0`)
2492	return `0`;
2493
2494	start = length();
2495
2496	// Now that we've found a match once, we don't want to find it again, because those
2497	// are the SearchBuffer semantics, allowing for a buffer where you append more than one
2498	// character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2499	// we want to get rid of that in the future we could track this with a separate boolean
2500	// or even move the characters to the start of the buffer and set m_isBufferFull to false.
2501	m_isCharacterStartBuffer[m_cursor] = false;
2502
2503	return start;
2504	}
2505
2506	// Returns the number of characters that were appended to the buffer (what we are searching in).
2507	// That's not necessarily the same length as the passed-in target string, because case folding
2508	// can make two strings match even though they're not the same length.
2509	size_t SearchBuffer::length() const
2510	{
2511	size_t bufferSize = m_target.length();
2512	size_t length = `0`;
2513	for (size_t i = `0`; i < bufferSize; ++i)
2514	length += m_isCharacterStartBuffer[i];
2515	return length;
2516	}
2517
2518	#endif
2519
2520	// --------
2521
2522	int TextIterator::rangeLength(const Range* range, bool forSelectionPreservation)
2523	{
2524	unsigned length = `0`;
2525	for (TextIterator it(range, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2526	length += it.text().length();
2527	return length;
2528	}
2529
2530	Ref<Range> TextIterator::subrange(Range& entireRange, int characterOffset, int characterCount)
2531	{
2532	CharacterIterator entireRangeIterator(entireRange);
2533	return characterSubrange(entireRange.ownerDocument(), entireRangeIterator, characterOffset, characterCount);
2534	}
2535
2536	static inline bool isInsideReplacedElement(TextIterator& iterator)
2537	{
2538	ASSERT(!iterator.atEnd());
2539	ASSERT(iterator.text().length() == `1`);
2540	Node* node = iterator.node();
2541	return node && isRendererReplacedElement(node->renderer());
2542	}
2543
2544	RefPtr<Range> TextIterator::rangeFromLocationAndLength(ContainerNode* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2545	{
2546	Ref<Range> resultRange = scope->document().createRange();
2547
2548	int docTextPosition = `0`;
2549	int rangeEnd = rangeLocation + rangeLength;
2550	bool startRangeFound = false;
2551
2552	Ref<Range> textRunRange = rangeOfContents(*scope);
2553
2554	TextIterator it(textRunRange.ptr(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2555
2556	// FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2557	if (!rangeLocation && !rangeLength && it.atEnd()) {
2558	resultRange ->setStart(textRunRange ->startContainer(), `0`);
2559	resultRange ->setEnd(textRunRange ->startContainer(), `0`);
2560	return resultRange;
2561	}
2562
2563	for (; !it.atEnd(); it.advance()) {
2564	int length = it.text().length();
2565	textRunRange = it.range();
2566
2567	bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + length;
2568	bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + length;
2569
2570	if (foundEnd) {
2571	// FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2572	// position for emitted '\n's or if the renderer of the current node is a replaced element.
2573	if (length == `1` && (it.text()[`0`] == `'\n'` \|\| isInsideReplacedElement(it))) {
2574	it.advance();
2575	if (!it.atEnd()) {
2576	Ref<Range> range = it.range();
2577	textRunRange ->setEnd(range ->startContainer(), range ->startOffset());
2578	} else {
2579	Position runStart = textRunRange ->startPosition();
2580	Position runEnd = VisiblePosition (runStart).next().deepEquivalent();
2581	if (runEnd.isNotNull())
2582	textRunRange ->setEnd(*runEnd.containerNode(), runEnd.computeOffsetInContainerNode());
2583	}
2584	}
2585	}
2586
2587	if (foundStart) {
2588	startRangeFound = true;
2589	if (textRunRange ->startContainer().isTextNode()) {
2590	int offset = rangeLocation - docTextPosition;
2591	resultRange ->setStart(textRunRange ->startContainer(), offset + textRunRange ->startOffset());
2592	} else {
2593	if (rangeLocation == docTextPosition)
2594	resultRange ->setStart(textRunRange ->startContainer(), textRunRange ->startOffset());
2595	else
2596	resultRange ->setStart(textRunRange ->endContainer(), textRunRange ->endOffset());
2597	}
2598	}
2599
2600	if (foundEnd) {
2601	if (textRunRange ->startContainer().isTextNode()) {
2602	int offset = rangeEnd - docTextPosition;
2603	resultRange ->setEnd(textRunRange ->startContainer(), offset + textRunRange ->startOffset());
2604	} else {
2605	if (rangeEnd == docTextPosition)
2606	resultRange ->setEnd(textRunRange ->startContainer(), textRunRange ->startOffset());
2607	else
2608	resultRange ->setEnd(textRunRange ->endContainer(), textRunRange ->endOffset());
2609	}
2610	docTextPosition += length;
2611	break;
2612	}
2613
2614	docTextPosition += length;
2615	}
2616
2617	if (!startRangeFound)
2618	return nullptr;
2619
2620	if (rangeLength && rangeEnd > docTextPosition) // rangeEnd is out of bounds
2621	resultRange ->setEnd(textRunRange ->endContainer(), textRunRange ->endOffset());
2622
2623	return resultRange;
2624	}
2625
2626	bool TextIterator::getLocationAndLengthFromRange(Node* scope, const Range* range, size_t& location, size_t& length)
2627	{
2628	location = notFound;
2629	length = `0`;
2630
2631	// The critical assumption is that this only gets called with ranges that
2632	// concentrate on a given area containing the selection root. This is done
2633	// because of text fields and textareas. The DOM for those is not
2634	// directly in the document DOM, so ensure that the range does not cross a
2635	// boundary of one of those.
2636	if (&range->startContainer() != scope && !range->startContainer().isDescendantOf(scope))
2637	return false;
2638	if (&range->endContainer() != scope && !range->endContainer().isDescendantOf(scope))
2639	return false;
2640
2641	Ref<Range> testRange = Range::create(scope->document(), scope, `0`, &range->startContainer(), range->startOffset());
2642	ASSERT(&testRange ->startContainer() == scope);
2643	location = TextIterator::rangeLength(testRange.ptr());
2644
2645	testRange ->setEnd(range->endContainer(), range->endOffset());
2646	ASSERT(&testRange ->startContainer() == scope);
2647	length = TextIterator::rangeLength(testRange.ptr()) - location;
2648	return true;
2649	}
2650
2651	// --------
2652
2653	bool hasAnyPlainText(const Range& range, TextIteratorBehavior behavior)
2654	{
2655	for (TextIterator iterator { &range, behavior }; !iterator.atEnd(); iterator.advance()) {
2656	if (!iterator.text().isEmpty())
2657	return true;
2658	}
2659	return false;
2660	}
2661
2662	String plainText(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2663	{
2664	// The initial buffer size can be critical for performance: https://bugs.webkit.org/show_bug.cgi?id=81192
2665	static const unsigned initialCapacity = `1` << `15`;
2666
2667	if (!start.document())
2668	return { };
2669	auto document = makeRef(*start.document());
2670
2671	unsigned bufferLength = `0`;
2672	StringBuilder builder;
2673	builder.reserveCapacity(initialCapacity);
2674	TextIteratorBehavior behavior = defaultBehavior;
2675	if (!isDisplayString)
2676	behavior = static_cast<TextIteratorBehavior>(behavior \| TextIteratorEmitsTextsWithoutTranscoding);
2677
2678	for (TextIterator it(start, end, behavior); !it.atEnd(); it.advance()) {
2679	it.appendTextToStringBuilder(builder);
2680	bufferLength += it.text().length();
2681	}
2682
2683	if (!bufferLength)
2684	return emptyString();
2685
2686	String result = builder.toString();
2687
2688	if (isDisplayString)
2689	document ->displayStringModifiedByEncoding(result);
2690
2691	return result;
2692	}
2693
2694	String plainTextReplacingNoBreakSpace(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2695	{
2696	return plainText(start, end, defaultBehavior, isDisplayString).replace(noBreakSpace, `' '`);
2697	}
2698
2699	String plainText(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2700	{
2701	if (!range)
2702	return emptyString();
2703	return plainText(range->startPosition(), range->endPosition(), defaultBehavior, isDisplayString);
2704	}
2705
2706	String plainTextUsingBackwardsTextIteratorForTesting(const Range& range)
2707	{
2708	String result;
2709	for (SimplifiedBackwardsTextIterator backwardsIterator(range); !backwardsIterator.atEnd(); backwardsIterator.advance())
2710	result.insert(backwardsIterator.text().toString(), `0`);
2711	return result;
2712	}
2713
2714	String plainTextReplacingNoBreakSpace(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2715	{
2716	return plainText(range, defaultBehavior, isDisplayString).replace(noBreakSpace, `' '`);
2717	}
2718
2719	static Ref<Range> collapsedToBoundary(const Range& range, bool forward)
2720	{
2721	Ref<Range> result = range.cloneRange();
2722	result ->collapse(!forward);
2723	return result;
2724	}
2725
2726	static TextIteratorBehavior findIteratorOptions(FindOptions options)
2727	{
2728	TextIteratorBehavior iteratorOptions = TextIteratorEntersTextControls \| TextIteratorClipsToFrameAncestors;
2729	if (!options.contains(DoNotTraverseFlatTree))
2730	iteratorOptions \|= TextIteratorTraversesFlatTree;
2731	return iteratorOptions;
2732	}
2733
2734	static void findPlainTextMatches(const Range& range, const String& target, FindOptions options, const WTF::Function<bool(size_t, size_t)>& match)
2735	{
2736	SearchBuffer buffer(target, options);
2737	if (buffer.needsMoreContext()) {
2738	Ref<Range> beforeStartRange = range.ownerDocument().createRange();
2739	beforeStartRange ->setEnd(range.startContainer(), range.startOffset());
2740	for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2741	buffer.prependContext(backwardsIterator.text());
2742	if (!buffer.needsMoreContext())
2743	break;
2744	}
2745	}
2746
2747	CharacterIterator findIterator(range, findIteratorOptions(options));
2748	while (!findIterator.atEnd()) {
2749	findIterator.advance(buffer.append(findIterator.text()));
2750	while (`1`) {
2751	size_t matchStartOffset;
2752	size_t newMatchLength = buffer.search(matchStartOffset);
2753	if (!newMatchLength) {
2754	if (findIterator.atBreak() && !buffer.atBreak()) {
2755	buffer.reachedBreak();
2756	continue;
2757	}
2758	break;
2759	}
2760	size_t lastCharacterInBufferOffset = findIterator.characterOffset();
2761	ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2762	if (match (lastCharacterInBufferOffset - matchStartOffset, newMatchLength))
2763	return;
2764	}
2765	}
2766	}
2767
2768	static Ref<Range> rangeForMatch(const Range& range, FindOptions options, size_t matchStart, size_t matchLength, bool searchForward)
2769	{
2770	if (!matchLength)
2771	return collapsedToBoundary(range, searchForward);
2772	CharacterIterator rangeComputeIterator(range, findIteratorOptions(options));
2773	return characterSubrange(range.ownerDocument(), rangeComputeIterator, matchStart, matchLength);
2774	}
2775
2776	Ref<Range> findClosestPlainText(const Range& range, const String& target, FindOptions options, unsigned targetOffset)
2777	{
2778	size_t matchStart = `0`;
2779	size_t matchLength = `0`;
2780	size_t distance = std::numeric_limits<size_t>::max();
2781	auto match = [targetOffset, &distance, &matchStart, &matchLength] (size_t start, size_t length) {
2782	size_t newDistance = std::min(abs(static_cast<signed>(start - targetOffset)), abs(static_cast<signed>(start + length - targetOffset)));
2783	if (newDistance < distance) {
2784	matchStart = start;
2785	matchLength = length;
2786	distance = newDistance;
2787	}
2788	return false;
2789	};
2790
2791	findPlainTextMatches(range, target, options, WTFMove(match));
2792	return rangeForMatch(range, options, matchStart, matchLength, !options.contains(Backwards));
2793	}
2794
2795	Ref<Range> findPlainText(const Range& range, const String& target, FindOptions options)
2796	{
2797	bool searchForward = !options.contains(Backwards);
2798	size_t matchStart = `0`;
2799	size_t matchLength = `0`;
2800	auto match = [searchForward, &matchStart, &matchLength] (size_t start, size_t length) {
2801	matchStart = start;
2802	matchLength = length;
2803	// Look for the last match when searching backwards instead.
2804	return searchForward;
2805	};
2806
2807	findPlainTextMatches(range, target, options, WTFMove(match));
2808	return rangeForMatch(range, options, matchStart, matchLength, searchForward);
2809	}
2810
2811	bool findPlainText(const String& document, const String& target, FindOptions options)
2812	{
2813	SearchBuffer buffer { target, options };
2814	StringView remainingText { document };
2815	while (!remainingText.isEmpty()) {
2816	size_t charactersAppended = buffer.append(document);
2817	remainingText = remainingText.substring(charactersAppended);
2818	if (remainingText.isEmpty())
2819	buffer.reachedBreak();
2820	size_t matchStartOffset;
2821	if (buffer.search(matchStartOffset))
2822	return true;
2823	}
2824	return false;
2825	}
2826
2827	}
2828

Browse the source code of webkit/Source/WebCore/editing/TextIterator.cpp