| 1 | /* |
| 2 | * Copyright (C) 2004-2019 Apple Inc. All rights reserved. |
| 3 | * Copyright (C) 2005 Alexey Proskuryakov. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * |
| 14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| 15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 25 | */ |
| 26 | |
| 27 | #include "config.h" |
| 28 | #include "TextIterator.h" |
| 29 | |
| 30 | #include "ComposedTreeIterator.h" |
| 31 | #include "Document.h" |
| 32 | #include "Editing.h" |
| 33 | #include "FontCascade.h" |
| 34 | #include "Frame.h" |
| 35 | #include "HTMLBodyElement.h" |
| 36 | #include "HTMLElement.h" |
| 37 | #include "HTMLFrameOwnerElement.h" |
| 38 | #include "HTMLInputElement.h" |
| 39 | #include "HTMLLegendElement.h" |
| 40 | #include "HTMLMeterElement.h" |
| 41 | #include "HTMLNames.h" |
| 42 | #include "HTMLParagraphElement.h" |
| 43 | #include "HTMLProgressElement.h" |
| 44 | #include "HTMLSlotElement.h" |
| 45 | #include "HTMLTextAreaElement.h" |
| 46 | #include "HTMLTextFormControlElement.h" |
| 47 | #include "InlineTextBox.h" |
| 48 | #include "NodeTraversal.h" |
| 49 | #include "RenderImage.h" |
| 50 | #include "RenderIterator.h" |
| 51 | #include "RenderTableCell.h" |
| 52 | #include "RenderTableRow.h" |
| 53 | #include "RenderTextControl.h" |
| 54 | #include "RenderTextFragment.h" |
| 55 | #include "ShadowRoot.h" |
| 56 | #include "SimpleLineLayout.h" |
| 57 | #include "SimpleLineLayoutFunctions.h" |
| 58 | #include "SimpleLineLayoutResolver.h" |
| 59 | #include "TextBoundaries.h" |
| 60 | #include "TextControlInnerElements.h" |
| 61 | #include "VisiblePosition.h" |
| 62 | #include "VisibleUnits.h" |
| 63 | #include <unicode/unorm2.h> |
| 64 | #include <wtf/Function.h> |
| 65 | #include <wtf/text/CString.h> |
| 66 | #include <wtf/text/StringBuilder.h> |
| 67 | #include <wtf/text/TextBreakIterator.h> |
| 68 | #include <wtf/unicode/CharacterNames.h> |
| 69 | |
| 70 | #if !UCONFIG_NO_COLLATION |
| 71 | #include <unicode/usearch.h> |
| 72 | #include <wtf/text/TextBreakIteratorInternalICU.h> |
| 73 | #endif |
| 74 | |
| 75 | namespace WebCore { |
| 76 | |
| 77 | using namespace WTF::Unicode; |
| 78 | using namespace HTMLNames; |
| 79 | |
| 80 | // Buffer that knows how to compare with a search target. |
| 81 | // Keeps enough of the previous text to be able to search in the future, but no more. |
| 82 | // Non-breaking spaces are always equal to normal spaces. |
| 83 | // Case folding is also done if the CaseInsensitive option is specified. |
| 84 | // Matches are further filtered if the AtWordStarts option is specified, although some |
| 85 | // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well. |
| 86 | class SearchBuffer { |
| 87 | WTF_MAKE_NONCOPYABLE(SearchBuffer); |
| 88 | public: |
| 89 | SearchBuffer(const String& target, FindOptions); |
| 90 | ~SearchBuffer(); |
| 91 | |
| 92 | // Returns number of characters appended; guaranteed to be in the range [1, length]. |
| 93 | size_t append(StringView); |
| 94 | bool needsMoreContext() const; |
| 95 | void prependContext(StringView); |
| 96 | void reachedBreak(); |
| 97 | |
| 98 | // Result is the size in characters of what was found. |
| 99 | // And <startOffset> is the number of characters back to the start of what was found. |
| 100 | size_t search(size_t& startOffset); |
| 101 | bool atBreak() const; |
| 102 | |
| 103 | #if !UCONFIG_NO_COLLATION |
| 104 | |
| 105 | private: |
| 106 | bool isBadMatch(const UChar*, size_t length) const; |
| 107 | bool isWordStartMatch(size_t start, size_t length) const; |
| 108 | bool isWordEndMatch(size_t start, size_t length) const; |
| 109 | |
| 110 | const String m_target; |
| 111 | const StringView::UpconvertedCharacters m_targetCharacters; |
| 112 | FindOptions m_options; |
| 113 | |
| 114 | Vector<UChar> m_buffer; |
| 115 | size_t m_overlap; |
| 116 | size_t m_prefixLength; |
| 117 | bool m_atBreak; |
| 118 | bool m_needsMoreContext; |
| 119 | |
| 120 | const bool m_targetRequiresKanaWorkaround; |
| 121 | Vector<UChar> m_normalizedTarget; |
| 122 | mutable Vector<UChar> m_normalizedMatch; |
| 123 | |
| 124 | #else |
| 125 | |
| 126 | private: |
| 127 | void append(UChar, bool isCharacterStart); |
| 128 | size_t length() const; |
| 129 | |
| 130 | String m_target; |
| 131 | FindOptions m_options; |
| 132 | |
| 133 | Vector<UChar> m_buffer; |
| 134 | Vector<bool> m_isCharacterStartBuffer; |
| 135 | bool m_isBufferFull; |
| 136 | size_t m_cursor; |
| 137 | |
| 138 | #endif |
| 139 | }; |
| 140 | |
| 141 | // -------- |
| 142 | |
| 143 | static const unsigned bitsInWord = sizeof(unsigned) * 8; |
| 144 | static const unsigned bitInWordMask = bitsInWord - 1; |
| 145 | |
| 146 | BitStack::BitStack() |
| 147 | : m_size(0) |
| 148 | { |
| 149 | } |
| 150 | |
| 151 | BitStack::~BitStack() = default; |
| 152 | |
| 153 | void BitStack::push(bool bit) |
| 154 | { |
| 155 | unsigned index = m_size / bitsInWord; |
| 156 | unsigned shift = m_size & bitInWordMask; |
| 157 | if (!shift && index == m_words.size()) { |
| 158 | m_words.grow(index + 1); |
| 159 | m_words[index] = 0; |
| 160 | } |
| 161 | unsigned& word = m_words[index]; |
| 162 | unsigned mask = 1U << shift; |
| 163 | if (bit) |
| 164 | word |= mask; |
| 165 | else |
| 166 | word &= ~mask; |
| 167 | ++m_size; |
| 168 | } |
| 169 | |
| 170 | void BitStack::pop() |
| 171 | { |
| 172 | if (m_size) |
| 173 | --m_size; |
| 174 | } |
| 175 | |
| 176 | bool BitStack::top() const |
| 177 | { |
| 178 | if (!m_size) |
| 179 | return false; |
| 180 | unsigned shift = (m_size - 1) & bitInWordMask; |
| 181 | return m_words.last() & (1U << shift); |
| 182 | } |
| 183 | |
| 184 | unsigned BitStack::size() const |
| 185 | { |
| 186 | return m_size; |
| 187 | } |
| 188 | |
| 189 | // -------- |
| 190 | |
| 191 | // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees. |
| 192 | static Node* nextInPreOrderCrossingShadowBoundaries(Node& rangeEndContainer, int rangeEndOffset) |
| 193 | { |
| 194 | if (rangeEndOffset >= 0 && !rangeEndContainer.isCharacterDataNode()) { |
| 195 | if (Node* next = rangeEndContainer.traverseToChildAt(rangeEndOffset)) |
| 196 | return next; |
| 197 | } |
| 198 | for (Node* node = &rangeEndContainer; node; node = node->parentOrShadowHostNode()) { |
| 199 | if (Node* next = node->nextSibling()) |
| 200 | return next; |
| 201 | } |
| 202 | return nullptr; |
| 203 | } |
| 204 | |
| 205 | static inline bool fullyClipsContents(Node& node) |
| 206 | { |
| 207 | auto* renderer = node.renderer(); |
| 208 | if (!renderer) { |
| 209 | if (!is<Element>(node)) |
| 210 | return false; |
| 211 | return !downcast<Element>(node).hasDisplayContents(); |
| 212 | } |
| 213 | if (!is<RenderBox>(*renderer)) |
| 214 | return false; |
| 215 | auto& box = downcast<RenderBox>(*renderer); |
| 216 | if (!box.hasOverflowClip()) |
| 217 | return false; |
| 218 | |
| 219 | // Quirk to keep copy/paste in the CodeMirror editor version used in Jenkins working. |
| 220 | if (is<HTMLTextAreaElement>(node)) |
| 221 | return box.size().isEmpty(); |
| 222 | |
| 223 | return box.contentSize().isEmpty(); |
| 224 | } |
| 225 | |
| 226 | static inline bool ignoresContainerClip(Node& node) |
| 227 | { |
| 228 | auto* renderer = node.renderer(); |
| 229 | if (!renderer || renderer->isTextOrLineBreak()) |
| 230 | return false; |
| 231 | return renderer->style().hasOutOfFlowPosition(); |
| 232 | } |
| 233 | |
| 234 | static void pushFullyClippedState(BitStack& stack, Node& node) |
| 235 | { |
| 236 | // Push true if this node full clips its contents, or if a parent already has fully |
| 237 | // clipped and this is not a node that ignores its container's clip. |
| 238 | stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node))); |
| 239 | } |
| 240 | |
| 241 | static void setUpFullyClippedStack(BitStack& stack, Node& node) |
| 242 | { |
| 243 | // Put the nodes in a vector so we can iterate in reverse order. |
| 244 | // FIXME: This (and TextIterator in general) should use ComposedTreeIterator. |
| 245 | Vector<Node*, 100> ancestry; |
| 246 | for (Node* parent = node.parentOrShadowHostNode(); parent; parent = parent->parentOrShadowHostNode()) |
| 247 | ancestry.append(parent); |
| 248 | |
| 249 | // Call pushFullyClippedState on each node starting with the earliest ancestor. |
| 250 | size_t size = ancestry.size(); |
| 251 | for (size_t i = 0; i < size; ++i) |
| 252 | pushFullyClippedState(stack, *ancestry[size - i - 1]); |
| 253 | pushFullyClippedState(stack, node); |
| 254 | } |
| 255 | |
| 256 | static bool isClippedByFrameAncestor(const Document& document, TextIteratorBehavior behavior) |
| 257 | { |
| 258 | if (!(behavior & TextIteratorClipsToFrameAncestors)) |
| 259 | return false; |
| 260 | |
| 261 | for (auto* owner = document.ownerElement(); owner; owner = owner->document().ownerElement()) { |
| 262 | BitStack ownerClipStack; |
| 263 | setUpFullyClippedStack(ownerClipStack, *owner); |
| 264 | if (ownerClipStack.top()) |
| 265 | return true; |
| 266 | } |
| 267 | return false; |
| 268 | } |
| 269 | |
| 270 | // FIXME: editingIgnoresContent and isRendererReplacedElement try to do the same job. |
| 271 | // It's not good to have both of them. |
| 272 | bool isRendererReplacedElement(RenderObject* renderer) |
| 273 | { |
| 274 | if (!renderer) |
| 275 | return false; |
| 276 | |
| 277 | bool isAttachment = false; |
| 278 | #if ENABLE(ATTACHMENT_ELEMENT) |
| 279 | isAttachment = renderer->isAttachment(); |
| 280 | #endif |
| 281 | if (renderer->isImage() || renderer->isWidget() || renderer->isMedia() || isAttachment) |
| 282 | return true; |
| 283 | |
| 284 | if (is<Element>(renderer->node())) { |
| 285 | Element& element = downcast<Element>(*renderer->node()); |
| 286 | if (is<HTMLFormControlElement>(element) || is<HTMLLegendElement>(element) || is<HTMLProgressElement>(element) || element.hasTagName(meterTag)) |
| 287 | return true; |
| 288 | if (equalLettersIgnoringASCIICase(element.attributeWithoutSynchronization(roleAttr), "img" )) |
| 289 | return true; |
| 290 | } |
| 291 | |
| 292 | return false; |
| 293 | } |
| 294 | |
| 295 | // -------- |
| 296 | |
| 297 | inline void TextIteratorCopyableText::reset() |
| 298 | { |
| 299 | m_singleCharacter = 0; |
| 300 | m_string = String(); |
| 301 | m_offset = 0; |
| 302 | m_length = 0; |
| 303 | } |
| 304 | |
| 305 | inline void TextIteratorCopyableText::set(String&& string) |
| 306 | { |
| 307 | m_singleCharacter = 0; |
| 308 | m_string = WTFMove(string); |
| 309 | m_offset = 0; |
| 310 | m_length = m_string.length(); |
| 311 | } |
| 312 | |
| 313 | inline void TextIteratorCopyableText::set(String&& string, unsigned offset, unsigned length) |
| 314 | { |
| 315 | ASSERT(offset < string.length()); |
| 316 | ASSERT(length); |
| 317 | ASSERT(length <= string.length() - offset); |
| 318 | |
| 319 | m_singleCharacter = 0; |
| 320 | m_string = WTFMove(string); |
| 321 | m_offset = offset; |
| 322 | m_length = length; |
| 323 | } |
| 324 | |
| 325 | inline void TextIteratorCopyableText::set(UChar singleCharacter) |
| 326 | { |
| 327 | m_singleCharacter = singleCharacter; |
| 328 | m_string = String(); |
| 329 | m_offset = 0; |
| 330 | m_length = 0; |
| 331 | } |
| 332 | |
| 333 | void TextIteratorCopyableText::appendToStringBuilder(StringBuilder& builder) const |
| 334 | { |
| 335 | if (m_singleCharacter) |
| 336 | builder.append(m_singleCharacter); |
| 337 | else |
| 338 | builder.append(m_string, m_offset, m_length); |
| 339 | } |
| 340 | |
| 341 | // -------- |
| 342 | |
| 343 | |
| 344 | TextIterator::TextIterator(Position start, Position end, TextIteratorBehavior behavior) |
| 345 | : m_behavior(behavior) |
| 346 | { |
| 347 | if (start.isNull() || end.isNull()) |
| 348 | return; |
| 349 | ASSERT(comparePositions(start, end) <= 0); |
| 350 | |
| 351 | RELEASE_ASSERT(behavior & TextIteratorTraversesFlatTree || start.treeScope() == end.treeScope()); |
| 352 | |
| 353 | start.document()->updateLayoutIgnorePendingStylesheets(); |
| 354 | |
| 355 | // FIXME: Use Position / PositionIterator instead to avoid offset computation. |
| 356 | m_startContainer = start.containerNode(); |
| 357 | m_startOffset = start.computeOffsetInContainerNode(); |
| 358 | |
| 359 | m_endContainer = end.containerNode(); |
| 360 | m_endOffset = end.computeOffsetInContainerNode(); |
| 361 | |
| 362 | m_node = start.firstNode().get(); |
| 363 | if (!m_node) |
| 364 | return; |
| 365 | |
| 366 | init(); |
| 367 | } |
| 368 | |
| 369 | TextIterator::TextIterator(const Range* range, TextIteratorBehavior behavior) |
| 370 | : m_behavior(behavior) |
| 371 | { |
| 372 | if (!range) |
| 373 | return; |
| 374 | |
| 375 | range->ownerDocument().updateLayoutIgnorePendingStylesheets(); |
| 376 | |
| 377 | m_startContainer = &range->startContainer(); |
| 378 | |
| 379 | // Callers should be handing us well-formed ranges. If we discover that this isn't |
| 380 | // the case, we could consider changing this assertion to an early return. |
| 381 | ASSERT(range->boundaryPointsValid()); |
| 382 | |
| 383 | m_startOffset = range->startOffset(); |
| 384 | m_endContainer = &range->endContainer(); |
| 385 | m_endOffset = range->endOffset(); |
| 386 | |
| 387 | m_node = range->firstNode(); |
| 388 | if (!m_node) |
| 389 | return; |
| 390 | |
| 391 | init(); |
| 392 | } |
| 393 | |
| 394 | void TextIterator::init() |
| 395 | { |
| 396 | if (isClippedByFrameAncestor(m_node->document(), m_behavior)) |
| 397 | return; |
| 398 | |
| 399 | setUpFullyClippedStack(m_fullyClippedStack, *m_node); |
| 400 | |
| 401 | m_offset = m_node == m_startContainer ? m_startOffset : 0; |
| 402 | |
| 403 | m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(*m_endContainer, m_endOffset); |
| 404 | |
| 405 | m_positionNode = m_node; |
| 406 | |
| 407 | advance(); |
| 408 | } |
| 409 | |
| 410 | TextIterator::~TextIterator() = default; |
| 411 | |
| 412 | // FIXME: Use ComposedTreeIterator instead. These functions are more expensive because they might do O(n) work. |
| 413 | static inline Node* firstChild(TextIteratorBehavior options, Node& node) |
| 414 | { |
| 415 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
| 416 | return firstChildInComposedTreeIgnoringUserAgentShadow(node); |
| 417 | return node.firstChild(); |
| 418 | } |
| 419 | |
| 420 | static inline Node* nextSibling(TextIteratorBehavior options, Node& node) |
| 421 | { |
| 422 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
| 423 | return nextSiblingInComposedTreeIgnoringUserAgentShadow(node); |
| 424 | return node.nextSibling(); |
| 425 | } |
| 426 | |
| 427 | static inline Node* nextNode(TextIteratorBehavior options, Node& node) |
| 428 | { |
| 429 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
| 430 | return nextInComposedTreeIgnoringUserAgentShadow(node); |
| 431 | return NodeTraversal::next(node); |
| 432 | } |
| 433 | |
| 434 | static inline bool isDescendantOf(TextIteratorBehavior options, Node& node, Node& possibleAncestor) |
| 435 | { |
| 436 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
| 437 | return node.isDescendantOrShadowDescendantOf(&possibleAncestor); |
| 438 | return node.isDescendantOf(&possibleAncestor); |
| 439 | } |
| 440 | |
| 441 | static inline Node* parentNodeOrShadowHost(TextIteratorBehavior options, Node& node) |
| 442 | { |
| 443 | if (UNLIKELY(options & TextIteratorTraversesFlatTree)) |
| 444 | return node.parentInComposedTree(); |
| 445 | return node.parentOrShadowHostNode(); |
| 446 | } |
| 447 | |
| 448 | static inline bool hasDisplayContents(Node& node) |
| 449 | { |
| 450 | return is<Element>(node) && downcast<Element>(node).hasDisplayContents(); |
| 451 | } |
| 452 | |
| 453 | void TextIterator::advance() |
| 454 | { |
| 455 | ASSERT(!atEnd()); |
| 456 | |
| 457 | // reset the run information |
| 458 | m_positionNode = nullptr; |
| 459 | m_copyableText.reset(); |
| 460 | m_text = StringView(); |
| 461 | |
| 462 | // handle remembered node that needed a newline after the text node's newline |
| 463 | if (m_nodeForAdditionalNewline) { |
| 464 | // Emit the extra newline, and position it *inside* m_node, after m_node's |
| 465 | // contents, in case it's a block, in the same way that we position the first |
| 466 | // newline. The range for the emitted newline should start where the line |
| 467 | // break begins. |
| 468 | // FIXME: It would be cleaner if we emitted two newlines during the last |
| 469 | // iteration, instead of using m_needsAnotherNewline. |
| 470 | emitCharacter('\n', *m_nodeForAdditionalNewline->parentNode(), m_nodeForAdditionalNewline, 1, 1); |
| 471 | m_nodeForAdditionalNewline = nullptr; |
| 472 | return; |
| 473 | } |
| 474 | |
| 475 | if (!m_textBox && m_remainingTextBox) { |
| 476 | m_textBox = m_remainingTextBox; |
| 477 | m_remainingTextBox = nullptr; |
| 478 | m_firstLetterText = nullptr; |
| 479 | m_offset = 0; |
| 480 | } |
| 481 | // handle remembered text box |
| 482 | if (m_textBox) { |
| 483 | handleTextBox(); |
| 484 | if (m_positionNode) |
| 485 | return; |
| 486 | } |
| 487 | |
| 488 | while (m_node && m_node != m_pastEndNode) { |
| 489 | // if the range ends at offset 0 of an element, represent the |
| 490 | // position, but not the content, of that element e.g. if the |
| 491 | // node is a blockflow element, emit a newline that |
| 492 | // precedes the element |
| 493 | if (m_node == m_endContainer && !m_endOffset) { |
| 494 | representNodeOffsetZero(); |
| 495 | m_node = nullptr; |
| 496 | return; |
| 497 | } |
| 498 | |
| 499 | auto* renderer = m_node->renderer(); |
| 500 | if (!m_handledNode) { |
| 501 | if (!renderer) { |
| 502 | m_handledNode = true; |
| 503 | m_handledChildren = !hasDisplayContents(*m_node); |
| 504 | } else { |
| 505 | // handle current node according to its type |
| 506 | if (renderer->isText() && m_node->isTextNode()) |
| 507 | m_handledNode = handleTextNode(); |
| 508 | else if (isRendererReplacedElement(renderer)) |
| 509 | m_handledNode = handleReplacedElement(); |
| 510 | else |
| 511 | m_handledNode = handleNonTextNode(); |
| 512 | if (m_positionNode) |
| 513 | return; |
| 514 | } |
| 515 | } |
| 516 | |
| 517 | // find a new current node to handle in depth-first manner, |
| 518 | // calling exitNode() as we come back thru a parent node |
| 519 | Node* next = m_handledChildren ? nullptr : firstChild(m_behavior, *m_node); |
| 520 | m_offset = 0; |
| 521 | if (!next) { |
| 522 | next = nextSibling(m_behavior, *m_node); |
| 523 | if (!next) { |
| 524 | bool pastEnd = nextNode(m_behavior, *m_node) == m_pastEndNode; |
| 525 | Node* parentNode = parentNodeOrShadowHost(m_behavior, *m_node); |
| 526 | while (!next && parentNode) { |
| 527 | if ((pastEnd && parentNode == m_endContainer) || isDescendantOf(m_behavior, *m_endContainer, *parentNode)) |
| 528 | return; |
| 529 | bool haveRenderer = m_node->renderer(); |
| 530 | Node* exitedNode = m_node; |
| 531 | m_node = parentNode; |
| 532 | m_fullyClippedStack.pop(); |
| 533 | parentNode = parentNodeOrShadowHost(m_behavior, *m_node); |
| 534 | if (haveRenderer) |
| 535 | exitNode(exitedNode); |
| 536 | if (m_positionNode) { |
| 537 | m_handledNode = true; |
| 538 | m_handledChildren = true; |
| 539 | return; |
| 540 | } |
| 541 | next = nextSibling(m_behavior, *m_node); |
| 542 | if (next && m_node->renderer()) |
| 543 | exitNode(m_node); |
| 544 | } |
| 545 | } |
| 546 | m_fullyClippedStack.pop(); |
| 547 | } |
| 548 | |
| 549 | // set the new current node |
| 550 | m_node = next; |
| 551 | if (m_node) |
| 552 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
| 553 | m_handledNode = false; |
| 554 | m_handledChildren = false; |
| 555 | m_handledFirstLetter = false; |
| 556 | m_firstLetterText = nullptr; |
| 557 | |
| 558 | // how would this ever be? |
| 559 | if (m_positionNode) |
| 560 | return; |
| 561 | } |
| 562 | } |
| 563 | |
| 564 | static bool hasVisibleTextNode(RenderText& renderer) |
| 565 | { |
| 566 | if (renderer.style().visibility() == Visibility::Visible) |
| 567 | return true; |
| 568 | if (is<RenderTextFragment>(renderer)) { |
| 569 | if (auto firstLetter = downcast<RenderTextFragment>(renderer).firstLetter()) { |
| 570 | if (firstLetter->style().visibility() == Visibility::Visible) |
| 571 | return true; |
| 572 | } |
| 573 | } |
| 574 | return false; |
| 575 | } |
| 576 | |
| 577 | static unsigned textNodeOffsetInFlow(const Text& firstTextNodeInRange) |
| 578 | { |
| 579 | // Calculate the text offset for simple lines. |
| 580 | RenderObject* renderer = firstTextNodeInRange.renderer(); |
| 581 | if (!renderer) |
| 582 | return 0; |
| 583 | unsigned textOffset = 0; |
| 584 | for (renderer = renderer->previousSibling(); renderer; renderer = renderer->previousSibling()) { |
| 585 | if (is<RenderText>(renderer)) |
| 586 | textOffset += downcast<RenderText>(renderer)->text().length(); |
| 587 | } |
| 588 | return textOffset; |
| 589 | } |
| 590 | |
| 591 | static bool isNewLineOrTabCharacter(UChar character) |
| 592 | { |
| 593 | return character == '\n' || character == '\t'; |
| 594 | } |
| 595 | |
| 596 | bool TextIterator::handleTextNode() |
| 597 | { |
| 598 | Text& textNode = downcast<Text>(*m_node); |
| 599 | |
| 600 | if (m_fullyClippedStack.top() && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 601 | return false; |
| 602 | |
| 603 | auto& renderer = *textNode.renderer(); |
| 604 | m_lastTextNode = &textNode; |
| 605 | String rendererText = renderer.text(); |
| 606 | |
| 607 | // handle pre-formatted text |
| 608 | if (!renderer.style().collapseWhiteSpace()) { |
| 609 | int runStart = m_offset; |
| 610 | if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) { |
| 611 | emitCharacter(' ', textNode, nullptr, runStart, runStart); |
| 612 | return false; |
| 613 | } |
| 614 | if (!m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset) { |
| 615 | handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer)); |
| 616 | if (m_firstLetterText) { |
| 617 | String firstLetter = m_firstLetterText->text(); |
| 618 | emitText(textNode, *m_firstLetterText, m_offset, m_offset + firstLetter.length()); |
| 619 | m_firstLetterText = nullptr; |
| 620 | m_textBox = nullptr; |
| 621 | return false; |
| 622 | } |
| 623 | } |
| 624 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 625 | return false; |
| 626 | int rendererTextLength = rendererText.length(); |
| 627 | int end = (&textNode == m_endContainer) ? m_endOffset : INT_MAX; |
| 628 | int runEnd = std::min(rendererTextLength, end); |
| 629 | |
| 630 | if (runStart >= runEnd) |
| 631 | return true; |
| 632 | |
| 633 | emitText(textNode, renderer, runStart, runEnd); |
| 634 | return true; |
| 635 | } |
| 636 | |
| 637 | if (const auto* layout = renderer.simpleLineLayout()) { |
| 638 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 639 | return true; |
| 640 | ASSERT(renderer.parent()); |
| 641 | ASSERT(is<RenderBlockFlow>(*renderer.parent())); |
| 642 | const auto& blockFlow = downcast<RenderBlockFlow>(*renderer.parent()); |
| 643 | // Use the simple layout runs to iterate over the text content. |
| 644 | bool isNewTextNode = m_previousSimpleTextNodeInFlow && m_previousSimpleTextNodeInFlow != &textNode; |
| 645 | // Simple line layout run positions are all absolute to the parent flow. |
| 646 | // Offsetting is required when multiple renderers are present. |
| 647 | m_accumulatedSimpleTextLengthInFlow += isNewTextNode ? m_previousSimpleTextNodeInFlow->renderer()->text().length() : 0; |
| 648 | m_previousSimpleTextNodeInFlow = &textNode; |
| 649 | |
| 650 | unsigned endPosition = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : rendererText.length(); |
| 651 | if (!m_flowRunResolverCache || &m_flowRunResolverCache->flow() != &blockFlow) { |
| 652 | m_accumulatedSimpleTextLengthInFlow = m_flowRunResolverCache ? 0 : textNodeOffsetInFlow(textNode); |
| 653 | m_flowRunResolverCache = std::make_unique<SimpleLineLayout::RunResolver>(blockFlow, *layout); |
| 654 | } |
| 655 | // Skip to m_offset position. |
| 656 | auto range = m_flowRunResolverCache->rangeForRenderer(renderer); |
| 657 | auto it = range.begin(); |
| 658 | auto end = range.end(); |
| 659 | auto startPosition = static_cast<unsigned>(m_offset) + m_accumulatedSimpleTextLengthInFlow; |
| 660 | while (it != end && (*it).end() <= startPosition) |
| 661 | ++it; |
| 662 | if (m_nextRunNeedsWhitespace && rendererText[m_offset - 1] == '\n') { |
| 663 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
| 664 | return it == end; |
| 665 | } |
| 666 | if (it == end) { |
| 667 | // Collapsed trailing whitespace. |
| 668 | m_offset = endPosition; |
| 669 | m_lastTextNodeEndedWithCollapsedSpace = true; |
| 670 | return true; |
| 671 | } |
| 672 | if (m_nextRunNeedsWhitespace) { |
| 673 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
| 674 | return false; |
| 675 | } |
| 676 | // If the position we are looking for is to the left of the renderer's first run, it could mean that |
| 677 | // the runs and the renderers are out of sync (e.g. we skipped a renderer in between). |
| 678 | // Better bail out at this point. |
| 679 | auto run = *it; |
| 680 | if (run.start() > startPosition) { |
| 681 | ASSERT(m_flowRunResolverCache); |
| 682 | if (&(rendererForPosition(m_flowRunResolverCache->flowContents(), startPosition)) != &renderer) { |
| 683 | ASSERT_NOT_REACHED(); |
| 684 | return true; |
| 685 | } |
| 686 | } |
| 687 | ASSERT(run.end() - run.start() <= rendererText.length()); |
| 688 | // contentStart skips leading whitespace. |
| 689 | unsigned contentStart = std::max<unsigned>(m_offset, run.start() - m_accumulatedSimpleTextLengthInFlow); |
| 690 | unsigned contentEnd = std::min(endPosition, run.end() - m_accumulatedSimpleTextLengthInFlow); |
| 691 | ASSERT_WITH_SECURITY_IMPLICATION(contentStart <= contentEnd); |
| 692 | // Check if whitespace adjustment is needed when crossing renderer boundary. |
| 693 | if (isNewTextNode) { |
| 694 | bool lastCharacterIsNotWhitespace = m_lastCharacter && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter); |
| 695 | bool addTrailingWhitespaceForPrevious = m_lastTextNodeEndedWithCollapsedSpace && lastCharacterIsNotWhitespace; |
| 696 | bool leadingWhitespaceIsNeededForCurrent = contentStart > static_cast<unsigned>(m_offset) && lastCharacterIsNotWhitespace; |
| 697 | if (addTrailingWhitespaceForPrevious || leadingWhitespaceIsNeededForCurrent) { |
| 698 | emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1); |
| 699 | return false; |
| 700 | } |
| 701 | } |
| 702 | // \n \t single whitespace characters need replacing so that the new line/tab characters don't show up. |
| 703 | unsigned stopPosition = contentStart; |
| 704 | while (stopPosition < contentEnd && !isNewLineOrTabCharacter(rendererText[stopPosition])) |
| 705 | ++stopPosition; |
| 706 | // Emit the text up to the new line/tab character. |
| 707 | if (stopPosition < contentEnd) { |
| 708 | if (stopPosition == contentStart) { |
| 709 | emitCharacter(' ', textNode, nullptr, contentStart, contentStart + 1); |
| 710 | m_offset = contentStart + 1; |
| 711 | return false; |
| 712 | } |
| 713 | emitText(textNode, renderer, contentStart, stopPosition); |
| 714 | m_offset = stopPosition + 1; |
| 715 | m_nextRunNeedsWhitespace = true; |
| 716 | return false; |
| 717 | } |
| 718 | emitText(textNode, renderer, contentStart, contentEnd); |
| 719 | // When line ending with collapsed whitespace is present, we need to carry over one whitespace: foo(end of line)bar -> foo bar (otherwise we would end up with foobar). |
| 720 | m_nextRunNeedsWhitespace = run.isEndOfLine() && contentEnd < endPosition && renderer.style().isCollapsibleWhiteSpace(rendererText[contentEnd]); |
| 721 | m_offset = contentEnd; |
| 722 | return static_cast<unsigned>(m_offset) == endPosition; |
| 723 | } |
| 724 | |
| 725 | if (renderer.firstTextBox()) |
| 726 | m_textBox = renderer.firstTextBox(); |
| 727 | |
| 728 | bool shouldHandleFirstLetter = !m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset; |
| 729 | if (shouldHandleFirstLetter) |
| 730 | handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer)); |
| 731 | |
| 732 | if (!renderer.firstTextBox() && rendererText.length() && !shouldHandleFirstLetter) { |
| 733 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 734 | return false; |
| 735 | m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space |
| 736 | return true; |
| 737 | } |
| 738 | |
| 739 | // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text) |
| 740 | auto& boxesRenderer = m_firstLetterText ? *m_firstLetterText : renderer; |
| 741 | if (boxesRenderer.containsReversedText()) { |
| 742 | m_sortedTextBoxes.clear(); |
| 743 | for (InlineTextBox* textBox = boxesRenderer.firstTextBox(); textBox; textBox = textBox->nextTextBox()) |
| 744 | m_sortedTextBoxes.append(textBox); |
| 745 | std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart); |
| 746 | m_sortedTextBoxesPosition = 0; |
| 747 | m_textBox = m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]; |
| 748 | } |
| 749 | |
| 750 | handleTextBox(); |
| 751 | return true; |
| 752 | } |
| 753 | |
| 754 | void TextIterator::handleTextBox() |
| 755 | { |
| 756 | Text& textNode = downcast<Text>(*m_node); |
| 757 | |
| 758 | auto& renderer = m_firstLetterText ? *m_firstLetterText : *textNode.renderer(); |
| 759 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) { |
| 760 | m_textBox = nullptr; |
| 761 | return; |
| 762 | } |
| 763 | String rendererText = renderer.text(); |
| 764 | unsigned start = m_offset; |
| 765 | unsigned end = (&textNode == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX; |
| 766 | while (m_textBox) { |
| 767 | unsigned textBoxStart = m_textBox->start(); |
| 768 | unsigned runStart = std::max(textBoxStart, start); |
| 769 | |
| 770 | // Check for collapsed space at the start of this run. |
| 771 | InlineTextBox* firstTextBox = renderer.containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]) : renderer.firstTextBox(); |
| 772 | bool needSpace = m_lastTextNodeEndedWithCollapsedSpace || (m_textBox == firstTextBox && textBoxStart == runStart && runStart); |
| 773 | if (needSpace && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter) && m_lastCharacter) { |
| 774 | if (m_lastTextNode == &textNode && runStart && rendererText[runStart - 1] == ' ') { |
| 775 | unsigned spaceRunStart = runStart - 1; |
| 776 | while (spaceRunStart && rendererText[spaceRunStart - 1] == ' ') |
| 777 | --spaceRunStart; |
| 778 | emitText(textNode, renderer, spaceRunStart, spaceRunStart + 1); |
| 779 | } else |
| 780 | emitCharacter(' ', textNode, nullptr, runStart, runStart); |
| 781 | return; |
| 782 | } |
| 783 | unsigned textBoxEnd = textBoxStart + m_textBox->len(); |
| 784 | unsigned runEnd = std::min(textBoxEnd, end); |
| 785 | |
| 786 | // Determine what the next text box will be, but don't advance yet |
| 787 | InlineTextBox* nextTextBox = nullptr; |
| 788 | if (renderer.containsReversedText()) { |
| 789 | if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size()) |
| 790 | nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1]; |
| 791 | } else |
| 792 | nextTextBox = m_textBox->nextTextBox(); |
| 793 | ASSERT(!nextTextBox || &nextTextBox->renderer() == &renderer); |
| 794 | |
| 795 | if (runStart < runEnd) { |
| 796 | // Handle either a single newline character (which becomes a space), |
| 797 | // or a run of characters that does not include a newline. |
| 798 | // This effectively translates newlines to spaces without copying the text. |
| 799 | if (rendererText[runStart] == '\n') { |
| 800 | emitCharacter(' ', textNode, nullptr, runStart, runStart + 1); |
| 801 | m_offset = runStart + 1; |
| 802 | } else { |
| 803 | size_t subrunEnd = rendererText.find('\n', runStart); |
| 804 | if (subrunEnd == notFound || subrunEnd > runEnd) { |
| 805 | subrunEnd = runEnd; |
| 806 | bool lastSpaceCollapsedByNextNonTextBox = !nextTextBox && (m_behavior & TextIteratorBehavesAsIfNodesFollowing) && rendererText.length() > runEnd; |
| 807 | if (lastSpaceCollapsedByNextNonTextBox) |
| 808 | ++subrunEnd; // runEnd stopped before last space. Increment by one to restore the space. |
| 809 | } |
| 810 | m_offset = subrunEnd; |
| 811 | emitText(textNode, renderer, runStart, subrunEnd); |
| 812 | } |
| 813 | |
| 814 | // If we are doing a subrun that doesn't go to the end of the text box, |
| 815 | // come back again to finish handling this text box; don't advance to the next one. |
| 816 | if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd) |
| 817 | return; |
| 818 | |
| 819 | // Advance and return |
| 820 | unsigned nextRunStart = nextTextBox ? nextTextBox->start() : rendererText.length(); |
| 821 | if (nextRunStart > runEnd) |
| 822 | m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end |
| 823 | m_textBox = nextTextBox; |
| 824 | if (renderer.containsReversedText()) |
| 825 | ++m_sortedTextBoxesPosition; |
| 826 | return; |
| 827 | } |
| 828 | // Advance and continue |
| 829 | m_textBox = nextTextBox; |
| 830 | if (renderer.containsReversedText()) |
| 831 | ++m_sortedTextBoxesPosition; |
| 832 | } |
| 833 | if (!m_textBox && m_remainingTextBox) { |
| 834 | m_textBox = m_remainingTextBox; |
| 835 | m_remainingTextBox = nullptr; |
| 836 | m_firstLetterText = nullptr; |
| 837 | m_offset = 0; |
| 838 | handleTextBox(); |
| 839 | } |
| 840 | } |
| 841 | |
| 842 | static inline RenderText* firstRenderTextInFirstLetter(RenderBoxModelObject* firstLetter) |
| 843 | { |
| 844 | if (!firstLetter) |
| 845 | return nullptr; |
| 846 | |
| 847 | // FIXME: Should this check descendent objects? |
| 848 | return childrenOfType<RenderText>(*firstLetter).first(); |
| 849 | } |
| 850 | |
| 851 | void TextIterator::handleTextNodeFirstLetter(RenderTextFragment& renderer) |
| 852 | { |
| 853 | if (auto* firstLetter = renderer.firstLetter()) { |
| 854 | if (firstLetter->style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 855 | return; |
| 856 | if (auto* firstLetterText = firstRenderTextInFirstLetter(firstLetter)) { |
| 857 | m_handledFirstLetter = true; |
| 858 | m_remainingTextBox = m_textBox; |
| 859 | m_textBox = firstLetterText->firstTextBox(); |
| 860 | m_sortedTextBoxes.clear(); |
| 861 | m_firstLetterText = firstLetterText; |
| 862 | } |
| 863 | } |
| 864 | m_handledFirstLetter = true; |
| 865 | } |
| 866 | |
| 867 | bool TextIterator::handleReplacedElement() |
| 868 | { |
| 869 | if (m_fullyClippedStack.top()) |
| 870 | return false; |
| 871 | |
| 872 | auto& renderer = *m_node->renderer(); |
| 873 | if (renderer.style().visibility() != Visibility::Visible && !(m_behavior & TextIteratorIgnoresStyleVisibility)) |
| 874 | return false; |
| 875 | |
| 876 | if (m_lastTextNodeEndedWithCollapsedSpace) { |
| 877 | emitCharacter(' ', *m_lastTextNode->parentNode(), m_lastTextNode, 1, 1); |
| 878 | return false; |
| 879 | } |
| 880 | |
| 881 | if ((m_behavior & TextIteratorEntersTextControls) && is<RenderTextControl>(renderer)) { |
| 882 | if (auto innerTextElement = downcast<RenderTextControl>(renderer).textFormControlElement().innerTextElement()) { |
| 883 | m_node = innerTextElement->containingShadowRoot(); |
| 884 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
| 885 | m_offset = 0; |
| 886 | return false; |
| 887 | } |
| 888 | } |
| 889 | |
| 890 | m_hasEmitted = true; |
| 891 | |
| 892 | if ((m_behavior & TextIteratorEmitsObjectReplacementCharacters) && renderer.isReplaced()) { |
| 893 | emitCharacter(objectReplacementCharacter, *m_node->parentNode(), m_node, 0, 1); |
| 894 | // Don't process subtrees for embedded objects. If the text there is required, |
| 895 | // it must be explicitly asked by specifying a range falling inside its boundaries. |
| 896 | m_handledChildren = true; |
| 897 | return true; |
| 898 | } |
| 899 | |
| 900 | if (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) { |
| 901 | // We want replaced elements to behave like punctuation for boundary |
| 902 | // finding, and to simply take up space for the selection preservation |
| 903 | // code in moveParagraphs, so we use a comma. |
| 904 | emitCharacter(',', *m_node->parentNode(), m_node, 0, 1); |
| 905 | return true; |
| 906 | } |
| 907 | |
| 908 | m_positionNode = m_node->parentNode(); |
| 909 | m_positionOffsetBaseNode = m_node; |
| 910 | m_positionStartOffset = 0; |
| 911 | m_positionEndOffset = 1; |
| 912 | |
| 913 | if ((m_behavior & TextIteratorEmitsImageAltText) && is<RenderImage>(renderer)) { |
| 914 | String altText = downcast<RenderImage>(renderer).altText(); |
| 915 | if (unsigned length = altText.length()) { |
| 916 | m_lastCharacter = altText[length - 1]; |
| 917 | m_copyableText.set(WTFMove(altText)); |
| 918 | m_text = m_copyableText.text(); |
| 919 | return true; |
| 920 | } |
| 921 | } |
| 922 | |
| 923 | m_copyableText.reset(); |
| 924 | m_text = StringView(); |
| 925 | m_lastCharacter = 0; |
| 926 | return true; |
| 927 | } |
| 928 | |
| 929 | static bool shouldEmitTabBeforeNode(Node& node) |
| 930 | { |
| 931 | auto* renderer = node.renderer(); |
| 932 | |
| 933 | // Table cells are delimited by tabs. |
| 934 | if (!renderer || !isTableCell(&node)) |
| 935 | return false; |
| 936 | |
| 937 | // Want a tab before every cell other than the first one. |
| 938 | RenderTableCell& cell = downcast<RenderTableCell>(*renderer); |
| 939 | RenderTable* table = cell.table(); |
| 940 | return table && (table->cellBefore(&cell) || table->cellAbove(&cell)); |
| 941 | } |
| 942 | |
| 943 | static bool shouldEmitNewlineForNode(Node* node, bool emitsOriginalText) |
| 944 | { |
| 945 | auto* renderer = node->renderer(); |
| 946 | if (!(renderer ? renderer->isBR() : node->hasTagName(brTag))) |
| 947 | return false; |
| 948 | return emitsOriginalText || !(node->isInShadowTree() && is<HTMLInputElement>(*node->shadowHost())); |
| 949 | } |
| 950 | |
| 951 | static bool (HTMLElement& element) |
| 952 | { |
| 953 | return element.hasTagName(h1Tag) |
| 954 | || element.hasTagName(h2Tag) |
| 955 | || element.hasTagName(h3Tag) |
| 956 | || element.hasTagName(h4Tag) |
| 957 | || element.hasTagName(h5Tag) |
| 958 | || element.hasTagName(h6Tag); |
| 959 | } |
| 960 | |
| 961 | static bool shouldEmitNewlinesBeforeAndAfterNode(Node& node) |
| 962 | { |
| 963 | // Block flow (versus inline flow) is represented by having |
| 964 | // a newline both before and after the element. |
| 965 | auto* renderer = node.renderer(); |
| 966 | if (!renderer) { |
| 967 | if (!is<HTMLElement>(node)) |
| 968 | return false; |
| 969 | auto& element = downcast<HTMLElement>(node); |
| 970 | return hasHeaderTag(element) |
| 971 | || element.hasTagName(blockquoteTag) |
| 972 | || element.hasTagName(ddTag) |
| 973 | || element.hasTagName(divTag) |
| 974 | || element.hasTagName(dlTag) |
| 975 | || element.hasTagName(dtTag) |
| 976 | || element.hasTagName(hrTag) |
| 977 | || element.hasTagName(liTag) |
| 978 | || element.hasTagName(listingTag) |
| 979 | || element.hasTagName(olTag) |
| 980 | || element.hasTagName(pTag) |
| 981 | || element.hasTagName(preTag) |
| 982 | || element.hasTagName(trTag) |
| 983 | || element.hasTagName(ulTag); |
| 984 | } |
| 985 | |
| 986 | // Need to make an exception for table cells, because they are blocks, but we |
| 987 | // want them tab-delimited rather than having newlines before and after. |
| 988 | if (isTableCell(&node)) |
| 989 | return false; |
| 990 | |
| 991 | // Need to make an exception for table row elements, because they are neither |
| 992 | // "inline" or "RenderBlock", but we want newlines for them. |
| 993 | if (is<RenderTableRow>(*renderer)) { |
| 994 | RenderTable* table = downcast<RenderTableRow>(*renderer).table(); |
| 995 | if (table && !table->isInline()) |
| 996 | return true; |
| 997 | } |
| 998 | |
| 999 | return !renderer->isInline() |
| 1000 | && is<RenderBlock>(*renderer) |
| 1001 | && !renderer->isFloatingOrOutOfFlowPositioned() |
| 1002 | && !renderer->isBody() |
| 1003 | && !renderer->isRubyText(); |
| 1004 | } |
| 1005 | |
| 1006 | static bool shouldEmitNewlineAfterNode(Node& node, bool emitsCharactersBetweenAllVisiblePositions = false) |
| 1007 | { |
| 1008 | // FIXME: It should be better but slower to create a VisiblePosition here. |
| 1009 | if (!shouldEmitNewlinesBeforeAndAfterNode(node)) |
| 1010 | return false; |
| 1011 | |
| 1012 | // Don't emit a new line at the end of the document unless we're matching the behavior of VisiblePosition. |
| 1013 | if (emitsCharactersBetweenAllVisiblePositions) |
| 1014 | return true; |
| 1015 | Node* subsequentNode = &node; |
| 1016 | while ((subsequentNode = NodeTraversal::nextSkippingChildren(*subsequentNode))) { |
| 1017 | if (subsequentNode->renderer()) |
| 1018 | return true; |
| 1019 | } |
| 1020 | return false; |
| 1021 | } |
| 1022 | |
| 1023 | static bool shouldEmitNewlineBeforeNode(Node& node) |
| 1024 | { |
| 1025 | return shouldEmitNewlinesBeforeAndAfterNode(node); |
| 1026 | } |
| 1027 | |
| 1028 | static bool (Node& node) |
| 1029 | { |
| 1030 | // When there is a significant collapsed bottom margin, emit an extra |
| 1031 | // newline for a more realistic result. We end up getting the right |
| 1032 | // result even without margin collapsing. For example: <div><p>text</p></div> |
| 1033 | // will work right even if both the <div> and the <p> have bottom margins. |
| 1034 | |
| 1035 | auto* renderer = node.renderer(); |
| 1036 | if (!is<RenderBox>(renderer)) |
| 1037 | return false; |
| 1038 | |
| 1039 | // NOTE: We only do this for a select set of nodes, and WinIE appears not to do this at all. |
| 1040 | if (!is<HTMLElement>(node)) |
| 1041 | return false; |
| 1042 | |
| 1043 | HTMLElement& element = downcast<HTMLElement>(node); |
| 1044 | if (!hasHeaderTag(element) && !is<HTMLParagraphElement>(element)) |
| 1045 | return false; |
| 1046 | |
| 1047 | auto& renderBox = downcast<RenderBox>(*renderer); |
| 1048 | if (!renderBox.height()) |
| 1049 | return false; |
| 1050 | |
| 1051 | int bottomMargin = renderBox.collapsedMarginAfter(); |
| 1052 | int fontSize = renderBox.style().fontDescription().computedPixelSize(); |
| 1053 | return bottomMargin * 2 >= fontSize; |
| 1054 | } |
| 1055 | |
| 1056 | static int collapsedSpaceLength(RenderText& renderer, int textEnd) |
| 1057 | { |
| 1058 | StringImpl& text = renderer.text(); |
| 1059 | unsigned length = text.length(); |
| 1060 | for (unsigned i = textEnd; i < length; ++i) { |
| 1061 | if (!renderer.style().isCollapsibleWhiteSpace(text[i])) |
| 1062 | return i - textEnd; |
| 1063 | } |
| 1064 | return length - textEnd; |
| 1065 | } |
| 1066 | |
| 1067 | static int maxOffsetIncludingCollapsedSpaces(Node& node) |
| 1068 | { |
| 1069 | int offset = caretMaxOffset(node); |
| 1070 | if (auto* renderer = node.renderer()) { |
| 1071 | if (is<RenderText>(*renderer)) |
| 1072 | offset += collapsedSpaceLength(downcast<RenderText>(*renderer), offset); |
| 1073 | } |
| 1074 | return offset; |
| 1075 | } |
| 1076 | |
| 1077 | // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic). |
| 1078 | bool TextIterator::shouldRepresentNodeOffsetZero() |
| 1079 | { |
| 1080 | if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isTable()) |
| 1081 | return true; |
| 1082 | |
| 1083 | // Leave element positioned flush with start of a paragraph |
| 1084 | // (e.g. do not insert tab before a table cell at the start of a paragraph) |
| 1085 | if (m_lastCharacter == '\n') |
| 1086 | return false; |
| 1087 | |
| 1088 | // Otherwise, show the position if we have emitted any characters |
| 1089 | if (m_hasEmitted) |
| 1090 | return true; |
| 1091 | |
| 1092 | // We've not emitted anything yet. Generally, there is no need for any positioning then. |
| 1093 | // The only exception is when the element is visually not in the same line as |
| 1094 | // the start of the range (e.g. the range starts at the end of the previous paragraph). |
| 1095 | // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we |
| 1096 | // make quicker checks to possibly avoid that. Another check that we could make is |
| 1097 | // is whether the inline vs block flow changed since the previous visible element. |
| 1098 | // I think we're already in a special enough case that that won't be needed, tho. |
| 1099 | |
| 1100 | // No character needed if this is the first node in the range. |
| 1101 | if (m_node == m_startContainer) |
| 1102 | return false; |
| 1103 | |
| 1104 | // If we are outside the start container's subtree, assume we need to emit. |
| 1105 | // FIXME: m_startContainer could be an inline block |
| 1106 | if (!m_node->isDescendantOf(m_startContainer)) |
| 1107 | return true; |
| 1108 | |
| 1109 | // If we started as m_startContainer offset 0 and the current node is a descendant of |
| 1110 | // the start container, we already had enough context to correctly decide whether to |
| 1111 | // emit after a preceding block. We chose not to emit (m_hasEmitted is false), |
| 1112 | // so don't second guess that now. |
| 1113 | // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably |
| 1114 | // immaterial since we likely would have already emitted something by now. |
| 1115 | if (m_startOffset == 0) |
| 1116 | return false; |
| 1117 | |
| 1118 | // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning. |
| 1119 | // Additionally, if the range we are iterating over contains huge sections of unrendered content, |
| 1120 | // we would create VisiblePositions on every call to this function without this check. |
| 1121 | if (!m_node->renderer() || m_node->renderer()->style().visibility() != Visibility::Visible |
| 1122 | || (is<RenderBlockFlow>(*m_node->renderer()) && !downcast<RenderBlockFlow>(*m_node->renderer()).height() && !is<HTMLBodyElement>(*m_node))) |
| 1123 | return false; |
| 1124 | |
| 1125 | // The startPos.isNotNull() check is needed because the start could be before the body, |
| 1126 | // and in that case we'll get null. We don't want to put in newlines at the start in that case. |
| 1127 | // The currPos.isNotNull() check is needed because positions in non-HTML content |
| 1128 | // (like SVG) do not have visible positions, and we don't want to emit for them either. |
| 1129 | VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM); |
| 1130 | VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM); |
| 1131 | return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos); |
| 1132 | } |
| 1133 | |
| 1134 | bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node& node) |
| 1135 | { |
| 1136 | return node.renderer() && node.renderer()->isTable() && (node.renderer()->isInline() || (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)); |
| 1137 | } |
| 1138 | |
| 1139 | void TextIterator::representNodeOffsetZero() |
| 1140 | { |
| 1141 | // Emit a character to show the positioning of m_node. |
| 1142 | |
| 1143 | // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can |
| 1144 | // create VisiblePositions, which is expensive. So, we perform the inexpensive checks |
| 1145 | // on m_node to see if it necessitates emitting a character first and will early return |
| 1146 | // before encountering shouldRepresentNodeOffsetZero()s worse case behavior. |
| 1147 | if (shouldEmitTabBeforeNode(*m_node)) { |
| 1148 | if (shouldRepresentNodeOffsetZero()) |
| 1149 | emitCharacter('\t', *m_node->parentNode(), m_node, 0, 0); |
| 1150 | } else if (shouldEmitNewlineBeforeNode(*m_node)) { |
| 1151 | if (shouldRepresentNodeOffsetZero()) |
| 1152 | emitCharacter('\n', *m_node->parentNode(), m_node, 0, 0); |
| 1153 | } else if (shouldEmitSpaceBeforeAndAfterNode(*m_node)) { |
| 1154 | if (shouldRepresentNodeOffsetZero()) |
| 1155 | emitCharacter(' ', *m_node->parentNode(), m_node, 0, 0); |
| 1156 | } |
| 1157 | } |
| 1158 | |
| 1159 | bool TextIterator::handleNonTextNode() |
| 1160 | { |
| 1161 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText)) |
| 1162 | emitCharacter('\n', *m_node->parentNode(), m_node, 0, 1); |
| 1163 | else if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isHR()) |
| 1164 | emitCharacter(' ', *m_node->parentNode(), m_node, 0, 1); |
| 1165 | else |
| 1166 | representNodeOffsetZero(); |
| 1167 | |
| 1168 | return true; |
| 1169 | } |
| 1170 | |
| 1171 | void TextIterator::exitNode(Node* exitedNode) |
| 1172 | { |
| 1173 | // prevent emitting a newline when exiting a collapsed block at beginning of the range |
| 1174 | // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could |
| 1175 | // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and |
| 1176 | // therefore look like a blank line. |
| 1177 | if (!m_hasEmitted) |
| 1178 | return; |
| 1179 | |
| 1180 | // Emit with a position *inside* m_node, after m_node's contents, in |
| 1181 | // case it is a block, because the run should start where the |
| 1182 | // emitted character is positioned visually. |
| 1183 | Node* baseNode = exitedNode; |
| 1184 | // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making |
| 1185 | // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use |
| 1186 | // TextIterator in _web_attributedStringFromRange. |
| 1187 | // See <rdar://problem/5428427> for an example of how this mismatch will cause problems. |
| 1188 | if (m_lastTextNode && shouldEmitNewlineAfterNode(*m_node, m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)) { |
| 1189 | // use extra newline to represent margin bottom, as needed |
| 1190 | bool addNewline = shouldEmitExtraNewlineForNode(*m_node); |
| 1191 | |
| 1192 | // FIXME: We need to emit a '\n' as we leave an empty block(s) that |
| 1193 | // contain a VisiblePosition when doing selection preservation. |
| 1194 | if (m_lastCharacter != '\n') { |
| 1195 | // insert a newline with a position following this block's contents. |
| 1196 | emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1); |
| 1197 | // remember whether to later add a newline for the current node |
| 1198 | ASSERT(!m_nodeForAdditionalNewline); |
| 1199 | if (addNewline) |
| 1200 | m_nodeForAdditionalNewline = baseNode; |
| 1201 | } else if (addNewline) |
| 1202 | // insert a newline with a position following this block's contents. |
| 1203 | emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1); |
| 1204 | } |
| 1205 | |
| 1206 | // If nothing was emitted, see if we need to emit a space. |
| 1207 | if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(*m_node)) |
| 1208 | emitCharacter(' ', *baseNode->parentNode(), baseNode, 1, 1); |
| 1209 | } |
| 1210 | |
| 1211 | void TextIterator::emitCharacter(UChar character, Node& characterNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset) |
| 1212 | { |
| 1213 | m_hasEmitted = true; |
| 1214 | |
| 1215 | // remember information with which to construct the TextIterator::range() |
| 1216 | m_positionNode = &characterNode; |
| 1217 | m_positionOffsetBaseNode = offsetBaseNode; |
| 1218 | m_positionStartOffset = textStartOffset; |
| 1219 | m_positionEndOffset = textEndOffset; |
| 1220 | |
| 1221 | m_copyableText.set(character); |
| 1222 | m_text = m_copyableText.text(); |
| 1223 | m_lastCharacter = character; |
| 1224 | m_lastTextNodeEndedWithCollapsedSpace = false; |
| 1225 | m_nextRunNeedsWhitespace = false; |
| 1226 | } |
| 1227 | |
| 1228 | void TextIterator::emitText(Text& textNode, RenderText& renderer, int textStartOffset, int textEndOffset) |
| 1229 | { |
| 1230 | ASSERT(textStartOffset >= 0); |
| 1231 | ASSERT(textEndOffset >= 0); |
| 1232 | ASSERT(textStartOffset <= textEndOffset); |
| 1233 | |
| 1234 | // FIXME: This probably yields the wrong offsets when text-transform: lowercase turns a single character into two characters. |
| 1235 | String string = (m_behavior & TextIteratorEmitsOriginalText) ? renderer.originalText() |
| 1236 | : ((m_behavior & TextIteratorEmitsTextsWithoutTranscoding) ? renderer.textWithoutConvertingBackslashToYenSymbol() : renderer.text()); |
| 1237 | |
| 1238 | ASSERT(string.length() >= static_cast<unsigned>(textEndOffset)); |
| 1239 | |
| 1240 | m_positionNode = &textNode; |
| 1241 | m_positionOffsetBaseNode = nullptr; |
| 1242 | m_positionStartOffset = textStartOffset; |
| 1243 | m_positionEndOffset = textEndOffset; |
| 1244 | |
| 1245 | m_lastCharacter = string[textEndOffset - 1]; |
| 1246 | m_copyableText.set(WTFMove(string), textStartOffset, textEndOffset - textStartOffset); |
| 1247 | m_text = m_copyableText.text(); |
| 1248 | |
| 1249 | m_lastTextNodeEndedWithCollapsedSpace = false; |
| 1250 | m_nextRunNeedsWhitespace = false; |
| 1251 | m_hasEmitted = true; |
| 1252 | } |
| 1253 | |
| 1254 | Ref<Range> TextIterator::range() const |
| 1255 | { |
| 1256 | ASSERT(!atEnd()); |
| 1257 | |
| 1258 | // use the current run information, if we have it |
| 1259 | if (m_positionOffsetBaseNode) { |
| 1260 | unsigned index = m_positionOffsetBaseNode->computeNodeIndex(); |
| 1261 | m_positionStartOffset += index; |
| 1262 | m_positionEndOffset += index; |
| 1263 | m_positionOffsetBaseNode = nullptr; |
| 1264 | } |
| 1265 | return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); |
| 1266 | } |
| 1267 | |
| 1268 | Node* TextIterator::node() const |
| 1269 | { |
| 1270 | Ref<Range> = range(); |
| 1271 | |
| 1272 | Node& node = textRange->startContainer(); |
| 1273 | if (node.isCharacterDataNode()) |
| 1274 | return &node; |
| 1275 | |
| 1276 | return node.traverseToChildAt(textRange->startOffset()); |
| 1277 | } |
| 1278 | |
| 1279 | // -------- |
| 1280 | |
| 1281 | SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range& range) |
| 1282 | { |
| 1283 | range.ownerDocument().updateLayoutIgnorePendingStylesheets(); |
| 1284 | |
| 1285 | Node* startNode = &range.startContainer(); |
| 1286 | Node* endNode = &range.endContainer(); |
| 1287 | int startOffset = range.startOffset(); |
| 1288 | int endOffset = range.endOffset(); |
| 1289 | |
| 1290 | if (!startNode->isCharacterDataNode()) { |
| 1291 | if (startOffset >= 0 && startOffset < static_cast<int>(startNode->countChildNodes())) { |
| 1292 | startNode = startNode->traverseToChildAt(startOffset); |
| 1293 | startOffset = 0; |
| 1294 | } |
| 1295 | } |
| 1296 | if (!endNode->isCharacterDataNode()) { |
| 1297 | if (endOffset > 0 && endOffset <= static_cast<int>(endNode->countChildNodes())) { |
| 1298 | endNode = endNode->traverseToChildAt(endOffset - 1); |
| 1299 | endOffset = lastOffsetInNode(endNode); |
| 1300 | } |
| 1301 | } |
| 1302 | |
| 1303 | m_node = endNode; |
| 1304 | setUpFullyClippedStack(m_fullyClippedStack, *m_node); |
| 1305 | m_offset = endOffset; |
| 1306 | m_handledNode = false; |
| 1307 | m_handledChildren = endOffset == 0; |
| 1308 | |
| 1309 | m_startContainer = startNode; |
| 1310 | m_startOffset = startOffset; |
| 1311 | m_endContainer = endNode; |
| 1312 | m_endOffset = endOffset; |
| 1313 | |
| 1314 | m_positionNode = endNode; |
| 1315 | |
| 1316 | m_lastTextNode = nullptr; |
| 1317 | m_lastCharacter = '\n'; |
| 1318 | |
| 1319 | m_havePassedStartContainer = false; |
| 1320 | |
| 1321 | advance(); |
| 1322 | } |
| 1323 | |
| 1324 | void SimplifiedBackwardsTextIterator::advance() |
| 1325 | { |
| 1326 | ASSERT(!atEnd()); |
| 1327 | |
| 1328 | m_positionNode = nullptr; |
| 1329 | m_copyableText.reset(); |
| 1330 | m_text = StringView(); |
| 1331 | |
| 1332 | while (m_node && !m_havePassedStartContainer) { |
| 1333 | // Don't handle node if we start iterating at [node, 0]. |
| 1334 | if (!m_handledNode && !(m_node == m_endContainer && !m_endOffset)) { |
| 1335 | auto* renderer = m_node->renderer(); |
| 1336 | if (renderer && renderer->isText() && m_node->isTextNode()) { |
| 1337 | if (renderer->style().visibility() == Visibility::Visible && m_offset > 0) |
| 1338 | m_handledNode = handleTextNode(); |
| 1339 | } else if (renderer && (renderer->isImage() || renderer->isWidget())) { |
| 1340 | if (renderer->style().visibility() == Visibility::Visible && m_offset > 0) |
| 1341 | m_handledNode = handleReplacedElement(); |
| 1342 | } else |
| 1343 | m_handledNode = handleNonTextNode(); |
| 1344 | if (m_positionNode) |
| 1345 | return; |
| 1346 | } |
| 1347 | |
| 1348 | if (!m_handledChildren && m_node->hasChildNodes()) { |
| 1349 | m_node = m_node->lastChild(); |
| 1350 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
| 1351 | } else { |
| 1352 | // Exit empty containers as we pass over them or containers |
| 1353 | // where [container, 0] is where we started iterating. |
| 1354 | if (!m_handledNode && canHaveChildrenForEditing(*m_node) && m_node->parentNode() && (!m_node->lastChild() || (m_node == m_endContainer && !m_endOffset))) { |
| 1355 | exitNode(); |
| 1356 | if (m_positionNode) { |
| 1357 | m_handledNode = true; |
| 1358 | m_handledChildren = true; |
| 1359 | return; |
| 1360 | } |
| 1361 | } |
| 1362 | |
| 1363 | // Exit all other containers. |
| 1364 | while (!m_node->previousSibling()) { |
| 1365 | if (!advanceRespectingRange(m_node->parentOrShadowHostNode())) |
| 1366 | break; |
| 1367 | m_fullyClippedStack.pop(); |
| 1368 | exitNode(); |
| 1369 | if (m_positionNode) { |
| 1370 | m_handledNode = true; |
| 1371 | m_handledChildren = true; |
| 1372 | return; |
| 1373 | } |
| 1374 | } |
| 1375 | |
| 1376 | m_fullyClippedStack.pop(); |
| 1377 | if (advanceRespectingRange(m_node->previousSibling())) |
| 1378 | pushFullyClippedState(m_fullyClippedStack, *m_node); |
| 1379 | else |
| 1380 | m_node = nullptr; |
| 1381 | } |
| 1382 | |
| 1383 | // For the purpose of word boundary detection, |
| 1384 | // we should iterate all visible text and trailing (collapsed) whitespaces. |
| 1385 | m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(*m_node) : 0; |
| 1386 | m_handledNode = false; |
| 1387 | m_handledChildren = false; |
| 1388 | |
| 1389 | if (m_positionNode) |
| 1390 | return; |
| 1391 | } |
| 1392 | } |
| 1393 | |
| 1394 | bool SimplifiedBackwardsTextIterator::handleTextNode() |
| 1395 | { |
| 1396 | Text& textNode = downcast<Text>(*m_node); |
| 1397 | |
| 1398 | m_lastTextNode = &textNode; |
| 1399 | |
| 1400 | int startOffset; |
| 1401 | int offsetInNode; |
| 1402 | RenderText* renderer = handleFirstLetter(startOffset, offsetInNode); |
| 1403 | if (!renderer) |
| 1404 | return true; |
| 1405 | |
| 1406 | String text = renderer->text(); |
| 1407 | if (!renderer->hasRenderedText() && text.length()) |
| 1408 | return true; |
| 1409 | |
| 1410 | if (startOffset + offsetInNode == m_offset) { |
| 1411 | ASSERT(!m_shouldHandleFirstLetter); |
| 1412 | return true; |
| 1413 | } |
| 1414 | |
| 1415 | m_positionEndOffset = m_offset; |
| 1416 | m_offset = startOffset + offsetInNode; |
| 1417 | m_positionNode = m_node; |
| 1418 | m_positionStartOffset = m_offset; |
| 1419 | |
| 1420 | ASSERT(m_positionStartOffset < m_positionEndOffset); |
| 1421 | ASSERT(m_positionStartOffset - offsetInNode >= 0); |
| 1422 | ASSERT(m_positionEndOffset - offsetInNode > 0); |
| 1423 | ASSERT(m_positionEndOffset - offsetInNode <= static_cast<int>(text.length())); |
| 1424 | |
| 1425 | m_lastCharacter = text[m_positionEndOffset - offsetInNode - 1]; |
| 1426 | m_copyableText.set(WTFMove(text), m_positionStartOffset - offsetInNode, m_positionEndOffset - m_positionStartOffset); |
| 1427 | m_text = m_copyableText.text(); |
| 1428 | |
| 1429 | return !m_shouldHandleFirstLetter; |
| 1430 | } |
| 1431 | |
| 1432 | RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode) |
| 1433 | { |
| 1434 | RenderText& renderer = downcast<RenderText>(*m_node->renderer()); |
| 1435 | startOffset = (m_node == m_startContainer) ? m_startOffset : 0; |
| 1436 | |
| 1437 | if (!is<RenderTextFragment>(renderer)) { |
| 1438 | offsetInNode = 0; |
| 1439 | return &renderer; |
| 1440 | } |
| 1441 | |
| 1442 | RenderTextFragment& fragment = downcast<RenderTextFragment>(renderer); |
| 1443 | int offsetAfterFirstLetter = fragment.start(); |
| 1444 | if (startOffset >= offsetAfterFirstLetter) { |
| 1445 | ASSERT(!m_shouldHandleFirstLetter); |
| 1446 | offsetInNode = offsetAfterFirstLetter; |
| 1447 | return &renderer; |
| 1448 | } |
| 1449 | |
| 1450 | if (!m_shouldHandleFirstLetter && startOffset + offsetAfterFirstLetter < m_offset) { |
| 1451 | m_shouldHandleFirstLetter = true; |
| 1452 | offsetInNode = offsetAfterFirstLetter; |
| 1453 | return &renderer; |
| 1454 | } |
| 1455 | |
| 1456 | m_shouldHandleFirstLetter = false; |
| 1457 | offsetInNode = 0; |
| 1458 | RenderText* firstLetterRenderer = firstRenderTextInFirstLetter(fragment.firstLetter()); |
| 1459 | |
| 1460 | m_offset = firstLetterRenderer->caretMaxOffset(); |
| 1461 | m_offset += collapsedSpaceLength(*firstLetterRenderer, m_offset); |
| 1462 | |
| 1463 | return firstLetterRenderer; |
| 1464 | } |
| 1465 | |
| 1466 | bool SimplifiedBackwardsTextIterator::handleReplacedElement() |
| 1467 | { |
| 1468 | unsigned index = m_node->computeNodeIndex(); |
| 1469 | // We want replaced elements to behave like punctuation for boundary |
| 1470 | // finding, and to simply take up space for the selection preservation |
| 1471 | // code in moveParagraphs, so we use a comma. Unconditionally emit |
| 1472 | // here because this iterator is only used for boundary finding. |
| 1473 | emitCharacter(',', *m_node->parentNode(), index, index + 1); |
| 1474 | return true; |
| 1475 | } |
| 1476 | |
| 1477 | bool SimplifiedBackwardsTextIterator::handleNonTextNode() |
| 1478 | { |
| 1479 | // We can use a linefeed in place of a tab because this simple iterator is only used to |
| 1480 | // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs. |
| 1481 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineAfterNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) { |
| 1482 | if (m_lastCharacter != '\n') { |
| 1483 | // Corresponds to the same check in TextIterator::exitNode. |
| 1484 | unsigned index = m_node->computeNodeIndex(); |
| 1485 | // The start of this emitted range is wrong. Ensuring correctness would require |
| 1486 | // VisiblePositions and so would be slow. previousBoundary expects this. |
| 1487 | emitCharacter('\n', *m_node->parentNode(), index + 1, index + 1); |
| 1488 | } |
| 1489 | } |
| 1490 | return true; |
| 1491 | } |
| 1492 | |
| 1493 | void SimplifiedBackwardsTextIterator::exitNode() |
| 1494 | { |
| 1495 | if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineBeforeNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) { |
| 1496 | // The start of this emitted range is wrong. Ensuring correctness would require |
| 1497 | // VisiblePositions and so would be slow. previousBoundary expects this. |
| 1498 | emitCharacter('\n', *m_node, 0, 0); |
| 1499 | } |
| 1500 | } |
| 1501 | |
| 1502 | void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node& node, int startOffset, int endOffset) |
| 1503 | { |
| 1504 | m_positionNode = &node; |
| 1505 | m_positionStartOffset = startOffset; |
| 1506 | m_positionEndOffset = endOffset; |
| 1507 | m_copyableText.set(c); |
| 1508 | m_text = m_copyableText.text(); |
| 1509 | m_lastCharacter = c; |
| 1510 | } |
| 1511 | |
| 1512 | bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next) |
| 1513 | { |
| 1514 | if (!next) |
| 1515 | return false; |
| 1516 | m_havePassedStartContainer |= m_node == m_startContainer; |
| 1517 | if (m_havePassedStartContainer) |
| 1518 | return false; |
| 1519 | m_node = next; |
| 1520 | return true; |
| 1521 | } |
| 1522 | |
| 1523 | Ref<Range> SimplifiedBackwardsTextIterator::range() const |
| 1524 | { |
| 1525 | ASSERT(!atEnd()); |
| 1526 | |
| 1527 | return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); |
| 1528 | } |
| 1529 | |
| 1530 | // -------- |
| 1531 | |
| 1532 | CharacterIterator::CharacterIterator(const Range& range, TextIteratorBehavior behavior) |
| 1533 | : m_underlyingIterator(&range, behavior) |
| 1534 | { |
| 1535 | while (!atEnd() && !m_underlyingIterator.text().length()) |
| 1536 | m_underlyingIterator.advance(); |
| 1537 | } |
| 1538 | |
| 1539 | CharacterIterator::CharacterIterator(Position start, Position end, TextIteratorBehavior behavior) |
| 1540 | : m_underlyingIterator(start, end, behavior) |
| 1541 | { |
| 1542 | while (!atEnd() && !m_underlyingIterator.text().length()) |
| 1543 | m_underlyingIterator.advance(); |
| 1544 | } |
| 1545 | |
| 1546 | Ref<Range> CharacterIterator::range() const |
| 1547 | { |
| 1548 | Ref<Range> range = m_underlyingIterator.range(); |
| 1549 | if (!m_underlyingIterator.atEnd()) { |
| 1550 | if (m_underlyingIterator.text().length() <= 1) { |
| 1551 | ASSERT(m_runOffset == 0); |
| 1552 | } else { |
| 1553 | Node& node = range->startContainer(); |
| 1554 | ASSERT(&node == &range->endContainer()); |
| 1555 | int offset = range->startOffset() + m_runOffset; |
| 1556 | range->setStart(node, offset); |
| 1557 | range->setEnd(node, offset + 1); |
| 1558 | } |
| 1559 | } |
| 1560 | return range; |
| 1561 | } |
| 1562 | |
| 1563 | void CharacterIterator::advance(int count) |
| 1564 | { |
| 1565 | if (count <= 0) { |
| 1566 | ASSERT(count == 0); |
| 1567 | return; |
| 1568 | } |
| 1569 | |
| 1570 | m_atBreak = false; |
| 1571 | |
| 1572 | // easy if there is enough left in the current m_underlyingIterator run |
| 1573 | int remaining = m_underlyingIterator.text().length() - m_runOffset; |
| 1574 | if (count < remaining) { |
| 1575 | m_runOffset += count; |
| 1576 | m_offset += count; |
| 1577 | return; |
| 1578 | } |
| 1579 | |
| 1580 | // exhaust the current m_underlyingIterator run |
| 1581 | count -= remaining; |
| 1582 | m_offset += remaining; |
| 1583 | |
| 1584 | // move to a subsequent m_underlyingIterator run |
| 1585 | for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) { |
| 1586 | int runLength = m_underlyingIterator.text().length(); |
| 1587 | if (!runLength) |
| 1588 | m_atBreak = true; |
| 1589 | else { |
| 1590 | // see whether this is m_underlyingIterator to use |
| 1591 | if (count < runLength) { |
| 1592 | m_runOffset = count; |
| 1593 | m_offset += count; |
| 1594 | return; |
| 1595 | } |
| 1596 | |
| 1597 | // exhaust this m_underlyingIterator run |
| 1598 | count -= runLength; |
| 1599 | m_offset += runLength; |
| 1600 | } |
| 1601 | } |
| 1602 | |
| 1603 | // ran to the end of the m_underlyingIterator... no more runs left |
| 1604 | m_atBreak = true; |
| 1605 | m_runOffset = 0; |
| 1606 | } |
| 1607 | |
| 1608 | static Ref<Range> characterSubrange(Document& document, CharacterIterator& it, int offset, int length) |
| 1609 | { |
| 1610 | it.advance(offset); |
| 1611 | if (it.atEnd()) |
| 1612 | return Range::create(document); |
| 1613 | |
| 1614 | Ref<Range> start = it.range(); |
| 1615 | |
| 1616 | if (length > 1) |
| 1617 | it.advance(length - 1); |
| 1618 | if (it.atEnd()) |
| 1619 | return Range::create(document); |
| 1620 | |
| 1621 | Ref<Range> end = it.range(); |
| 1622 | |
| 1623 | return Range::create(document, &start->startContainer(), start->startOffset(), &end->endContainer(), end->endOffset()); |
| 1624 | } |
| 1625 | |
| 1626 | BackwardsCharacterIterator::BackwardsCharacterIterator(const Range& range) |
| 1627 | : m_underlyingIterator(range) |
| 1628 | , m_offset(0) |
| 1629 | , m_runOffset(0) |
| 1630 | , m_atBreak(true) |
| 1631 | { |
| 1632 | while (!atEnd() && !m_underlyingIterator.text().length()) |
| 1633 | m_underlyingIterator.advance(); |
| 1634 | } |
| 1635 | |
| 1636 | Ref<Range> BackwardsCharacterIterator::range() const |
| 1637 | { |
| 1638 | Ref<Range> r = m_underlyingIterator.range(); |
| 1639 | if (!m_underlyingIterator.atEnd()) { |
| 1640 | if (m_underlyingIterator.text().length() <= 1) |
| 1641 | ASSERT(m_runOffset == 0); |
| 1642 | else { |
| 1643 | Node& node = r->startContainer(); |
| 1644 | ASSERT(&node == &r->endContainer()); |
| 1645 | int offset = r->endOffset() - m_runOffset; |
| 1646 | r->setStart(node, offset - 1); |
| 1647 | r->setEnd(node, offset); |
| 1648 | } |
| 1649 | } |
| 1650 | return r; |
| 1651 | } |
| 1652 | |
| 1653 | void BackwardsCharacterIterator::advance(int count) |
| 1654 | { |
| 1655 | if (count <= 0) { |
| 1656 | ASSERT(!count); |
| 1657 | return; |
| 1658 | } |
| 1659 | |
| 1660 | m_atBreak = false; |
| 1661 | |
| 1662 | int remaining = m_underlyingIterator.text().length() - m_runOffset; |
| 1663 | if (count < remaining) { |
| 1664 | m_runOffset += count; |
| 1665 | m_offset += count; |
| 1666 | return; |
| 1667 | } |
| 1668 | |
| 1669 | count -= remaining; |
| 1670 | m_offset += remaining; |
| 1671 | |
| 1672 | for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) { |
| 1673 | int runLength = m_underlyingIterator.text().length(); |
| 1674 | if (runLength == 0) |
| 1675 | m_atBreak = true; |
| 1676 | else { |
| 1677 | if (count < runLength) { |
| 1678 | m_runOffset = count; |
| 1679 | m_offset += count; |
| 1680 | return; |
| 1681 | } |
| 1682 | |
| 1683 | count -= runLength; |
| 1684 | m_offset += runLength; |
| 1685 | } |
| 1686 | } |
| 1687 | |
| 1688 | m_atBreak = true; |
| 1689 | m_runOffset = 0; |
| 1690 | } |
| 1691 | |
| 1692 | // -------- |
| 1693 | |
| 1694 | WordAwareIterator::WordAwareIterator(const Range& range) |
| 1695 | : m_underlyingIterator(&range) |
| 1696 | , m_didLookAhead(true) // so we consider the first chunk from the text iterator |
| 1697 | { |
| 1698 | advance(); // get in position over the first chunk of text |
| 1699 | } |
| 1700 | |
| 1701 | // We're always in one of these modes: |
| 1702 | // - The current chunk in the text iterator is our current chunk |
| 1703 | // (typically its a piece of whitespace, or text that ended with whitespace) |
| 1704 | // - The previous chunk in the text iterator is our current chunk |
| 1705 | // (we looked ahead to the next chunk and found a word boundary) |
| 1706 | // - We built up our own chunk of text from many chunks from the text iterator |
| 1707 | |
| 1708 | // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries. |
| 1709 | |
| 1710 | void WordAwareIterator::advance() |
| 1711 | { |
| 1712 | m_previousText.reset(); |
| 1713 | m_buffer.clear(); |
| 1714 | |
| 1715 | // If last time we did a look-ahead, start with that looked-ahead chunk now |
| 1716 | if (!m_didLookAhead) { |
| 1717 | ASSERT(!m_underlyingIterator.atEnd()); |
| 1718 | m_underlyingIterator.advance(); |
| 1719 | } |
| 1720 | m_didLookAhead = false; |
| 1721 | |
| 1722 | // Go to next non-empty chunk |
| 1723 | while (!m_underlyingIterator.atEnd() && !m_underlyingIterator.text().length()) |
| 1724 | m_underlyingIterator.advance(); |
| 1725 | if (m_underlyingIterator.atEnd()) |
| 1726 | return; |
| 1727 | |
| 1728 | while (1) { |
| 1729 | // If this chunk ends in whitespace we can just use it as our chunk. |
| 1730 | if (isSpaceOrNewline(m_underlyingIterator.text()[m_underlyingIterator.text().length() - 1])) |
| 1731 | return; |
| 1732 | |
| 1733 | // If this is the first chunk that failed, save it in previousText before look ahead |
| 1734 | if (m_buffer.isEmpty()) |
| 1735 | m_previousText = m_underlyingIterator.copyableText(); |
| 1736 | |
| 1737 | // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff |
| 1738 | m_underlyingIterator.advance(); |
| 1739 | if (m_underlyingIterator.atEnd() || !m_underlyingIterator.text().length() || isSpaceOrNewline(m_underlyingIterator.text()[0])) { |
| 1740 | m_didLookAhead = true; |
| 1741 | return; |
| 1742 | } |
| 1743 | |
| 1744 | if (m_buffer.isEmpty()) { |
| 1745 | // Start gobbling chunks until we get to a suitable stopping point |
| 1746 | append(m_buffer, m_previousText.text()); |
| 1747 | m_previousText.reset(); |
| 1748 | } |
| 1749 | append(m_buffer, m_underlyingIterator.text()); |
| 1750 | } |
| 1751 | } |
| 1752 | |
| 1753 | StringView WordAwareIterator::text() const |
| 1754 | { |
| 1755 | if (!m_buffer.isEmpty()) |
| 1756 | return StringView(m_buffer.data(), m_buffer.size()); |
| 1757 | if (m_previousText.text().length()) |
| 1758 | return m_previousText.text(); |
| 1759 | return m_underlyingIterator.text(); |
| 1760 | } |
| 1761 | |
| 1762 | // -------- |
| 1763 | |
| 1764 | static inline UChar foldQuoteMark(UChar c) |
| 1765 | { |
| 1766 | switch (c) { |
| 1767 | case hebrewPunctuationGershayim: |
| 1768 | case leftDoubleQuotationMark: |
| 1769 | case leftLowDoubleQuotationMark: |
| 1770 | case rightDoubleQuotationMark: |
| 1771 | return '"'; |
| 1772 | case hebrewPunctuationGeresh: |
| 1773 | case leftSingleQuotationMark: |
| 1774 | case leftLowSingleQuotationMark: |
| 1775 | case rightSingleQuotationMark: |
| 1776 | return '\''; |
| 1777 | default: |
| 1778 | return c; |
| 1779 | } |
| 1780 | } |
| 1781 | |
| 1782 | // FIXME: We'd like to tailor the searcher to fold quote marks for us instead |
| 1783 | // of doing it in a separate replacement pass here, but ICU doesn't offer a way |
| 1784 | // to add tailoring on top of the locale-specific tailoring as of this writing. |
| 1785 | static inline String foldQuoteMarks(String string) |
| 1786 | { |
| 1787 | string.replace(hebrewPunctuationGeresh, '\''); |
| 1788 | string.replace(hebrewPunctuationGershayim, '"'); |
| 1789 | string.replace(leftDoubleQuotationMark, '"'); |
| 1790 | string.replace(leftLowDoubleQuotationMark, '"'); |
| 1791 | string.replace(leftSingleQuotationMark, '\''); |
| 1792 | string.replace(leftLowSingleQuotationMark, '\''); |
| 1793 | string.replace(rightDoubleQuotationMark, '"'); |
| 1794 | string.replace(rightSingleQuotationMark, '\''); |
| 1795 | |
| 1796 | return string; |
| 1797 | } |
| 1798 | |
| 1799 | #if !UCONFIG_NO_COLLATION |
| 1800 | |
| 1801 | const size_t minimumSearchBufferSize = 8192; |
| 1802 | |
| 1803 | #ifndef NDEBUG |
| 1804 | static bool searcherInUse; |
| 1805 | #endif |
| 1806 | |
| 1807 | static UStringSearch* createSearcher() |
| 1808 | { |
| 1809 | // Provide a non-empty pattern and non-empty text so usearch_open will not fail, |
| 1810 | // but it doesn't matter exactly what it is, since we don't perform any searches |
| 1811 | // without setting both the pattern and the text. |
| 1812 | UErrorCode status = U_ZERO_ERROR; |
| 1813 | String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search" ); |
| 1814 | UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status); |
| 1815 | ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); |
| 1816 | return searcher; |
| 1817 | } |
| 1818 | |
| 1819 | static UStringSearch* searcher() |
| 1820 | { |
| 1821 | static UStringSearch* searcher = createSearcher(); |
| 1822 | return searcher; |
| 1823 | } |
| 1824 | |
| 1825 | static inline void lockSearcher() |
| 1826 | { |
| 1827 | #ifndef NDEBUG |
| 1828 | ASSERT(!searcherInUse); |
| 1829 | searcherInUse = true; |
| 1830 | #endif |
| 1831 | } |
| 1832 | |
| 1833 | static inline void unlockSearcher() |
| 1834 | { |
| 1835 | #ifndef NDEBUG |
| 1836 | ASSERT(searcherInUse); |
| 1837 | searcherInUse = false; |
| 1838 | #endif |
| 1839 | } |
| 1840 | |
| 1841 | // ICU's search ignores the distinction between small kana letters and ones |
| 1842 | // that are not small, and also characters that differ only in the voicing |
| 1843 | // marks when considering only primary collation strength differences. |
| 1844 | // This is not helpful for end users, since these differences make words |
| 1845 | // distinct, so for our purposes we need these to be considered. |
| 1846 | // The Unicode folks do not think the collation algorithm should be |
| 1847 | // changed. To work around this, we would like to tailor the ICU searcher, |
| 1848 | // but we can't get that to work yet. So instead, we check for cases where |
| 1849 | // these differences occur, and skip those matches. |
| 1850 | |
| 1851 | // We refer to the above technique as the "kana workaround". The next few |
| 1852 | // functions are helper functinos for the kana workaround. |
| 1853 | |
| 1854 | static inline bool isKanaLetter(UChar character) |
| 1855 | { |
| 1856 | // Hiragana letters. |
| 1857 | if (character >= 0x3041 && character <= 0x3096) |
| 1858 | return true; |
| 1859 | |
| 1860 | // Katakana letters. |
| 1861 | if (character >= 0x30A1 && character <= 0x30FA) |
| 1862 | return true; |
| 1863 | if (character >= 0x31F0 && character <= 0x31FF) |
| 1864 | return true; |
| 1865 | |
| 1866 | // Halfwidth katakana letters. |
| 1867 | if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) |
| 1868 | return true; |
| 1869 | |
| 1870 | return false; |
| 1871 | } |
| 1872 | |
| 1873 | static inline bool isSmallKanaLetter(UChar character) |
| 1874 | { |
| 1875 | ASSERT(isKanaLetter(character)); |
| 1876 | |
| 1877 | switch (character) { |
| 1878 | case 0x3041: // HIRAGANA LETTER SMALL A |
| 1879 | case 0x3043: // HIRAGANA LETTER SMALL I |
| 1880 | case 0x3045: // HIRAGANA LETTER SMALL U |
| 1881 | case 0x3047: // HIRAGANA LETTER SMALL E |
| 1882 | case 0x3049: // HIRAGANA LETTER SMALL O |
| 1883 | case 0x3063: // HIRAGANA LETTER SMALL TU |
| 1884 | case 0x3083: // HIRAGANA LETTER SMALL YA |
| 1885 | case 0x3085: // HIRAGANA LETTER SMALL YU |
| 1886 | case 0x3087: // HIRAGANA LETTER SMALL YO |
| 1887 | case 0x308E: // HIRAGANA LETTER SMALL WA |
| 1888 | case 0x3095: // HIRAGANA LETTER SMALL KA |
| 1889 | case 0x3096: // HIRAGANA LETTER SMALL KE |
| 1890 | case 0x30A1: // KATAKANA LETTER SMALL A |
| 1891 | case 0x30A3: // KATAKANA LETTER SMALL I |
| 1892 | case 0x30A5: // KATAKANA LETTER SMALL U |
| 1893 | case 0x30A7: // KATAKANA LETTER SMALL E |
| 1894 | case 0x30A9: // KATAKANA LETTER SMALL O |
| 1895 | case 0x30C3: // KATAKANA LETTER SMALL TU |
| 1896 | case 0x30E3: // KATAKANA LETTER SMALL YA |
| 1897 | case 0x30E5: // KATAKANA LETTER SMALL YU |
| 1898 | case 0x30E7: // KATAKANA LETTER SMALL YO |
| 1899 | case 0x30EE: // KATAKANA LETTER SMALL WA |
| 1900 | case 0x30F5: // KATAKANA LETTER SMALL KA |
| 1901 | case 0x30F6: // KATAKANA LETTER SMALL KE |
| 1902 | case 0x31F0: // KATAKANA LETTER SMALL KU |
| 1903 | case 0x31F1: // KATAKANA LETTER SMALL SI |
| 1904 | case 0x31F2: // KATAKANA LETTER SMALL SU |
| 1905 | case 0x31F3: // KATAKANA LETTER SMALL TO |
| 1906 | case 0x31F4: // KATAKANA LETTER SMALL NU |
| 1907 | case 0x31F5: // KATAKANA LETTER SMALL HA |
| 1908 | case 0x31F6: // KATAKANA LETTER SMALL HI |
| 1909 | case 0x31F7: // KATAKANA LETTER SMALL HU |
| 1910 | case 0x31F8: // KATAKANA LETTER SMALL HE |
| 1911 | case 0x31F9: // KATAKANA LETTER SMALL HO |
| 1912 | case 0x31FA: // KATAKANA LETTER SMALL MU |
| 1913 | case 0x31FB: // KATAKANA LETTER SMALL RA |
| 1914 | case 0x31FC: // KATAKANA LETTER SMALL RI |
| 1915 | case 0x31FD: // KATAKANA LETTER SMALL RU |
| 1916 | case 0x31FE: // KATAKANA LETTER SMALL RE |
| 1917 | case 0x31FF: // KATAKANA LETTER SMALL RO |
| 1918 | case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A |
| 1919 | case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I |
| 1920 | case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U |
| 1921 | case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E |
| 1922 | case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O |
| 1923 | case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA |
| 1924 | case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU |
| 1925 | case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO |
| 1926 | case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU |
| 1927 | return true; |
| 1928 | } |
| 1929 | return false; |
| 1930 | } |
| 1931 | |
| 1932 | enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; |
| 1933 | |
| 1934 | static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) |
| 1935 | { |
| 1936 | ASSERT(isKanaLetter(character)); |
| 1937 | |
| 1938 | switch (character) { |
| 1939 | case 0x304C: // HIRAGANA LETTER GA |
| 1940 | case 0x304E: // HIRAGANA LETTER GI |
| 1941 | case 0x3050: // HIRAGANA LETTER GU |
| 1942 | case 0x3052: // HIRAGANA LETTER GE |
| 1943 | case 0x3054: // HIRAGANA LETTER GO |
| 1944 | case 0x3056: // HIRAGANA LETTER ZA |
| 1945 | case 0x3058: // HIRAGANA LETTER ZI |
| 1946 | case 0x305A: // HIRAGANA LETTER ZU |
| 1947 | case 0x305C: // HIRAGANA LETTER ZE |
| 1948 | case 0x305E: // HIRAGANA LETTER ZO |
| 1949 | case 0x3060: // HIRAGANA LETTER DA |
| 1950 | case 0x3062: // HIRAGANA LETTER DI |
| 1951 | case 0x3065: // HIRAGANA LETTER DU |
| 1952 | case 0x3067: // HIRAGANA LETTER DE |
| 1953 | case 0x3069: // HIRAGANA LETTER DO |
| 1954 | case 0x3070: // HIRAGANA LETTER BA |
| 1955 | case 0x3073: // HIRAGANA LETTER BI |
| 1956 | case 0x3076: // HIRAGANA LETTER BU |
| 1957 | case 0x3079: // HIRAGANA LETTER BE |
| 1958 | case 0x307C: // HIRAGANA LETTER BO |
| 1959 | case 0x3094: // HIRAGANA LETTER VU |
| 1960 | case 0x30AC: // KATAKANA LETTER GA |
| 1961 | case 0x30AE: // KATAKANA LETTER GI |
| 1962 | case 0x30B0: // KATAKANA LETTER GU |
| 1963 | case 0x30B2: // KATAKANA LETTER GE |
| 1964 | case 0x30B4: // KATAKANA LETTER GO |
| 1965 | case 0x30B6: // KATAKANA LETTER ZA |
| 1966 | case 0x30B8: // KATAKANA LETTER ZI |
| 1967 | case 0x30BA: // KATAKANA LETTER ZU |
| 1968 | case 0x30BC: // KATAKANA LETTER ZE |
| 1969 | case 0x30BE: // KATAKANA LETTER ZO |
| 1970 | case 0x30C0: // KATAKANA LETTER DA |
| 1971 | case 0x30C2: // KATAKANA LETTER DI |
| 1972 | case 0x30C5: // KATAKANA LETTER DU |
| 1973 | case 0x30C7: // KATAKANA LETTER DE |
| 1974 | case 0x30C9: // KATAKANA LETTER DO |
| 1975 | case 0x30D0: // KATAKANA LETTER BA |
| 1976 | case 0x30D3: // KATAKANA LETTER BI |
| 1977 | case 0x30D6: // KATAKANA LETTER BU |
| 1978 | case 0x30D9: // KATAKANA LETTER BE |
| 1979 | case 0x30DC: // KATAKANA LETTER BO |
| 1980 | case 0x30F4: // KATAKANA LETTER VU |
| 1981 | case 0x30F7: // KATAKANA LETTER VA |
| 1982 | case 0x30F8: // KATAKANA LETTER VI |
| 1983 | case 0x30F9: // KATAKANA LETTER VE |
| 1984 | case 0x30FA: // KATAKANA LETTER VO |
| 1985 | return VoicedSoundMark; |
| 1986 | case 0x3071: // HIRAGANA LETTER PA |
| 1987 | case 0x3074: // HIRAGANA LETTER PI |
| 1988 | case 0x3077: // HIRAGANA LETTER PU |
| 1989 | case 0x307A: // HIRAGANA LETTER PE |
| 1990 | case 0x307D: // HIRAGANA LETTER PO |
| 1991 | case 0x30D1: // KATAKANA LETTER PA |
| 1992 | case 0x30D4: // KATAKANA LETTER PI |
| 1993 | case 0x30D7: // KATAKANA LETTER PU |
| 1994 | case 0x30DA: // KATAKANA LETTER PE |
| 1995 | case 0x30DD: // KATAKANA LETTER PO |
| 1996 | return SemiVoicedSoundMark; |
| 1997 | } |
| 1998 | return NoVoicedSoundMark; |
| 1999 | } |
| 2000 | |
| 2001 | static inline bool isCombiningVoicedSoundMark(UChar character) |
| 2002 | { |
| 2003 | switch (character) { |
| 2004 | case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK |
| 2005 | case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK |
| 2006 | return true; |
| 2007 | } |
| 2008 | return false; |
| 2009 | } |
| 2010 | |
| 2011 | static inline bool containsKanaLetters(const String& pattern) |
| 2012 | { |
| 2013 | if (pattern.is8Bit()) |
| 2014 | return false; |
| 2015 | const UChar* characters = pattern.characters16(); |
| 2016 | unsigned length = pattern.length(); |
| 2017 | for (unsigned i = 0; i < length; ++i) { |
| 2018 | if (isKanaLetter(characters[i])) |
| 2019 | return true; |
| 2020 | } |
| 2021 | return false; |
| 2022 | } |
| 2023 | |
| 2024 | static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer) |
| 2025 | { |
| 2026 | UErrorCode status = U_ZERO_ERROR; |
| 2027 | const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); |
| 2028 | ASSERT(U_SUCCESS(status)); |
| 2029 | |
| 2030 | buffer.resize(length); |
| 2031 | |
| 2032 | auto normalizedLength = unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status); |
| 2033 | ASSERT(U_SUCCESS(status) || status == U_BUFFER_OVERFLOW_ERROR); |
| 2034 | |
| 2035 | buffer.resize(normalizedLength); |
| 2036 | |
| 2037 | if (U_SUCCESS(status)) |
| 2038 | return; |
| 2039 | |
| 2040 | status = U_ZERO_ERROR; |
| 2041 | unorm2_normalize(normalizer, characters, length, buffer.data(), length, &status); |
| 2042 | ASSERT(U_SUCCESS(status)); |
| 2043 | } |
| 2044 | |
| 2045 | static bool isNonLatin1Separator(UChar32 character) |
| 2046 | { |
| 2047 | ASSERT_ARG(character, character >= 256); |
| 2048 | |
| 2049 | return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); |
| 2050 | } |
| 2051 | |
| 2052 | static inline bool isSeparator(UChar32 character) |
| 2053 | { |
| 2054 | static const bool latin1SeparatorTable[256] = { |
| 2055 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2056 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2057 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / |
| 2058 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? |
| 2059 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ |
| 2060 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ |
| 2061 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` |
| 2062 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ |
| 2063 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2064 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2065 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
| 2066 | 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
| 2067 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2068 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2069 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2070 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 |
| 2071 | }; |
| 2072 | |
| 2073 | if (character < 256) |
| 2074 | return latin1SeparatorTable[character]; |
| 2075 | |
| 2076 | return isNonLatin1Separator(character); |
| 2077 | } |
| 2078 | |
| 2079 | inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
| 2080 | : m_target(foldQuoteMarks(target)) |
| 2081 | , m_targetCharacters(StringView(m_target).upconvertedCharacters()) |
| 2082 | , m_options(options) |
| 2083 | , m_prefixLength(0) |
| 2084 | , m_atBreak(true) |
| 2085 | , m_needsMoreContext(options.contains(AtWordStarts)) |
| 2086 | , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) |
| 2087 | { |
| 2088 | ASSERT(!m_target.isEmpty()); |
| 2089 | |
| 2090 | size_t targetLength = m_target.length(); |
| 2091 | m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); |
| 2092 | m_overlap = m_buffer.capacity() / 4; |
| 2093 | |
| 2094 | if (m_options.contains(AtWordStarts) && targetLength) { |
| 2095 | UChar32 targetFirstCharacter; |
| 2096 | U16_GET(m_target, 0, 0u, targetLength, targetFirstCharacter); |
| 2097 | // Characters in the separator category never really occur at the beginning of a word, |
| 2098 | // so if the target begins with such a character, we just ignore the AtWordStart option. |
| 2099 | if (isSeparator(targetFirstCharacter)) { |
| 2100 | m_options.remove(AtWordStarts); |
| 2101 | m_needsMoreContext = false; |
| 2102 | } |
| 2103 | } |
| 2104 | |
| 2105 | // Grab the single global searcher. |
| 2106 | // If we ever have a reason to do more than once search buffer at once, we'll have |
| 2107 | // to move to multiple searchers. |
| 2108 | lockSearcher(); |
| 2109 | |
| 2110 | UStringSearch* searcher = WebCore::searcher(); |
| 2111 | UCollator* collator = usearch_getCollator(searcher); |
| 2112 | |
| 2113 | UCollationStrength strength; |
| 2114 | USearchAttributeValue comparator; |
| 2115 | if (m_options.contains(CaseInsensitive)) { |
| 2116 | // Without loss of generality, have 'e' match {'e', 'E', 'é', 'É'} and 'é' match {'é', 'É'}. |
| 2117 | strength = UCOL_SECONDARY; |
| 2118 | comparator = USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD; |
| 2119 | } else { |
| 2120 | // Without loss of generality, have 'e' match {'e'} and 'é' match {'é'}. |
| 2121 | strength = UCOL_TERTIARY; |
| 2122 | comparator = USEARCH_STANDARD_ELEMENT_COMPARISON; |
| 2123 | } |
| 2124 | if (ucol_getStrength(collator) != strength) { |
| 2125 | ucol_setStrength(collator, strength); |
| 2126 | usearch_reset(searcher); |
| 2127 | } |
| 2128 | |
| 2129 | UErrorCode status = U_ZERO_ERROR; |
| 2130 | usearch_setAttribute(searcher, USEARCH_ELEMENT_COMPARISON, comparator, &status); |
| 2131 | ASSERT(status == U_ZERO_ERROR); |
| 2132 | |
| 2133 | usearch_setPattern(searcher, m_targetCharacters, targetLength, &status); |
| 2134 | ASSERT(status == U_ZERO_ERROR); |
| 2135 | |
| 2136 | // The kana workaround requires a normalized copy of the target string. |
| 2137 | if (m_targetRequiresKanaWorkaround) |
| 2138 | normalizeCharacters(m_targetCharacters, targetLength, m_normalizedTarget); |
| 2139 | } |
| 2140 | |
| 2141 | inline SearchBuffer::~SearchBuffer() |
| 2142 | { |
| 2143 | // Leave the static object pointing to a valid string. |
| 2144 | UErrorCode status = U_ZERO_ERROR; |
| 2145 | usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status); |
| 2146 | ASSERT(status == U_ZERO_ERROR); |
| 2147 | usearch_setText(WebCore::searcher(), &newlineCharacter, 1, &status); |
| 2148 | ASSERT(status == U_ZERO_ERROR); |
| 2149 | |
| 2150 | unlockSearcher(); |
| 2151 | } |
| 2152 | |
| 2153 | inline size_t SearchBuffer::append(StringView text) |
| 2154 | { |
| 2155 | ASSERT(text.length()); |
| 2156 | |
| 2157 | if (m_atBreak) { |
| 2158 | m_buffer.shrink(0); |
| 2159 | m_prefixLength = 0; |
| 2160 | m_atBreak = false; |
| 2161 | } else if (m_buffer.size() == m_buffer.capacity()) { |
| 2162 | memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); |
| 2163 | m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap); |
| 2164 | m_buffer.shrink(m_overlap); |
| 2165 | } |
| 2166 | |
| 2167 | size_t oldLength = m_buffer.size(); |
| 2168 | size_t usableLength = std::min<size_t>(m_buffer.capacity() - oldLength, text.length()); |
| 2169 | ASSERT(usableLength); |
| 2170 | m_buffer.grow(oldLength + usableLength); |
| 2171 | for (unsigned i = 0; i < usableLength; ++i) |
| 2172 | m_buffer[oldLength + i] = foldQuoteMark(text[i]); |
| 2173 | return usableLength; |
| 2174 | } |
| 2175 | |
| 2176 | inline bool SearchBuffer::needsMoreContext() const |
| 2177 | { |
| 2178 | return m_needsMoreContext; |
| 2179 | } |
| 2180 | |
| 2181 | inline void SearchBuffer::prependContext(StringView text) |
| 2182 | { |
| 2183 | ASSERT(m_needsMoreContext); |
| 2184 | ASSERT(m_prefixLength == m_buffer.size()); |
| 2185 | |
| 2186 | if (!text.length()) |
| 2187 | return; |
| 2188 | |
| 2189 | m_atBreak = false; |
| 2190 | |
| 2191 | size_t wordBoundaryContextStart = text.length(); |
| 2192 | if (wordBoundaryContextStart) { |
| 2193 | U16_BACK_1(text, 0, wordBoundaryContextStart); |
| 2194 | wordBoundaryContextStart = startOfLastWordBoundaryContext(text.substring(0, wordBoundaryContextStart)); |
| 2195 | } |
| 2196 | |
| 2197 | size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, text.length() - wordBoundaryContextStart); |
| 2198 | WTF::append(m_buffer, text.substring(text.length() - usableLength, usableLength)); |
| 2199 | m_prefixLength += usableLength; |
| 2200 | |
| 2201 | if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) |
| 2202 | m_needsMoreContext = false; |
| 2203 | } |
| 2204 | |
| 2205 | inline bool SearchBuffer::atBreak() const |
| 2206 | { |
| 2207 | return m_atBreak; |
| 2208 | } |
| 2209 | |
| 2210 | inline void SearchBuffer::reachedBreak() |
| 2211 | { |
| 2212 | m_atBreak = true; |
| 2213 | } |
| 2214 | |
| 2215 | inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const |
| 2216 | { |
| 2217 | // This function implements the kana workaround. If usearch treats |
| 2218 | // it as a match, but we do not want to, then it's a "bad match". |
| 2219 | if (!m_targetRequiresKanaWorkaround) |
| 2220 | return false; |
| 2221 | |
| 2222 | // Normalize into a match buffer. We reuse a single buffer rather than |
| 2223 | // creating a new one each time. |
| 2224 | normalizeCharacters(match, matchLength, m_normalizedMatch); |
| 2225 | |
| 2226 | const UChar* a = m_normalizedTarget.begin(); |
| 2227 | const UChar* aEnd = m_normalizedTarget.end(); |
| 2228 | |
| 2229 | const UChar* b = m_normalizedMatch.begin(); |
| 2230 | const UChar* bEnd = m_normalizedMatch.end(); |
| 2231 | |
| 2232 | while (true) { |
| 2233 | // Skip runs of non-kana-letter characters. This is necessary so we can |
| 2234 | // correctly handle strings where the target and match have different-length |
| 2235 | // runs of characters that match, while still double checking the correctness |
| 2236 | // of matches of kana letters with other kana letters. |
| 2237 | while (a != aEnd && !isKanaLetter(*a)) |
| 2238 | ++a; |
| 2239 | while (b != bEnd && !isKanaLetter(*b)) |
| 2240 | ++b; |
| 2241 | |
| 2242 | // If we reached the end of either the target or the match, we should have |
| 2243 | // reached the end of both; both should have the same number of kana letters. |
| 2244 | if (a == aEnd || b == bEnd) { |
| 2245 | ASSERT(a == aEnd); |
| 2246 | ASSERT(b == bEnd); |
| 2247 | return false; |
| 2248 | } |
| 2249 | |
| 2250 | // Check for differences in the kana letter character itself. |
| 2251 | if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b)) |
| 2252 | return true; |
| 2253 | if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b)) |
| 2254 | return true; |
| 2255 | ++a; |
| 2256 | ++b; |
| 2257 | |
| 2258 | // Check for differences in combining voiced sound marks found after the letter. |
| 2259 | while (1) { |
| 2260 | if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) { |
| 2261 | if (b != bEnd && isCombiningVoicedSoundMark(*b)) |
| 2262 | return true; |
| 2263 | break; |
| 2264 | } |
| 2265 | if (!(b != bEnd && isCombiningVoicedSoundMark(*b))) |
| 2266 | return true; |
| 2267 | if (*a != *b) |
| 2268 | return true; |
| 2269 | ++a; |
| 2270 | ++b; |
| 2271 | } |
| 2272 | } |
| 2273 | } |
| 2274 | |
| 2275 | inline bool SearchBuffer::isWordEndMatch(size_t start, size_t length) const |
| 2276 | { |
| 2277 | ASSERT(length); |
| 2278 | ASSERT(m_options.contains(AtWordEnds)); |
| 2279 | |
| 2280 | int endWord; |
| 2281 | // Start searching at the end of matched search, so that multiple word matches succeed. |
| 2282 | findEndWordBoundary(StringView(m_buffer.data(), m_buffer.size()), start + length - 1, &endWord); |
| 2283 | return static_cast<size_t>(endWord) == (start + length); |
| 2284 | } |
| 2285 | |
| 2286 | inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const |
| 2287 | { |
| 2288 | ASSERT(m_options.contains(AtWordStarts)); |
| 2289 | |
| 2290 | if (!start) |
| 2291 | return true; |
| 2292 | |
| 2293 | int size = m_buffer.size(); |
| 2294 | int offset = start; |
| 2295 | UChar32 firstCharacter; |
| 2296 | U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); |
| 2297 | |
| 2298 | if (m_options.contains(TreatMedialCapitalAsWordStart)) { |
| 2299 | UChar32 previousCharacter; |
| 2300 | U16_PREV(m_buffer.data(), 0, offset, previousCharacter); |
| 2301 | |
| 2302 | if (isSeparator(firstCharacter)) { |
| 2303 | // The start of a separator run is a word start (".org" in "webkit.org"). |
| 2304 | if (!isSeparator(previousCharacter)) |
| 2305 | return true; |
| 2306 | } else if (isASCIIUpper(firstCharacter)) { |
| 2307 | // The start of an uppercase run is a word start ("Kit" in "WebKit"). |
| 2308 | if (!isASCIIUpper(previousCharacter)) |
| 2309 | return true; |
| 2310 | // The last character of an uppercase run followed by a non-separator, non-digit |
| 2311 | // is a word start ("Request" in "XMLHTTPRequest"). |
| 2312 | offset = start; |
| 2313 | U16_FWD_1(m_buffer.data(), offset, size); |
| 2314 | UChar32 nextCharacter = 0; |
| 2315 | if (offset < size) |
| 2316 | U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); |
| 2317 | if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) |
| 2318 | return true; |
| 2319 | } else if (isASCIIDigit(firstCharacter)) { |
| 2320 | // The start of a digit run is a word start ("2" in "WebKit2"). |
| 2321 | if (!isASCIIDigit(previousCharacter)) |
| 2322 | return true; |
| 2323 | } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { |
| 2324 | // The start of a non-separator, non-uppercase, non-digit run is a word start, |
| 2325 | // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). |
| 2326 | return true; |
| 2327 | } |
| 2328 | } |
| 2329 | |
| 2330 | // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes |
| 2331 | // a word, so treat the position before any CJK character as a word start. |
| 2332 | if (FontCascade::isCJKIdeographOrSymbol(firstCharacter)) |
| 2333 | return true; |
| 2334 | |
| 2335 | size_t wordBreakSearchStart = start + length; |
| 2336 | while (wordBreakSearchStart > start) |
| 2337 | wordBreakSearchStart = findNextWordFromIndex(StringView(m_buffer.data(), m_buffer.size()), wordBreakSearchStart, false /* backwards */); |
| 2338 | return wordBreakSearchStart == start; |
| 2339 | } |
| 2340 | |
| 2341 | inline size_t SearchBuffer::search(size_t& start) |
| 2342 | { |
| 2343 | size_t size = m_buffer.size(); |
| 2344 | if (m_atBreak) { |
| 2345 | if (!size) |
| 2346 | return 0; |
| 2347 | } else { |
| 2348 | if (size != m_buffer.capacity()) |
| 2349 | return 0; |
| 2350 | } |
| 2351 | |
| 2352 | UStringSearch* searcher = WebCore::searcher(); |
| 2353 | |
| 2354 | UErrorCode status = U_ZERO_ERROR; |
| 2355 | usearch_setText(searcher, m_buffer.data(), size, &status); |
| 2356 | ASSERT(status == U_ZERO_ERROR); |
| 2357 | |
| 2358 | usearch_setOffset(searcher, m_prefixLength, &status); |
| 2359 | ASSERT(status == U_ZERO_ERROR); |
| 2360 | |
| 2361 | int matchStart = usearch_next(searcher, &status); |
| 2362 | ASSERT(status == U_ZERO_ERROR); |
| 2363 | |
| 2364 | nextMatch: |
| 2365 | if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { |
| 2366 | ASSERT(matchStart == USEARCH_DONE); |
| 2367 | return 0; |
| 2368 | } |
| 2369 | |
| 2370 | // Matches that start in the overlap area are only tentative. |
| 2371 | // The same match may appear later, matching more characters, |
| 2372 | // possibly including a combining character that's not yet in the buffer. |
| 2373 | if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { |
| 2374 | size_t overlap = m_overlap; |
| 2375 | if (m_options.contains(AtWordStarts)) { |
| 2376 | // Ensure that there is sufficient context before matchStart the next time around for |
| 2377 | // determining if it is at a word boundary. |
| 2378 | unsigned wordBoundaryContextStart = matchStart; |
| 2379 | U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); |
| 2380 | wordBoundaryContextStart = startOfLastWordBoundaryContext(StringView(m_buffer.data(), wordBoundaryContextStart)); |
| 2381 | overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart)); |
| 2382 | } |
| 2383 | memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); |
| 2384 | m_prefixLength -= std::min(m_prefixLength, size - overlap); |
| 2385 | m_buffer.shrink(overlap); |
| 2386 | return 0; |
| 2387 | } |
| 2388 | |
| 2389 | size_t matchedLength = usearch_getMatchedLength(searcher); |
| 2390 | ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size); |
| 2391 | |
| 2392 | // If this match is "bad", move on to the next match. |
| 2393 | if (isBadMatch(m_buffer.data() + matchStart, matchedLength) |
| 2394 | || (m_options.contains(AtWordStarts) && !isWordStartMatch(matchStart, matchedLength)) |
| 2395 | || (m_options.contains(AtWordEnds) && !isWordEndMatch(matchStart, matchedLength))) { |
| 2396 | matchStart = usearch_next(searcher, &status); |
| 2397 | ASSERT(status == U_ZERO_ERROR); |
| 2398 | goto nextMatch; |
| 2399 | } |
| 2400 | |
| 2401 | size_t newSize = size - (matchStart + 1); |
| 2402 | memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); |
| 2403 | m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1); |
| 2404 | m_buffer.shrink(newSize); |
| 2405 | |
| 2406 | start = size - matchStart; |
| 2407 | return matchedLength; |
| 2408 | } |
| 2409 | |
| 2410 | #else |
| 2411 | |
| 2412 | inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
| 2413 | : m_target(options & CaseInsensitive ? target.foldCase() : target) |
| 2414 | , m_options(options) |
| 2415 | , m_buffer(m_target.length()) |
| 2416 | , m_isCharacterStartBuffer(m_target.length()) |
| 2417 | , m_isBufferFull(false) |
| 2418 | , m_cursor(0) |
| 2419 | { |
| 2420 | ASSERT(!m_target.isEmpty()); |
| 2421 | m_target.replace(noBreakSpace, ' '); |
| 2422 | foldQuoteMarks(m_target); |
| 2423 | } |
| 2424 | |
| 2425 | inline SearchBuffer::~SearchBuffer() = default; |
| 2426 | |
| 2427 | inline void SearchBuffer::reachedBreak() |
| 2428 | { |
| 2429 | m_cursor = 0; |
| 2430 | m_isBufferFull = false; |
| 2431 | } |
| 2432 | |
| 2433 | inline bool SearchBuffer::atBreak() const |
| 2434 | { |
| 2435 | return !m_cursor && !m_isBufferFull; |
| 2436 | } |
| 2437 | |
| 2438 | inline void SearchBuffer::append(UChar c, bool isStart) |
| 2439 | { |
| 2440 | m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMark(c); |
| 2441 | m_isCharacterStartBuffer[m_cursor] = isStart; |
| 2442 | if (++m_cursor == m_target.length()) { |
| 2443 | m_cursor = 0; |
| 2444 | m_isBufferFull = true; |
| 2445 | } |
| 2446 | } |
| 2447 | |
| 2448 | inline size_t SearchBuffer::append(const UChar* characters, size_t length) |
| 2449 | { |
| 2450 | ASSERT(length); |
| 2451 | if (!(m_options & CaseInsensitive)) { |
| 2452 | append(characters[0], true); |
| 2453 | return 1; |
| 2454 | } |
| 2455 | const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough |
| 2456 | UChar foldedCharacters[maxFoldedCharacters]; |
| 2457 | UErrorCode status = U_ZERO_ERROR; |
| 2458 | int numFoldedCharacters = u_strFoldCase(foldedCharacters, maxFoldedCharacters, characters, 1, U_FOLD_CASE_DEFAULT, &status); |
| 2459 | ASSERT(U_SUCCESS(status)); |
| 2460 | ASSERT(numFoldedCharacters); |
| 2461 | ASSERT(numFoldedCharacters <= maxFoldedCharacters); |
| 2462 | if (U_SUCCESS(status) && numFoldedCharacters) { |
| 2463 | numFoldedCharacters = std::min(numFoldedCharacters, maxFoldedCharacters); |
| 2464 | append(foldedCharacters[0], true); |
| 2465 | for (int i = 1; i < numFoldedCharacters; ++i) |
| 2466 | append(foldedCharacters[i], false); |
| 2467 | } |
| 2468 | return 1; |
| 2469 | } |
| 2470 | |
| 2471 | inline bool SearchBuffer::needsMoreContext() const |
| 2472 | { |
| 2473 | return false; |
| 2474 | } |
| 2475 | |
| 2476 | void SearchBuffer::prependContext(const UChar*, size_t) |
| 2477 | { |
| 2478 | ASSERT_NOT_REACHED(); |
| 2479 | } |
| 2480 | |
| 2481 | inline size_t SearchBuffer::search(size_t& start) |
| 2482 | { |
| 2483 | if (!m_isBufferFull) |
| 2484 | return 0; |
| 2485 | if (!m_isCharacterStartBuffer[m_cursor]) |
| 2486 | return 0; |
| 2487 | |
| 2488 | size_t tailSpace = m_target.length() - m_cursor; |
| 2489 | if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0) |
| 2490 | return 0; |
| 2491 | if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0) |
| 2492 | return 0; |
| 2493 | |
| 2494 | start = length(); |
| 2495 | |
| 2496 | // Now that we've found a match once, we don't want to find it again, because those |
| 2497 | // are the SearchBuffer semantics, allowing for a buffer where you append more than one |
| 2498 | // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if |
| 2499 | // we want to get rid of that in the future we could track this with a separate boolean |
| 2500 | // or even move the characters to the start of the buffer and set m_isBufferFull to false. |
| 2501 | m_isCharacterStartBuffer[m_cursor] = false; |
| 2502 | |
| 2503 | return start; |
| 2504 | } |
| 2505 | |
| 2506 | // Returns the number of characters that were appended to the buffer (what we are searching in). |
| 2507 | // That's not necessarily the same length as the passed-in target string, because case folding |
| 2508 | // can make two strings match even though they're not the same length. |
| 2509 | size_t SearchBuffer::length() const |
| 2510 | { |
| 2511 | size_t bufferSize = m_target.length(); |
| 2512 | size_t length = 0; |
| 2513 | for (size_t i = 0; i < bufferSize; ++i) |
| 2514 | length += m_isCharacterStartBuffer[i]; |
| 2515 | return length; |
| 2516 | } |
| 2517 | |
| 2518 | #endif |
| 2519 | |
| 2520 | // -------- |
| 2521 | |
| 2522 | int TextIterator::rangeLength(const Range* range, bool forSelectionPreservation) |
| 2523 | { |
| 2524 | unsigned length = 0; |
| 2525 | for (TextIterator it(range, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance()) |
| 2526 | length += it.text().length(); |
| 2527 | return length; |
| 2528 | } |
| 2529 | |
| 2530 | Ref<Range> TextIterator::subrange(Range& entireRange, int characterOffset, int characterCount) |
| 2531 | { |
| 2532 | CharacterIterator entireRangeIterator(entireRange); |
| 2533 | return characterSubrange(entireRange.ownerDocument(), entireRangeIterator, characterOffset, characterCount); |
| 2534 | } |
| 2535 | |
| 2536 | static inline bool isInsideReplacedElement(TextIterator& iterator) |
| 2537 | { |
| 2538 | ASSERT(!iterator.atEnd()); |
| 2539 | ASSERT(iterator.text().length() == 1); |
| 2540 | Node* node = iterator.node(); |
| 2541 | return node && isRendererReplacedElement(node->renderer()); |
| 2542 | } |
| 2543 | |
| 2544 | RefPtr<Range> TextIterator::rangeFromLocationAndLength(ContainerNode* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation) |
| 2545 | { |
| 2546 | Ref<Range> resultRange = scope->document().createRange(); |
| 2547 | |
| 2548 | int docTextPosition = 0; |
| 2549 | int rangeEnd = rangeLocation + rangeLength; |
| 2550 | bool startRangeFound = false; |
| 2551 | |
| 2552 | Ref<Range> textRunRange = rangeOfContents(*scope); |
| 2553 | |
| 2554 | TextIterator it(textRunRange.ptr(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); |
| 2555 | |
| 2556 | // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>. |
| 2557 | if (!rangeLocation && !rangeLength && it.atEnd()) { |
| 2558 | resultRange->setStart(textRunRange->startContainer(), 0); |
| 2559 | resultRange->setEnd(textRunRange->startContainer(), 0); |
| 2560 | return resultRange; |
| 2561 | } |
| 2562 | |
| 2563 | for (; !it.atEnd(); it.advance()) { |
| 2564 | int length = it.text().length(); |
| 2565 | textRunRange = it.range(); |
| 2566 | |
| 2567 | bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + length; |
| 2568 | bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + length; |
| 2569 | |
| 2570 | if (foundEnd) { |
| 2571 | // FIXME: This is a workaround for the fact that the end of a run is often at the wrong |
| 2572 | // position for emitted '\n's or if the renderer of the current node is a replaced element. |
| 2573 | if (length == 1 && (it.text()[0] == '\n' || isInsideReplacedElement(it))) { |
| 2574 | it.advance(); |
| 2575 | if (!it.atEnd()) { |
| 2576 | Ref<Range> range = it.range(); |
| 2577 | textRunRange->setEnd(range->startContainer(), range->startOffset()); |
| 2578 | } else { |
| 2579 | Position runStart = textRunRange->startPosition(); |
| 2580 | Position runEnd = VisiblePosition(runStart).next().deepEquivalent(); |
| 2581 | if (runEnd.isNotNull()) |
| 2582 | textRunRange->setEnd(*runEnd.containerNode(), runEnd.computeOffsetInContainerNode()); |
| 2583 | } |
| 2584 | } |
| 2585 | } |
| 2586 | |
| 2587 | if (foundStart) { |
| 2588 | startRangeFound = true; |
| 2589 | if (textRunRange->startContainer().isTextNode()) { |
| 2590 | int offset = rangeLocation - docTextPosition; |
| 2591 | resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset()); |
| 2592 | } else { |
| 2593 | if (rangeLocation == docTextPosition) |
| 2594 | resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset()); |
| 2595 | else |
| 2596 | resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset()); |
| 2597 | } |
| 2598 | } |
| 2599 | |
| 2600 | if (foundEnd) { |
| 2601 | if (textRunRange->startContainer().isTextNode()) { |
| 2602 | int offset = rangeEnd - docTextPosition; |
| 2603 | resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset()); |
| 2604 | } else { |
| 2605 | if (rangeEnd == docTextPosition) |
| 2606 | resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset()); |
| 2607 | else |
| 2608 | resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset()); |
| 2609 | } |
| 2610 | docTextPosition += length; |
| 2611 | break; |
| 2612 | } |
| 2613 | |
| 2614 | docTextPosition += length; |
| 2615 | } |
| 2616 | |
| 2617 | if (!startRangeFound) |
| 2618 | return nullptr; |
| 2619 | |
| 2620 | if (rangeLength && rangeEnd > docTextPosition) // rangeEnd is out of bounds |
| 2621 | resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset()); |
| 2622 | |
| 2623 | return resultRange; |
| 2624 | } |
| 2625 | |
| 2626 | bool TextIterator::getLocationAndLengthFromRange(Node* scope, const Range* range, size_t& location, size_t& length) |
| 2627 | { |
| 2628 | location = notFound; |
| 2629 | length = 0; |
| 2630 | |
| 2631 | // The critical assumption is that this only gets called with ranges that |
| 2632 | // concentrate on a given area containing the selection root. This is done |
| 2633 | // because of text fields and textareas. The DOM for those is not |
| 2634 | // directly in the document DOM, so ensure that the range does not cross a |
| 2635 | // boundary of one of those. |
| 2636 | if (&range->startContainer() != scope && !range->startContainer().isDescendantOf(scope)) |
| 2637 | return false; |
| 2638 | if (&range->endContainer() != scope && !range->endContainer().isDescendantOf(scope)) |
| 2639 | return false; |
| 2640 | |
| 2641 | Ref<Range> testRange = Range::create(scope->document(), scope, 0, &range->startContainer(), range->startOffset()); |
| 2642 | ASSERT(&testRange->startContainer() == scope); |
| 2643 | location = TextIterator::rangeLength(testRange.ptr()); |
| 2644 | |
| 2645 | testRange->setEnd(range->endContainer(), range->endOffset()); |
| 2646 | ASSERT(&testRange->startContainer() == scope); |
| 2647 | length = TextIterator::rangeLength(testRange.ptr()) - location; |
| 2648 | return true; |
| 2649 | } |
| 2650 | |
| 2651 | // -------- |
| 2652 | |
| 2653 | bool hasAnyPlainText(const Range& range, TextIteratorBehavior behavior) |
| 2654 | { |
| 2655 | for (TextIterator iterator { &range, behavior }; !iterator.atEnd(); iterator.advance()) { |
| 2656 | if (!iterator.text().isEmpty()) |
| 2657 | return true; |
| 2658 | } |
| 2659 | return false; |
| 2660 | } |
| 2661 | |
| 2662 | String plainText(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
| 2663 | { |
| 2664 | // The initial buffer size can be critical for performance: https://bugs.webkit.org/show_bug.cgi?id=81192 |
| 2665 | static const unsigned initialCapacity = 1 << 15; |
| 2666 | |
| 2667 | if (!start.document()) |
| 2668 | return { }; |
| 2669 | auto document = makeRef(*start.document()); |
| 2670 | |
| 2671 | unsigned bufferLength = 0; |
| 2672 | StringBuilder builder; |
| 2673 | builder.reserveCapacity(initialCapacity); |
| 2674 | TextIteratorBehavior behavior = defaultBehavior; |
| 2675 | if (!isDisplayString) |
| 2676 | behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding); |
| 2677 | |
| 2678 | for (TextIterator it(start, end, behavior); !it.atEnd(); it.advance()) { |
| 2679 | it.appendTextToStringBuilder(builder); |
| 2680 | bufferLength += it.text().length(); |
| 2681 | } |
| 2682 | |
| 2683 | if (!bufferLength) |
| 2684 | return emptyString(); |
| 2685 | |
| 2686 | String result = builder.toString(); |
| 2687 | |
| 2688 | if (isDisplayString) |
| 2689 | document->displayStringModifiedByEncoding(result); |
| 2690 | |
| 2691 | return result; |
| 2692 | } |
| 2693 | |
| 2694 | String plainTextReplacingNoBreakSpace(Position start, Position end, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
| 2695 | { |
| 2696 | return plainText(start, end, defaultBehavior, isDisplayString).replace(noBreakSpace, ' '); |
| 2697 | } |
| 2698 | |
| 2699 | String plainText(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
| 2700 | { |
| 2701 | if (!range) |
| 2702 | return emptyString(); |
| 2703 | return plainText(range->startPosition(), range->endPosition(), defaultBehavior, isDisplayString); |
| 2704 | } |
| 2705 | |
| 2706 | String plainTextUsingBackwardsTextIteratorForTesting(const Range& range) |
| 2707 | { |
| 2708 | String result; |
| 2709 | for (SimplifiedBackwardsTextIterator backwardsIterator(range); !backwardsIterator.atEnd(); backwardsIterator.advance()) |
| 2710 | result.insert(backwardsIterator.text().toString(), 0); |
| 2711 | return result; |
| 2712 | } |
| 2713 | |
| 2714 | String plainTextReplacingNoBreakSpace(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString) |
| 2715 | { |
| 2716 | return plainText(range, defaultBehavior, isDisplayString).replace(noBreakSpace, ' '); |
| 2717 | } |
| 2718 | |
| 2719 | static Ref<Range> collapsedToBoundary(const Range& range, bool forward) |
| 2720 | { |
| 2721 | Ref<Range> result = range.cloneRange(); |
| 2722 | result->collapse(!forward); |
| 2723 | return result; |
| 2724 | } |
| 2725 | |
| 2726 | static TextIteratorBehavior findIteratorOptions(FindOptions options) |
| 2727 | { |
| 2728 | TextIteratorBehavior iteratorOptions = TextIteratorEntersTextControls | TextIteratorClipsToFrameAncestors; |
| 2729 | if (!options.contains(DoNotTraverseFlatTree)) |
| 2730 | iteratorOptions |= TextIteratorTraversesFlatTree; |
| 2731 | return iteratorOptions; |
| 2732 | } |
| 2733 | |
| 2734 | static void findPlainTextMatches(const Range& range, const String& target, FindOptions options, const WTF::Function<bool(size_t, size_t)>& match) |
| 2735 | { |
| 2736 | SearchBuffer buffer(target, options); |
| 2737 | if (buffer.needsMoreContext()) { |
| 2738 | Ref<Range> beforeStartRange = range.ownerDocument().createRange(); |
| 2739 | beforeStartRange->setEnd(range.startContainer(), range.startOffset()); |
| 2740 | for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) { |
| 2741 | buffer.prependContext(backwardsIterator.text()); |
| 2742 | if (!buffer.needsMoreContext()) |
| 2743 | break; |
| 2744 | } |
| 2745 | } |
| 2746 | |
| 2747 | CharacterIterator findIterator(range, findIteratorOptions(options)); |
| 2748 | while (!findIterator.atEnd()) { |
| 2749 | findIterator.advance(buffer.append(findIterator.text())); |
| 2750 | while (1) { |
| 2751 | size_t matchStartOffset; |
| 2752 | size_t newMatchLength = buffer.search(matchStartOffset); |
| 2753 | if (!newMatchLength) { |
| 2754 | if (findIterator.atBreak() && !buffer.atBreak()) { |
| 2755 | buffer.reachedBreak(); |
| 2756 | continue; |
| 2757 | } |
| 2758 | break; |
| 2759 | } |
| 2760 | size_t lastCharacterInBufferOffset = findIterator.characterOffset(); |
| 2761 | ASSERT(lastCharacterInBufferOffset >= matchStartOffset); |
| 2762 | if (match(lastCharacterInBufferOffset - matchStartOffset, newMatchLength)) |
| 2763 | return; |
| 2764 | } |
| 2765 | } |
| 2766 | } |
| 2767 | |
| 2768 | static Ref<Range> rangeForMatch(const Range& range, FindOptions options, size_t matchStart, size_t matchLength, bool searchForward) |
| 2769 | { |
| 2770 | if (!matchLength) |
| 2771 | return collapsedToBoundary(range, searchForward); |
| 2772 | CharacterIterator rangeComputeIterator(range, findIteratorOptions(options)); |
| 2773 | return characterSubrange(range.ownerDocument(), rangeComputeIterator, matchStart, matchLength); |
| 2774 | } |
| 2775 | |
| 2776 | Ref<Range> findClosestPlainText(const Range& range, const String& target, FindOptions options, unsigned targetOffset) |
| 2777 | { |
| 2778 | size_t matchStart = 0; |
| 2779 | size_t matchLength = 0; |
| 2780 | size_t distance = std::numeric_limits<size_t>::max(); |
| 2781 | auto match = [targetOffset, &distance, &matchStart, &matchLength] (size_t start, size_t length) { |
| 2782 | size_t newDistance = std::min(abs(static_cast<signed>(start - targetOffset)), abs(static_cast<signed>(start + length - targetOffset))); |
| 2783 | if (newDistance < distance) { |
| 2784 | matchStart = start; |
| 2785 | matchLength = length; |
| 2786 | distance = newDistance; |
| 2787 | } |
| 2788 | return false; |
| 2789 | }; |
| 2790 | |
| 2791 | findPlainTextMatches(range, target, options, WTFMove(match)); |
| 2792 | return rangeForMatch(range, options, matchStart, matchLength, !options.contains(Backwards)); |
| 2793 | } |
| 2794 | |
| 2795 | Ref<Range> findPlainText(const Range& range, const String& target, FindOptions options) |
| 2796 | { |
| 2797 | bool searchForward = !options.contains(Backwards); |
| 2798 | size_t matchStart = 0; |
| 2799 | size_t matchLength = 0; |
| 2800 | auto match = [searchForward, &matchStart, &matchLength] (size_t start, size_t length) { |
| 2801 | matchStart = start; |
| 2802 | matchLength = length; |
| 2803 | // Look for the last match when searching backwards instead. |
| 2804 | return searchForward; |
| 2805 | }; |
| 2806 | |
| 2807 | findPlainTextMatches(range, target, options, WTFMove(match)); |
| 2808 | return rangeForMatch(range, options, matchStart, matchLength, searchForward); |
| 2809 | } |
| 2810 | |
| 2811 | bool findPlainText(const String& document, const String& target, FindOptions options) |
| 2812 | { |
| 2813 | SearchBuffer buffer { target, options }; |
| 2814 | StringView remainingText { document }; |
| 2815 | while (!remainingText.isEmpty()) { |
| 2816 | size_t charactersAppended = buffer.append(document); |
| 2817 | remainingText = remainingText.substring(charactersAppended); |
| 2818 | if (remainingText.isEmpty()) |
| 2819 | buffer.reachedBreak(); |
| 2820 | size_t matchStartOffset; |
| 2821 | if (buffer.search(matchStartOffset)) |
| 2822 | return true; |
| 2823 | } |
| 2824 | return false; |
| 2825 | } |
| 2826 | |
| 2827 | } |
| 2828 | |