| 1 | /* |
| 2 | |
| 3 | Copyright (C) 2014-2019 Apple Inc. All rights reserved. |
| 4 | |
| 5 | Redistribution and use in source and binary forms, with or without |
| 6 | modification, are permitted provided that the following conditions |
| 7 | are met: |
| 8 | 1. Redistributions of source code must retain the above copyright |
| 9 | notice, this list of conditions and the following disclaimer. |
| 10 | 2. Redistributions in binary form must reproduce the above copyright |
| 11 | notice, this list of conditions and the following disclaimer in the |
| 12 | documentation and/or other materials provided with the distribution. |
| 13 | |
| 14 | THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY |
| 15 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 17 | DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| 18 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 24 | |
| 25 | */ |
| 26 | |
| 27 | #include "config.h" |
| 28 | #include <wtf/text/StringView.h> |
| 29 | |
| 30 | #include <mutex> |
| 31 | #include <unicode/ubrk.h> |
| 32 | #include <unicode/unorm2.h> |
| 33 | #include <wtf/HashMap.h> |
| 34 | #include <wtf/Lock.h> |
| 35 | #include <wtf/NeverDestroyed.h> |
| 36 | #include <wtf/Optional.h> |
| 37 | #include <wtf/text/TextBreakIterator.h> |
| 38 | |
| 39 | namespace WTF { |
| 40 | |
| 41 | bool StringView::containsIgnoringASCIICase(const StringView& matchString) const |
| 42 | { |
| 43 | return findIgnoringASCIICase(matchString) != notFound; |
| 44 | } |
| 45 | |
| 46 | bool StringView::containsIgnoringASCIICase(const StringView& matchString, unsigned startOffset) const |
| 47 | { |
| 48 | return findIgnoringASCIICase(matchString, startOffset) != notFound; |
| 49 | } |
| 50 | |
| 51 | size_t StringView::findIgnoringASCIICase(const StringView& matchString) const |
| 52 | { |
| 53 | return ::WTF::findIgnoringASCIICase(*this, matchString, 0); |
| 54 | } |
| 55 | |
| 56 | size_t StringView::findIgnoringASCIICase(const StringView& matchString, unsigned startOffset) const |
| 57 | { |
| 58 | return ::WTF::findIgnoringASCIICase(*this, matchString, startOffset); |
| 59 | } |
| 60 | |
| 61 | bool StringView::startsWith(const StringView& prefix) const |
| 62 | { |
| 63 | return ::WTF::startsWith(*this, prefix); |
| 64 | } |
| 65 | |
| 66 | bool StringView::startsWithIgnoringASCIICase(const StringView& prefix) const |
| 67 | { |
| 68 | return ::WTF::startsWithIgnoringASCIICase(*this, prefix); |
| 69 | } |
| 70 | |
| 71 | bool StringView::endsWith(const StringView& suffix) const |
| 72 | { |
| 73 | return ::WTF::endsWith(*this, suffix); |
| 74 | } |
| 75 | |
| 76 | bool StringView::endsWithIgnoringASCIICase(const StringView& suffix) const |
| 77 | { |
| 78 | return ::WTF::endsWithIgnoringASCIICase(*this, suffix); |
| 79 | } |
| 80 | |
| 81 | Expected<CString, UTF8ConversionError> StringView::tryGetUtf8(ConversionMode mode) const |
| 82 | { |
| 83 | if (isNull()) |
| 84 | return CString("" , 0); |
| 85 | if (is8Bit()) |
| 86 | return StringImpl::utf8ForCharacters(characters8(), length()); |
| 87 | return StringImpl::utf8ForCharacters(characters16(), length(), mode); |
| 88 | } |
| 89 | |
| 90 | CString StringView::utf8(ConversionMode mode) const |
| 91 | { |
| 92 | auto expectedString = tryGetUtf8(mode); |
| 93 | RELEASE_ASSERT(expectedString); |
| 94 | return expectedString.value(); |
| 95 | } |
| 96 | |
| 97 | size_t StringView::find(StringView matchString, unsigned start) const |
| 98 | { |
| 99 | return findCommon(*this, matchString, start); |
| 100 | } |
| 101 | |
| 102 | void StringView::SplitResult::Iterator::findNextSubstring() |
| 103 | { |
| 104 | for (size_t separatorPosition; (separatorPosition = m_result.m_string.find(m_result.m_separator, m_position)) != notFound; ++m_position) { |
| 105 | if (m_result.m_allowEmptyEntries || separatorPosition > m_position) { |
| 106 | m_length = separatorPosition - m_position; |
| 107 | return; |
| 108 | } |
| 109 | } |
| 110 | m_length = m_result.m_string.length() - m_position; |
| 111 | if (!m_length && !m_result.m_allowEmptyEntries) |
| 112 | m_isDone = true; |
| 113 | } |
| 114 | |
| 115 | auto StringView::SplitResult::Iterator::operator++() -> Iterator& |
| 116 | { |
| 117 | ASSERT(m_position <= m_result.m_string.length() && !m_isDone); |
| 118 | m_position += m_length; |
| 119 | if (m_position < m_result.m_string.length()) { |
| 120 | ++m_position; |
| 121 | findNextSubstring(); |
| 122 | } else if (!m_isDone) |
| 123 | m_isDone = true; |
| 124 | return *this; |
| 125 | } |
| 126 | |
| 127 | class StringView::GraphemeClusters::Iterator::Impl { |
| 128 | WTF_MAKE_FAST_ALLOCATED; |
| 129 | public: |
| 130 | Impl(const StringView& stringView, Optional<NonSharedCharacterBreakIterator>&& iterator, unsigned index) |
| 131 | : m_stringView(stringView) |
| 132 | , m_iterator(WTFMove(iterator)) |
| 133 | , m_index(index) |
| 134 | , m_indexEnd(computeIndexEnd()) |
| 135 | { |
| 136 | } |
| 137 | |
| 138 | void operator++() |
| 139 | { |
| 140 | ASSERT(m_indexEnd > m_index); |
| 141 | m_index = m_indexEnd; |
| 142 | m_indexEnd = computeIndexEnd(); |
| 143 | } |
| 144 | |
| 145 | StringView operator*() const |
| 146 | { |
| 147 | if (m_stringView.is8Bit()) |
| 148 | return StringView(m_stringView.characters8() + m_index, m_indexEnd - m_index); |
| 149 | return StringView(m_stringView.characters16() + m_index, m_indexEnd - m_index); |
| 150 | } |
| 151 | |
| 152 | bool operator==(const Impl& other) const |
| 153 | { |
| 154 | ASSERT(&m_stringView == &other.m_stringView); |
| 155 | auto result = m_index == other.m_index; |
| 156 | ASSERT(!result || m_indexEnd == other.m_indexEnd); |
| 157 | return result; |
| 158 | } |
| 159 | |
| 160 | unsigned computeIndexEnd() |
| 161 | { |
| 162 | if (!m_iterator) |
| 163 | return 0; |
| 164 | if (m_index == m_stringView.length()) |
| 165 | return m_index; |
| 166 | return ubrk_following(m_iterator.value(), m_index); |
| 167 | } |
| 168 | |
| 169 | private: |
| 170 | const StringView& m_stringView; |
| 171 | Optional<NonSharedCharacterBreakIterator> m_iterator; |
| 172 | unsigned m_index; |
| 173 | unsigned m_indexEnd; |
| 174 | }; |
| 175 | |
| 176 | StringView::GraphemeClusters::Iterator::Iterator(const StringView& stringView, unsigned index) |
| 177 | : m_impl(std::make_unique<Impl>(stringView, stringView.isNull() ? WTF::nullopt : Optional<NonSharedCharacterBreakIterator>(NonSharedCharacterBreakIterator(stringView)), index)) |
| 178 | { |
| 179 | } |
| 180 | |
| 181 | StringView::GraphemeClusters::Iterator::~Iterator() |
| 182 | { |
| 183 | } |
| 184 | |
| 185 | StringView::GraphemeClusters::Iterator::Iterator(Iterator&& other) |
| 186 | : m_impl(WTFMove(other.m_impl)) |
| 187 | { |
| 188 | } |
| 189 | |
| 190 | auto StringView::GraphemeClusters::Iterator::operator++() -> Iterator& |
| 191 | { |
| 192 | ++(*m_impl); |
| 193 | return *this; |
| 194 | } |
| 195 | |
| 196 | StringView StringView::GraphemeClusters::Iterator::operator*() const |
| 197 | { |
| 198 | return **m_impl; |
| 199 | } |
| 200 | |
| 201 | bool StringView::GraphemeClusters::Iterator::operator==(const Iterator& other) const |
| 202 | { |
| 203 | return *m_impl == *(other.m_impl); |
| 204 | } |
| 205 | |
| 206 | bool StringView::GraphemeClusters::Iterator::operator!=(const Iterator& other) const |
| 207 | { |
| 208 | return !(*this == other); |
| 209 | } |
| 210 | |
| 211 | enum class ASCIICase { Lower, Upper }; |
| 212 | |
| 213 | template<ASCIICase type, typename CharacterType> |
| 214 | String convertASCIICase(const CharacterType* input, unsigned length) |
| 215 | { |
| 216 | if (!input) |
| 217 | return { }; |
| 218 | |
| 219 | CharacterType* characters; |
| 220 | auto result = String::createUninitialized(length, characters); |
| 221 | for (unsigned i = 0; i < length; ++i) |
| 222 | characters[i] = type == ASCIICase::Lower ? toASCIILower(input[i]) : toASCIIUpper(input[i]); |
| 223 | return result; |
| 224 | } |
| 225 | |
| 226 | String StringView::convertToASCIILowercase() const |
| 227 | { |
| 228 | if (m_is8Bit) |
| 229 | return convertASCIICase<ASCIICase::Lower>(static_cast<const LChar*>(m_characters), m_length); |
| 230 | return convertASCIICase<ASCIICase::Lower>(static_cast<const UChar*>(m_characters), m_length); |
| 231 | } |
| 232 | |
| 233 | String StringView::convertToASCIIUppercase() const |
| 234 | { |
| 235 | if (m_is8Bit) |
| 236 | return convertASCIICase<ASCIICase::Upper>(static_cast<const LChar*>(m_characters), m_length); |
| 237 | return convertASCIICase<ASCIICase::Upper>(static_cast<const UChar*>(m_characters), m_length); |
| 238 | } |
| 239 | |
| 240 | StringViewWithUnderlyingString normalizedNFC(StringView string) |
| 241 | { |
| 242 | // Latin-1 characters are unaffected by normalization. |
| 243 | if (string.is8Bit()) |
| 244 | return { string, { } }; |
| 245 | |
| 246 | UErrorCode status = U_ZERO_ERROR; |
| 247 | const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); |
| 248 | ASSERT(U_SUCCESS(status)); |
| 249 | |
| 250 | // No need to normalize if already normalized. |
| 251 | UBool checkResult = unorm2_isNormalized(normalizer, string.characters16(), string.length(), &status); |
| 252 | if (checkResult) |
| 253 | return { string, { } }; |
| 254 | |
| 255 | unsigned normalizedLength = unorm2_normalize(normalizer, string.characters16(), string.length(), nullptr, 0, &status); |
| 256 | ASSERT(status == U_BUFFER_OVERFLOW_ERROR); |
| 257 | |
| 258 | UChar* characters; |
| 259 | String result = String::createUninitialized(normalizedLength, characters); |
| 260 | |
| 261 | status = U_ZERO_ERROR; |
| 262 | unorm2_normalize(normalizer, string.characters16(), string.length(), characters, normalizedLength, &status); |
| 263 | ASSERT(U_SUCCESS(status)); |
| 264 | |
| 265 | StringView view { result }; |
| 266 | return { view, WTFMove(result) }; |
| 267 | } |
| 268 | |
| 269 | String normalizedNFC(const String& string) |
| 270 | { |
| 271 | auto result = normalizedNFC(StringView { string }); |
| 272 | if (result.underlyingString.isNull()) |
| 273 | return string; |
| 274 | return result.underlyingString; |
| 275 | } |
| 276 | |
| 277 | #if CHECK_STRINGVIEW_LIFETIME |
| 278 | |
| 279 | // Manage reference count manually so UnderlyingString does not need to be defined in the header. |
| 280 | |
| 281 | struct StringView::UnderlyingString { |
| 282 | std::atomic_uint refCount { 1u }; |
| 283 | bool isValid { true }; |
| 284 | const StringImpl& string; |
| 285 | explicit UnderlyingString(const StringImpl&); |
| 286 | }; |
| 287 | |
| 288 | StringView::UnderlyingString::UnderlyingString(const StringImpl& string) |
| 289 | : string(string) |
| 290 | { |
| 291 | } |
| 292 | |
| 293 | static Lock underlyingStringsMutex; |
| 294 | |
| 295 | static HashMap<const StringImpl*, StringView::UnderlyingString*>& underlyingStrings() |
| 296 | { |
| 297 | static NeverDestroyed<HashMap<const StringImpl*, StringView::UnderlyingString*>> map; |
| 298 | return map; |
| 299 | } |
| 300 | |
| 301 | void StringView::invalidate(const StringImpl& stringToBeDestroyed) |
| 302 | { |
| 303 | UnderlyingString* underlyingString; |
| 304 | { |
| 305 | std::lock_guard<Lock> lock(underlyingStringsMutex); |
| 306 | underlyingString = underlyingStrings().take(&stringToBeDestroyed); |
| 307 | if (!underlyingString) |
| 308 | return; |
| 309 | } |
| 310 | ASSERT(underlyingString->isValid); |
| 311 | underlyingString->isValid = false; |
| 312 | } |
| 313 | |
| 314 | bool StringView::underlyingStringIsValid() const |
| 315 | { |
| 316 | return !m_underlyingString || m_underlyingString->isValid; |
| 317 | } |
| 318 | |
| 319 | void StringView::adoptUnderlyingString(UnderlyingString* underlyingString) |
| 320 | { |
| 321 | if (m_underlyingString) { |
| 322 | std::lock_guard<Lock> lock(underlyingStringsMutex); |
| 323 | if (!--m_underlyingString->refCount) { |
| 324 | if (m_underlyingString->isValid) { |
| 325 | underlyingStrings().remove(&m_underlyingString->string); |
| 326 | } |
| 327 | delete m_underlyingString; |
| 328 | } |
| 329 | } |
| 330 | m_underlyingString = underlyingString; |
| 331 | } |
| 332 | |
| 333 | void StringView::setUnderlyingString(const StringImpl* string) |
| 334 | { |
| 335 | UnderlyingString* underlyingString; |
| 336 | if (!string) |
| 337 | underlyingString = nullptr; |
| 338 | else { |
| 339 | std::lock_guard<Lock> lock(underlyingStringsMutex); |
| 340 | auto result = underlyingStrings().add(string, nullptr); |
| 341 | if (result.isNewEntry) |
| 342 | result.iterator->value = new UnderlyingString(*string); |
| 343 | else |
| 344 | ++result.iterator->value->refCount; |
| 345 | underlyingString = result.iterator->value; |
| 346 | } |
| 347 | adoptUnderlyingString(underlyingString); |
| 348 | } |
| 349 | |
| 350 | void StringView::setUnderlyingString(const StringView& otherString) |
| 351 | { |
| 352 | UnderlyingString* underlyingString = otherString.m_underlyingString; |
| 353 | if (underlyingString) |
| 354 | ++underlyingString->refCount; |
| 355 | adoptUnderlyingString(underlyingString); |
| 356 | } |
| 357 | |
| 358 | #endif // CHECK_STRINGVIEW_LIFETIME |
| 359 | |
| 360 | } // namespace WTF |
| 361 | |