1 | /* |
2 | * Copyright (C) 2003-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2008 Holger Hans Peter Freyther |
4 | * Copyright (C) Research In Motion Limited 2011. All rights reserved. |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Library General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Library General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Library General Public License |
17 | * along with this library; see the file COPYING.LIB. If not, write to |
18 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
19 | * Boston, MA 02110-1301, USA. |
20 | * |
21 | */ |
22 | |
23 | #include "config.h" |
24 | #include "SurrogatePairAwareTextIterator.h" |
25 | |
26 | #include <unicode/unorm2.h> |
27 | |
28 | namespace WebCore { |
29 | |
30 | SurrogatePairAwareTextIterator::SurrogatePairAwareTextIterator(const UChar* characters, unsigned currentIndex, unsigned lastIndex, unsigned endIndex) |
31 | : m_characters(characters) |
32 | , m_currentIndex(currentIndex) |
33 | , m_lastIndex(lastIndex) |
34 | , m_endIndex(endIndex) |
35 | { |
36 | } |
37 | |
38 | bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength) |
39 | { |
40 | if (character <= 0x30FE) { |
41 | // Deal with Hiragana and Katakana voiced and semi-voiced syllables. |
42 | // Normalize into composed form, and then look for glyph with base + combined mark. |
43 | // Check above for character range to minimize performance impact. |
44 | if (UChar32 normalized = normalizeVoicingMarks()) { |
45 | character = normalized; |
46 | clusterLength = 2; |
47 | } |
48 | return true; |
49 | } |
50 | |
51 | if (!U16_IS_SURROGATE(character)) |
52 | return true; |
53 | |
54 | // If we have a surrogate pair, make sure it starts with the high part. |
55 | if (!U16_IS_SURROGATE_LEAD(character)) |
56 | return false; |
57 | |
58 | // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup. |
59 | // Make sure we have another character and it's a low surrogate. |
60 | if (m_currentIndex + 1 >= m_endIndex) |
61 | return false; |
62 | |
63 | UChar low = m_characters[1]; |
64 | if (!U16_IS_TRAIL(low)) |
65 | return false; |
66 | |
67 | character = U16_GET_SUPPLEMENTARY(character, low); |
68 | clusterLength = 2; |
69 | return true; |
70 | } |
71 | |
72 | UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks() |
73 | { |
74 | // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values |
75 | static constexpr uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8; |
76 | |
77 | if (m_currentIndex + 1 >= m_endIndex) |
78 | return 0; |
79 | |
80 | if (u_getCombiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) { |
81 | UErrorCode status = U_ZERO_ERROR; |
82 | const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); |
83 | ASSERT(U_SUCCESS(status)); |
84 | auto composedCharacter = unorm2_composePair(normalizer, m_characters[0], m_characters[1]); |
85 | if (composedCharacter > 0) |
86 | return composedCharacter; |
87 | } |
88 | |
89 | return 0; |
90 | } |
91 | |
92 | } |
93 | |