1/*
2 * Copyright (C) 2003-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2008 Holger Hans Peter Freyther
4 * Copyright (C) Research In Motion Limited 2011. All rights reserved.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#include "config.h"
24#include "SurrogatePairAwareTextIterator.h"
25
26#include <unicode/unorm2.h>
27
28namespace WebCore {
29
30SurrogatePairAwareTextIterator::SurrogatePairAwareTextIterator(const UChar* characters, unsigned currentIndex, unsigned lastIndex, unsigned endIndex)
31 : m_characters(characters)
32 , m_currentIndex(currentIndex)
33 , m_lastIndex(lastIndex)
34 , m_endIndex(endIndex)
35{
36}
37
38bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength)
39{
40 if (character <= 0x30FE) {
41 // Deal with Hiragana and Katakana voiced and semi-voiced syllables.
42 // Normalize into composed form, and then look for glyph with base + combined mark.
43 // Check above for character range to minimize performance impact.
44 if (UChar32 normalized = normalizeVoicingMarks()) {
45 character = normalized;
46 clusterLength = 2;
47 }
48 return true;
49 }
50
51 if (!U16_IS_SURROGATE(character))
52 return true;
53
54 // If we have a surrogate pair, make sure it starts with the high part.
55 if (!U16_IS_SURROGATE_LEAD(character))
56 return false;
57
58 // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup.
59 // Make sure we have another character and it's a low surrogate.
60 if (m_currentIndex + 1 >= m_endIndex)
61 return false;
62
63 UChar low = m_characters[1];
64 if (!U16_IS_TRAIL(low))
65 return false;
66
67 character = U16_GET_SUPPLEMENTARY(character, low);
68 clusterLength = 2;
69 return true;
70}
71
72UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks()
73{
74 // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values
75 static constexpr uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8;
76
77 if (m_currentIndex + 1 >= m_endIndex)
78 return 0;
79
80 if (u_getCombiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) {
81 UErrorCode status = U_ZERO_ERROR;
82 const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
83 ASSERT(U_SUCCESS(status));
84 auto composedCharacter = unorm2_composePair(normalizer, m_characters[0], m_characters[1]);
85 if (composedCharacter > 0)
86 return composedCharacter;
87 }
88
89 return 0;
90}
91
92}
93