1/*
2 * Copyright (C) 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2015 Igalia S.L.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
24 * THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "Hyphenation.h"
29
30#if USE(LIBHYPHEN)
31
32#include <hyphen.h>
33#include <limits>
34#include <stdlib.h>
35#include <wtf/FileSystem.h>
36#include <wtf/HashMap.h>
37#include <wtf/NeverDestroyed.h>
38#include <wtf/TinyLRUCache.h>
39#include <wtf/glib/GLibUtilities.h>
40#include <wtf/glib/GUniquePtr.h>
41#include <wtf/text/AtomicStringHash.h>
42#include <wtf/text/CString.h>
43#include <wtf/text/StringView.h>
44
45namespace WebCore {
46
47static const char* const gDictionaryDirectories[] = {
48 "/usr/share/hyphen",
49 "/usr/local/share/hyphen",
50};
51
52static String extractLocaleFromDictionaryFilePath(const String& filePath)
53{
54 // Dictionary files always have the form "hyph_<locale name>.dic"
55 // so we strip everything except the locale.
56 String fileName = FileSystem::pathGetFileName(filePath);
57 static const int prefixLength = 5;
58 static const int suffixLength = 4;
59 return fileName.substring(prefixLength, fileName.length() - prefixLength - suffixLength);
60}
61
62static void scanDirectoryForDictionaries(const char* directoryPath, HashMap<AtomicString, Vector<String>>& availableLocales)
63{
64 for (auto& filePath : FileSystem::listDirectory(directoryPath, "hyph_*.dic")) {
65 String locale = extractLocaleFromDictionaryFilePath(filePath).convertToASCIILowercase();
66
67 char normalizedPath[PATH_MAX];
68 if (!realpath(FileSystem::fileSystemRepresentation(filePath).data(), normalizedPath))
69 continue;
70
71 filePath = FileSystem::stringFromFileSystemRepresentation(normalizedPath);
72 availableLocales.add(locale, Vector<String>()).iterator->value.append(filePath);
73
74 String localeReplacingUnderscores = String(locale);
75 localeReplacingUnderscores.replace('_', '-');
76 if (locale != localeReplacingUnderscores)
77 availableLocales.add(localeReplacingUnderscores, Vector<String>()).iterator->value.append(filePath);
78
79 size_t dividerPosition = localeReplacingUnderscores.find('-');
80 if (dividerPosition != notFound) {
81 localeReplacingUnderscores.truncate(dividerPosition);
82 availableLocales.add(localeReplacingUnderscores, Vector<String>()).iterator->value.append(filePath);
83 }
84 }
85}
86
87#if ENABLE(DEVELOPER_MODE)
88static CString topLevelPath()
89{
90 if (const char* topLevelDirectory = g_getenv("WEBKIT_TOP_LEVEL"))
91 return topLevelDirectory;
92
93 // If the environment variable wasn't provided then assume we were built into
94 // WebKitBuild/Debug or WebKitBuild/Release. Obviously this will fail if the build
95 // directory is non-standard, but we can't do much more about this.
96 GUniquePtr<char> parentPath(g_path_get_dirname(getCurrentExecutablePath().data()));
97 GUniquePtr<char> layoutTestsPath(g_build_filename(parentPath.get(), "..", "..", "..", nullptr));
98 GUniquePtr<char> absoluteTopLevelPath(realpath(layoutTestsPath.get(), 0));
99 return absoluteTopLevelPath.get();
100}
101
102static CString webkitBuildDirectory()
103{
104 const char* webkitOutputDir = g_getenv("WEBKIT_OUTPUTDIR");
105 if (webkitOutputDir)
106 return webkitOutputDir;
107
108 GUniquePtr<char> outputDir(g_build_filename(topLevelPath().data(), "WebKitBuild", nullptr));
109 return outputDir.get();
110}
111
112static void scanTestDictionariesDirectoryIfNecessary(HashMap<AtomicString, Vector<String>>& availableLocales)
113{
114 // It's unfortunate that we need to look for the dictionaries this way, but
115 // libhyphen doesn't have the concept of installed dictionaries. Instead,
116 // we have this special case for WebKit tests.
117#if PLATFORM(GTK)
118 CString buildDirectory = webkitBuildDirectory();
119 GUniquePtr<char> dictionariesPath(g_build_filename(buildDirectory.data(), "DependenciesGTK", "Root", "webkitgtk-test-dicts", nullptr));
120 if (g_file_test(dictionariesPath.get(), static_cast<GFileTest>(G_FILE_TEST_IS_DIR))) {
121 scanDirectoryForDictionaries(dictionariesPath.get(), availableLocales);
122 return;
123 }
124
125 // Try alternative dictionaries path for people not using JHBuild.
126 dictionariesPath.reset(g_build_filename(buildDirectory.data(), "webkitgtk-test-dicts", nullptr));
127 scanDirectoryForDictionaries(dictionariesPath.get(), availableLocales);
128#elif defined(TEST_HYPHENATAION_PATH)
129 scanDirectoryForDictionaries(TEST_HYPHENATAION_PATH, availableLocales);
130#else
131 UNUSED_PARAM(availableLocales);
132#endif
133}
134#endif
135
136static HashMap<AtomicString, Vector<String>>& availableLocales()
137{
138 static bool scannedLocales = false;
139 static HashMap<AtomicString, Vector<String>> availableLocales;
140
141 if (!scannedLocales) {
142 for (size_t i = 0; i < WTF_ARRAY_LENGTH(gDictionaryDirectories); i++)
143 scanDirectoryForDictionaries(gDictionaryDirectories[i], availableLocales);
144
145#if ENABLE(DEVELOPER_MODE)
146 scanTestDictionariesDirectoryIfNecessary(availableLocales);
147#endif
148
149 scannedLocales = true;
150 }
151
152 return availableLocales;
153}
154
155bool canHyphenate(const AtomicString& localeIdentifier)
156{
157 if (localeIdentifier.isNull())
158 return false;
159 if (availableLocales().contains(localeIdentifier))
160 return true;
161 return availableLocales().contains(AtomicString(localeIdentifier.string().convertToASCIILowercase()));
162}
163
164class HyphenationDictionary : public RefCounted<HyphenationDictionary> {
165 WTF_MAKE_NONCOPYABLE(HyphenationDictionary);
166 WTF_MAKE_FAST_ALLOCATED;
167public:
168 typedef std::unique_ptr<HyphenDict, void(*)(HyphenDict*)> HyphenDictUniquePtr;
169
170 virtual ~HyphenationDictionary() = default;
171
172 static Ref<HyphenationDictionary> createNull()
173 {
174 return adoptRef(*new HyphenationDictionary());
175 }
176
177 static Ref<HyphenationDictionary> create(const CString& dictPath)
178 {
179 return adoptRef(*new HyphenationDictionary(dictPath));
180 }
181
182 HyphenDict* libhyphenDictionary() const
183 {
184 return m_libhyphenDictionary.get();
185 }
186
187private:
188 HyphenationDictionary(const CString& dictPath)
189 : m_libhyphenDictionary(HyphenDictUniquePtr(hnj_hyphen_load(dictPath.data()), hnj_hyphen_free))
190 {
191 }
192
193 HyphenationDictionary()
194 : m_libhyphenDictionary(HyphenDictUniquePtr(nullptr, hnj_hyphen_free))
195 {
196 }
197
198 HyphenDictUniquePtr m_libhyphenDictionary;
199};
200
201} // namespace WebCore
202
203namespace WTF {
204
205template<>
206class TinyLRUCachePolicy<AtomicString, RefPtr<WebCore::HyphenationDictionary>>
207{
208public:
209 static TinyLRUCache<AtomicString, RefPtr<WebCore::HyphenationDictionary>, 32>& cache()
210 {
211 static NeverDestroyed<TinyLRUCache<AtomicString, RefPtr<WebCore::HyphenationDictionary>, 32>> cache;
212 return cache;
213 }
214
215 static bool isKeyNull(const AtomicString& localeIdentifier)
216 {
217 return localeIdentifier.isNull();
218 }
219
220 static RefPtr<WebCore::HyphenationDictionary> createValueForNullKey()
221 {
222 return WebCore::HyphenationDictionary::createNull();
223 }
224
225 static RefPtr<WebCore::HyphenationDictionary> createValueForKey(const AtomicString& dictionaryPath)
226 {
227 return WebCore::HyphenationDictionary::create(FileSystem::fileSystemRepresentation(dictionaryPath.string()));
228 }
229};
230
231} // namespace WTF
232
233namespace WebCore {
234
235static void countLeadingSpaces(const CString& utf8String, int32_t& pointerOffset, int32_t& characterOffset)
236{
237 pointerOffset = 0;
238 characterOffset = 0;
239 const char* stringData = utf8String.data();
240 UChar32 character = 0;
241 while (static_cast<unsigned>(pointerOffset) < utf8String.length()) {
242 int32_t nextPointerOffset = pointerOffset;
243 U8_NEXT(stringData, nextPointerOffset, static_cast<int32_t>(utf8String.length()), character);
244
245 if (character < 0 || !u_isUWhiteSpace(character))
246 return;
247
248 pointerOffset = nextPointerOffset;
249 characterOffset++;
250 }
251}
252
253size_t lastHyphenLocation(StringView string, size_t beforeIndex, const AtomicString& localeIdentifier)
254{
255 // libhyphen accepts strings in UTF-8 format, but WebCore can only provide StringView
256 // which stores either UTF-16 or Latin1 data. This is unfortunate for performance
257 // reasons and we should consider switching to a more flexible hyphenation library
258 // if it is available.
259 CString utf8StringCopy = string.toStringWithoutCopying().utf8();
260
261 // WebCore often passes strings like " wordtohyphenate" to the platform layer. Since
262 // libhyphen isn't advanced enough to deal with leading spaces (presumably CoreFoundation
263 // can), we should find the appropriate indexes into the string to skip them.
264 int32_t leadingSpaceBytes;
265 int32_t leadingSpaceCharacters;
266 countLeadingSpaces(utf8StringCopy, leadingSpaceBytes, leadingSpaceCharacters);
267
268 // The libhyphen documentation specifies that this array should be 5 bytes longer than
269 // the byte length of the input string.
270 Vector<char> hyphenArray(utf8StringCopy.length() - leadingSpaceBytes + 5);
271 char* hyphenArrayData = hyphenArray.data();
272
273 String lowercaseLocaleIdentifier = AtomicString(localeIdentifier.string().convertToASCIILowercase());
274
275 // Web content may specify strings for locales which do not exist or that we do not have.
276 if (!availableLocales().contains(lowercaseLocaleIdentifier))
277 return 0;
278
279 for (const auto& dictionaryPath : availableLocales().get(lowercaseLocaleIdentifier)) {
280 RefPtr<HyphenationDictionary> dictionary = WTF::TinyLRUCachePolicy<AtomicString, RefPtr<HyphenationDictionary>>::cache().get(AtomicString(dictionaryPath));
281
282 char** replacements = nullptr;
283 int* positions = nullptr;
284 int* removedCharacterCounts = nullptr;
285 hnj_hyphen_hyphenate2(dictionary->libhyphenDictionary(),
286 utf8StringCopy.data() + leadingSpaceBytes,
287 utf8StringCopy.length() - leadingSpaceBytes,
288 hyphenArrayData,
289 nullptr, /* output parameter for hyphenated word */
290 &replacements,
291 &positions,
292 &removedCharacterCounts);
293
294 if (replacements) {
295 for (unsigned i = 0; i < utf8StringCopy.length() - leadingSpaceBytes - 1; i++)
296 free(replacements[i]);
297 free(replacements);
298 }
299
300 free(positions);
301 free(removedCharacterCounts);
302
303 for (int i = beforeIndex - leadingSpaceCharacters - 2; i >= 0; i--) {
304 // libhyphen will put an odd number in hyphenArrayData at all
305 // hyphenation points. A number & 1 will be true for odd numbers.
306 if (hyphenArrayData[i] & 1)
307 return i + 1 + leadingSpaceCharacters;
308 }
309 }
310
311 return 0;
312}
313
314} // namespace WebCore
315
316#endif // USE(LIBHYPHEN)
317