1 | /* |
2 | * Copyright (C) 2004-2017 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #include "config.h" |
28 | #include "TextCodecICU.h" |
29 | |
30 | #include "TextEncoding.h" |
31 | #include "TextEncodingRegistry.h" |
32 | #include "ThreadGlobalData.h" |
33 | #include <array> |
34 | #include <unicode/ucnv_cb.h> |
35 | #include <wtf/Threading.h> |
36 | #include <wtf/text/CString.h> |
37 | #include <wtf/text/StringBuilder.h> |
38 | #include <wtf/unicode/CharacterNames.h> |
39 | |
40 | namespace WebCore { |
41 | |
42 | const size_t ConversionBufferSize = 16384; |
43 | |
44 | #define DECLARE_ALIASES(encoding, ...) \ |
45 | static const char* const encoding##_aliases[] { __VA_ARGS__ } |
46 | |
47 | // From https://encoding.spec.whatwg.org. Plus a few extra aliases that macOS had historically from TEC. |
48 | DECLARE_ALIASES(IBM866, "866" , "cp866" , "csibm866" ); |
49 | DECLARE_ALIASES(ISO_8859_2, "csisolatin2" , "iso-ir-101" , "iso8859-2" , "iso88592" , "iso_8859-2" , "iso_8859-2:1987" , "l2" , "latin2" ); |
50 | DECLARE_ALIASES(ISO_8859_3, "csisolatin3" , "iso-ir-109" , "iso8859-3" , "iso88593" , "iso_8859-3" , "iso_8859-3:1988" , "l3" , "latin3" ); |
51 | DECLARE_ALIASES(ISO_8859_4, "csisolatin4" , "iso-ir-110" , "iso8859-4" , "iso88594" , "iso_8859-4" , "iso_8859-4:1988" , "l4" , "latin4" ); |
52 | DECLARE_ALIASES(ISO_8859_5, "csisolatincyrillic" , "cyrillic" , "iso-ir-144" , "iso8859-5" , "iso88595" , "iso_8859-5" , "iso_8859-5:1988" ); |
53 | DECLARE_ALIASES(ISO_8859_6, "arabic" , "asmo-708" , "csiso88596e" , "csiso88596i" , "csisolatinarabic" , "ecma-114" , "iso-8859-6-e" , "iso-8859-6-i" , "iso-ir-127" , "iso8859-6" , "iso88596" , "iso_8859-6" , "iso_8859-6:1987" ); |
54 | DECLARE_ALIASES(ISO_8859_7, "csisolatingreek" , "ecma-118" , "elot_928" , "greek" , "greek8" , "iso-ir-126" , "iso8859-7" , "iso88597" , "iso_8859-7" , "iso_8859-7:1987" , "sun_eu_greek" ); |
55 | DECLARE_ALIASES(ISO_8859_8, "csiso88598e" , "csisolatinhebrew" , "hebrew" , "iso-8859-8-e" , "iso-ir-138" , "iso8859-8" , "iso88598" , "iso_8859-8" , "iso_8859-8:1988" , "visual" ); |
56 | DECLARE_ALIASES(ISO_8859_8_I, "csiso88598i" , "logical" ); |
57 | DECLARE_ALIASES(ISO_8859_10, "csisolatin6" , "iso-ir-157" , "iso8859-10" , "iso885910" , "l6" , "latin6" , "iso8859101992" , "isoir157" ); |
58 | DECLARE_ALIASES(ISO_8859_13, "iso8859-13" , "iso885913" ); |
59 | DECLARE_ALIASES(ISO_8859_14, "iso8859-14" , "iso885914" , "isoceltic" , "iso8859141998" , "isoir199" , "latin8" , "l8" ); |
60 | DECLARE_ALIASES(ISO_8859_15, "csisolatin9" , "iso8859-15" , "iso885915" , "iso_8859-15" , "l9" ); |
61 | DECLARE_ALIASES(ISO_8859_16, "isoir226" , "iso8859162001" , "l10" , "latin10" ); |
62 | DECLARE_ALIASES(KOI8_R, "cskoi8r" , "koi" , "koi8" , "koi8_r" ); |
63 | DECLARE_ALIASES(KOI8_U, "koi8-ru" ); |
64 | DECLARE_ALIASES(macintosh, "csmacintosh" , "mac" , "x-mac-roman" , "macroman" , "x-macroman" ); |
65 | DECLARE_ALIASES(windows_874, "dos-874" , "iso-8859-11" , "iso8859-11" , "iso885911" , "tis-620" ); |
66 | DECLARE_ALIASES(EUC_KR, "windows-949" , "cseuckr" , "csksc56011987" , "iso-ir-149" , "korean" , "ks_c_5601-1987" , "ks_c_5601-1989" , "ksc5601" , "ksc_5601" , "ms949" , "x-KSC5601" , "x-windows-949" , "x-uhc" ); |
67 | DECLARE_ALIASES(windows_1250, "cp1250" , "x-cp1250" , "winlatin2" ); |
68 | DECLARE_ALIASES(windows_1251, "cp1251" , "wincyrillic" , "x-cp1251" ); |
69 | DECLARE_ALIASES(windows_1253, "wingreek" , "cp1253" , "x-cp1253" ); |
70 | DECLARE_ALIASES(windows_1254, "winturkish" , "cp1254" , "csisolatin5" , "iso-8859-9" , "iso-ir-148" , "iso8859-9" , "iso88599" , "iso_8859-9" , "iso_8859-9:1989" , "l5" , "latin5" , "x-cp1254" ); |
71 | DECLARE_ALIASES(windows_1255, "winhebrew" , "cp1255" , "x-cp1255" ); |
72 | DECLARE_ALIASES(windows_1256, "winarabic" , "cp1256" , "x-cp1256" ); |
73 | DECLARE_ALIASES(windows_1257, "winbaltic" , "cp1257" , "x-cp1257" ); |
74 | DECLARE_ALIASES(windows_1258, "winvietnamese" , "cp1258" , "x-cp1258" ); |
75 | DECLARE_ALIASES(x_mac_cyrillic, "maccyrillic" , "x-mac-ukrainian" , "windows-10007" , "mac-cyrillic" , "maccy" , "x-MacCyrillic" , "x-MacUkraine" ); |
76 | DECLARE_ALIASES(GBK, "cn-gb" , "csgb231280" , "x-euc-cn" , "chinese" , "csgb2312" , "csiso58gb231280" , "gb2312" , "gb_2312" , "gb_2312-80" , "iso-ir-58" , "x-gbk" , "euc-cn" , "cp936" , "ms936" , "gb2312-1980" , "windows-936" , "windows-936-2000" ); |
77 | DECLARE_ALIASES(gb18030, "ibm-1392" , "windows-54936" ); |
78 | DECLARE_ALIASES(Big5, "cn-big5" , "x-x-big5" , "csbig5" , "windows-950" , "windows-950-2000" , "ms950" , "x-windows-950" , "x-big5" ); |
79 | DECLARE_ALIASES(EUC_JP, "x-euc" , "cseucpkdfmtjapanese" , "x-euc-jp" ); |
80 | DECLARE_ALIASES(ISO_2022_JP, "jis7" , "csiso2022jp" ); |
81 | DECLARE_ALIASES(Shift_JIS, "shift-jis" , "csshiftjis" , "ms932" , "ms_kanji" , "sjis" , "windows-31j" , "x-sjis" ); |
82 | // Encodings below are not in the standard. |
83 | DECLARE_ALIASES(x_mac_greek, "windows-10006" , "macgr" , "x-MacGreek" ); |
84 | DECLARE_ALIASES(x_mac_centraleurroman, "windows-10029" , "x-mac-ce" , "macce" , "maccentraleurope" , "x-MacCentralEurope" ); |
85 | DECLARE_ALIASES(x_mac_turkish, "windows-10081" , "mactr" , "x-MacTurkish" ); |
86 | DECLARE_ALIASES(Big5_HKSCS, "big5hk" , "HKSCS-BIG5" , "ibm-1375" , "ibm-1375_P100-2008" ); |
87 | |
88 | #define DECLARE_ENCODING_NAME(encoding, alias_array) \ |
89 | { encoding, WTF_ARRAY_LENGTH(alias_array##_aliases), alias_array##_aliases } |
90 | |
91 | #define DECLARE_ENCODING_NAME_NO_ALIASES(encoding) \ |
92 | { encoding, 0, nullptr } |
93 | |
94 | static const struct EncodingName { |
95 | const char* name; |
96 | unsigned aliasCount; |
97 | const char* const * aliases; |
98 | } encodingNames[] = { |
99 | DECLARE_ENCODING_NAME("IBM866" , IBM866), |
100 | DECLARE_ENCODING_NAME("ISO-8859-2" , ISO_8859_2), |
101 | DECLARE_ENCODING_NAME("ISO-8859-3" , ISO_8859_3), |
102 | DECLARE_ENCODING_NAME("ISO-8859-4" , ISO_8859_4), |
103 | DECLARE_ENCODING_NAME("ISO-8859-5" , ISO_8859_5), |
104 | DECLARE_ENCODING_NAME("ISO-8859-6" , ISO_8859_6), |
105 | DECLARE_ENCODING_NAME("ISO-8859-7" , ISO_8859_7), |
106 | DECLARE_ENCODING_NAME("ISO-8859-8" , ISO_8859_8), |
107 | DECLARE_ENCODING_NAME("ISO-8859-8-I" , ISO_8859_8_I), |
108 | DECLARE_ENCODING_NAME("ISO-8859-10" , ISO_8859_10), |
109 | DECLARE_ENCODING_NAME("ISO-8859-13" , ISO_8859_13), |
110 | DECLARE_ENCODING_NAME("ISO-8859-14" , ISO_8859_14), |
111 | DECLARE_ENCODING_NAME("ISO-8859-15" , ISO_8859_15), |
112 | DECLARE_ENCODING_NAME("ISO-8859-16" , ISO_8859_16), |
113 | DECLARE_ENCODING_NAME("KOI8-R" , KOI8_R), |
114 | DECLARE_ENCODING_NAME("KOI8-U" , KOI8_U), |
115 | DECLARE_ENCODING_NAME("macintosh" , macintosh), |
116 | DECLARE_ENCODING_NAME("windows-874" , windows_874), |
117 | DECLARE_ENCODING_NAME("EUC-KR" , EUC_KR), |
118 | DECLARE_ENCODING_NAME("windows-1250" , windows_1250), |
119 | DECLARE_ENCODING_NAME("windows-1251" , windows_1251), |
120 | DECLARE_ENCODING_NAME("windows-1253" , windows_1253), |
121 | DECLARE_ENCODING_NAME("windows-1254" , windows_1254), |
122 | DECLARE_ENCODING_NAME("windows-1255" , windows_1255), |
123 | DECLARE_ENCODING_NAME("windows-1256" , windows_1256), |
124 | DECLARE_ENCODING_NAME("windows-1257" , windows_1257), |
125 | DECLARE_ENCODING_NAME("windows-1258" , windows_1258), |
126 | DECLARE_ENCODING_NAME("x-mac-cyrillic" , x_mac_cyrillic), |
127 | DECLARE_ENCODING_NAME("GBK" , GBK), |
128 | DECLARE_ENCODING_NAME("gb18030" , gb18030), |
129 | DECLARE_ENCODING_NAME("Big5" , Big5), |
130 | DECLARE_ENCODING_NAME("EUC-JP" , EUC_JP), |
131 | DECLARE_ENCODING_NAME("ISO-2022-JP" , ISO_2022_JP), |
132 | DECLARE_ENCODING_NAME("Shift_JIS" , Shift_JIS), |
133 | // Encodings below are not in the standard. |
134 | DECLARE_ENCODING_NAME("x-mac-greek" , x_mac_greek), |
135 | DECLARE_ENCODING_NAME("x-mac-centraleurroman" , x_mac_centraleurroman), |
136 | DECLARE_ENCODING_NAME("x-mac-turkish" , x_mac_turkish), |
137 | DECLARE_ENCODING_NAME("Big5-HKSCS" , Big5_HKSCS), |
138 | DECLARE_ENCODING_NAME_NO_ALIASES("EUC-TW" ), |
139 | }; |
140 | |
141 | void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
142 | { |
143 | for (auto& encodingName : encodingNames) { |
144 | registrar(encodingName.name, encodingName.name); |
145 | for (size_t i = 0; i < encodingName.aliasCount; ++i) |
146 | registrar(encodingName.aliases[i], encodingName.name); |
147 | } |
148 | } |
149 | |
150 | void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) |
151 | { |
152 | for (auto& encodingName : encodingNames) { |
153 | const char* name = encodingName.name; |
154 | |
155 | // These encodings currently don't have standard names, so we need to register encoders manually. |
156 | // http://demo.icu-project.org/icu-bin/convexp |
157 | if (!strcmp(name, "windows-874" )) { |
158 | registrar(name, [name] { |
159 | return std::make_unique<TextCodecICU>(name, "windows-874-2000" ); |
160 | }); |
161 | continue; |
162 | } |
163 | if (!strcmp(name, "windows-949" )) { |
164 | registrar(name, [name] { |
165 | return std::make_unique<TextCodecICU>(name, "windows-949-2000" ); |
166 | }); |
167 | continue; |
168 | } |
169 | if (!strcmp(name, "x-mac-cyrillic" )) { |
170 | registrar(name, [name] { |
171 | return std::make_unique<TextCodecICU>(name, "macos-7_3-10.2" ); |
172 | }); |
173 | continue; |
174 | } |
175 | if (!strcmp(name, "x-mac-greek" )) { |
176 | registrar(name, [name] { |
177 | return std::make_unique<TextCodecICU>(name, "macos-6_2-10.4" ); |
178 | }); |
179 | continue; |
180 | } |
181 | if (!strcmp(name, "x-mac-centraleurroman" )) { |
182 | registrar(name, [name] { |
183 | return std::make_unique<TextCodecICU>(name, "macos-29-10.2" ); |
184 | }); |
185 | continue; |
186 | } |
187 | if (!strcmp(name, "x-mac-turkish" )) { |
188 | registrar(name, [name] { |
189 | return std::make_unique<TextCodecICU>(name, "macos-35-10.2" ); |
190 | }); |
191 | continue; |
192 | } |
193 | if (!strcmp(name, "EUC-KR" )) { |
194 | registrar(name, [name] { |
195 | return std::make_unique<TextCodecICU>(name, "windows-949" ); |
196 | }); |
197 | continue; |
198 | } |
199 | |
200 | UErrorCode error = U_ZERO_ERROR; |
201 | const char* canonicalConverterName = ucnv_getCanonicalName(name, "IANA" , &error); |
202 | ASSERT(U_SUCCESS(error)); |
203 | registrar(name, [name, canonicalConverterName] { |
204 | return std::make_unique<TextCodecICU>(name, canonicalConverterName); |
205 | }); |
206 | } |
207 | } |
208 | |
209 | TextCodecICU::TextCodecICU(const char* encoding, const char* canonicalConverterName) |
210 | : m_encodingName(encoding) |
211 | , m_canonicalConverterName(canonicalConverterName) |
212 | { |
213 | } |
214 | |
215 | TextCodecICU::~TextCodecICU() |
216 | { |
217 | if (m_converter) { |
218 | ucnv_reset(m_converter.get()); |
219 | threadGlobalData().cachedConverterICU().converter = WTFMove(m_converter); |
220 | } |
221 | } |
222 | |
223 | void TextCodecICU::createICUConverter() const |
224 | { |
225 | ASSERT(!m_converter); |
226 | |
227 | m_needsGBKFallbacks = !strcmp(m_encodingName, "GBK" ); |
228 | |
229 | auto& cachedConverter = threadGlobalData().cachedConverterICU().converter; |
230 | if (cachedConverter) { |
231 | UErrorCode error = U_ZERO_ERROR; |
232 | const char* cachedConverterName = ucnv_getName(cachedConverter.get(), &error); |
233 | if (U_SUCCESS(error) && !strcmp(m_canonicalConverterName, cachedConverterName)) { |
234 | m_converter = WTFMove(cachedConverter); |
235 | return; |
236 | } |
237 | } |
238 | |
239 | UErrorCode error = U_ZERO_ERROR; |
240 | m_converter = ICUConverterPtr { ucnv_open(m_canonicalConverterName, &error), ucnv_close }; |
241 | if (m_converter) |
242 | ucnv_setFallback(m_converter.get(), TRUE); |
243 | } |
244 | |
245 | int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& error) |
246 | { |
247 | UChar* targetStart = target; |
248 | error = U_ZERO_ERROR; |
249 | ucnv_toUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, offsets, flush, &error); |
250 | return target - targetStart; |
251 | } |
252 | |
253 | class ErrorCallbackSetter { |
254 | public: |
255 | ErrorCallbackSetter(UConverter& converter, bool stopOnError) |
256 | : m_converter(converter) |
257 | , m_shouldStopOnEncodingErrors(stopOnError) |
258 | { |
259 | if (m_shouldStopOnEncodingErrors) { |
260 | UErrorCode err = U_ZERO_ERROR; |
261 | ucnv_setToUCallBack(&m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err); |
262 | ASSERT(err == U_ZERO_ERROR); |
263 | } |
264 | } |
265 | ~ErrorCallbackSetter() |
266 | { |
267 | if (m_shouldStopOnEncodingErrors) { |
268 | UErrorCode err = U_ZERO_ERROR; |
269 | const void* oldContext; |
270 | UConverterToUCallback oldAction; |
271 | ucnv_setToUCallBack(&m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err); |
272 | ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE); |
273 | ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL)); |
274 | ASSERT(err == U_ZERO_ERROR); |
275 | } |
276 | } |
277 | |
278 | private: |
279 | UConverter& m_converter; |
280 | bool m_shouldStopOnEncodingErrors; |
281 | const void* m_savedContext; |
282 | UConverterToUCallback m_savedAction; |
283 | }; |
284 | |
285 | String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) |
286 | { |
287 | // Get a converter for the passed-in encoding. |
288 | if (!m_converter) { |
289 | createICUConverter(); |
290 | if (!m_converter) { |
291 | LOG_ERROR("error creating ICU encoder even though encoding was in table" ); |
292 | sawError = true; |
293 | return { }; |
294 | } |
295 | } |
296 | |
297 | ErrorCallbackSetter callbackSetter(*m_converter, stopOnError); |
298 | |
299 | StringBuilder result; |
300 | |
301 | UChar buffer[ConversionBufferSize]; |
302 | UChar* bufferLimit = buffer + ConversionBufferSize; |
303 | const char* source = reinterpret_cast<const char*>(bytes); |
304 | const char* sourceLimit = source + length; |
305 | int32_t* offsets = NULL; |
306 | UErrorCode err = U_ZERO_ERROR; |
307 | |
308 | do { |
309 | int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); |
310 | result.append(buffer, ucharsDecoded); |
311 | } while (err == U_BUFFER_OVERFLOW_ERROR); |
312 | |
313 | if (U_FAILURE(err)) { |
314 | // flush the converter so it can be reused, and not be bothered by this error. |
315 | do { |
316 | decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); |
317 | } while (source < sourceLimit); |
318 | sawError = true; |
319 | } |
320 | |
321 | String resultString = result.toString(); |
322 | |
323 | // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. |
324 | if (!strcmp(m_encodingName, "GBK" ) || equalLettersIgnoringASCIICase(m_encodingName, "gb18030" )) |
325 | resultString.replace(0xE5E5, ideographicSpace); |
326 | |
327 | return resultString; |
328 | } |
329 | |
330 | // We need to apply these fallbacks ourselves as they are not currently supported by ICU and |
331 | // they were provided by the Mac TEC encoding path. Needed to fix <rdar://problem/4708689>. |
332 | static UChar fallbackForGBK(UChar32 character) |
333 | { |
334 | switch (character) { |
335 | case 0x01F9: |
336 | return 0xE7C8; |
337 | case 0x1E3F: |
338 | return 0xE7C7; |
339 | case 0x22EF: |
340 | return 0x2026; |
341 | case 0x301C: |
342 | return 0xFF5E; |
343 | } |
344 | return 0; |
345 | } |
346 | |
347 | // Invalid character handler when writing escaped entities for unrepresentable |
348 | // characters. See the declaration of TextCodec::encode for more. |
349 | static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
350 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
351 | { |
352 | if (reason == UCNV_UNASSIGNED) { |
353 | *error = U_ZERO_ERROR; |
354 | UnencodableReplacementArray entity; |
355 | int entityLen = TextCodec::getUnencodableReplacement(codePoint, UnencodableHandling::URLEncodedEntities, entity); |
356 | ucnv_cbFromUWriteBytes(fromUArgs, entity.data(), entityLen, 0, error); |
357 | } else |
358 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
359 | } |
360 | |
361 | // Substitutes special GBK characters, escaping all other unassigned entities. |
362 | static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
363 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
364 | { |
365 | UChar outChar; |
366 | if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) { |
367 | const UChar* source = &outChar; |
368 | *error = U_ZERO_ERROR; |
369 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
370 | return; |
371 | } |
372 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
373 | } |
374 | |
375 | // Combines both gbkUrlEscapedEntityCallback and GBK character substitution. |
376 | static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
377 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
378 | { |
379 | if (reason == UCNV_UNASSIGNED) { |
380 | if (UChar outChar = fallbackForGBK(codePoint)) { |
381 | const UChar* source = &outChar; |
382 | *error = U_ZERO_ERROR; |
383 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
384 | return; |
385 | } |
386 | urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
387 | return; |
388 | } |
389 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
390 | } |
391 | |
392 | static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
393 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
394 | { |
395 | UChar outChar; |
396 | if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) { |
397 | const UChar* source = &outChar; |
398 | *error = U_ZERO_ERROR; |
399 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
400 | return; |
401 | } |
402 | UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
403 | } |
404 | |
405 | Vector<uint8_t> TextCodecICU::encode(StringView string, UnencodableHandling handling) |
406 | { |
407 | if (string.isEmpty()) |
408 | return { }; |
409 | |
410 | if (!m_converter) { |
411 | createICUConverter(); |
412 | if (!m_converter) |
413 | return { }; |
414 | } |
415 | |
416 | // FIXME: We should see if there is "force ASCII range" mode in ICU; |
417 | // until then, we change the backslash into a yen sign. |
418 | // Encoding will change the yen sign back into a backslash. |
419 | String copy; |
420 | if (shouldShowBackslashAsCurrencySymbolIn(m_encodingName)) { |
421 | copy = string.toStringWithoutCopying(); |
422 | copy.replace('\\', yenSign); |
423 | string = copy; |
424 | } |
425 | |
426 | UErrorCode error; |
427 | switch (handling) { |
428 | case UnencodableHandling::QuestionMarks: |
429 | error = U_ZERO_ERROR; |
430 | ucnv_setSubstChars(m_converter.get(), "?" , 1, &error); |
431 | if (U_FAILURE(error)) |
432 | return { }; |
433 | error = U_ZERO_ERROR; |
434 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &error); |
435 | if (U_FAILURE(error)) |
436 | return { }; |
437 | break; |
438 | case UnencodableHandling::Entities: |
439 | error = U_ZERO_ERROR; |
440 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &error); |
441 | if (U_FAILURE(error)) |
442 | return { }; |
443 | break; |
444 | case UnencodableHandling::URLEncodedEntities: |
445 | error = U_ZERO_ERROR; |
446 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &error); |
447 | if (U_FAILURE(error)) |
448 | return { }; |
449 | break; |
450 | } |
451 | |
452 | auto upconvertedCharacters = string.upconvertedCharacters(); |
453 | auto* source = upconvertedCharacters.get(); |
454 | auto* sourceLimit = source + string.length(); |
455 | |
456 | Vector<uint8_t> result; |
457 | do { |
458 | char buffer[ConversionBufferSize]; |
459 | char* target = buffer; |
460 | char* targetLimit = target + ConversionBufferSize; |
461 | error = U_ZERO_ERROR; |
462 | ucnv_fromUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, 0, true, &error); |
463 | result.append(reinterpret_cast<uint8_t*>(buffer), target - buffer); |
464 | } while (error == U_BUFFER_OVERFLOW_ERROR); |
465 | return result; |
466 | } |
467 | |
468 | } // namespace WebCore |
469 | |