| 1 | /* |
| 2 | * Copyright (C) 2004-2017 Apple Inc. All rights reserved. |
| 3 | * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * |
| 14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| 15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 25 | */ |
| 26 | |
| 27 | #include "config.h" |
| 28 | #include "TextCodecICU.h" |
| 29 | |
| 30 | #include "TextEncoding.h" |
| 31 | #include "TextEncodingRegistry.h" |
| 32 | #include "ThreadGlobalData.h" |
| 33 | #include <array> |
| 34 | #include <unicode/ucnv_cb.h> |
| 35 | #include <wtf/Threading.h> |
| 36 | #include <wtf/text/CString.h> |
| 37 | #include <wtf/text/StringBuilder.h> |
| 38 | #include <wtf/unicode/CharacterNames.h> |
| 39 | |
| 40 | namespace WebCore { |
| 41 | |
| 42 | const size_t ConversionBufferSize = 16384; |
| 43 | |
| 44 | #define DECLARE_ALIASES(encoding, ...) \ |
| 45 | static const char* const encoding##_aliases[] { __VA_ARGS__ } |
| 46 | |
| 47 | // From https://encoding.spec.whatwg.org. Plus a few extra aliases that macOS had historically from TEC. |
| 48 | DECLARE_ALIASES(IBM866, "866" , "cp866" , "csibm866" ); |
| 49 | DECLARE_ALIASES(ISO_8859_2, "csisolatin2" , "iso-ir-101" , "iso8859-2" , "iso88592" , "iso_8859-2" , "iso_8859-2:1987" , "l2" , "latin2" ); |
| 50 | DECLARE_ALIASES(ISO_8859_3, "csisolatin3" , "iso-ir-109" , "iso8859-3" , "iso88593" , "iso_8859-3" , "iso_8859-3:1988" , "l3" , "latin3" ); |
| 51 | DECLARE_ALIASES(ISO_8859_4, "csisolatin4" , "iso-ir-110" , "iso8859-4" , "iso88594" , "iso_8859-4" , "iso_8859-4:1988" , "l4" , "latin4" ); |
| 52 | DECLARE_ALIASES(ISO_8859_5, "csisolatincyrillic" , "cyrillic" , "iso-ir-144" , "iso8859-5" , "iso88595" , "iso_8859-5" , "iso_8859-5:1988" ); |
| 53 | DECLARE_ALIASES(ISO_8859_6, "arabic" , "asmo-708" , "csiso88596e" , "csiso88596i" , "csisolatinarabic" , "ecma-114" , "iso-8859-6-e" , "iso-8859-6-i" , "iso-ir-127" , "iso8859-6" , "iso88596" , "iso_8859-6" , "iso_8859-6:1987" ); |
| 54 | DECLARE_ALIASES(ISO_8859_7, "csisolatingreek" , "ecma-118" , "elot_928" , "greek" , "greek8" , "iso-ir-126" , "iso8859-7" , "iso88597" , "iso_8859-7" , "iso_8859-7:1987" , "sun_eu_greek" ); |
| 55 | DECLARE_ALIASES(ISO_8859_8, "csiso88598e" , "csisolatinhebrew" , "hebrew" , "iso-8859-8-e" , "iso-ir-138" , "iso8859-8" , "iso88598" , "iso_8859-8" , "iso_8859-8:1988" , "visual" ); |
| 56 | DECLARE_ALIASES(ISO_8859_8_I, "csiso88598i" , "logical" ); |
| 57 | DECLARE_ALIASES(ISO_8859_10, "csisolatin6" , "iso-ir-157" , "iso8859-10" , "iso885910" , "l6" , "latin6" , "iso8859101992" , "isoir157" ); |
| 58 | DECLARE_ALIASES(ISO_8859_13, "iso8859-13" , "iso885913" ); |
| 59 | DECLARE_ALIASES(ISO_8859_14, "iso8859-14" , "iso885914" , "isoceltic" , "iso8859141998" , "isoir199" , "latin8" , "l8" ); |
| 60 | DECLARE_ALIASES(ISO_8859_15, "csisolatin9" , "iso8859-15" , "iso885915" , "iso_8859-15" , "l9" ); |
| 61 | DECLARE_ALIASES(ISO_8859_16, "isoir226" , "iso8859162001" , "l10" , "latin10" ); |
| 62 | DECLARE_ALIASES(KOI8_R, "cskoi8r" , "koi" , "koi8" , "koi8_r" ); |
| 63 | DECLARE_ALIASES(KOI8_U, "koi8-ru" ); |
| 64 | DECLARE_ALIASES(macintosh, "csmacintosh" , "mac" , "x-mac-roman" , "macroman" , "x-macroman" ); |
| 65 | DECLARE_ALIASES(windows_874, "dos-874" , "iso-8859-11" , "iso8859-11" , "iso885911" , "tis-620" ); |
| 66 | DECLARE_ALIASES(EUC_KR, "windows-949" , "cseuckr" , "csksc56011987" , "iso-ir-149" , "korean" , "ks_c_5601-1987" , "ks_c_5601-1989" , "ksc5601" , "ksc_5601" , "ms949" , "x-KSC5601" , "x-windows-949" , "x-uhc" ); |
| 67 | DECLARE_ALIASES(windows_1250, "cp1250" , "x-cp1250" , "winlatin2" ); |
| 68 | DECLARE_ALIASES(windows_1251, "cp1251" , "wincyrillic" , "x-cp1251" ); |
| 69 | DECLARE_ALIASES(windows_1253, "wingreek" , "cp1253" , "x-cp1253" ); |
| 70 | DECLARE_ALIASES(windows_1254, "winturkish" , "cp1254" , "csisolatin5" , "iso-8859-9" , "iso-ir-148" , "iso8859-9" , "iso88599" , "iso_8859-9" , "iso_8859-9:1989" , "l5" , "latin5" , "x-cp1254" ); |
| 71 | DECLARE_ALIASES(windows_1255, "winhebrew" , "cp1255" , "x-cp1255" ); |
| 72 | DECLARE_ALIASES(windows_1256, "winarabic" , "cp1256" , "x-cp1256" ); |
| 73 | DECLARE_ALIASES(windows_1257, "winbaltic" , "cp1257" , "x-cp1257" ); |
| 74 | DECLARE_ALIASES(windows_1258, "winvietnamese" , "cp1258" , "x-cp1258" ); |
| 75 | DECLARE_ALIASES(x_mac_cyrillic, "maccyrillic" , "x-mac-ukrainian" , "windows-10007" , "mac-cyrillic" , "maccy" , "x-MacCyrillic" , "x-MacUkraine" ); |
| 76 | DECLARE_ALIASES(GBK, "cn-gb" , "csgb231280" , "x-euc-cn" , "chinese" , "csgb2312" , "csiso58gb231280" , "gb2312" , "gb_2312" , "gb_2312-80" , "iso-ir-58" , "x-gbk" , "euc-cn" , "cp936" , "ms936" , "gb2312-1980" , "windows-936" , "windows-936-2000" ); |
| 77 | DECLARE_ALIASES(gb18030, "ibm-1392" , "windows-54936" ); |
| 78 | DECLARE_ALIASES(Big5, "cn-big5" , "x-x-big5" , "csbig5" , "windows-950" , "windows-950-2000" , "ms950" , "x-windows-950" , "x-big5" ); |
| 79 | DECLARE_ALIASES(EUC_JP, "x-euc" , "cseucpkdfmtjapanese" , "x-euc-jp" ); |
| 80 | DECLARE_ALIASES(ISO_2022_JP, "jis7" , "csiso2022jp" ); |
| 81 | DECLARE_ALIASES(Shift_JIS, "shift-jis" , "csshiftjis" , "ms932" , "ms_kanji" , "sjis" , "windows-31j" , "x-sjis" ); |
| 82 | // Encodings below are not in the standard. |
| 83 | DECLARE_ALIASES(x_mac_greek, "windows-10006" , "macgr" , "x-MacGreek" ); |
| 84 | DECLARE_ALIASES(x_mac_centraleurroman, "windows-10029" , "x-mac-ce" , "macce" , "maccentraleurope" , "x-MacCentralEurope" ); |
| 85 | DECLARE_ALIASES(x_mac_turkish, "windows-10081" , "mactr" , "x-MacTurkish" ); |
| 86 | DECLARE_ALIASES(Big5_HKSCS, "big5hk" , "HKSCS-BIG5" , "ibm-1375" , "ibm-1375_P100-2008" ); |
| 87 | |
| 88 | #define DECLARE_ENCODING_NAME(encoding, alias_array) \ |
| 89 | { encoding, WTF_ARRAY_LENGTH(alias_array##_aliases), alias_array##_aliases } |
| 90 | |
| 91 | #define DECLARE_ENCODING_NAME_NO_ALIASES(encoding) \ |
| 92 | { encoding, 0, nullptr } |
| 93 | |
| 94 | static const struct EncodingName { |
| 95 | const char* name; |
| 96 | unsigned aliasCount; |
| 97 | const char* const * aliases; |
| 98 | } encodingNames[] = { |
| 99 | DECLARE_ENCODING_NAME("IBM866" , IBM866), |
| 100 | DECLARE_ENCODING_NAME("ISO-8859-2" , ISO_8859_2), |
| 101 | DECLARE_ENCODING_NAME("ISO-8859-3" , ISO_8859_3), |
| 102 | DECLARE_ENCODING_NAME("ISO-8859-4" , ISO_8859_4), |
| 103 | DECLARE_ENCODING_NAME("ISO-8859-5" , ISO_8859_5), |
| 104 | DECLARE_ENCODING_NAME("ISO-8859-6" , ISO_8859_6), |
| 105 | DECLARE_ENCODING_NAME("ISO-8859-7" , ISO_8859_7), |
| 106 | DECLARE_ENCODING_NAME("ISO-8859-8" , ISO_8859_8), |
| 107 | DECLARE_ENCODING_NAME("ISO-8859-8-I" , ISO_8859_8_I), |
| 108 | DECLARE_ENCODING_NAME("ISO-8859-10" , ISO_8859_10), |
| 109 | DECLARE_ENCODING_NAME("ISO-8859-13" , ISO_8859_13), |
| 110 | DECLARE_ENCODING_NAME("ISO-8859-14" , ISO_8859_14), |
| 111 | DECLARE_ENCODING_NAME("ISO-8859-15" , ISO_8859_15), |
| 112 | DECLARE_ENCODING_NAME("ISO-8859-16" , ISO_8859_16), |
| 113 | DECLARE_ENCODING_NAME("KOI8-R" , KOI8_R), |
| 114 | DECLARE_ENCODING_NAME("KOI8-U" , KOI8_U), |
| 115 | DECLARE_ENCODING_NAME("macintosh" , macintosh), |
| 116 | DECLARE_ENCODING_NAME("windows-874" , windows_874), |
| 117 | DECLARE_ENCODING_NAME("EUC-KR" , EUC_KR), |
| 118 | DECLARE_ENCODING_NAME("windows-1250" , windows_1250), |
| 119 | DECLARE_ENCODING_NAME("windows-1251" , windows_1251), |
| 120 | DECLARE_ENCODING_NAME("windows-1253" , windows_1253), |
| 121 | DECLARE_ENCODING_NAME("windows-1254" , windows_1254), |
| 122 | DECLARE_ENCODING_NAME("windows-1255" , windows_1255), |
| 123 | DECLARE_ENCODING_NAME("windows-1256" , windows_1256), |
| 124 | DECLARE_ENCODING_NAME("windows-1257" , windows_1257), |
| 125 | DECLARE_ENCODING_NAME("windows-1258" , windows_1258), |
| 126 | DECLARE_ENCODING_NAME("x-mac-cyrillic" , x_mac_cyrillic), |
| 127 | DECLARE_ENCODING_NAME("GBK" , GBK), |
| 128 | DECLARE_ENCODING_NAME("gb18030" , gb18030), |
| 129 | DECLARE_ENCODING_NAME("Big5" , Big5), |
| 130 | DECLARE_ENCODING_NAME("EUC-JP" , EUC_JP), |
| 131 | DECLARE_ENCODING_NAME("ISO-2022-JP" , ISO_2022_JP), |
| 132 | DECLARE_ENCODING_NAME("Shift_JIS" , Shift_JIS), |
| 133 | // Encodings below are not in the standard. |
| 134 | DECLARE_ENCODING_NAME("x-mac-greek" , x_mac_greek), |
| 135 | DECLARE_ENCODING_NAME("x-mac-centraleurroman" , x_mac_centraleurroman), |
| 136 | DECLARE_ENCODING_NAME("x-mac-turkish" , x_mac_turkish), |
| 137 | DECLARE_ENCODING_NAME("Big5-HKSCS" , Big5_HKSCS), |
| 138 | DECLARE_ENCODING_NAME_NO_ALIASES("EUC-TW" ), |
| 139 | }; |
| 140 | |
| 141 | void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
| 142 | { |
| 143 | for (auto& encodingName : encodingNames) { |
| 144 | registrar(encodingName.name, encodingName.name); |
| 145 | for (size_t i = 0; i < encodingName.aliasCount; ++i) |
| 146 | registrar(encodingName.aliases[i], encodingName.name); |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) |
| 151 | { |
| 152 | for (auto& encodingName : encodingNames) { |
| 153 | const char* name = encodingName.name; |
| 154 | |
| 155 | // These encodings currently don't have standard names, so we need to register encoders manually. |
| 156 | // http://demo.icu-project.org/icu-bin/convexp |
| 157 | if (!strcmp(name, "windows-874" )) { |
| 158 | registrar(name, [name] { |
| 159 | return std::make_unique<TextCodecICU>(name, "windows-874-2000" ); |
| 160 | }); |
| 161 | continue; |
| 162 | } |
| 163 | if (!strcmp(name, "windows-949" )) { |
| 164 | registrar(name, [name] { |
| 165 | return std::make_unique<TextCodecICU>(name, "windows-949-2000" ); |
| 166 | }); |
| 167 | continue; |
| 168 | } |
| 169 | if (!strcmp(name, "x-mac-cyrillic" )) { |
| 170 | registrar(name, [name] { |
| 171 | return std::make_unique<TextCodecICU>(name, "macos-7_3-10.2" ); |
| 172 | }); |
| 173 | continue; |
| 174 | } |
| 175 | if (!strcmp(name, "x-mac-greek" )) { |
| 176 | registrar(name, [name] { |
| 177 | return std::make_unique<TextCodecICU>(name, "macos-6_2-10.4" ); |
| 178 | }); |
| 179 | continue; |
| 180 | } |
| 181 | if (!strcmp(name, "x-mac-centraleurroman" )) { |
| 182 | registrar(name, [name] { |
| 183 | return std::make_unique<TextCodecICU>(name, "macos-29-10.2" ); |
| 184 | }); |
| 185 | continue; |
| 186 | } |
| 187 | if (!strcmp(name, "x-mac-turkish" )) { |
| 188 | registrar(name, [name] { |
| 189 | return std::make_unique<TextCodecICU>(name, "macos-35-10.2" ); |
| 190 | }); |
| 191 | continue; |
| 192 | } |
| 193 | if (!strcmp(name, "EUC-KR" )) { |
| 194 | registrar(name, [name] { |
| 195 | return std::make_unique<TextCodecICU>(name, "windows-949" ); |
| 196 | }); |
| 197 | continue; |
| 198 | } |
| 199 | |
| 200 | UErrorCode error = U_ZERO_ERROR; |
| 201 | const char* canonicalConverterName = ucnv_getCanonicalName(name, "IANA" , &error); |
| 202 | ASSERT(U_SUCCESS(error)); |
| 203 | registrar(name, [name, canonicalConverterName] { |
| 204 | return std::make_unique<TextCodecICU>(name, canonicalConverterName); |
| 205 | }); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | TextCodecICU::TextCodecICU(const char* encoding, const char* canonicalConverterName) |
| 210 | : m_encodingName(encoding) |
| 211 | , m_canonicalConverterName(canonicalConverterName) |
| 212 | { |
| 213 | } |
| 214 | |
| 215 | TextCodecICU::~TextCodecICU() |
| 216 | { |
| 217 | if (m_converter) { |
| 218 | ucnv_reset(m_converter.get()); |
| 219 | threadGlobalData().cachedConverterICU().converter = WTFMove(m_converter); |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | void TextCodecICU::createICUConverter() const |
| 224 | { |
| 225 | ASSERT(!m_converter); |
| 226 | |
| 227 | m_needsGBKFallbacks = !strcmp(m_encodingName, "GBK" ); |
| 228 | |
| 229 | auto& cachedConverter = threadGlobalData().cachedConverterICU().converter; |
| 230 | if (cachedConverter) { |
| 231 | UErrorCode error = U_ZERO_ERROR; |
| 232 | const char* cachedConverterName = ucnv_getName(cachedConverter.get(), &error); |
| 233 | if (U_SUCCESS(error) && !strcmp(m_canonicalConverterName, cachedConverterName)) { |
| 234 | m_converter = WTFMove(cachedConverter); |
| 235 | return; |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | UErrorCode error = U_ZERO_ERROR; |
| 240 | m_converter = ICUConverterPtr { ucnv_open(m_canonicalConverterName, &error), ucnv_close }; |
| 241 | if (m_converter) |
| 242 | ucnv_setFallback(m_converter.get(), TRUE); |
| 243 | } |
| 244 | |
| 245 | int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& error) |
| 246 | { |
| 247 | UChar* targetStart = target; |
| 248 | error = U_ZERO_ERROR; |
| 249 | ucnv_toUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, offsets, flush, &error); |
| 250 | return target - targetStart; |
| 251 | } |
| 252 | |
| 253 | class ErrorCallbackSetter { |
| 254 | public: |
| 255 | ErrorCallbackSetter(UConverter& converter, bool stopOnError) |
| 256 | : m_converter(converter) |
| 257 | , m_shouldStopOnEncodingErrors(stopOnError) |
| 258 | { |
| 259 | if (m_shouldStopOnEncodingErrors) { |
| 260 | UErrorCode err = U_ZERO_ERROR; |
| 261 | ucnv_setToUCallBack(&m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err); |
| 262 | ASSERT(err == U_ZERO_ERROR); |
| 263 | } |
| 264 | } |
| 265 | ~ErrorCallbackSetter() |
| 266 | { |
| 267 | if (m_shouldStopOnEncodingErrors) { |
| 268 | UErrorCode err = U_ZERO_ERROR; |
| 269 | const void* oldContext; |
| 270 | UConverterToUCallback oldAction; |
| 271 | ucnv_setToUCallBack(&m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err); |
| 272 | ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE); |
| 273 | ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL)); |
| 274 | ASSERT(err == U_ZERO_ERROR); |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | private: |
| 279 | UConverter& m_converter; |
| 280 | bool m_shouldStopOnEncodingErrors; |
| 281 | const void* m_savedContext; |
| 282 | UConverterToUCallback m_savedAction; |
| 283 | }; |
| 284 | |
| 285 | String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) |
| 286 | { |
| 287 | // Get a converter for the passed-in encoding. |
| 288 | if (!m_converter) { |
| 289 | createICUConverter(); |
| 290 | if (!m_converter) { |
| 291 | LOG_ERROR("error creating ICU encoder even though encoding was in table" ); |
| 292 | sawError = true; |
| 293 | return { }; |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | ErrorCallbackSetter callbackSetter(*m_converter, stopOnError); |
| 298 | |
| 299 | StringBuilder result; |
| 300 | |
| 301 | UChar buffer[ConversionBufferSize]; |
| 302 | UChar* bufferLimit = buffer + ConversionBufferSize; |
| 303 | const char* source = reinterpret_cast<const char*>(bytes); |
| 304 | const char* sourceLimit = source + length; |
| 305 | int32_t* offsets = NULL; |
| 306 | UErrorCode err = U_ZERO_ERROR; |
| 307 | |
| 308 | do { |
| 309 | int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); |
| 310 | result.append(buffer, ucharsDecoded); |
| 311 | } while (err == U_BUFFER_OVERFLOW_ERROR); |
| 312 | |
| 313 | if (U_FAILURE(err)) { |
| 314 | // flush the converter so it can be reused, and not be bothered by this error. |
| 315 | do { |
| 316 | decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); |
| 317 | } while (source < sourceLimit); |
| 318 | sawError = true; |
| 319 | } |
| 320 | |
| 321 | String resultString = result.toString(); |
| 322 | |
| 323 | // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. |
| 324 | if (!strcmp(m_encodingName, "GBK" ) || equalLettersIgnoringASCIICase(m_encodingName, "gb18030" )) |
| 325 | resultString.replace(0xE5E5, ideographicSpace); |
| 326 | |
| 327 | return resultString; |
| 328 | } |
| 329 | |
| 330 | // We need to apply these fallbacks ourselves as they are not currently supported by ICU and |
| 331 | // they were provided by the Mac TEC encoding path. Needed to fix <rdar://problem/4708689>. |
| 332 | static UChar fallbackForGBK(UChar32 character) |
| 333 | { |
| 334 | switch (character) { |
| 335 | case 0x01F9: |
| 336 | return 0xE7C8; |
| 337 | case 0x1E3F: |
| 338 | return 0xE7C7; |
| 339 | case 0x22EF: |
| 340 | return 0x2026; |
| 341 | case 0x301C: |
| 342 | return 0xFF5E; |
| 343 | } |
| 344 | return 0; |
| 345 | } |
| 346 | |
| 347 | // Invalid character handler when writing escaped entities for unrepresentable |
| 348 | // characters. See the declaration of TextCodec::encode for more. |
| 349 | static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
| 350 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
| 351 | { |
| 352 | if (reason == UCNV_UNASSIGNED) { |
| 353 | *error = U_ZERO_ERROR; |
| 354 | UnencodableReplacementArray entity; |
| 355 | int entityLen = TextCodec::getUnencodableReplacement(codePoint, UnencodableHandling::URLEncodedEntities, entity); |
| 356 | ucnv_cbFromUWriteBytes(fromUArgs, entity.data(), entityLen, 0, error); |
| 357 | } else |
| 358 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
| 359 | } |
| 360 | |
| 361 | // Substitutes special GBK characters, escaping all other unassigned entities. |
| 362 | static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
| 363 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
| 364 | { |
| 365 | UChar outChar; |
| 366 | if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) { |
| 367 | const UChar* source = &outChar; |
| 368 | *error = U_ZERO_ERROR; |
| 369 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
| 370 | return; |
| 371 | } |
| 372 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
| 373 | } |
| 374 | |
| 375 | // Combines both gbkUrlEscapedEntityCallback and GBK character substitution. |
| 376 | static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
| 377 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
| 378 | { |
| 379 | if (reason == UCNV_UNASSIGNED) { |
| 380 | if (UChar outChar = fallbackForGBK(codePoint)) { |
| 381 | const UChar* source = &outChar; |
| 382 | *error = U_ZERO_ERROR; |
| 383 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
| 384 | return; |
| 385 | } |
| 386 | urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
| 387 | return; |
| 388 | } |
| 389 | UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
| 390 | } |
| 391 | |
| 392 | static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
| 393 | UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error) |
| 394 | { |
| 395 | UChar outChar; |
| 396 | if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) { |
| 397 | const UChar* source = &outChar; |
| 398 | *error = U_ZERO_ERROR; |
| 399 | ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, error); |
| 400 | return; |
| 401 | } |
| 402 | UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, error); |
| 403 | } |
| 404 | |
| 405 | Vector<uint8_t> TextCodecICU::encode(StringView string, UnencodableHandling handling) |
| 406 | { |
| 407 | if (string.isEmpty()) |
| 408 | return { }; |
| 409 | |
| 410 | if (!m_converter) { |
| 411 | createICUConverter(); |
| 412 | if (!m_converter) |
| 413 | return { }; |
| 414 | } |
| 415 | |
| 416 | // FIXME: We should see if there is "force ASCII range" mode in ICU; |
| 417 | // until then, we change the backslash into a yen sign. |
| 418 | // Encoding will change the yen sign back into a backslash. |
| 419 | String copy; |
| 420 | if (shouldShowBackslashAsCurrencySymbolIn(m_encodingName)) { |
| 421 | copy = string.toStringWithoutCopying(); |
| 422 | copy.replace('\\', yenSign); |
| 423 | string = copy; |
| 424 | } |
| 425 | |
| 426 | UErrorCode error; |
| 427 | switch (handling) { |
| 428 | case UnencodableHandling::QuestionMarks: |
| 429 | error = U_ZERO_ERROR; |
| 430 | ucnv_setSubstChars(m_converter.get(), "?" , 1, &error); |
| 431 | if (U_FAILURE(error)) |
| 432 | return { }; |
| 433 | error = U_ZERO_ERROR; |
| 434 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &error); |
| 435 | if (U_FAILURE(error)) |
| 436 | return { }; |
| 437 | break; |
| 438 | case UnencodableHandling::Entities: |
| 439 | error = U_ZERO_ERROR; |
| 440 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &error); |
| 441 | if (U_FAILURE(error)) |
| 442 | return { }; |
| 443 | break; |
| 444 | case UnencodableHandling::URLEncodedEntities: |
| 445 | error = U_ZERO_ERROR; |
| 446 | ucnv_setFromUCallBack(m_converter.get(), m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &error); |
| 447 | if (U_FAILURE(error)) |
| 448 | return { }; |
| 449 | break; |
| 450 | } |
| 451 | |
| 452 | auto upconvertedCharacters = string.upconvertedCharacters(); |
| 453 | auto* source = upconvertedCharacters.get(); |
| 454 | auto* sourceLimit = source + string.length(); |
| 455 | |
| 456 | Vector<uint8_t> result; |
| 457 | do { |
| 458 | char buffer[ConversionBufferSize]; |
| 459 | char* target = buffer; |
| 460 | char* targetLimit = target + ConversionBufferSize; |
| 461 | error = U_ZERO_ERROR; |
| 462 | ucnv_fromUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, 0, true, &error); |
| 463 | result.append(reinterpret_cast<uint8_t*>(buffer), target - buffer); |
| 464 | } while (error == U_BUFFER_OVERFLOW_ERROR); |
| 465 | return result; |
| 466 | } |
| 467 | |
| 468 | } // namespace WebCore |
| 469 | |