UTF8Conversion.cpp source code [webkit/Source/WTF/wtf/unicode/UTF8Conversion.cpp]

1	/*
2	* Copyright (C) 2007-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include <wtf/unicode/UTF8Conversion.h>
29
30	#include <wtf/ASCIICType.h>
31	#include <wtf/text/StringHasher.h>
32	#include <wtf/unicode/CharacterNames.h>
33
34	namespace WTF {
35	namespace Unicode {
36
37	bool convertLatin1ToUTF8(const LChar** sourceStart, const LChar* sourceEnd, char** targetStart, char* targetEnd)
38	{
39	const LChar* source;
40	char* target = *targetStart;
41	int i = `0`;
42	for (source = *sourceStart; source < sourceEnd; ++source) {
43	UBool sawError = false;
44	// Work around bug in either Windows compiler or old version of ICU, where passing a uint8_t to
45	// U8_APPEND warns, by converting from uint8_t to a wider type.
46	UChar32 character = *source;
47	U8_APPEND(reinterpret_cast<uint8_t>(target), i, targetEnd - targetStart, character, sawError);
48	if (sawError)
49	return false;
50	}
51	*sourceStart = source;
52	*targetStart = target + i;
53	return true;
54	}
55
56	ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, const UChar* sourceEnd, char** targetStart, char* targetEnd, bool strict)
57	{
58	ConversionResult result = ConversionOK;
59	const UChar* source = *sourceStart;
60	char* target = *targetStart;
61	UBool sawError = false;
62	int i = `0`;
63	while (source < sourceEnd) {
64	UChar32 ch;
65	int j = `0`;
66	U16_NEXT(source, j, sourceEnd - source, ch);
67	if (U_IS_SURROGATE(ch)) {
68	if (source + j == sourceEnd && U_IS_SURROGATE_LEAD(ch)) {
69	result = SourceExhausted;
70	break;
71	}
72	if (strict) {
73	result = SourceIllegal;
74	break;
75	}
76	ch = replacementCharacter;
77	}
78	U8_APPEND(reinterpret_cast<uint8_t*>(target), i, targetEnd - target, ch, sawError);
79	if (sawError) {
80	result = TargetExhausted;
81	break;
82	}
83	source += j;
84	}
85	*sourceStart = source;
86	*targetStart = target + i;
87	return result;
88	}
89
90	bool convertUTF8ToUTF16(const char* source, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII)
91	{
92	RELEASE_ASSERT(sourceEnd - source <= std::numeric_limits<int>::max());
93	UBool error = false;
94	UChar* target = *targetStart;
95	UChar32 orAllData = `0`;
96	int targetOffset = `0`;
97	for (int sourceOffset = `0`; sourceOffset < sourceEnd - source; ) {
98	UChar32 character;
99	U8_NEXT(reinterpret_cast<const uint8_t*>(source), sourceOffset, sourceEnd - source, character);
100	if (character < `0`)
101	return false;
102	U16_APPEND(target, targetOffset, targetEnd - target, character, error);
103	if (error)
104	return false;
105	orAllData \|= character;
106	}
107	*targetStart = target + targetOffset;
108	if (sourceAllASCII)
109	*sourceAllASCII = isASCII(orAllData);
110	return true;
111	}
112
113	unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
114	{
115	StringHasher stringHasher;
116	utf16Length = `0`;
117
118	int inputOffset = `0`;
119	int inputLength = dataEnd - data;
120	while (inputOffset < inputLength) {
121	UChar32 character;
122	U8_NEXT(reinterpret_cast<const uint8_t*>(data), inputOffset, inputLength, character);
123	if (character < `0`)
124	return `0`;
125
126	if (U_IS_BMP(character)) {
127	ASSERT(!U_IS_SURROGATE(character));
128	stringHasher.addCharacter(character);
129	utf16Length++;
130	} else {
131	ASSERT(U_IS_SUPPLEMENTARY(character));
132	stringHasher.addCharacters(U16_LEAD(character), U16_TRAIL(character));
133	utf16Length += `2`;
134	}
135	}
136
137	dataLength = inputOffset;
138	return stringHasher.hashWithTop8BitsMasked();
139	}
140
141	bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
142	{
143	while (b < bEnd) {
144	int offset = `0`;
145	UChar32 character;
146	U8_NEXT(reinterpret_cast<const uint8_t*>(b), offset, bEnd - b, character);
147	if (character < `0`)
148	return false;
149	b += offset;
150
151	if (U_IS_BMP(character)) {
152	ASSERT(!U_IS_SURROGATE(character));
153	if (*a++ != character)
154	return false;
155	} else {
156	ASSERT(U_IS_SUPPLEMENTARY(character));
157	if (*a++ != U16_LEAD(character))
158	return false;
159	if (*a++ != U16_TRAIL(character))
160	return false;
161	}
162	}
163
164	return true;
165	}
166
167	bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
168	{
169	while (b < bEnd) {
170	if (isASCII(a) \|\| isASCII(b)) {
171	if (a++ != b++)
172	return false;
173	continue;
174	}
175
176	if (b + `1` == bEnd)
177	return false;
178
179	if ((b[`0`] & `0xE0`) != `0xC0` \|\| (b[`1`] & `0xC0`) != `0x80`)
180	return false;
181
182	LChar character = ((b[`0`] & `0x1F`) << `6`) \| (b[`1`] & `0x3F`);
183
184	b += `2`;
185
186	if (*a++ != character)
187	return false;
188	}
189
190	return true;
191	}
192
193	} // namespace Unicode
194	} // namespace WTF
195

Browse the source code of webkit/Source/WTF/wtf/unicode/UTF8Conversion.cpp