TextCodecLatin1.cpp source code [webkit/Source/WebCore/platform/text/TextCodecLatin1.cpp]

1	/*
2	* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "TextCodecLatin1.h"
28
29	#include "TextCodecASCIIFastPath.h"
30	#include <array>
31	#include <wtf/text/CString.h>
32	#include <wtf/text/WTFString.h>
33
34	namespace WebCore {
35
36	static const UChar latin1ConversionTable[`256`] = {
37	`0x0000`, `0x0001`, `0x0002`, `0x0003`, `0x0004`, `0x0005`, `0x0006`, `0x0007`, // 00-07
38	`0x0008`, `0x0009`, `0x000A`, `0x000B`, `0x000C`, `0x000D`, `0x000E`, `0x000F`, // 08-0F
39	`0x0010`, `0x0011`, `0x0012`, `0x0013`, `0x0014`, `0x0015`, `0x0016`, `0x0017`, // 10-17
40	`0x0018`, `0x0019`, `0x001A`, `0x001B`, `0x001C`, `0x001D`, `0x001E`, `0x001F`, // 18-1F
41	`0x0020`, `0x0021`, `0x0022`, `0x0023`, `0x0024`, `0x0025`, `0x0026`, `0x0027`, // 20-27
42	`0x0028`, `0x0029`, `0x002A`, `0x002B`, `0x002C`, `0x002D`, `0x002E`, `0x002F`, // 28-2F
43	`0x0030`, `0x0031`, `0x0032`, `0x0033`, `0x0034`, `0x0035`, `0x0036`, `0x0037`, // 30-37
44	`0x0038`, `0x0039`, `0x003A`, `0x003B`, `0x003C`, `0x003D`, `0x003E`, `0x003F`, // 38-3F
45	`0x0040`, `0x0041`, `0x0042`, `0x0043`, `0x0044`, `0x0045`, `0x0046`, `0x0047`, // 40-47
46	`0x0048`, `0x0049`, `0x004A`, `0x004B`, `0x004C`, `0x004D`, `0x004E`, `0x004F`, // 48-4F
47	`0x0050`, `0x0051`, `0x0052`, `0x0053`, `0x0054`, `0x0055`, `0x0056`, `0x0057`, // 50-57
48	`0x0058`, `0x0059`, `0x005A`, `0x005B`, `0x005C`, `0x005D`, `0x005E`, `0x005F`, // 58-5F
49	`0x0060`, `0x0061`, `0x0062`, `0x0063`, `0x0064`, `0x0065`, `0x0066`, `0x0067`, // 60-67
50	`0x0068`, `0x0069`, `0x006A`, `0x006B`, `0x006C`, `0x006D`, `0x006E`, `0x006F`, // 68-6F
51	`0x0070`, `0x0071`, `0x0072`, `0x0073`, `0x0074`, `0x0075`, `0x0076`, `0x0077`, // 70-77
52	`0x0078`, `0x0079`, `0x007A`, `0x007B`, `0x007C`, `0x007D`, `0x007E`, `0x007F`, // 78-7F
53	`0x20AC`, `0x0081`, `0x201A`, `0x0192`, `0x201E`, `0x2026`, `0x2020`, `0x2021`, // 80-87
54	`0x02C6`, `0x2030`, `0x0160`, `0x2039`, `0x0152`, `0x008D`, `0x017D`, `0x008F`, // 88-8F
55	`0x0090`, `0x2018`, `0x2019`, `0x201C`, `0x201D`, `0x2022`, `0x2013`, `0x2014`, // 90-97
56	`0x02DC`, `0x2122`, `0x0161`, `0x203A`, `0x0153`, `0x009D`, `0x017E`, `0x0178`, // 98-9F
57	`0x00A0`, `0x00A1`, `0x00A2`, `0x00A3`, `0x00A4`, `0x00A5`, `0x00A6`, `0x00A7`, // A0-A7
58	`0x00A8`, `0x00A9`, `0x00AA`, `0x00AB`, `0x00AC`, `0x00AD`, `0x00AE`, `0x00AF`, // A8-AF
59	`0x00B0`, `0x00B1`, `0x00B2`, `0x00B3`, `0x00B4`, `0x00B5`, `0x00B6`, `0x00B7`, // B0-B7
60	`0x00B8`, `0x00B9`, `0x00BA`, `0x00BB`, `0x00BC`, `0x00BD`, `0x00BE`, `0x00BF`, // B8-BF
61	`0x00C0`, `0x00C1`, `0x00C2`, `0x00C3`, `0x00C4`, `0x00C5`, `0x00C6`, `0x00C7`, // C0-C7
62	`0x00C8`, `0x00C9`, `0x00CA`, `0x00CB`, `0x00CC`, `0x00CD`, `0x00CE`, `0x00CF`, // C8-CF
63	`0x00D0`, `0x00D1`, `0x00D2`, `0x00D3`, `0x00D4`, `0x00D5`, `0x00D6`, `0x00D7`, // D0-D7
64	`0x00D8`, `0x00D9`, `0x00DA`, `0x00DB`, `0x00DC`, `0x00DD`, `0x00DE`, `0x00DF`, // D8-DF
65	`0x00E0`, `0x00E1`, `0x00E2`, `0x00E3`, `0x00E4`, `0x00E5`, `0x00E6`, `0x00E7`, // E0-E7
66	`0x00E8`, `0x00E9`, `0x00EA`, `0x00EB`, `0x00EC`, `0x00ED`, `0x00EE`, `0x00EF`, // E8-EF
67	`0x00F0`, `0x00F1`, `0x00F2`, `0x00F3`, `0x00F4`, `0x00F5`, `0x00F6`, `0x00F7`, // F0-F7
68	`0x00F8`, `0x00F9`, `0x00FA`, `0x00FB`, `0x00FC`, `0x00FD`, `0x00FE`, `0x00FF` // F8-FF
69	};
70
71	void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
72	{
73	// From https://encoding.spec.whatwg.org.
74	registrar("windows-1252", "windows-1252");
75	registrar("ansi_x3.4-1968", "windows-1252");
76	registrar("ascii", "windows-1252");
77	registrar("cp1252", "windows-1252");
78	registrar("cp819", "windows-1252");
79	registrar("csisolatin1", "windows-1252");
80	registrar("ibm819", "windows-1252");
81	registrar("iso-8859-1", "windows-1252");
82	registrar("iso-ir-100", "windows-1252");
83	registrar("iso8859-1", "windows-1252");
84	registrar("iso88591", "windows-1252");
85	registrar("iso_8859-1", "windows-1252");
86	registrar("iso_8859-1:1987", "windows-1252");
87	registrar("l1", "windows-1252");
88	registrar("latin1", "windows-1252");
89	registrar("us-ascii", "windows-1252");
90	registrar("x-cp1252", "windows-1252");
91	}
92
93	void TextCodecLatin1::registerCodecs(TextCodecRegistrar registrar)
94	{
95	registrar("windows-1252", [] {
96	return std::make_unique<TextCodecLatin1>();
97	});
98	}
99
100	String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&)
101	{
102	LChar* characters;
103	if (!length)
104	return emptyString();
105	String result = String::createUninitialized(length, characters);
106
107	const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
108	const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length);
109	const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
110	LChar* destination = characters;
111
112	while (source < end) {
113	if (isASCII(*source)) {
114	// Fast path for ASCII. Most Latin-1 text will be ASCII.
115	if (WTF::isAlignedToMachineWord(source)) {
116	while (source < alignedEnd) {
117	auto chunk = reinterpret_cast_ptr<const* WTF::MachineWord*>(source);
118
119	if (!WTF::isAllASCII<LChar>(chunk))
120	goto useLookupTable;
121
122	copyASCIIMachineWord(destination, source);
123	source += sizeof(WTF::MachineWord);
124	destination += sizeof(WTF::MachineWord);
125	}
126
127	if (source == end)
128	break;
129
130	// source may not be ASCII anymore if source moves inside the loop of the fast code path*
131	if (!isASCII(*source))
132	goto useLookupTable;
133	}
134	destination = source;
135	} else {
136	useLookupTable:
137	if (latin1ConversionTable[*source] > `0xff`)
138	goto upConvertTo16Bit;
139
140	destination = latin1ConversionTable[source];
141	}
142
143	++source;
144	++destination;
145	}
146
147	return result;
148
149	upConvertTo16Bit:
150	UChar* characters16;
151	String result16 = String::createUninitialized(length, characters16);
152
153	UChar* destination16 = characters16;
154
155	// Zero extend and copy already processed 8 bit data
156	LChar* ptr8 = characters;
157	LChar* endPtr8 = destination;
158
159	while (ptr8 < endPtr8)
160	destination16++ = ptr8++;
161
162	// Handle the character that triggered the 16 bit path
163	destination16 = latin1ConversionTable[source];
164	++source;
165	++destination16;
166
167	while (source < end) {
168	if (isASCII(*source)) {
169	// Fast path for ASCII. Most Latin-1 text will be ASCII.
170	if (WTF::isAlignedToMachineWord(source)) {
171	while (source < alignedEnd) {
172	auto chunk = reinterpret_cast_ptr<const* WTF::MachineWord*>(source);
173
174	if (!WTF::isAllASCII<LChar>(chunk))
175	goto useLookupTable16;
176
177	copyASCIIMachineWord(destination16, source);
178	source += sizeof(WTF::MachineWord);
179	destination16 += sizeof(WTF::MachineWord);
180	}
181
182	if (source == end)
183	break;
184
185	// source may not be ASCII anymore if source moves inside the loop of the fast code path*
186	if (!isASCII(*source))
187	goto useLookupTable16;
188	}
189	destination16 = source;
190	} else {
191	useLookupTable16:
192	destination16 = latin1ConversionTable[source];
193	}
194
195	++source;
196	++destination16;
197	}
198
199	return result16;
200	}
201
202	static Vector<uint8_t> encodeComplexWindowsLatin1(StringView string, UnencodableHandling handling)
203	{
204	Vector<uint8_t> result;
205
206	for (auto character : string.codePoints()) {
207	uint8_t b = character;
208	// Do an efficient check to detect characters other than 00-7F and A0-FF.
209	if (b != character \|\| (character & `0xE0`) == `0x80`) {
210	// Look for a way to encode this with Windows Latin-1.
211	for (b = `0x80`; b < `0xA0`; ++b) {
212	if (latin1ConversionTable[b] == character)
213	goto gotByte;
214	}
215	// No way to encode this character with Windows Latin-1.
216	UnencodableReplacementArray replacement;
217	int replacementLength = TextCodec::getUnencodableReplacement(character, handling, replacement);
218	result.append(replacement.data(), replacementLength);
219	continue;
220	}
221	gotByte:
222	result.append(b);
223	}
224
225	return result;
226	}
227
228	Vector<uint8_t> TextCodecLatin1::encode(StringView string, UnencodableHandling handling)
229	{
230	{
231	Vector<uint8_t> result(string.length());
232	auto* bytes = result.data();
233
234	// Convert and simultaneously do a check to see if it's all ASCII.
235	UChar ored = `0`;
236	for (auto character : string.codeUnits()) {
237	*bytes++ = character;
238	ored \|= character;
239	}
240
241	if (!(ored & `0xFF80`))
242	return result;
243	}
244
245	// If it wasn't all ASCII, call the function that handles more-complex cases.
246	return encodeComplexWindowsLatin1(string, handling);
247	}
248
249	} // namespace WebCore
250

Browse the source code of webkit/Source/WebCore/platform/text/TextCodecLatin1.cpp