DecodeEscapeSequences.h source code [webkit/Source/WebCore/platform/text/DecodeEscapeSequences.h]

1	/*
2	* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
3	* Copyright (c) 2012 Google, inc. All Rights Reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	* 3. Neither the name of Google Inc. nor the names of its
14	* contributors may be used to endorse or promote products derived from
15	* this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
21	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	*/
29
30	#pragma once
31
32	#include "TextEncoding.h"
33	#include <wtf/ASCIICType.h>
34	#include <wtf/Assertions.h>
35	#include <wtf/text/StringBuilder.h>
36
37	namespace WebCore {
38
39	// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
40	struct Unicode16BitEscapeSequence {
41	enum { sequenceSize = `6` }; // e.g. %u26C4
42	static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView ("%u"), startPosition); }
43	static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
44	{
45	size_t runEnd = startPosition;
46	while (endPosition - runEnd >= sequenceSize && string [runEnd] == `'%'` && string [runEnd + `1`] == `'u'`
47	&& isASCIIHexDigit(string [runEnd + `2`]) && isASCIIHexDigit(string [runEnd + `3`])
48	&& isASCIIHexDigit(string [runEnd + `4`]) && isASCIIHexDigit(string [runEnd + `5`])) {
49	runEnd += sequenceSize;
50	}
51	return runEnd;
52	}
53	static String decodeRun(StringView run, const TextEncoding&)
54	{
55	// Each %u-escape sequence represents a UTF-16 code unit.
56	// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
57	// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
58	// without any intervening characters, so decode the run without additional checks.
59	auto numberOfSequences = run.length() / sequenceSize;
60	StringBuilder builder;
61	builder.reserveCapacity(numberOfSequences);
62	while (numberOfSequences--) {
63	UChar codeUnit = (toASCIIHexValue(run [`2`]) << `12`) \| (toASCIIHexValue(run [`3`]) << `8`) \| (toASCIIHexValue(run [`4`]) << `4`) \| toASCIIHexValue(run [`5`]);
64	builder.append(codeUnit);
65	run = run.substring(sequenceSize);
66	}
67	return builder.toString();
68	}
69	};
70
71	struct URLEscapeSequence {
72	enum { sequenceSize = `3` }; // e.g. %41
73	static size_t findInString(StringView string, size_t startPosition) { return string.find(`'%'`, startPosition); }
74	static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
75	{
76	// Make the simplifying assumption that supported encodings may have up to two unescaped characters
77	// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
78	// decoder as part of the run. In other words, we end the run at the first value outside of the
79	// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
80	// escape sequence.
81	size_t runEnd = startPosition;
82	int numberOfTrailingCharacters = `0`;
83	while (runEnd < endPosition) {
84	if (string [runEnd] == `'%'`) {
85	if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string [runEnd + `1`]) && isASCIIHexDigit(string [runEnd + `2`])) {
86	runEnd += sequenceSize;
87	numberOfTrailingCharacters = `0`;
88	} else
89	break;
90	} else if (string [runEnd] >= `0x40` && string [runEnd] <= `0x7F` && numberOfTrailingCharacters < `2`) {
91	runEnd += `1`;
92	numberOfTrailingCharacters += `1`;
93	} else
94	break;
95	}
96	return runEnd;
97	}
98
99	static Vector<char, `512`> decodeRun(StringView run)
100	{
101	// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
102	// a valid escape sequence, but there may be characters between the sequences.
103	Vector<char, `512`> buffer;
104	buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
105	char* p = buffer.data();
106	while (!run.isEmpty()) {
107	if (run [`0`] == `'%'`) {
108	*p++ = (toASCIIHexValue(run [`1`]) << `4`) \| toASCIIHexValue(run [`2`]);
109	run = run.substring(sequenceSize);
110	} else {
111	*p++ = run [`0`];
112	run = run.substring(`1`);
113	}
114	}
115	ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
116	buffer.shrink(p - buffer.data());
117	return buffer;
118	}
119
120	static String decodeRun(StringView run, const TextEncoding& encoding)
121	{
122	auto buffer = decodeRun(run);
123	if (!encoding.isValid())
124	return UTF8Encoding().decode(buffer.data(), buffer.size());
125	return encoding.decode(buffer.data(), buffer.size());
126	}
127	};
128
129	template<typename EscapeSequence>
130	String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
131	{
132	StringBuilder result;
133	size_t length = string.length();
134	size_t decodedPosition = `0`;
135	size_t searchPosition = `0`;
136	size_t encodedRunPosition;
137	while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
138	size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
139	searchPosition = encodedRunEnd;
140	if (encodedRunEnd == encodedRunPosition) {
141	++searchPosition;
142	continue;
143	}
144
145	String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
146	if (decoded.isEmpty())
147	continue;
148
149	result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
150	result.append(decoded);
151	decodedPosition = encodedRunEnd;
152	}
153	result.append(string.substring(decodedPosition, length - decodedPosition));
154	return result.toString();
155	}
156
157	inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
158	{
159	ASSERT(encoding.isValid());
160
161	Vector<uint8_t> result;
162	size_t decodedPosition = `0`;
163	size_t searchPosition = `0`;
164	while (true) {
165	size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
166	size_t encodedRunEnd = `0`;
167	if (encodedRunPosition != notFound) {
168	encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
169	searchPosition = encodedRunEnd;
170	if (encodedRunEnd == encodedRunPosition) {
171	++searchPosition;
172	continue;
173	}
174	}
175
176	// Strings are encoded as requested.
177	result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));
178
179	if (encodedRunPosition == notFound)
180	return result;
181
182	// Bytes go through as-is.
183	auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
184	ASSERT(!decodedEscapeSequence.isEmpty());
185	result.appendVector(decodedEscapeSequence);
186
187	decodedPosition = encodedRunEnd;
188	}
189	}
190
191	} // namespace WebCore
192
193

Browse the source code of webkit/Source/WebCore/platform/text/DecodeEscapeSequences.h