1/*
2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
3 * Copyright (c) 2012 Google, inc. All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Google Inc. nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#pragma once
31
32#include "TextEncoding.h"
33#include <wtf/ASCIICType.h>
34#include <wtf/Assertions.h>
35#include <wtf/text/StringBuilder.h>
36
37namespace WebCore {
38
39// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
40struct Unicode16BitEscapeSequence {
41 enum { sequenceSize = 6 }; // e.g. %u26C4
42 static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView("%u"), startPosition); }
43 static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
44 {
45 size_t runEnd = startPosition;
46 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
47 && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
48 && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
49 runEnd += sequenceSize;
50 }
51 return runEnd;
52 }
53 static String decodeRun(StringView run, const TextEncoding&)
54 {
55 // Each %u-escape sequence represents a UTF-16 code unit.
56 // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
57 // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
58 // without any intervening characters, so decode the run without additional checks.
59 auto numberOfSequences = run.length() / sequenceSize;
60 StringBuilder builder;
61 builder.reserveCapacity(numberOfSequences);
62 while (numberOfSequences--) {
63 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
64 builder.append(codeUnit);
65 run = run.substring(sequenceSize);
66 }
67 return builder.toString();
68 }
69};
70
71struct URLEscapeSequence {
72 enum { sequenceSize = 3 }; // e.g. %41
73 static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
74 static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
75 {
76 // Make the simplifying assumption that supported encodings may have up to two unescaped characters
77 // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
78 // decoder as part of the run. In other words, we end the run at the first value outside of the
79 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
80 // escape sequence.
81 size_t runEnd = startPosition;
82 int numberOfTrailingCharacters = 0;
83 while (runEnd < endPosition) {
84 if (string[runEnd] == '%') {
85 if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
86 runEnd += sequenceSize;
87 numberOfTrailingCharacters = 0;
88 } else
89 break;
90 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
91 runEnd += 1;
92 numberOfTrailingCharacters += 1;
93 } else
94 break;
95 }
96 return runEnd;
97 }
98
99 static Vector<char, 512> decodeRun(StringView run)
100 {
101 // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
102 // a valid escape sequence, but there may be characters between the sequences.
103 Vector<char, 512> buffer;
104 buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
105 char* p = buffer.data();
106 while (!run.isEmpty()) {
107 if (run[0] == '%') {
108 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
109 run = run.substring(sequenceSize);
110 } else {
111 *p++ = run[0];
112 run = run.substring(1);
113 }
114 }
115 ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
116 buffer.shrink(p - buffer.data());
117 return buffer;
118 }
119
120 static String decodeRun(StringView run, const TextEncoding& encoding)
121 {
122 auto buffer = decodeRun(run);
123 if (!encoding.isValid())
124 return UTF8Encoding().decode(buffer.data(), buffer.size());
125 return encoding.decode(buffer.data(), buffer.size());
126 }
127};
128
129template<typename EscapeSequence>
130String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
131{
132 StringBuilder result;
133 size_t length = string.length();
134 size_t decodedPosition = 0;
135 size_t searchPosition = 0;
136 size_t encodedRunPosition;
137 while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
138 size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
139 searchPosition = encodedRunEnd;
140 if (encodedRunEnd == encodedRunPosition) {
141 ++searchPosition;
142 continue;
143 }
144
145 String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
146 if (decoded.isEmpty())
147 continue;
148
149 result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
150 result.append(decoded);
151 decodedPosition = encodedRunEnd;
152 }
153 result.append(string.substring(decodedPosition, length - decodedPosition));
154 return result.toString();
155}
156
157inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
158{
159 ASSERT(encoding.isValid());
160
161 Vector<uint8_t> result;
162 size_t decodedPosition = 0;
163 size_t searchPosition = 0;
164 while (true) {
165 size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
166 size_t encodedRunEnd = 0;
167 if (encodedRunPosition != notFound) {
168 encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
169 searchPosition = encodedRunEnd;
170 if (encodedRunEnd == encodedRunPosition) {
171 ++searchPosition;
172 continue;
173 }
174 }
175
176 // Strings are encoded as requested.
177 result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));
178
179 if (encodedRunPosition == notFound)
180 return result;
181
182 // Bytes go through as-is.
183 auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
184 ASSERT(!decodedEscapeSequence.isEmpty());
185 result.appendVector(decodedEscapeSequence);
186
187 decodedPosition = encodedRunEnd;
188 }
189}
190
191} // namespace WebCore
192
193