URLParser.cpp source code [webkit/Source/WTF/wtf/URLParser.cpp]

1	/*
2	* Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23	* THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include <wtf/URLParser.h>
28
29	#include <array>
30	#include <mutex>
31	#include <unicode/uidna.h>
32	#include <unicode/utf8.h>
33	#include <unicode/utypes.h>
34
35	namespace WTF {
36
37	#define URL_PARSER_DEBUGGING 0
38
39	#if URL_PARSER_DEBUGGING
40	#define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__)
41	#else
42	#define URL_PARSER_LOG(...)
43	#endif
44
45	template<typename CharacterType>
46	class CodePointIterator {
47	public:
48	ALWAYS_INLINE CodePointIterator() { }
49	ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50	: m_begin(begin)
51	, m_end(end)
52	{
53	}
54
55	ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56	: CodePointIterator(begin.m_begin, end.m_begin)
57	{
58	ASSERT(end.m_begin >= begin.m_begin);
59	}
60
61	ALWAYS_INLINE UChar32 operator() const*;
62	ALWAYS_INLINE CodePointIterator& operator++();
63
64	ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65	{
66	return m_begin == other.m_begin
67	&& m_end == other.m_end;
68	}
69	ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70
71	ALWAYS_INLINE bool atEnd() const
72	{
73	ASSERT(m_begin <= m_end);
74	return m_begin >= m_end;
75	}
76
77	ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
78	{
79	ASSERT(m_begin >= reference);
80	return m_begin - reference;
81	}
82
83	ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
84	{
85	return codeUnitsSince(other.m_begin);
86	}
87
88	private:
89	const CharacterType* m_begin { nullptr };
90	const CharacterType* m_end { nullptr };
91	};
92
93	template<>
94	ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator() const*
95	{
96	ASSERT(!atEnd());
97	return *m_begin;
98	}
99
100	template<>
101	ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
102	{
103	m_begin++;
104	return *this;
105	}
106
107	template<>
108	ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator() const*
109	{
110	ASSERT(!atEnd());
111	UChar32 c;
112	U16_GET(m_begin, `0`, `0`, m_end - m_begin, c);
113	return c;
114	}
115
116	template<>
117	ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
118	{
119	unsigned i = `0`;
120	size_t length = m_end - m_begin;
121	U16_FWD_1(m_begin, i, length);
122	m_begin += i;
123	return *this;
124	}
125
126	ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
127	{
128	if (U_IS_BMP(codePoint)) {
129	destination.append(static_cast<UChar>(codePoint));
130	return;
131	}
132	destination.reserveCapacity(destination.size() + `2`);
133	destination.uncheckedAppend(U16_LEAD(codePoint));
134	destination.uncheckedAppend(U16_TRAIL(codePoint));
135	}
136
137	enum URLCharacterClass {
138	UserInfo = `0x1`,
139	Default = `0x2`,
140	ForbiddenHost = `0x4`,
141	QueryPercent = `0x8`,
142	SlashQuestionOrHash = `0x10`,
143	ValidScheme = `0x20`,
144	};
145
146	static const uint8_t characterClassTable[`256`] = {
147	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0x0
148	UserInfo \| Default \| QueryPercent, // 0x1
149	UserInfo \| Default \| QueryPercent, // 0x2
150	UserInfo \| Default \| QueryPercent, // 0x3
151	UserInfo \| Default \| QueryPercent, // 0x4
152	UserInfo \| Default \| QueryPercent, // 0x5
153	UserInfo \| Default \| QueryPercent, // 0x6
154	UserInfo \| Default \| QueryPercent, // 0x7
155	UserInfo \| Default \| QueryPercent, // 0x8
156	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0x9
157	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0xA
158	UserInfo \| Default \| QueryPercent, // 0xB
159	UserInfo \| Default \| QueryPercent, // 0xC
160	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0xD
161	UserInfo \| Default \| QueryPercent, // 0xE
162	UserInfo \| Default \| QueryPercent, // 0xF
163	UserInfo \| Default \| QueryPercent, // 0x10
164	UserInfo \| Default \| QueryPercent, // 0x11
165	UserInfo \| Default \| QueryPercent, // 0x12
166	UserInfo \| Default \| QueryPercent, // 0x13
167	UserInfo \| Default \| QueryPercent, // 0x14
168	UserInfo \| Default \| QueryPercent, // 0x15
169	UserInfo \| Default \| QueryPercent, // 0x16
170	UserInfo \| Default \| QueryPercent, // 0x17
171	UserInfo \| Default \| QueryPercent, // 0x18
172	UserInfo \| Default \| QueryPercent, // 0x19
173	UserInfo \| Default \| QueryPercent, // 0x1A
174	UserInfo \| Default \| QueryPercent, // 0x1B
175	UserInfo \| Default \| QueryPercent, // 0x1C
176	UserInfo \| Default \| QueryPercent, // 0x1D
177	UserInfo \| Default \| QueryPercent, // 0x1E
178	UserInfo \| Default \| QueryPercent, // 0x1F
179	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // ' '
180	`0`, // '!'
181	UserInfo \| Default \| QueryPercent, // '"'
182	UserInfo \| Default \| QueryPercent \| SlashQuestionOrHash \| ForbiddenHost, // '#'
183	`0`, // '$'
184	ForbiddenHost, // '%'
185	`0`, // '&'
186	`0`, // '\''
187	`0`, // '('
188	`0`, // ')'
189	`0`, // ''*
190	ValidScheme, // '+'
191	`0`, // ','
192	ValidScheme, // '-'
193	ValidScheme, // '.'
194	UserInfo \| SlashQuestionOrHash \| ForbiddenHost, // '/'
195	ValidScheme, // '0'
196	ValidScheme, // '1'
197	ValidScheme, // '2'
198	ValidScheme, // '3'
199	ValidScheme, // '4'
200	ValidScheme, // '5'
201	ValidScheme, // '6'
202	ValidScheme, // '7'
203	ValidScheme, // '8'
204	ValidScheme, // '9'
205	UserInfo \| ForbiddenHost, // ':'
206	UserInfo, // ';'
207	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // '<'
208	UserInfo, // '='
209	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // '>'
210	UserInfo \| Default \| SlashQuestionOrHash \| ForbiddenHost, // '?'
211	UserInfo \| ForbiddenHost, // '@'
212	ValidScheme, // 'A'
213	ValidScheme, // 'B'
214	ValidScheme, // 'C'
215	ValidScheme, // 'D'
216	ValidScheme, // 'E'
217	ValidScheme, // 'F'
218	ValidScheme, // 'G'
219	ValidScheme, // 'H'
220	ValidScheme, // 'I'
221	ValidScheme, // 'J'
222	ValidScheme, // 'K'
223	ValidScheme, // 'L'
224	ValidScheme, // 'M'
225	ValidScheme, // 'N'
226	ValidScheme, // 'O'
227	ValidScheme, // 'P'
228	ValidScheme, // 'Q'
229	ValidScheme, // 'R'
230	ValidScheme, // 'S'
231	ValidScheme, // 'T'
232	ValidScheme, // 'U'
233	ValidScheme, // 'V'
234	ValidScheme, // 'W'
235	ValidScheme, // 'X'
236	ValidScheme, // 'Y'
237	ValidScheme, // 'Z'
238	UserInfo \| ForbiddenHost, // '['
239	UserInfo \| SlashQuestionOrHash \| ForbiddenHost, // '\\'
240	UserInfo \| ForbiddenHost, // ']'
241	UserInfo, // '^'
242	`0`, // '_'
243	UserInfo \| Default, // '`'
244	ValidScheme, // 'a'
245	ValidScheme, // 'b'
246	ValidScheme, // 'c'
247	ValidScheme, // 'd'
248	ValidScheme, // 'e'
249	ValidScheme, // 'f'
250	ValidScheme, // 'g'
251	ValidScheme, // 'h'
252	ValidScheme, // 'i'
253	ValidScheme, // 'j'
254	ValidScheme, // 'k'
255	ValidScheme, // 'l'
256	ValidScheme, // 'm'
257	ValidScheme, // 'n'
258	ValidScheme, // 'o'
259	ValidScheme, // 'p'
260	ValidScheme, // 'q'
261	ValidScheme, // 'r'
262	ValidScheme, // 's'
263	ValidScheme, // 't'
264	ValidScheme, // 'u'
265	ValidScheme, // 'v'
266	ValidScheme, // 'w'
267	ValidScheme, // 'x'
268	ValidScheme, // 'y'
269	ValidScheme, // 'z'
270	UserInfo \| Default, // '{'
271	UserInfo, // '\|'
272	UserInfo \| Default, // '}'
273	`0`, // '~'
274	QueryPercent, // 0x7F
275	QueryPercent, // 0x80
276	QueryPercent, // 0x81
277	QueryPercent, // 0x82
278	QueryPercent, // 0x83
279	QueryPercent, // 0x84
280	QueryPercent, // 0x85
281	QueryPercent, // 0x86
282	QueryPercent, // 0x87
283	QueryPercent, // 0x88
284	QueryPercent, // 0x89
285	QueryPercent, // 0x8A
286	QueryPercent, // 0x8B
287	QueryPercent, // 0x8C
288	QueryPercent, // 0x8D
289	QueryPercent, // 0x8E
290	QueryPercent, // 0x8F
291	QueryPercent, // 0x90
292	QueryPercent, // 0x91
293	QueryPercent, // 0x92
294	QueryPercent, // 0x93
295	QueryPercent, // 0x94
296	QueryPercent, // 0x95
297	QueryPercent, // 0x96
298	QueryPercent, // 0x97
299	QueryPercent, // 0x98
300	QueryPercent, // 0x99
301	QueryPercent, // 0x9A
302	QueryPercent, // 0x9B
303	QueryPercent, // 0x9C
304	QueryPercent, // 0x9D
305	QueryPercent, // 0x9E
306	QueryPercent, // 0x9F
307	QueryPercent, // 0xA0
308	QueryPercent, // 0xA1
309	QueryPercent, // 0xA2
310	QueryPercent, // 0xA3
311	QueryPercent, // 0xA4
312	QueryPercent, // 0xA5
313	QueryPercent, // 0xA6
314	QueryPercent, // 0xA7
315	QueryPercent, // 0xA8
316	QueryPercent, // 0xA9
317	QueryPercent, // 0xAA
318	QueryPercent, // 0xAB
319	QueryPercent, // 0xAC
320	QueryPercent, // 0xAD
321	QueryPercent, // 0xAE
322	QueryPercent, // 0xAF
323	QueryPercent, // 0xB0
324	QueryPercent, // 0xB1
325	QueryPercent, // 0xB2
326	QueryPercent, // 0xB3
327	QueryPercent, // 0xB4
328	QueryPercent, // 0xB5
329	QueryPercent, // 0xB6
330	QueryPercent, // 0xB7
331	QueryPercent, // 0xB8
332	QueryPercent, // 0xB9
333	QueryPercent, // 0xBA
334	QueryPercent, // 0xBB
335	QueryPercent, // 0xBC
336	QueryPercent, // 0xBD
337	QueryPercent, // 0xBE
338	QueryPercent, // 0xBF
339	QueryPercent, // 0xC0
340	QueryPercent, // 0xC1
341	QueryPercent, // 0xC2
342	QueryPercent, // 0xC3
343	QueryPercent, // 0xC4
344	QueryPercent, // 0xC5
345	QueryPercent, // 0xC6
346	QueryPercent, // 0xC7
347	QueryPercent, // 0xC8
348	QueryPercent, // 0xC9
349	QueryPercent, // 0xCA
350	QueryPercent, // 0xCB
351	QueryPercent, // 0xCC
352	QueryPercent, // 0xCD
353	QueryPercent, // 0xCE
354	QueryPercent, // 0xCF
355	QueryPercent, // 0xD0
356	QueryPercent, // 0xD1
357	QueryPercent, // 0xD2
358	QueryPercent, // 0xD3
359	QueryPercent, // 0xD4
360	QueryPercent, // 0xD5
361	QueryPercent, // 0xD6
362	QueryPercent, // 0xD7
363	QueryPercent, // 0xD8
364	QueryPercent, // 0xD9
365	QueryPercent, // 0xDA
366	QueryPercent, // 0xDB
367	QueryPercent, // 0xDC
368	QueryPercent, // 0xDD
369	QueryPercent, // 0xDE
370	QueryPercent, // 0xDF
371	QueryPercent, // 0xE0
372	QueryPercent, // 0xE1
373	QueryPercent, // 0xE2
374	QueryPercent, // 0xE3
375	QueryPercent, // 0xE4
376	QueryPercent, // 0xE5
377	QueryPercent, // 0xE6
378	QueryPercent, // 0xE7
379	QueryPercent, // 0xE8
380	QueryPercent, // 0xE9
381	QueryPercent, // 0xEA
382	QueryPercent, // 0xEB
383	QueryPercent, // 0xEC
384	QueryPercent, // 0xED
385	QueryPercent, // 0xEE
386	QueryPercent, // 0xEF
387	QueryPercent, // 0xF0
388	QueryPercent, // 0xF1
389	QueryPercent, // 0xF2
390	QueryPercent, // 0xF3
391	QueryPercent, // 0xF4
392	QueryPercent, // 0xF5
393	QueryPercent, // 0xF6
394	QueryPercent, // 0xF7
395	QueryPercent, // 0xF8
396	QueryPercent, // 0xF9
397	QueryPercent, // 0xFA
398	QueryPercent, // 0xFB
399	QueryPercent, // 0xFC
400	QueryPercent, // 0xFD
401	QueryPercent, // 0xFE
402	QueryPercent, // 0xFF
403	};
404
405	template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= `0x1F`; }
406	template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= `0x20`; }
407	template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= `0xD` && character >= `0x9` && character != `0xB` && character != `0xC`; }
408	template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > `0x7E` \|\| isC0Control(character); }
409	template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > `0x7E` \|\| characterClassTable[character] & Default; }
410	template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > `0x7E` \|\| characterClassTable[character] & UserInfo; }
411	template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) \|\| character == `'%'`; }
412	template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= `'\\'` && characterClassTable[character] & SlashQuestionOrHash; }
413	template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= `'z'` && characterClassTable[character] & ValidScheme; }
414	template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= `']'` && characterClassTable[character] & ForbiddenHost; }
415	ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
416	{
417	if (characterClassTable[byte] & QueryPercent)
418	return true;
419	if (byte == `'\''` && urlIsSpecial)
420	return true;
421	return false;
422	}
423
424	bool URLParser::isInUserInfoEncodeSet(UChar c)
425	{
426	return WTF::isInUserInfoEncodeSet(c);
427	}
428
429	template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
430	ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
431	{
432	++iterator;
433	while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
434	if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
435	syntaxViolation(iteratorForSyntaxViolationPosition);
436	++iterator;
437	}
438	}
439
440	template<typename CharacterType>
441	bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
442	{
443	if (iterator.atEnd())
444	return false;
445	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446	if (iterator.atEnd())
447	return false;
448	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
449	return iterator.atEnd();
450	}
451
452	template<typename CharacterType>
453	ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
454	{
455	if (iterator.atEnd() \|\| !isASCIIAlpha(*iterator))
456	return false;
457	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
458	if (iterator.atEnd())
459	return false;
460	if (*iterator == `':'`)
461	return true;
462	if (UNLIKELY(*iterator == `'\|'`))
463	return true;
464	return false;
465	}
466
467	ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
468	{
469	ASSERT(isASCII(codePoint));
470	if (UNLIKELY(m_didSeeSyntaxViolation))
471	m_asciiBuffer.append(codePoint);
472	}
473
474	ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
475	{
476	if (UNLIKELY(m_didSeeSyntaxViolation))
477	m_asciiBuffer.append(characters, length);
478	}
479
480	template<typename CharacterType>
481	void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
482	{
483	ASSERT(isWindowsDriveLetter(iterator));
484	appendToASCIIBuffer(*iterator);
485	advance(iterator);
486	ASSERT(!iterator.atEnd());
487	ASSERT(iterator == `':'` \|\| iterator == `'\|'`);
488	if (*iterator == `'\|'`)
489	syntaxViolation(iterator);
490	appendToASCIIBuffer(`':'`);
491	advance(iterator);
492	}
493
494	bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
495	{
496	if (base.protocolIs("file")) {
497	RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
498	if (base.m_string.is8Bit()) {
499	const LChar* begin = base.m_string.characters8();
500	CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + `1`, begin + base.m_string.length());
501	if (isWindowsDriveLetter(c)) {
502	appendWindowsDriveLetter(c);
503	return true;
504	}
505	} else {
506	const UChar* begin = base.m_string.characters16();
507	CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + `1`, begin + base.m_string.length());
508	if (isWindowsDriveLetter(c)) {
509	appendWindowsDriveLetter(c);
510	return true;
511	}
512	}
513	}
514	return false;
515	}
516
517	template<typename CharacterType>
518	bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
519	{
520	if (!isWindowsDriveLetter(iterator))
521	return true;
522	if (iterator.atEnd())
523	return false;
524	advance(iterator);
525	if (iterator.atEnd())
526	return true;
527	advance(iterator);
528	if (iterator.atEnd())
529	return true;
530	return !isSlashQuestionOrHash(*iterator);
531	}
532
533	static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
534	{
535	buffer.append(`'%'`);
536	buffer.append(upperNibbleToASCIIHexDigit(byte));
537	buffer.append(lowerNibbleToASCIIHexDigit(byte));
538	}
539
540	void URLParser::percentEncodeByte(uint8_t byte)
541	{
542	ASSERT(m_didSeeSyntaxViolation);
543	appendToASCIIBuffer(`'%'`);
544	appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
545	appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
546	}
547
548	const char replacementCharacterUTF8PercentEncoded[`10`] = "%EF%BF%BD";
549	const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - `1`;
550
551	template<bool(isInCodeSet)(UChar32), typename* CharacterType>
552	ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
553	{
554	ASSERT(!iterator.atEnd());
555	UChar32 codePoint = *iterator;
556	if (LIKELY(isASCII(codePoint))) {
557	if (UNLIKELY(isInCodeSet(codePoint))) {
558	syntaxViolation(iterator);
559	percentEncodeByte(codePoint);
560	} else
561	appendToASCIIBuffer(codePoint);
562	return;
563	}
564	ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
565	syntaxViolation(iterator);
566
567	if (!U_IS_UNICODE_CHAR(codePoint)) {
568	appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
569	return;
570	}
571
572	uint8_t buffer[U8_MAX_LENGTH];
573	int32_t offset = `0`;
574	U8_APPEND_UNSAFE(buffer, offset, codePoint);
575	for (int32_t i = `0`; i < offset; ++i)
576	percentEncodeByte(buffer[i]);
577	}
578
579	template<typename CharacterType>
580	ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
581	{
582	ASSERT(!iterator.atEnd());
583	UChar32 codePoint = *iterator;
584	if (LIKELY(isASCII(codePoint))) {
585	if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
586	syntaxViolation(iterator);
587	percentEncodeByte(codePoint);
588	} else
589	appendToASCIIBuffer(codePoint);
590	return;
591	}
592
593	syntaxViolation(iterator);
594
595	if (!U_IS_UNICODE_CHAR(codePoint)) {
596	appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
597	return;
598	}
599
600	uint8_t buffer[U8_MAX_LENGTH];
601	int32_t offset = `0`;
602	U8_APPEND_UNSAFE(buffer, offset, codePoint);
603	for (int32_t i = `0`; i < offset; ++i) {
604	auto byte = buffer[i];
605	if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
606	percentEncodeByte(byte);
607	else
608	appendToASCIIBuffer(byte);
609	}
610	}
611
612	template<typename CharacterType>
613	void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
614	{
615	auto encoded = encoding.encodeForURLParsing(StringView (source.data(), source.size()));
616	auto* data = encoded.data();
617	size_t length = encoded.size();
618
619	if (!length == !iterator.atEnd()) {
620	syntaxViolation(iterator);
621	return;
622	}
623
624	size_t i = `0`;
625	for (; i < length; ++i) {
626	ASSERT(!iterator.atEnd());
627	uint8_t byte = data[i];
628	if (UNLIKELY(byte != *iterator)) {
629	syntaxViolation(iterator);
630	break;
631	}
632	if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
633	syntaxViolation(iterator);
634	break;
635	}
636	appendToASCIIBuffer(byte);
637	++iterator;
638	}
639	while (!iterator.atEnd() && isTabOrNewline(*iterator))
640	++iterator;
641	ASSERT((i == length) == iterator.atEnd());
642	for (; i < length; ++i) {
643	ASSERT(m_didSeeSyntaxViolation);
644	uint8_t byte = data[i];
645	if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
646	percentEncodeByte(byte);
647	else
648	appendToASCIIBuffer(byte);
649	}
650	}
651
652	Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
653	{
654	static const uint16_t ftpPort = `21`;
655	static const uint16_t gopherPort = `70`;
656	static const uint16_t httpPort = `80`;
657	static const uint16_t httpsPort = `443`;
658	static const uint16_t wsPort = `80`;
659	static const uint16_t wssPort = `443`;
660
661	auto length = scheme.length();
662	if (!length)
663	return WTF::nullopt;
664	switch (scheme [`0`]) {
665	case `'w'`:
666	switch (length) {
667	case `2`:
668	if (scheme [`1`] == `'s'`)
669	return wsPort;
670	return WTF::nullopt;
671	case `3`:
672	if (scheme [`1`] == `'s'`
673	&& scheme [`2`] == `'s'`)
674	return wssPort;
675	return WTF::nullopt;
676	default:
677	return false;
678	}
679	case `'h'`:
680	switch (length) {
681	case `4`:
682	if (scheme [`1`] == `'t'`
683	&& scheme [`2`] == `'t'`
684	&& scheme [`3`] == `'p'`)
685	return httpPort;
686	return WTF::nullopt;
687	case `5`:
688	if (scheme [`1`] == `'t'`
689	&& scheme [`2`] == `'t'`
690	&& scheme [`3`] == `'p'`
691	&& scheme [`4`] == `'s'`)
692	return httpsPort;
693	return WTF::nullopt;
694	default:
695	return WTF::nullopt;
696	}
697	case `'g'`:
698	if (length == `6`
699	&& scheme [`1`] == `'o'`
700	&& scheme [`2`] == `'p'`
701	&& scheme [`3`] == `'h'`
702	&& scheme [`4`] == `'e'`
703	&& scheme [`5`] == `'r'`)
704	return gopherPort;
705	return WTF::nullopt;
706	case `'f'`:
707	if (length == `3`
708	&& scheme [`1`] == `'t'`
709	&& scheme [`2`] == `'p'`)
710	return ftpPort;
711	return WTF::nullopt;
712	default:
713	return WTF::nullopt;
714	}
715	}
716
717	enum class Scheme {
718	WS,
719	WSS,
720	File,
721	FTP,
722	Gopher,
723	HTTP,
724	HTTPS,
725	NonSpecial
726	};
727
728	ALWAYS_INLINE static Scheme scheme(StringView scheme)
729	{
730	auto length = scheme.length();
731	if (!length)
732	return Scheme::NonSpecial;
733	switch (scheme [`0`]) {
734	case `'f'`:
735	switch (length) {
736	case `3`:
737	if (scheme [`1`] == `'t'`
738	&& scheme [`2`] == `'p'`)
739	return Scheme::FTP;
740	return Scheme::NonSpecial;
741	case `4`:
742	if (scheme [`1`] == `'i'`
743	&& scheme [`2`] == `'l'`
744	&& scheme [`3`] == `'e'`)
745	return Scheme::File;
746	return Scheme::NonSpecial;
747	default:
748	return Scheme::NonSpecial;
749	}
750	case `'g'`:
751	if (length == `6`
752	&& scheme [`1`] == `'o'`
753	&& scheme [`2`] == `'p'`
754	&& scheme [`3`] == `'h'`
755	&& scheme [`4`] == `'e'`
756	&& scheme [`5`] == `'r'`)
757	return Scheme::Gopher;
758	return Scheme::NonSpecial;
759	case `'h'`:
760	switch (length) {
761	case `4`:
762	if (scheme [`1`] == `'t'`
763	&& scheme [`2`] == `'t'`
764	&& scheme [`3`] == `'p'`)
765	return Scheme::HTTP;
766	return Scheme::NonSpecial;
767	case `5`:
768	if (scheme [`1`] == `'t'`
769	&& scheme [`2`] == `'t'`
770	&& scheme [`3`] == `'p'`
771	&& scheme [`4`] == `'s'`)
772	return Scheme::HTTPS;
773	return Scheme::NonSpecial;
774	default:
775	return Scheme::NonSpecial;
776	}
777	case `'w'`:
778	switch (length) {
779	case `2`:
780	if (scheme [`1`] == `'s'`)
781	return Scheme::WS;
782	return Scheme::NonSpecial;
783	case `3`:
784	if (scheme [`1`] == `'s'`
785	&& scheme [`2`] == `'s'`)
786	return Scheme::WSS;
787	return Scheme::NonSpecial;
788	default:
789	return Scheme::NonSpecial;
790	}
791	default:
792	return Scheme::NonSpecial;
793	}
794	}
795
796	Optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
797	{
798	if (scheme.isEmpty())
799	return WTF::nullopt;
800
801	if (!isASCIIAlpha(scheme [`0`]))
802	return WTF::nullopt;
803
804	for (size_t i = `1`; i < scheme.length(); ++i) {
805	if (isASCIIAlphanumeric(scheme [i]) \|\| scheme [i] == `'+'` \|\| scheme [i] == `'-'` \|\| scheme [i] == `'.'`)
806	continue;
807	return WTF::nullopt;
808	}
809
810	return scheme.convertToASCIILowercase();
811	}
812
813	bool URLParser::isSpecialScheme(const String& schemeArg)
814	{
815	return scheme(schemeArg) != Scheme::NonSpecial;
816	}
817
818	enum class URLParser::URLPart {
819	SchemeEnd,
820	UserStart,
821	UserEnd,
822	PasswordEnd,
823	HostEnd,
824	PortEnd,
825	PathAfterLastSlash,
826	PathEnd,
827	QueryEnd,
828	};
829
830	size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
831	{
832	switch (part) {
833	case URLPart::QueryEnd:
834	return url.m_queryEnd;
835	case URLPart::PathEnd:
836	return url.m_pathEnd;
837	case URLPart::PathAfterLastSlash:
838	return url.m_pathAfterLastSlash;
839	case URLPart::PortEnd:
840	return url.m_hostEnd + url.m_portLength;
841	case URLPart::HostEnd:
842	return url.m_hostEnd;
843	case URLPart::PasswordEnd:
844	return url.m_passwordEnd;
845	case URLPart::UserEnd:
846	return url.m_userEnd;
847	case URLPart::UserStart:
848	return url.m_userStart;
849	case URLPart::SchemeEnd:
850	return url.m_schemeEnd;
851	}
852	ASSERT_NOT_REACHED();
853	return `0`;
854	}
855
856	void URLParser::copyASCIIStringUntil(const String& string, size_t length)
857	{
858	RELEASE_ASSERT(length <= string.length());
859	if (string.isNull())
860	return;
861	ASSERT(m_asciiBuffer.isEmpty());
862	if (string.is8Bit())
863	appendToASCIIBuffer(string.characters8(), length);
864	else {
865	const UChar* characters = string.characters16();
866	for (size_t i = `0`; i < length; ++i) {
867	UChar c = characters[i];
868	ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
869	appendToASCIIBuffer(c);
870	}
871	}
872	}
873
874	template<typename CharacterType>
875	void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
876	{
877	syntaxViolation(iterator);
878
879	m_asciiBuffer.clear();
880	copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
881	switch (part) {
882	case URLPart::QueryEnd:
883	m_url.m_queryEnd = base.m_queryEnd;
884	FALLTHROUGH;
885	case URLPart::PathEnd:
886	m_url.m_pathEnd = base.m_pathEnd;
887	FALLTHROUGH;
888	case URLPart::PathAfterLastSlash:
889	m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
890	FALLTHROUGH;
891	case URLPart::PortEnd:
892	m_url.m_portLength = base.m_portLength;
893	FALLTHROUGH;
894	case URLPart::HostEnd:
895	m_url.m_hostEnd = base.m_hostEnd;
896	FALLTHROUGH;
897	case URLPart::PasswordEnd:
898	m_url.m_passwordEnd = base.m_passwordEnd;
899	FALLTHROUGH;
900	case URLPart::UserEnd:
901	m_url.m_userEnd = base.m_userEnd;
902	FALLTHROUGH;
903	case URLPart::UserStart:
904	m_url.m_userStart = base.m_userStart;
905	FALLTHROUGH;
906	case URLPart::SchemeEnd:
907	m_url.m_isValid = base.m_isValid;
908	m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
909	m_url.m_schemeEnd = base.m_schemeEnd;
910	}
911	switch (scheme(StringView (m_asciiBuffer.data(), m_url.m_schemeEnd))) {
912	case Scheme::WS:
913	case Scheme::WSS:
914	nonUTF8QueryEncoding = nullptr;
915	m_urlIsSpecial = true;
916	return;
917	case Scheme::File:
918	m_urlIsFile = true;
919	FALLTHROUGH;
920	case Scheme::FTP:
921	case Scheme::Gopher:
922	case Scheme::HTTP:
923	case Scheme::HTTPS:
924	m_urlIsSpecial = true;
925	return;
926	case Scheme::NonSpecial:
927	m_urlIsSpecial = false;
928	nonUTF8QueryEncoding = nullptr;
929	return;
930	}
931	ASSERT_NOT_REACHED();
932	}
933
934	static const char dotASCIICode[`2`] = {`'2'`, `'e'`};
935
936	template<typename CharacterType>
937	ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
938	{
939	if (c.atEnd())
940	return false;
941	if (*c == `'.'`) {
942	advance<CharacterType, ReportSyntaxViolation::No>(c);
943	return c.atEnd() \|\| isSlashQuestionOrHash(*c);
944	}
945	if (*c != `'%'`)
946	return false;
947	advance<CharacterType, ReportSyntaxViolation::No>(c);
948	if (c.atEnd() \|\| *c != dotASCIICode[`0`])
949	return false;
950	advance<CharacterType, ReportSyntaxViolation::No>(c);
951	if (c.atEnd())
952	return false;
953	if (toASCIILower(*c) == dotASCIICode[`1`]) {
954	advance<CharacterType, ReportSyntaxViolation::No>(c);
955	return c.atEnd() \|\| isSlashQuestionOrHash(*c);
956	}
957	return false;
958	}
959
960	template<typename CharacterType>
961	ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
962	{
963	if (c.atEnd())
964	return false;
965	if (*c == `'.'`) {
966	advance<CharacterType, ReportSyntaxViolation::No>(c);
967	return isSingleDotPathSegment(c);
968	}
969	if (*c != `'%'`)
970	return false;
971	advance<CharacterType, ReportSyntaxViolation::No>(c);
972	if (c.atEnd() \|\| *c != dotASCIICode[`0`])
973	return false;
974	advance<CharacterType, ReportSyntaxViolation::No>(c);
975	if (c.atEnd())
976	return false;
977	if (toASCIILower(*c) == dotASCIICode[`1`]) {
978	advance<CharacterType, ReportSyntaxViolation::No>(c);
979	return isSingleDotPathSegment(c);
980	}
981	return false;
982	}
983
984	template<typename CharacterType>
985	void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
986	{
987	ASSERT(isSingleDotPathSegment(c));
988	if (*c == `'.'`) {
989	advance(c);
990	if (!c.atEnd()) {
991	if (c == `'/'` \|\| c == `'\\'`)
992	advance(c);
993	else
994	ASSERT(c == `'?'` \|\| c == `'#'`);
995	}
996	} else {
997	ASSERT(*c == `'%'`);
998	advance(c);
999	ASSERT(*c == dotASCIICode[`0`]);
1000	advance(c);
1001	ASSERT(toASCIILower(*c) == dotASCIICode[`1`]);
1002	advance(c);
1003	if (!c.atEnd()) {
1004	if (c == `'/'` \|\| c == `'\\'`)
1005	advance(c);
1006	else
1007	ASSERT(c == `'?'` \|\| c == `'#'`);
1008	}
1009	}
1010	}
1011
1012	template<typename CharacterType>
1013	void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1014	{
1015	ASSERT(isDoubleDotPathSegment(c));
1016	if (*c == `'.'`)
1017	advance(c);
1018	else {
1019	ASSERT(*c == `'%'`);
1020	advance(c);
1021	ASSERT(*c == dotASCIICode[`0`]);
1022	advance(c);
1023	ASSERT(toASCIILower(*c) == dotASCIICode[`1`]);
1024	advance(c);
1025	}
1026	consumeSingleDotPathSegment(c);
1027	}
1028
1029	bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1030	{
1031	ASSERT(m_didSeeSyntaxViolation);
1032	if (!m_urlIsFile)
1033	return true;
1034
1035	ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1036	CodePointIterator<LChar> componentToPop(&m_asciiBuffer [newPathAfterLastSlash], &m_asciiBuffer [`0`] + m_url.m_pathAfterLastSlash);
1037	if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + `1` && isWindowsDriveLetter(componentToPop))
1038	return false;
1039	return true;
1040	}
1041
1042	void URLParser::popPath()
1043	{
1044	ASSERT(m_didSeeSyntaxViolation);
1045	if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + `1`) {
1046	auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - `1`;
1047	if (m_asciiBuffer [newPathAfterLastSlash] == `'/'`)
1048	newPathAfterLastSlash--;
1049	while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer [newPathAfterLastSlash] != `'/'`)
1050	newPathAfterLastSlash--;
1051	newPathAfterLastSlash++;
1052	if (shouldPopPath(newPathAfterLastSlash))
1053	m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1054	}
1055	m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1056	}
1057
1058	template<typename CharacterType>
1059	void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1060	{
1061	if (m_didSeeSyntaxViolation)
1062	return;
1063	m_didSeeSyntaxViolation = true;
1064
1065	ASSERT(m_asciiBuffer.isEmpty());
1066	size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1067	RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1068	m_asciiBuffer.reserveCapacity(m_inputString.length());
1069	for (size_t i = `0`; i < codeUnitsToCopy; ++i) {
1070	ASSERT(isASCII(m_inputString[i]));
1071	m_asciiBuffer.uncheckedAppend(m_inputString [i]);
1072	}
1073	}
1074
1075	void URLParser::failure()
1076	{
1077	m_url.invalidate();
1078	m_url.m_string = m_inputString;
1079	}
1080
1081	template<typename CharacterType>
1082	bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1083	{
1084	if (iterator.atEnd() \|\| toASCIILower(*iterator) != codePoint)
1085	return false;
1086	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1087	return true;
1088	}
1089
1090	template<typename CharacterType>
1091	bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1092	{
1093	if (!checkLocalhostCodePoint(iterator, `'l'`))
1094	return false;
1095	if (!checkLocalhostCodePoint(iterator, `'o'`))
1096	return false;
1097	if (!checkLocalhostCodePoint(iterator, `'c'`))
1098	return false;
1099	if (!checkLocalhostCodePoint(iterator, `'a'`))
1100	return false;
1101	if (!checkLocalhostCodePoint(iterator, `'l'`))
1102	return false;
1103	if (!checkLocalhostCodePoint(iterator, `'h'`))
1104	return false;
1105	if (!checkLocalhostCodePoint(iterator, `'o'`))
1106	return false;
1107	if (!checkLocalhostCodePoint(iterator, `'s'`))
1108	return false;
1109	if (!checkLocalhostCodePoint(iterator, `'t'`))
1110	return false;
1111	return iterator.atEnd();
1112	}
1113
1114	bool URLParser::isLocalhost(StringView view)
1115	{
1116	if (view.is8Bit())
1117	return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1118	return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1119	}
1120
1121	ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1122	{
1123	if (UNLIKELY(m_didSeeSyntaxViolation)) {
1124	ASSERT(start + length <= m_asciiBuffer.size());
1125	return StringView (m_asciiBuffer.data() + start, length);
1126	}
1127	ASSERT(start + length <= m_inputString.length());
1128	return StringView (m_inputString).substring(start, length);
1129	}
1130
1131	ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1132	{
1133	if (UNLIKELY(m_didSeeSyntaxViolation))
1134	return m_asciiBuffer [position];
1135	return m_inputString [position];
1136	}
1137
1138	template<typename CharacterType>
1139	ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1140	{
1141	if (UNLIKELY(m_didSeeSyntaxViolation))
1142	return m_asciiBuffer.size();
1143
1144	return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1145	}
1146
1147	URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1148	: m_inputString (input)
1149	{
1150	if (input.isNull()) {
1151	if (base.isValid() && !base.m_cannotBeABaseURL) {
1152	m_url = base;
1153	m_url.removeFragmentIdentifier();
1154	}
1155	return;
1156	}
1157
1158	if (input.is8Bit()) {
1159	m_inputBegin = input.characters8();
1160	parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1161	} else {
1162	m_inputBegin = input.characters16();
1163	parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1164	}
1165
1166	ASSERT(!m_url.m_isValid
1167	\|\| m_didSeeSyntaxViolation == (m_url.string() != input)
1168	\|\| (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1169	&& m_url.m_string == base.m_string.left(base.m_queryEnd)));
1170	ASSERT(internalValuesConsistent(m_url));
1171	#if !ASSERT_DISABLED
1172	if (!m_didSeeSyntaxViolation) {
1173	// Force a syntax violation at the beginning to make sure we get the same result.
1174	URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1175	URL parsed = parser.result();
1176	if (parsed.isValid())
1177	ASSERT(allValuesEqual(parser.result(), m_url));
1178	}
1179	#endif
1180	}
1181
1182	template<typename CharacterType>
1183	void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1184	{
1185	URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1186	m_url = { };
1187	ASSERT(m_asciiBuffer.isEmpty());
1188
1189	Vector<UChar> queryBuffer;
1190
1191	unsigned endIndex = length;
1192	while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - `1`]))) {
1193	syntaxViolation(CodePointIterator<CharacterType>(input, input));
1194	endIndex--;
1195	}
1196	CodePointIterator<CharacterType> c(input, input + endIndex);
1197	CodePointIterator<CharacterType> authorityOrHostBegin;
1198	CodePointIterator<CharacterType> queryBegin;
1199	while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1200	syntaxViolation(c);
1201	++c;
1202	}
1203	auto beginAfterControlAndSpace = c;
1204
1205	enum class State : uint8_t {
1206	SchemeStart,
1207	Scheme,
1208	NoScheme,
1209	SpecialRelativeOrAuthority,
1210	PathOrAuthority,
1211	Relative,
1212	RelativeSlash,
1213	SpecialAuthoritySlashes,
1214	SpecialAuthorityIgnoreSlashes,
1215	AuthorityOrHost,
1216	Host,
1217	File,
1218	FileSlash,
1219	FileHost,
1220	PathStart,
1221	Path,
1222	CannotBeABaseURLPath,
1223	UTF8Query,
1224	NonUTF8Query,
1225	Fragment,
1226	};
1227
1228	#define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1229	#define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1230
1231	State state = State::SchemeStart;
1232	while (!c.atEnd()) {
1233	if (UNLIKELY(isTabOrNewline(*c))) {
1234	syntaxViolation(c);
1235	++c;
1236	continue;
1237	}
1238
1239	switch (state) {
1240	case State::SchemeStart:
1241	LOG_STATE("SchemeStart");
1242	if (isASCIIAlpha(*c)) {
1243	if (UNLIKELY(isASCIIUpper(*c)))
1244	syntaxViolation(c);
1245	appendToASCIIBuffer(toASCIILower(*c));
1246	advance(c);
1247	if (c.atEnd()) {
1248	m_asciiBuffer.clear();
1249	state = State::NoScheme;
1250	c = beginAfterControlAndSpace;
1251	break;
1252	}
1253	state = State::Scheme;
1254	} else
1255	state = State::NoScheme;
1256	break;
1257	case State::Scheme:
1258	LOG_STATE("Scheme");
1259	if (isValidSchemeCharacter(*c)) {
1260	if (UNLIKELY(isASCIIUpper(*c)))
1261	syntaxViolation(c);
1262	appendToASCIIBuffer(toASCIILower(*c));
1263	} else if (*c == `':'`) {
1264	unsigned schemeEnd = currentPosition(c);
1265	if (schemeEnd > URL::maxSchemeLength) {
1266	failure();
1267	return;
1268	}
1269	m_url.m_schemeEnd = schemeEnd;
1270	StringView urlScheme = parsedDataView(`0`, m_url.m_schemeEnd);
1271	appendToASCIIBuffer(`':'`);
1272	switch (scheme(urlScheme)) {
1273	case Scheme::File:
1274	m_urlIsSpecial = true;
1275	m_urlIsFile = true;
1276	state = State::File;
1277	++c;
1278	break;
1279	case Scheme::WS:
1280	case Scheme::WSS:
1281	nonUTF8QueryEncoding = nullptr;
1282	m_urlIsSpecial = true;
1283	if (base.protocolIs(urlScheme))
1284	state = State::SpecialRelativeOrAuthority;
1285	else
1286	state = State::SpecialAuthoritySlashes;
1287	++c;
1288	break;
1289	case Scheme::HTTP:
1290	case Scheme::HTTPS:
1291	m_url.m_protocolIsInHTTPFamily = true;
1292	FALLTHROUGH;
1293	case Scheme::FTP:
1294	case Scheme::Gopher:
1295	m_urlIsSpecial = true;
1296	if (base.protocolIs(urlScheme))
1297	state = State::SpecialRelativeOrAuthority;
1298	else
1299	state = State::SpecialAuthoritySlashes;
1300	++c;
1301	break;
1302	case Scheme::NonSpecial:
1303	nonUTF8QueryEncoding = nullptr;
1304	auto maybeSlash = c;
1305	advance(maybeSlash);
1306	if (!maybeSlash.atEnd() && *maybeSlash == `'/'`) {
1307	appendToASCIIBuffer(`'/'`);
1308	c = maybeSlash;
1309	state = State::PathOrAuthority;
1310	ASSERT(*c == `'/'`);
1311	++c;
1312	m_url.m_userStart = currentPosition(c);
1313	} else {
1314	++c;
1315	m_url.m_userStart = currentPosition(c);
1316	m_url.m_userEnd = m_url.m_userStart;
1317	m_url.m_passwordEnd = m_url.m_userStart;
1318	m_url.m_hostEnd = m_url.m_userStart;
1319	m_url.m_portLength = `0`;
1320	m_url.m_pathAfterLastSlash = m_url.m_userStart;
1321	m_url.m_cannotBeABaseURL = true;
1322	state = State::CannotBeABaseURLPath;
1323	}
1324	break;
1325	}
1326	break;
1327	} else {
1328	m_asciiBuffer.clear();
1329	state = State::NoScheme;
1330	c = beginAfterControlAndSpace;
1331	break;
1332	}
1333	advance(c);
1334	if (c.atEnd()) {
1335	m_asciiBuffer.clear();
1336	state = State::NoScheme;
1337	c = beginAfterControlAndSpace;
1338	}
1339	break;
1340	case State::NoScheme:
1341	LOG_STATE("NoScheme");
1342	if (!base.isValid() \|\| (base.m_cannotBeABaseURL && *c != `'#'`)) {
1343	failure();
1344	return;
1345	}
1346	if (base.m_cannotBeABaseURL && *c == `'#'`) {
1347	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1348	state = State::Fragment;
1349	appendToASCIIBuffer(`'#'`);
1350	++c;
1351	break;
1352	}
1353	if (!base.protocolIs("file")) {
1354	state = State::Relative;
1355	break;
1356	}
1357	copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1358	appendToASCIIBuffer(`':'`);
1359	state = State::File;
1360	break;
1361	case State::SpecialRelativeOrAuthority:
1362	LOG_STATE("SpecialRelativeOrAuthority");
1363	if (*c == `'/'`) {
1364	appendToASCIIBuffer(`'/'`);
1365	advance(c);
1366	if (c.atEnd()) {
1367	failure();
1368	return;
1369	}
1370	if (*c == `'/'`) {
1371	appendToASCIIBuffer(`'/'`);
1372	state = State::SpecialAuthorityIgnoreSlashes;
1373	++c;
1374	} else
1375	state = State::RelativeSlash;
1376	} else
1377	state = State::Relative;
1378	break;
1379	case State::PathOrAuthority:
1380	LOG_STATE("PathOrAuthority");
1381	if (*c == `'/'`) {
1382	appendToASCIIBuffer(`'/'`);
1383	state = State::AuthorityOrHost;
1384	advance(c);
1385	m_url.m_userStart = currentPosition(c);
1386	authorityOrHostBegin = c;
1387	} else {
1388	ASSERT(parsedDataView(currentPosition(c) - `1`) == `'/'`);
1389	m_url.m_userStart = currentPosition(c) - `1`;
1390	m_url.m_userEnd = m_url.m_userStart;
1391	m_url.m_passwordEnd = m_url.m_userStart;
1392	m_url.m_hostEnd = m_url.m_userStart;
1393	m_url.m_portLength = `0`;
1394	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1395	state = State::Path;
1396	}
1397	break;
1398	case State::Relative:
1399	LOG_STATE("Relative");
1400	switch (*c) {
1401	case `'/'`:
1402	case `'\\'`:
1403	state = State::RelativeSlash;
1404	++c;
1405	break;
1406	case `'?'`:
1407	copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1408	appendToASCIIBuffer(`'?'`);
1409	++c;
1410	if (nonUTF8QueryEncoding) {
1411	queryBegin = c;
1412	state = State::NonUTF8Query;
1413	} else
1414	state = State::UTF8Query;
1415	break;
1416	case `'#'`:
1417	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1418	appendToASCIIBuffer(`'#'`);
1419	state = State::Fragment;
1420	++c;
1421	break;
1422	default:
1423	copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1424	if (currentPosition(c) && parsedDataView(currentPosition(c) - `1`) != `'/'`) {
1425	appendToASCIIBuffer(`'/'`);
1426	m_url.m_pathAfterLastSlash = currentPosition(c);
1427	}
1428	state = State::Path;
1429	break;
1430	}
1431	break;
1432	case State::RelativeSlash:
1433	LOG_STATE("RelativeSlash");
1434	if (c == `'/'` \|\| c == `'\\'`) {
1435	++c;
1436	copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1437	appendToASCIIBuffer("://", `3`);
1438	if (m_urlIsSpecial)
1439	state = State::SpecialAuthorityIgnoreSlashes;
1440	else {
1441	m_url.m_userStart = currentPosition(c);
1442	state = State::AuthorityOrHost;
1443	authorityOrHostBegin = c;
1444	}
1445	} else {
1446	copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1447	appendToASCIIBuffer(`'/'`);
1448	m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + `1`;
1449	state = State::Path;
1450	}
1451	break;
1452	case State::SpecialAuthoritySlashes:
1453	LOG_STATE("SpecialAuthoritySlashes");
1454	if (LIKELY(c == `'/'` \|\| c == `'\\'`)) {
1455	if (UNLIKELY(*c == `'\\'`))
1456	syntaxViolation(c);
1457	appendToASCIIBuffer(`'/'`);
1458	advance(c);
1459	if (LIKELY(!c.atEnd() && (c == `'/'` \|\| c == `'\\'`))) {
1460	if (UNLIKELY(*c == `'\\'`))
1461	syntaxViolation(c);
1462	++c;
1463	appendToASCIIBuffer(`'/'`);
1464	} else {
1465	syntaxViolation(c);
1466	appendToASCIIBuffer(`'/'`);
1467	}
1468	} else {
1469	syntaxViolation(c);
1470	appendToASCIIBuffer("//", `2`);
1471	}
1472	state = State::SpecialAuthorityIgnoreSlashes;
1473	break;
1474	case State::SpecialAuthorityIgnoreSlashes:
1475	LOG_STATE("SpecialAuthorityIgnoreSlashes");
1476	if (c == `'/'` \|\| c == `'\\'`) {
1477	syntaxViolation(c);
1478	++c;
1479	} else {
1480	m_url.m_userStart = currentPosition(c);
1481	state = State::AuthorityOrHost;
1482	authorityOrHostBegin = c;
1483	}
1484	break;
1485	case State::AuthorityOrHost:
1486	do {
1487	LOG_STATE("AuthorityOrHost");
1488	if (*c == `'@'`) {
1489	auto lastAt = c;
1490	auto findLastAt = c;
1491	while (!findLastAt.atEnd()) {
1492	URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1493	if (*findLastAt == `'@'`)
1494	lastAt = findLastAt;
1495	bool isSlash = findLastAt == `'/'` \|\| (m_urlIsSpecial && findLastAt == `'\\'`);
1496	if (isSlash \|\| findLastAt == `'?'` \|\| findLastAt == `'#'`)
1497	break;
1498	++findLastAt;
1499	}
1500	parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1501	c = lastAt;
1502	advance(c);
1503	authorityOrHostBegin = c;
1504	state = State::Host;
1505	m_hostHasPercentOrNonASCII = false;
1506	break;
1507	}
1508	bool isSlash = c == `'/'` \|\| (m_urlIsSpecial && c == `'\\'`);
1509	if (isSlash \|\| c == `'?'` \|\| c == `'#'`) {
1510	auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1511	if (iterator.atEnd()) {
1512	if (m_urlIsSpecial)
1513	return failure();
1514	m_url.m_userEnd = currentPosition(c);
1515	m_url.m_passwordEnd = m_url.m_userEnd;
1516	m_url.m_hostEnd = m_url.m_userEnd;
1517	m_url.m_portLength = `0`;
1518	m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1519	} else {
1520	m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1521	m_url.m_passwordEnd = m_url.m_userEnd;
1522	if (!parseHostAndPort(iterator)) {
1523	failure();
1524	return;
1525	}
1526	if (UNLIKELY(!isSlash)) {
1527	if (m_urlIsSpecial) {
1528	syntaxViolation(c);
1529	appendToASCIIBuffer(`'/'`);
1530	}
1531	m_url.m_pathAfterLastSlash = currentPosition(c);
1532	}
1533	}
1534	state = State::Path;
1535	break;
1536	}
1537	if (isPercentOrNonASCII(*c))
1538	m_hostHasPercentOrNonASCII = true;
1539	++c;
1540	} while (!c.atEnd());
1541	break;
1542	case State::Host:
1543	do {
1544	LOG_STATE("Host");
1545	if (c == `'/'` \|\| c == `'?'` \|\| *c == `'#'`) {
1546	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1547	failure();
1548	return;
1549	}
1550	if (c == `'?'` \|\| c == `'#'`) {
1551	syntaxViolation(c);
1552	appendToASCIIBuffer(`'/'`);
1553	m_url.m_pathAfterLastSlash = currentPosition(c);
1554	}
1555	state = State::Path;
1556	break;
1557	}
1558	if (isPercentOrNonASCII(*c))
1559	m_hostHasPercentOrNonASCII = true;
1560	++c;
1561	} while (!c.atEnd());
1562	break;
1563	case State::File:
1564	LOG_STATE("File");
1565	switch (*c) {
1566	case `'\\'`:
1567	syntaxViolation(c);
1568	FALLTHROUGH;
1569	case `'/'`:
1570	appendToASCIIBuffer(`'/'`);
1571	state = State::FileSlash;
1572	++c;
1573	break;
1574	case `'?'`:
1575	syntaxViolation(c);
1576	if (base.isValid() && base.protocolIs("file")) {
1577	copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1578	appendToASCIIBuffer(`'?'`);
1579	++c;
1580	} else {
1581	appendToASCIIBuffer("///?", `4`);
1582	++c;
1583	m_url.m_userStart = currentPosition(c) - `2`;
1584	m_url.m_userEnd = m_url.m_userStart;
1585	m_url.m_passwordEnd = m_url.m_userStart;
1586	m_url.m_hostEnd = m_url.m_userStart;
1587	m_url.m_portLength = `0`;
1588	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1589	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590	}
1591	if (nonUTF8QueryEncoding) {
1592	queryBegin = c;
1593	state = State::NonUTF8Query;
1594	} else
1595	state = State::UTF8Query;
1596	break;
1597	case `'#'`:
1598	syntaxViolation(c);
1599	if (base.isValid() && base.protocolIs("file")) {
1600	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1601	appendToASCIIBuffer(`'#'`);
1602	} else {
1603	appendToASCIIBuffer("///#", `4`);
1604	m_url.m_userStart = currentPosition(c) - `2`;
1605	m_url.m_userEnd = m_url.m_userStart;
1606	m_url.m_passwordEnd = m_url.m_userStart;
1607	m_url.m_hostEnd = m_url.m_userStart;
1608	m_url.m_portLength = `0`;
1609	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1610	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1611	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1612	}
1613	state = State::Fragment;
1614	++c;
1615	break;
1616	default:
1617	syntaxViolation(c);
1618	if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1619	copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1620	else {
1621	appendToASCIIBuffer("///", `3`);
1622	m_url.m_userStart = currentPosition(c) - `1`;
1623	m_url.m_userEnd = m_url.m_userStart;
1624	m_url.m_passwordEnd = m_url.m_userStart;
1625	m_url.m_hostEnd = m_url.m_userStart;
1626	m_url.m_portLength = `0`;
1627	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1628	if (isWindowsDriveLetter(c))
1629	appendWindowsDriveLetter(c);
1630	}
1631	state = State::Path;
1632	break;
1633	}
1634	break;
1635	case State::FileSlash:
1636	LOG_STATE("FileSlash");
1637	if (LIKELY(c == `'/'` \|\| c == `'\\'`)) {
1638	if (UNLIKELY(*c == `'\\'`))
1639	syntaxViolation(c);
1640	appendToASCIIBuffer(`'/'`);
1641	advance(c);
1642	m_url.m_userStart = currentPosition(c);
1643	m_url.m_userEnd = m_url.m_userStart;
1644	m_url.m_passwordEnd = m_url.m_userStart;
1645	m_url.m_hostEnd = m_url.m_userStart;
1646	m_url.m_portLength = `0`;
1647	authorityOrHostBegin = c;
1648	state = State::FileHost;
1649	break;
1650	}
1651	syntaxViolation(c);
1652	appendToASCIIBuffer("//", `2`);
1653	m_url.m_userStart = currentPosition(c) - `1`;
1654	m_url.m_userEnd = m_url.m_userStart;
1655	m_url.m_passwordEnd = m_url.m_userStart;
1656	m_url.m_hostEnd = m_url.m_userStart;
1657	m_url.m_portLength = `0`;
1658	if (isWindowsDriveLetter(c)) {
1659	appendWindowsDriveLetter(c);
1660	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1661	} else if (copyBaseWindowsDriveLetter(base)) {
1662	appendToASCIIBuffer(`'/'`);
1663	m_url.m_pathAfterLastSlash = m_url.m_userStart + `4`;
1664	} else
1665	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1666	state = State::Path;
1667	break;
1668	case State::FileHost:
1669	do {
1670	LOG_STATE("FileHost");
1671	if (isSlashQuestionOrHash(*c)) {
1672	bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1673	&& isWindowsDriveLetter(authorityOrHostBegin);
1674	if (windowsQuirk) {
1675	syntaxViolation(authorityOrHostBegin);
1676	appendToASCIIBuffer(`'/'`);
1677	appendWindowsDriveLetter(authorityOrHostBegin);
1678	}
1679	if (windowsQuirk \|\| authorityOrHostBegin == c) {
1680	ASSERT(windowsQuirk \|\| parsedDataView(currentPosition(c) - `1`) == `'/'`);
1681	if (UNLIKELY(*c == `'?'`)) {
1682	syntaxViolation(c);
1683	appendToASCIIBuffer("/?", `2`);
1684	++c;
1685	if (nonUTF8QueryEncoding) {
1686	queryBegin = c;
1687	state = State::NonUTF8Query;
1688	} else
1689	state = State::UTF8Query;
1690	m_url.m_pathAfterLastSlash = currentPosition(c) - `1`;
1691	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1692	break;
1693	}
1694	if (UNLIKELY(*c == `'#'`)) {
1695	syntaxViolation(c);
1696	appendToASCIIBuffer("/#", `2`);
1697	++c;
1698	m_url.m_pathAfterLastSlash = currentPosition(c) - `1`;
1699	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1701	state = State::Fragment;
1702	break;
1703	}
1704	state = State::Path;
1705	break;
1706	}
1707	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1708	failure();
1709	return;
1710	}
1711	if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1712	syntaxViolation(c);
1713	m_asciiBuffer.shrink(m_url.m_passwordEnd);
1714	m_url.m_hostEnd = currentPosition(c);
1715	m_url.m_portLength = `0`;
1716	}
1717
1718	state = State::PathStart;
1719	break;
1720	}
1721	if (isPercentOrNonASCII(*c))
1722	m_hostHasPercentOrNonASCII = true;
1723	++c;
1724	} while (!c.atEnd());
1725	break;
1726	case State::PathStart:
1727	LOG_STATE("PathStart");
1728	if (c != `'/'` && c != `'\\'`) {
1729	syntaxViolation(c);
1730	appendToASCIIBuffer(`'/'`);
1731	}
1732	m_url.m_pathAfterLastSlash = currentPosition(c);
1733	state = State::Path;
1734	break;
1735	case State::Path:
1736	LOG_STATE("Path");
1737	if (c == `'/'` \|\| (m_urlIsSpecial && c == `'\\'`)) {
1738	if (UNLIKELY(m_urlIsSpecial && *c == `'\\'`))
1739	syntaxViolation(c);
1740	appendToASCIIBuffer(`'/'`);
1741	++c;
1742	m_url.m_pathAfterLastSlash = currentPosition(c);
1743	break;
1744	}
1745	if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - `1`) == `'/'`)) {
1746	if (UNLIKELY(isDoubleDotPathSegment(c))) {
1747	syntaxViolation(c);
1748	consumeDoubleDotPathSegment(c);
1749	popPath();
1750	break;
1751	}
1752	if (UNLIKELY(isSingleDotPathSegment(c))) {
1753	syntaxViolation(c);
1754	consumeSingleDotPathSegment(c);
1755	break;
1756	}
1757	}
1758	if (*c == `'?'`) {
1759	m_url.m_pathEnd = currentPosition(c);
1760	appendToASCIIBuffer(`'?'`);
1761	++c;
1762	if (nonUTF8QueryEncoding) {
1763	queryBegin = c;
1764	state = State::NonUTF8Query;
1765	} else
1766	state = State::UTF8Query;
1767	break;
1768	}
1769	if (*c == `'#'`) {
1770	m_url.m_pathEnd = currentPosition(c);
1771	m_url.m_queryEnd = m_url.m_pathEnd;
1772	state = State::Fragment;
1773	break;
1774	}
1775	utf8PercentEncode<isInDefaultEncodeSet>(c);
1776	++c;
1777	break;
1778	case State::CannotBeABaseURLPath:
1779	LOG_STATE("CannotBeABaseURLPath");
1780	if (*c == `'?'`) {
1781	m_url.m_pathEnd = currentPosition(c);
1782	appendToASCIIBuffer(`'?'`);
1783	++c;
1784	if (nonUTF8QueryEncoding) {
1785	queryBegin = c;
1786	state = State::NonUTF8Query;
1787	} else
1788	state = State::UTF8Query;
1789	} else if (*c == `'#'`) {
1790	m_url.m_pathEnd = currentPosition(c);
1791	m_url.m_queryEnd = m_url.m_pathEnd;
1792	state = State::Fragment;
1793	} else if (*c == `'/'`) {
1794	appendToASCIIBuffer(`'/'`);
1795	++c;
1796	m_url.m_pathAfterLastSlash = currentPosition(c);
1797	} else {
1798	utf8PercentEncode<isInSimpleEncodeSet>(c);
1799	++c;
1800	}
1801	break;
1802	case State::UTF8Query:
1803	LOG_STATE("UTF8Query");
1804	ASSERT(queryBegin == CodePointIterator<CharacterType>());
1805	if (*c == `'#'`) {
1806	m_url.m_queryEnd = currentPosition(c);
1807	state = State::Fragment;
1808	break;
1809	}
1810	ASSERT(!nonUTF8QueryEncoding);
1811	utf8QueryEncode(c);
1812	++c;
1813	break;
1814	case State::NonUTF8Query:
1815	do {
1816	LOG_STATE("NonUTF8Query");
1817	ASSERT(queryBegin != CodePointIterator<CharacterType>());
1818	if (*c == `'#'`) {
1819	encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1820	m_url.m_queryEnd = currentPosition(c);
1821	state = State::Fragment;
1822	break;
1823	}
1824	appendCodePoint(queryBuffer, *c);
1825	advance(c, queryBegin);
1826	} while (!c.atEnd());
1827	break;
1828	case State::Fragment:
1829	URL_PARSER_LOG("State Fragment");
1830	utf8PercentEncode<isInSimpleEncodeSet>(c);
1831	++c;
1832	break;
1833	}
1834	}
1835
1836	switch (state) {
1837	case State::SchemeStart:
1838	LOG_FINAL_STATE("SchemeStart");
1839	if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1840	m_url = base;
1841	m_url.removeFragmentIdentifier();
1842	return;
1843	}
1844	failure();
1845	return;
1846	case State::Scheme:
1847	LOG_FINAL_STATE("Scheme");
1848	failure();
1849	return;
1850	case State::NoScheme:
1851	LOG_FINAL_STATE("NoScheme");
1852	RELEASE_ASSERT_NOT_REACHED();
1853	case State::SpecialRelativeOrAuthority:
1854	LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1855	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1856	break;
1857	case State::PathOrAuthority:
1858	LOG_FINAL_STATE("PathOrAuthority");
1859	ASSERT(m_url.m_userStart);
1860	ASSERT(m_url.m_userStart == currentPosition(c));
1861	ASSERT(parsedDataView(currentPosition(c) - `1`) == `'/'`);
1862	m_url.m_userStart--;
1863	m_url.m_userEnd = m_url.m_userStart;
1864	m_url.m_passwordEnd = m_url.m_userStart;
1865	m_url.m_hostEnd = m_url.m_userStart;
1866	m_url.m_portLength = `0`;
1867	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1868	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1869	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1870	break;
1871	case State::Relative:
1872	LOG_FINAL_STATE("Relative");
1873	RELEASE_ASSERT_NOT_REACHED();
1874	case State::RelativeSlash:
1875	LOG_FINAL_STATE("RelativeSlash");
1876	copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1877	appendToASCIIBuffer(`'/'`);
1878	m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + `1`;
1879	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1880	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1881	break;
1882	case State::SpecialAuthoritySlashes:
1883	LOG_FINAL_STATE("SpecialAuthoritySlashes");
1884	m_url.m_userStart = currentPosition(c);
1885	m_url.m_userEnd = m_url.m_userStart;
1886	m_url.m_passwordEnd = m_url.m_userStart;
1887	m_url.m_hostEnd = m_url.m_userStart;
1888	m_url.m_portLength = `0`;
1889	m_url.m_pathAfterLastSlash = m_url.m_userStart;
1890	m_url.m_pathEnd = m_url.m_userStart;
1891	m_url.m_queryEnd = m_url.m_userStart;
1892	break;
1893	case State::SpecialAuthorityIgnoreSlashes:
1894	LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1895	failure();
1896	return;
1897	case State::AuthorityOrHost:
1898	LOG_FINAL_STATE("AuthorityOrHost");
1899	m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1900	m_url.m_passwordEnd = m_url.m_userEnd;
1901	if (authorityOrHostBegin.atEnd()) {
1902	m_url.m_userEnd = m_url.m_userStart;
1903	m_url.m_passwordEnd = m_url.m_userStart;
1904	m_url.m_hostEnd = m_url.m_userStart;
1905	m_url.m_portLength = `0`;
1906	m_url.m_pathEnd = m_url.m_userStart;
1907	} else if (!parseHostAndPort(authorityOrHostBegin)) {
1908	failure();
1909	return;
1910	} else {
1911	if (m_urlIsSpecial) {
1912	syntaxViolation(c);
1913	appendToASCIIBuffer(`'/'`);
1914	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + `1`;
1915	} else
1916	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1917	}
1918	m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1919	m_url.m_queryEnd = m_url.m_pathEnd;
1920	break;
1921	case State::Host:
1922	LOG_FINAL_STATE("Host");
1923	if (!parseHostAndPort(authorityOrHostBegin)) {
1924	failure();
1925	return;
1926	}
1927	if (m_urlIsSpecial) {
1928	syntaxViolation(c);
1929	appendToASCIIBuffer(`'/'`);
1930	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + `1`;
1931	} else
1932	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1933	m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1934	m_url.m_queryEnd = m_url.m_pathEnd;
1935	break;
1936	case State::File:
1937	LOG_FINAL_STATE("File");
1938	if (base.isValid() && base.protocolIs("file")) {
1939	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1940	break;
1941	}
1942	syntaxViolation(c);
1943	appendToASCIIBuffer("///", `3`);
1944	m_url.m_userStart = currentPosition(c) - `1`;
1945	m_url.m_userEnd = m_url.m_userStart;
1946	m_url.m_passwordEnd = m_url.m_userStart;
1947	m_url.m_hostEnd = m_url.m_userStart;
1948	m_url.m_portLength = `0`;
1949	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1950	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1951	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1952	break;
1953	case State::FileSlash:
1954	LOG_FINAL_STATE("FileSlash");
1955	syntaxViolation(c);
1956	m_url.m_userStart = currentPosition(c) + `1`;
1957	appendToASCIIBuffer("//", `2`);
1958	m_url.m_userEnd = m_url.m_userStart;
1959	m_url.m_passwordEnd = m_url.m_userStart;
1960	m_url.m_hostEnd = m_url.m_userStart;
1961	m_url.m_portLength = `0`;
1962	if (copyBaseWindowsDriveLetter(base)) {
1963	appendToASCIIBuffer(`'/'`);
1964	m_url.m_pathAfterLastSlash = m_url.m_userStart + `4`;
1965	} else
1966	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1967	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1968	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1969	break;
1970	case State::FileHost:
1971	LOG_FINAL_STATE("FileHost");
1972	if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1973	&& isWindowsDriveLetter(authorityOrHostBegin)) {
1974	syntaxViolation(authorityOrHostBegin);
1975	appendToASCIIBuffer(`'/'`);
1976	appendWindowsDriveLetter(authorityOrHostBegin);
1977	m_url.m_pathAfterLastSlash = currentPosition(c);
1978	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1979	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1980	break;
1981	}
1982
1983	if (authorityOrHostBegin == c) {
1984	syntaxViolation(c);
1985	appendToASCIIBuffer(`'/'`);
1986	m_url.m_userStart = currentPosition(c) - `1`;
1987	m_url.m_userEnd = m_url.m_userStart;
1988	m_url.m_passwordEnd = m_url.m_userStart;
1989	m_url.m_hostEnd = m_url.m_userStart;
1990	m_url.m_portLength = `0`;
1991	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1992	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1993	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1994	break;
1995	}
1996
1997	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1998	failure();
1999	return;
2000	}
2001
2002	syntaxViolation(c);
2003	if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2004	m_asciiBuffer.shrink(m_url.m_passwordEnd);
2005	m_url.m_hostEnd = currentPosition(c);
2006	m_url.m_portLength = `0`;
2007	}
2008	appendToASCIIBuffer(`'/'`);
2009	m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + `1`;
2010	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2011	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2012	break;
2013	case State::PathStart:
2014	LOG_FINAL_STATE("PathStart");
2015	RELEASE_ASSERT_NOT_REACHED();
2016	case State::Path:
2017	LOG_FINAL_STATE("Path");
2018	m_url.m_pathEnd = currentPosition(c);
2019	m_url.m_queryEnd = m_url.m_pathEnd;
2020	break;
2021	case State::CannotBeABaseURLPath:
2022	LOG_FINAL_STATE("CannotBeABaseURLPath");
2023	m_url.m_pathEnd = currentPosition(c);
2024	m_url.m_queryEnd = m_url.m_pathEnd;
2025	break;
2026	case State::UTF8Query:
2027	LOG_FINAL_STATE("UTF8Query");
2028	ASSERT(queryBegin == CodePointIterator<CharacterType>());
2029	m_url.m_queryEnd = currentPosition(c);
2030	break;
2031	case State::NonUTF8Query:
2032	LOG_FINAL_STATE("NonUTF8Query");
2033	ASSERT(queryBegin != CodePointIterator<CharacterType>());
2034	encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2035	m_url.m_queryEnd = currentPosition(c);
2036	break;
2037	case State::Fragment:
2038	LOG_FINAL_STATE("Fragment");
2039	break;
2040	}
2041
2042	if (LIKELY(!m_didSeeSyntaxViolation)) {
2043	m_url.m_string = m_inputString;
2044	ASSERT(m_asciiBuffer.isEmpty());
2045	} else
2046	m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2047	m_url.m_isValid = true;
2048	URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2049	}
2050
2051	template<typename CharacterType>
2052	void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2053	{
2054	if (UNLIKELY(iterator.atEnd())) {
2055	syntaxViolation(iterator);
2056	m_url.m_userEnd = currentPosition(iterator);
2057	m_url.m_passwordEnd = m_url.m_userEnd;
2058	return;
2059	}
2060	for (; !iterator.atEnd(); advance(iterator)) {
2061	if (*iterator == `':'`) {
2062	m_url.m_userEnd = currentPosition(iterator);
2063	auto iteratorAtColon = iterator;
2064	++iterator;
2065	bool tabOrNewlineAfterColon = false;
2066	while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2067	tabOrNewlineAfterColon = true;
2068	++iterator;
2069	}
2070	if (UNLIKELY(iterator.atEnd())) {
2071	syntaxViolation(iteratorAtColon);
2072	m_url.m_passwordEnd = m_url.m_userEnd;
2073	if (m_url.m_userEnd > m_url.m_userStart)
2074	appendToASCIIBuffer(`'@'`);
2075	return;
2076	}
2077	if (tabOrNewlineAfterColon)
2078	syntaxViolation(iteratorAtColon);
2079	appendToASCIIBuffer(`':'`);
2080	break;
2081	}
2082	utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2083	}
2084	for (; !iterator.atEnd(); advance(iterator))
2085	utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2086	m_url.m_passwordEnd = currentPosition(iterator);
2087	if (!m_url.m_userEnd)
2088	m_url.m_userEnd = m_url.m_passwordEnd;
2089	appendToASCIIBuffer(`'@'`);
2090	}
2091
2092	template<typename UnsignedIntegerType>
2093	void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2094	{
2095	LChar buf[sizeof(UnsignedIntegerType) * `3` + `1`];
2096	LChar* end = std::end(buf);
2097	LChar* p = end;
2098	do {
2099	*--p = (number % `10`) + `'0'`;
2100	number /= `10`;
2101	} while (number);
2102	appendToASCIIBuffer(p, end - p);
2103	}
2104
2105	void URLParser::serializeIPv4(IPv4Address address)
2106	{
2107	appendNumberToASCIIBuffer<uint8_t>(address >> `24`);
2108	appendToASCIIBuffer(`'.'`);
2109	appendNumberToASCIIBuffer<uint8_t>(address >> `16`);
2110	appendToASCIIBuffer(`'.'`);
2111	appendNumberToASCIIBuffer<uint8_t>(address >> `8`);
2112	appendToASCIIBuffer(`'.'`);
2113	appendNumberToASCIIBuffer<uint8_t>(address);
2114	}
2115
2116	static size_t zeroSequenceLength(const std::array<uint16_t, `8`>& address, size_t begin)
2117	{
2118	size_t end = begin;
2119	for (; end < `8`; end++) {
2120	if (address [end])
2121	break;
2122	}
2123	return end - begin;
2124	}
2125
2126	static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, `8`>& address)
2127	{
2128	Optional<size_t> longest;
2129	size_t longestLength = `0`;
2130	for (size_t i = `0`; i < `8`; i++) {
2131	size_t length = zeroSequenceLength(address, i);
2132	if (length) {
2133	if (length > `1` && (!longest \|\| longestLength < length)) {
2134	longest = i;
2135	longestLength = length;
2136	}
2137	i += length;
2138	}
2139	}
2140	return longest;
2141	}
2142
2143	void URLParser::serializeIPv6Piece(uint16_t piece)
2144	{
2145	bool printed = false;
2146	if (auto nibble0 = piece >> `12`) {
2147	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2148	printed = true;
2149	}
2150	auto nibble1 = piece >> `8` & `0xF`;
2151	if (printed \|\| nibble1) {
2152	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2153	printed = true;
2154	}
2155	auto nibble2 = piece >> `4` & `0xF`;
2156	if (printed \|\| nibble2)
2157	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2158	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & `0xF`));
2159	}
2160
2161	void URLParser::serializeIPv6(URLParser::IPv6Address address)
2162	{
2163	appendToASCIIBuffer(`'['`);
2164	auto compressPointer = findLongestZeroSequence(address);
2165	for (size_t piece = `0`; piece < `8`; piece++) {
2166	if (compressPointer && compressPointer.value() == piece) {
2167	ASSERT(!address[piece]);
2168	if (piece)
2169	appendToASCIIBuffer(`':'`);
2170	else
2171	appendToASCIIBuffer("::", `2`);
2172	while (piece < `8` && !address [piece])
2173	piece++;
2174	if (piece == `8`)
2175	break;
2176	}
2177	serializeIPv6Piece(address [piece]);
2178	if (piece < `7`)
2179	appendToASCIIBuffer(`':'`);
2180	}
2181	appendToASCIIBuffer(`']'`);
2182	}
2183
2184	enum class URLParser::IPv4PieceParsingError {
2185	Failure,
2186	Overflow,
2187	};
2188
2189	template<typename CharacterType>
2190	Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2191	{
2192	enum class State : uint8_t {
2193	UnknownBase,
2194	Decimal,
2195	OctalOrHex,
2196	Octal,
2197	Hex,
2198	};
2199	State state = State::UnknownBase;
2200	Checked<uint32_t, RecordOverflow> value = `0`;
2201	if (!iterator.atEnd() && *iterator == `'.'`)
2202	return makeUnexpected(IPv4PieceParsingError::Failure);
2203	while (!iterator.atEnd()) {
2204	if (isTabOrNewline(*iterator)) {
2205	didSeeSyntaxViolation = true;
2206	++iterator;
2207	continue;
2208	}
2209	if (*iterator == `'.'`) {
2210	ASSERT(!value.hasOverflowed());
2211	return value.unsafeGet();
2212	}
2213	switch (state) {
2214	case State::UnknownBase:
2215	if (UNLIKELY(*iterator == `'0'`)) {
2216	++iterator;
2217	state = State::OctalOrHex;
2218	break;
2219	}
2220	state = State::Decimal;
2221	break;
2222	case State::OctalOrHex:
2223	didSeeSyntaxViolation = true;
2224	if (iterator == `'x'` \|\| iterator == `'X'`) {
2225	++iterator;
2226	state = State::Hex;
2227	break;
2228	}
2229	state = State::Octal;
2230	break;
2231	case State::Decimal:
2232	if (!isASCIIDigit(*iterator))
2233	return makeUnexpected(IPv4PieceParsingError::Failure);
2234	value *= `10`;
2235	value += *iterator - `'0'`;
2236	if (UNLIKELY(value.hasOverflowed()))
2237	return makeUnexpected(IPv4PieceParsingError::Overflow);
2238	++iterator;
2239	break;
2240	case State::Octal:
2241	ASSERT(didSeeSyntaxViolation);
2242	if (iterator < `'0'` \|\| iterator > `'7'`)
2243	return makeUnexpected(IPv4PieceParsingError::Failure);
2244	value *= `8`;
2245	value += *iterator - `'0'`;
2246	if (UNLIKELY(value.hasOverflowed()))
2247	return makeUnexpected(IPv4PieceParsingError::Overflow);
2248	++iterator;
2249	break;
2250	case State::Hex:
2251	ASSERT(didSeeSyntaxViolation);
2252	if (!isASCIIHexDigit(*iterator))
2253	return makeUnexpected(IPv4PieceParsingError::Failure);
2254	value *= `16`;
2255	value += toASCIIHexValue(*iterator);
2256	if (UNLIKELY(value.hasOverflowed()))
2257	return makeUnexpected(IPv4PieceParsingError::Overflow);
2258	++iterator;
2259	break;
2260	}
2261	}
2262	ASSERT(!value.hasOverflowed());
2263	return value.unsafeGet();
2264	}
2265
2266	ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2267	{
2268	RELEASE_ASSERT(exponent <= `4`);
2269	uint64_t values[`5`] = {`1`, `256`, `256` * `256`, `256` * `256` * `256`, `256ull` * `256` * `256` * `256` };
2270	return values[exponent];
2271	}
2272
2273	enum class URLParser::IPv4ParsingError {
2274	Failure,
2275	NotIPv4,
2276	};
2277
2278	template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2279	Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2280	{
2281	Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, `4`> items;
2282	bool didSeeSyntaxViolation = false;
2283	if (!iterator.atEnd() && *iterator == `'.'`)
2284	return makeUnexpected(IPv4ParsingError::NotIPv4);
2285	while (!iterator.atEnd()) {
2286	if (isTabOrNewline(*iterator)) {
2287	didSeeSyntaxViolation = true;
2288	++iterator;
2289	continue;
2290	}
2291	if (items.size() >= `4`)
2292	return makeUnexpected(IPv4ParsingError::NotIPv4);
2293	items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2294	if (!iterator.atEnd() && *iterator == `'.'`) {
2295	++iterator;
2296	if (iterator.atEnd())
2297	syntaxViolation(iteratorForSyntaxViolationPosition);
2298	else if (*iterator == `'.'`)
2299	return makeUnexpected(IPv4ParsingError::NotIPv4);
2300	}
2301	}
2302	if (!iterator.atEnd() \|\| !items.size() \|\| items.size() > `4`)
2303	return makeUnexpected(IPv4ParsingError::NotIPv4);
2304	for (const auto& item : items) {
2305	if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2306	return makeUnexpected(IPv4ParsingError::NotIPv4);
2307	}
2308	for (const auto& item : items) {
2309	if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2310	return makeUnexpected(IPv4ParsingError::Failure);
2311	}
2312	if (items.size() > `1`) {
2313	for (size_t i = `0`; i < items.size() - `1`; i++) {
2314	if (items [i].value() > `255`)
2315	return makeUnexpected(IPv4ParsingError::Failure);
2316	}
2317	}
2318	if (items [items.size() - `1`].value() >= pow256(`5` - items.size()))
2319	return makeUnexpected(IPv4ParsingError::Failure);
2320
2321	if (didSeeSyntaxViolation)
2322	syntaxViolation(iteratorForSyntaxViolationPosition);
2323	for (const auto& item : items) {
2324	if (item.value() > `255`)
2325	syntaxViolation(iteratorForSyntaxViolationPosition);
2326	}
2327
2328	if (UNLIKELY(items.size() != `4`))
2329	syntaxViolation(iteratorForSyntaxViolationPosition);
2330
2331	IPv4Address ipv4 = items.takeLast().value();
2332	for (size_t counter = `0`; counter < items.size(); ++counter)
2333	ipv4 += items [counter].value() * pow256(`3` - counter);
2334	return ipv4;
2335	}
2336
2337	template<typename CharacterType>
2338	Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2339	{
2340	if (iterator.atEnd())
2341	return WTF::nullopt;
2342	uint32_t piece = `0`;
2343	bool leadingZeros = false;
2344	size_t digitCount = `0`;
2345	while (!iterator.atEnd()) {
2346	if (!isASCIIDigit(*iterator))
2347	return WTF::nullopt;
2348	++digitCount;
2349	if (!piece && *iterator == `'0'`) {
2350	if (leadingZeros)
2351	return WTF::nullopt;
2352	leadingZeros = true;
2353	}
2354	if (!piece && *iterator == `'0'`)
2355	leadingZeros = true;
2356	piece = piece * `10` + *iterator - `'0'`;
2357	if (piece > `255`)
2358	return WTF::nullopt;
2359	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2360	if (iterator.atEnd())
2361	break;
2362	if (*iterator == `'.'`)
2363	break;
2364	}
2365	if (piece && leadingZeros)
2366	return WTF::nullopt;
2367	return piece;
2368	}
2369
2370	template<typename CharacterType>
2371	Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2372	{
2373	IPv4Address address = `0`;
2374	for (size_t i = `0`; i < `4`; ++i) {
2375	if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2376	address = (address << `8`) + piece.value();
2377	else
2378	return WTF::nullopt;
2379	if (i < `3`) {
2380	if (iterator.atEnd())
2381	return WTF::nullopt;
2382	if (*iterator != `'.'`)
2383	return WTF::nullopt;
2384	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2385	} else if (!iterator.atEnd())
2386	return WTF::nullopt;
2387	}
2388	ASSERT(iterator.atEnd());
2389	return address;
2390	}
2391
2392	template<typename CharacterType>
2393	Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2394	{
2395	ASSERT(*c == `'['`);
2396	const auto hostBegin = c;
2397	advance(c, hostBegin);
2398	if (c.atEnd())
2399	return WTF::nullopt;
2400
2401	IPv6Address address = {{`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}};
2402	size_t piecePointer = `0`;
2403	Optional<size_t> compressPointer;
2404
2405	if (*c == `':'`) {
2406	advance(c, hostBegin);
2407	if (c.atEnd())
2408	return WTF::nullopt;
2409	if (*c != `':'`)
2410	return WTF::nullopt;
2411	advance(c, hostBegin);
2412	++piecePointer;
2413	compressPointer = piecePointer;
2414	}
2415
2416	while (!c.atEnd()) {
2417	if (piecePointer == `8`)
2418	return WTF::nullopt;
2419	if (*c == `':'`) {
2420	if (compressPointer)
2421	return WTF::nullopt;
2422	advance(c, hostBegin);
2423	++piecePointer;
2424	compressPointer = piecePointer;
2425	continue;
2426	}
2427	if (piecePointer == `6` \|\| (compressPointer && piecePointer < `6`)) {
2428	if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2429	if (compressPointer && piecePointer == `5`)
2430	return WTF::nullopt;
2431	syntaxViolation(hostBegin);
2432	address [piecePointer++] = ipv4Address.value() >> `16`;
2433	address [piecePointer++] = ipv4Address.value() & `0xFFFF`;
2434	c = { };
2435	break;
2436	}
2437	}
2438	uint16_t value = `0`;
2439	size_t length = `0`;
2440	bool leadingZeros = false;
2441	for (; length < `4`; length++) {
2442	if (c.atEnd())
2443	break;
2444	if (!isASCIIHexDigit(*c))
2445	break;
2446	if (isASCIIUpper(*c))
2447	syntaxViolation(hostBegin);
2448	if (*c == `'0'` && !length)
2449	leadingZeros = true;
2450	value = value * `0x10` + toASCIIHexValue(*c);
2451	advance(c, hostBegin);
2452	}
2453
2454	if (UNLIKELY((value && leadingZeros) \|\| (!value && length > `1`)))
2455	syntaxViolation(hostBegin);
2456
2457	address [piecePointer++] = value;
2458	if (c.atEnd())
2459	break;
2460	if (piecePointer == `8` \|\| *c != `':'`)
2461	return WTF::nullopt;
2462	advance(c, hostBegin);
2463	}
2464
2465	if (!c.atEnd())
2466	return WTF::nullopt;
2467
2468	if (compressPointer) {
2469	size_t swaps = piecePointer - compressPointer.value();
2470	piecePointer = `7`;
2471	while (swaps)
2472	std::swap(address [piecePointer--], address [compressPointer.value() + swaps-- - `1`]);
2473	} else if (piecePointer != `8`)
2474	return WTF::nullopt;
2475
2476	Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2477	if (possibleCompressPointer)
2478	possibleCompressPointer.value()++;
2479	if (UNLIKELY(compressPointer != possibleCompressPointer))
2480	syntaxViolation(hostBegin);
2481
2482	return address;
2483	}
2484
2485	template<typename CharacterType>
2486	URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2487	{
2488	LCharBuffer output;
2489	output.reserveInitialCapacity(length);
2490
2491	for (size_t i = `0`; i < length; ++i) {
2492	uint8_t byte = input[i];
2493	if (byte != `'%'`)
2494	output.uncheckedAppend(byte);
2495	else if (length > `2` && i < length - `2`) {
2496	if (isASCIIHexDigit(input[i + `1`]) && isASCIIHexDigit(input[i + `2`])) {
2497	syntaxViolation(iteratorForSyntaxViolationPosition);
2498	output.uncheckedAppend(toASCIIHexValue(input[i + `1`], input[i + `2`]));
2499	i += `2`;
2500	} else
2501	output.uncheckedAppend(byte);
2502	} else
2503	output.uncheckedAppend(byte);
2504	}
2505	return output;
2506	}
2507
2508	URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2509	{
2510	LCharBuffer output;
2511	output.reserveInitialCapacity(length);
2512
2513	for (size_t i = `0`; i < length; ++i) {
2514	uint8_t byte = input[i];
2515	if (byte != `'%'`)
2516	output.uncheckedAppend(byte);
2517	else if (length > `2` && i < length - `2`) {
2518	if (isASCIIHexDigit(input[i + `1`]) && isASCIIHexDigit(input[i + `2`])) {
2519	output.uncheckedAppend(toASCIIHexValue(input[i + `1`], input[i + `2`]));
2520	i += `2`;
2521	} else
2522	output.uncheckedAppend(byte);
2523	} else
2524	output.uncheckedAppend(byte);
2525	}
2526	return output;
2527	}
2528
2529	template<typename CharacterType> Optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2530	{
2531	LCharBuffer ascii;
2532	if (domain.isAllASCII()) {
2533	size_t length = domain.length();
2534	if (domain.is8Bit()) {
2535	const LChar* characters = domain.characters8();
2536	ascii.reserveInitialCapacity(length);
2537	for (size_t i = `0`; i < length; ++i) {
2538	if (UNLIKELY(isASCIIUpper(characters[i])))
2539	syntaxViolation(iteratorForSyntaxViolationPosition);
2540	ascii.uncheckedAppend(toASCIILower(characters[i]));
2541	}
2542	} else {
2543	const UChar* characters = domain.characters16();
2544	ascii.reserveInitialCapacity(length);
2545	for (size_t i = `0`; i < length; ++i) {
2546	if (UNLIKELY(isASCIIUpper(characters[i])))
2547	syntaxViolation(iteratorForSyntaxViolationPosition);
2548	ascii.uncheckedAppend(toASCIILower(characters[i]));
2549	}
2550	}
2551	return ascii;
2552	}
2553
2554	const size_t maxDomainLength = `64`;
2555	UChar hostnameBuffer[maxDomainLength];
2556	UErrorCode error = U_ZERO_ERROR;
2557	UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2558	int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView (domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2559	ASSERT(numCharactersConverted <= static_cast<int32_t>(maxDomainLength));
2560
2561	if (U_SUCCESS(error) && !processingDetails.errors) {
2562	for (int32_t i = `0`; i < numCharactersConverted; ++i) {
2563	ASSERT(isASCII(hostnameBuffer[i]));
2564	ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2565	}
2566	ascii.append(hostnameBuffer, numCharactersConverted);
2567	if (domain != StringView (ascii.data(), ascii.size()))
2568	syntaxViolation(iteratorForSyntaxViolationPosition);
2569	return ascii;
2570	}
2571	return WTF::nullopt;
2572	}
2573
2574	bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2575	{
2576	for (size_t i = `0`; i < asciiDomain.size(); ++i) {
2577	if (isForbiddenHostCodePoint(asciiDomain [i]))
2578	return true;
2579	}
2580	return false;
2581	}
2582
2583	template<typename CharacterType>
2584	bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2585	{
2586	ASSERT(*iterator == `':'`);
2587	auto colonIterator = iterator;
2588	advance(iterator, colonIterator);
2589	uint32_t port = `0`;
2590	if (UNLIKELY(iterator.atEnd())) {
2591	unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2592	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2593	m_url.m_portLength = portLength;
2594	syntaxViolation(colonIterator);
2595	return true;
2596	}
2597	size_t digitCount = `0`;
2598	bool leadingZeros = false;
2599	for (; !iterator.atEnd(); ++iterator) {
2600	if (UNLIKELY(isTabOrNewline(*iterator))) {
2601	syntaxViolation(colonIterator);
2602	continue;
2603	}
2604	if (isASCIIDigit(*iterator)) {
2605	if (*iterator == `'0'` && !digitCount)
2606	leadingZeros = true;
2607	++digitCount;
2608	port = port * `10` + *iterator - `'0'`;
2609	if (port > std::numeric_limits<uint16_t>::max())
2610	return false;
2611	} else
2612	return false;
2613	}
2614
2615	if (port && leadingZeros)
2616	syntaxViolation(colonIterator);
2617
2618	if (!port && digitCount > `1`)
2619	syntaxViolation(colonIterator);
2620
2621	ASSERT(port == static_cast<uint16_t>(port));
2622	if (UNLIKELY(defaultPortForProtocol(parsedDataView(`0`, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2623	syntaxViolation(colonIterator);
2624	else {
2625	appendToASCIIBuffer(`':'`);
2626	ASSERT(port <= std::numeric_limits<uint16_t>::max());
2627	appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2628	}
2629
2630	unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2631	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2632	m_url.m_portLength = portLength;
2633	return true;
2634	}
2635
2636	template<typename CharacterType>
2637	bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2638	{
2639	if (iterator.atEnd())
2640	return false;
2641	if (*iterator == `':'`)
2642	return false;
2643	if (*iterator == `'['`) {
2644	auto ipv6End = iterator;
2645	while (!ipv6End.atEnd() && *ipv6End != `']'`)
2646	++ipv6End;
2647	if (ipv6End.atEnd())
2648	return false;
2649	if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2650	serializeIPv6(address.value());
2651	if (!ipv6End.atEnd()) {
2652	advance(ipv6End);
2653	if (!ipv6End.atEnd() && *ipv6End == `':'`) {
2654	m_url.m_hostEnd = currentPosition(ipv6End);
2655	return parsePort(ipv6End);
2656	}
2657	m_url.m_hostEnd = currentPosition(ipv6End);
2658	m_url.m_portLength = `0`;
2659	return true;
2660	}
2661	m_url.m_hostEnd = currentPosition(ipv6End);
2662	return true;
2663	}
2664	return false;
2665	}
2666
2667	if (!m_urlIsSpecial) {
2668	for (; !iterator.atEnd(); ++iterator) {
2669	if (UNLIKELY(isTabOrNewline(*iterator))) {
2670	syntaxViolation(iterator);
2671	continue;
2672	}
2673	if (*iterator == `':'`)
2674	break;
2675	if (UNLIKELY(isForbiddenHostCodePoint(iterator) && iterator != `'%'`))
2676	return false;
2677	utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2678	}
2679	m_url.m_hostEnd = currentPosition(iterator);
2680	if (iterator.atEnd()) {
2681	m_url.m_portLength = `0`;
2682	return true;
2683	}
2684	return parsePort(iterator);
2685	}
2686
2687	if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2688	auto hostIterator = iterator;
2689	for (; !iterator.atEnd(); ++iterator) {
2690	if (isTabOrNewline(*iterator))
2691	continue;
2692	if (*iterator == `':'`)
2693	break;
2694	if (isForbiddenHostCodePoint(*iterator))
2695	return false;
2696	}
2697	auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2698	if (address) {
2699	serializeIPv4(address.value());
2700	m_url.m_hostEnd = currentPosition(iterator);
2701	if (iterator.atEnd()) {
2702	m_url.m_portLength = `0`;
2703	return true;
2704	}
2705	return parsePort(iterator);
2706	}
2707	if (address.error() == IPv4ParsingError::Failure)
2708	return false;
2709	for (; hostIterator != iterator; ++hostIterator) {
2710	if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2711	syntaxViolation(hostIterator);
2712	continue;
2713	}
2714	if (UNLIKELY(isASCIIUpper(*hostIterator)))
2715	syntaxViolation(hostIterator);
2716	appendToASCIIBuffer(toASCIILower(*hostIterator));
2717	}
2718	m_url.m_hostEnd = currentPosition(iterator);
2719	if (!hostIterator.atEnd())
2720	return parsePort(hostIterator);
2721	unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2722	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2723	m_url.m_portLength = portLength;
2724	return true;
2725	}
2726
2727	const auto hostBegin = iterator;
2728
2729	LCharBuffer utf8Encoded;
2730	for (; !iterator.atEnd(); ++iterator) {
2731	if (UNLIKELY(isTabOrNewline(*iterator))) {
2732	syntaxViolation(hostBegin);
2733	continue;
2734	}
2735	if (*iterator == `':'`)
2736	break;
2737	if (UNLIKELY(!isASCII(*iterator)))
2738	syntaxViolation(hostBegin);
2739
2740	if (!U_IS_UNICODE_CHAR(*iterator))
2741	return false;
2742	uint8_t buffer[U8_MAX_LENGTH];
2743	int32_t offset = `0`;
2744	U8_APPEND_UNSAFE(buffer, offset, *iterator);
2745	utf8Encoded.append(buffer, offset);
2746	}
2747	LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2748	String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2749	if (domain.isNull())
2750	return false;
2751	if (domain != StringView (percentDecoded.data(), percentDecoded.size()))
2752	syntaxViolation(hostBegin);
2753	auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2754	if (!asciiDomain \|\| hasForbiddenHostCodePoint(asciiDomain.value()))
2755	return false;
2756	LCharBuffer& asciiDomainValue = asciiDomain.value();
2757	const LChar* asciiDomainCharacters = asciiDomainValue.data();
2758
2759	auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2760	if (address) {
2761	serializeIPv4(address.value());
2762	m_url.m_hostEnd = currentPosition(iterator);
2763	if (iterator.atEnd()) {
2764	m_url.m_portLength = `0`;
2765	return true;
2766	}
2767	return parsePort(iterator);
2768	}
2769	if (address.error() == IPv4ParsingError::Failure)
2770	return false;
2771
2772	appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2773	m_url.m_hostEnd = currentPosition(iterator);
2774	if (!iterator.atEnd())
2775	return parsePort(iterator);
2776	m_url.m_portLength = `0`;
2777	return true;
2778	}
2779
2780	Optional<String> URLParser::formURLDecode(StringView input)
2781	{
2782	auto utf8 = input.utf8(StrictConversion);
2783	if (utf8.isNull())
2784	return WTF::nullopt;
2785	auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2786	return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2787	}
2788
2789	// https://url.spec.whatwg.org/#concept-urlencoded-parser
2790	auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2791	{
2792	URLEncodedForm output;
2793	for (StringView bytes : input.split(`'&'`)) {
2794	auto equalIndex = bytes.find(`'='`);
2795	if (equalIndex == notFound) {
2796	auto name = formURLDecode(bytes.toString().replace(`'+'`, `0x20`));
2797	if (name)
2798	output.append({ name.value(), emptyString() });
2799	} else {
2800	auto name = formURLDecode(bytes.substring(`0`, equalIndex).toString().replace(`'+'`, `0x20`));
2801	auto value = formURLDecode(bytes.substring(equalIndex + `1`).toString().replace(`'+'`, `0x20`));
2802	if (name && value)
2803	output.append({ name.value(), value.value() });
2804	}
2805	}
2806	return output;
2807	}
2808
2809	static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2810	{
2811	auto utf8 = input.utf8(StrictConversion);
2812	const char* data = utf8.data();
2813	for (size_t i = `0`; i < utf8.length(); ++i) {
2814	const char byte = data[i];
2815	if (byte == `0x20`)
2816	output.append(`0x2B`);
2817	else if (byte == `0x2A`
2818	\|\| byte == `0x2D`
2819	\|\| byte == `0x2E`
2820	\|\| (byte >= `0x30` && byte <= `0x39`)
2821	\|\| (byte >= `0x41` && byte <= `0x5A`)
2822	\|\| byte == `0x5F`
2823	\|\| (byte >= `0x61` && byte <= `0x7A`)) // FIXME: Put these in the characterClassTable to avoid branches.
2824	output.append(byte);
2825	else
2826	percentEncodeByte(byte, output);
2827	}
2828	}
2829
2830	String URLParser::serialize(const URLEncodedForm& tuples)
2831	{
2832	if (tuples.isEmpty())
2833	return { };
2834
2835	Vector<LChar> output;
2836	for (auto& tuple : tuples) {
2837	if (!output.isEmpty())
2838	output.append(`'&'`);
2839	serializeURLEncodedForm(tuple.key, output);
2840	output.append(`'='`);
2841	serializeURLEncodedForm(tuple.value, output);
2842	}
2843	return String::adopt(WTFMove(output));
2844	}
2845
2846	const UIDNA& URLParser::internationalDomainNameTranscoder()
2847	{
2848	static UIDNA* encoder;
2849	static std::once_flag onceFlag;
2850	std::call_once(onceFlag, [] {
2851	UErrorCode error = U_ZERO_ERROR;
2852	// Warning: Please contact a WebKitGTK+ developer if changing these flags.
2853	// They should be synced with ephy_uri_decode() in ephy-uri-helpers.c.
2854	encoder = uidna_openUTS46(UIDNA_CHECK_BIDI \| UIDNA_CHECK_CONTEXTJ \| UIDNA_NONTRANSITIONAL_TO_UNICODE \| UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2855	RELEASE_ASSERT(U_SUCCESS(error));
2856	RELEASE_ASSERT(encoder);
2857	});
2858	return *encoder;
2859	}
2860
2861	bool URLParser::allValuesEqual(const URL& a, const URL& b)
2862	{
2863	URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2864	a.m_isValid,
2865	a.m_cannotBeABaseURL,
2866	a.m_protocolIsInHTTPFamily,
2867	a.m_schemeEnd,
2868	a.m_userStart,
2869	a.m_userEnd,
2870	a.m_passwordEnd,
2871	a.m_hostEnd,
2872	a.m_hostEnd + a.m_portLength,
2873	a.m_pathAfterLastSlash,
2874	a.m_pathEnd,
2875	a.m_queryEnd,
2876	a.m_string.utf8().data(),
2877	b.m_isValid,
2878	b.m_cannotBeABaseURL,
2879	b.m_protocolIsInHTTPFamily,
2880	b.m_schemeEnd,
2881	b.m_userStart,
2882	b.m_userEnd,
2883	b.m_passwordEnd,
2884	b.m_hostEnd,
2885	b.m_hostEnd + b.m_portLength,
2886	b.m_pathAfterLastSlash,
2887	b.m_pathEnd,
2888	b.m_queryEnd,
2889	b.m_string.utf8().data());
2890
2891	return a.m_string == b.m_string
2892	&& a.m_isValid == b.m_isValid
2893	&& a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2894	&& a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2895	&& a.m_schemeEnd == b.m_schemeEnd
2896	&& a.m_userStart == b.m_userStart
2897	&& a.m_userEnd == b.m_userEnd
2898	&& a.m_passwordEnd == b.m_passwordEnd
2899	&& a.m_hostEnd == b.m_hostEnd
2900	&& a.m_portLength == b.m_portLength
2901	&& a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2902	&& a.m_pathEnd == b.m_pathEnd
2903	&& a.m_queryEnd == b.m_queryEnd;
2904	}
2905
2906	bool URLParser::internalValuesConsistent(const URL& url)
2907	{
2908	return url.m_schemeEnd <= url.m_userStart
2909	&& url.m_userStart <= url.m_userEnd
2910	&& url.m_userEnd <= url.m_passwordEnd
2911	&& url.m_passwordEnd <= url.m_hostEnd
2912	&& url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2913	&& url.m_pathAfterLastSlash <= url.m_pathEnd
2914	&& url.m_pathEnd <= url.m_queryEnd
2915	&& url.m_queryEnd <= url.m_string.length();
2916	}
2917
2918	} // namespace WTF
2919

Browse the source code of webkit/Source/WTF/wtf/URLParser.cpp