HTMLToken.h source code [webkit/Source/WebCore/html/parser/HTMLToken.h]

1	/*
2	* Copyright (C) 2013 Google, Inc. All Rights Reserved.
3	* Copyright (C) 2015 Apple Inc. All Rights Reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#pragma once
28
29	#include "Attribute.h"
30
31	namespace WebCore {
32
33	struct DoctypeData {
34	WTF_MAKE_FAST_ALLOCATED;
35	public:
36	bool hasPublicIdentifier { false };
37	bool hasSystemIdentifier { false };
38	Vector<UChar> publicIdentifier;
39	Vector<UChar> systemIdentifier;
40	bool forceQuirks { false };
41	};
42
43	class HTMLToken {
44	WTF_MAKE_FAST_ALLOCATED;
45	public:
46	enum Type {
47	Uninitialized,
48	DOCTYPE,
49	StartTag,
50	EndTag,
51	Comment,
52	Character,
53	EndOfFile,
54	};
55
56	struct Attribute {
57	Vector<UChar, `32`> name;
58	Vector<UChar, `32`> value;
59
60	// Used by HTMLSourceTracker.
61	unsigned startOffset;
62	unsigned endOffset;
63	};
64
65	typedef Vector<Attribute, `10`> AttributeList;
66	typedef Vector<UChar, `256`> DataVector;
67
68	HTMLToken();
69
70	void clear();
71
72	Type type() const;
73
74	// EndOfFile
75
76	void makeEndOfFile();
77
78	// StartTag, EndTag, DOCTYPE.
79
80	const DataVector& name() const;
81
82	void appendToName(UChar);
83
84	// DOCTYPE.
85
86	void beginDOCTYPE();
87	void beginDOCTYPE(UChar);
88
89	void setForceQuirks();
90
91	void setPublicIdentifierToEmptyString();
92	void setSystemIdentifierToEmptyString();
93
94	void appendToPublicIdentifier(UChar);
95	void appendToSystemIdentifier(UChar);
96
97	std::unique_ptr<DoctypeData> releaseDoctypeData();
98
99	// StartTag, EndTag.
100
101	bool selfClosing() const;
102	const AttributeList& attributes() const;
103
104	void beginStartTag(UChar);
105
106	void beginEndTag(LChar);
107	void beginEndTag(const Vector<LChar, `32`>&);
108
109	void beginAttribute(unsigned offset);
110	void appendToAttributeName(UChar);
111	void appendToAttributeValue(UChar);
112	void endAttribute(unsigned offset);
113
114	void setSelfClosing();
115
116	// Used by HTMLTokenizer on behalf of HTMLSourceTracker.
117	void setAttributeBaseOffset(unsigned attributeBaseOffset) { m_attributeBaseOffset = attributeBaseOffset; }
118
119	public:
120	// Used by the XSSAuditor to nuke XSS-laden attributes.
121	void eraseValueOfAttribute(unsigned index);
122	void appendToAttributeValue(unsigned index, StringView value);
123
124	// Character.
125
126	// Starting a character token works slightly differently than starting
127	// other types of tokens because we want to save a per-character branch.
128	// There is no beginCharacters, and appending a character sets the type.
129
130	const DataVector& characters() const;
131	bool charactersIsAll8BitData() const;
132
133	void appendToCharacter(LChar);
134	void appendToCharacter(UChar);
135	void appendToCharacter(const Vector<LChar, `32`>&);
136
137	// Comment.
138
139	const DataVector& comment() const;
140	bool commentIsAll8BitData() const;
141
142	void beginComment();
143	void appendToComment(UChar);
144
145	private:
146	Type m_type;
147
148	DataVector m_data;
149	UChar m_data8BitCheck;
150
151	// For StartTag and EndTag
152	bool m_selfClosing;
153	AttributeList m_attributes;
154	Attribute* m_currentAttribute;
155
156	// For DOCTYPE
157	std::unique_ptr<DoctypeData> m_doctypeData;
158
159	unsigned m_attributeBaseOffset { `0` }; // Changes across document.write() boundaries.
160	};
161
162	const HTMLToken::Attribute* findAttribute(const Vector<HTMLToken::Attribute>&, StringView name);
163
164	inline HTMLToken::HTMLToken()
165	: m_type(Uninitialized)
166	, m_data8BitCheck(`0`)
167	{
168	}
169
170	inline void HTMLToken::clear()
171	{
172	m_type = Uninitialized;
173	m_data.clear();
174	m_data8BitCheck = `0`;
175	}
176
177	inline HTMLToken::Type HTMLToken::type() const
178	{
179	return m_type;
180	}
181
182	inline void HTMLToken::makeEndOfFile()
183	{
184	ASSERT(m_type == Uninitialized);
185	m_type = EndOfFile;
186	}
187
188	inline const HTMLToken::DataVector& HTMLToken::name() const
189	{
190	ASSERT(m_type == StartTag \|\| m_type == EndTag \|\| m_type == DOCTYPE);
191	return m_data;
192	}
193
194	inline void HTMLToken::appendToName(UChar character)
195	{
196	ASSERT(m_type == StartTag \|\| m_type == EndTag \|\| m_type == DOCTYPE);
197	ASSERT(character);
198	m_data.append(character);
199	m_data8BitCheck \|= character;
200	}
201
202	inline void HTMLToken::setForceQuirks()
203	{
204	ASSERT(m_type == DOCTYPE);
205	m_doctypeData ->forceQuirks = true;
206	}
207
208	inline void HTMLToken::beginDOCTYPE()
209	{
210	ASSERT(m_type == Uninitialized);
211	m_type = DOCTYPE;
212	m_doctypeData = std::make_unique<DoctypeData>();
213	}
214
215	inline void HTMLToken::beginDOCTYPE(UChar character)
216	{
217	ASSERT(character);
218	beginDOCTYPE();
219	m_data.append(character);
220	m_data8BitCheck \|= character;
221	}
222
223	inline void HTMLToken::setPublicIdentifierToEmptyString()
224	{
225	ASSERT(m_type == DOCTYPE);
226	m_doctypeData ->hasPublicIdentifier = true;
227	m_doctypeData ->publicIdentifier.clear();
228	}
229
230	inline void HTMLToken::setSystemIdentifierToEmptyString()
231	{
232	ASSERT(m_type == DOCTYPE);
233	m_doctypeData ->hasSystemIdentifier = true;
234	m_doctypeData ->systemIdentifier.clear();
235	}
236
237	inline void HTMLToken::appendToPublicIdentifier(UChar character)
238	{
239	ASSERT(character);
240	ASSERT(m_type == DOCTYPE);
241	ASSERT(m_doctypeData ->hasPublicIdentifier);
242	m_doctypeData ->publicIdentifier.append(character);
243	}
244
245	inline void HTMLToken::appendToSystemIdentifier(UChar character)
246	{
247	ASSERT(character);
248	ASSERT(m_type == DOCTYPE);
249	ASSERT(m_doctypeData ->hasSystemIdentifier);
250	m_doctypeData ->systemIdentifier.append(character);
251	}
252
253	inline std::unique_ptr<DoctypeData> HTMLToken::releaseDoctypeData()
254	{
255	return WTFMove(m_doctypeData);
256	}
257
258	inline bool HTMLToken::selfClosing() const
259	{
260	ASSERT(m_type == StartTag \|\| m_type == EndTag);
261	return m_selfClosing;
262	}
263
264	inline void HTMLToken::setSelfClosing()
265	{
266	ASSERT(m_type == StartTag \|\| m_type == EndTag);
267	m_selfClosing = true;
268	}
269
270	inline void HTMLToken::beginStartTag(UChar character)
271	{
272	ASSERT(character);
273	ASSERT(m_type == Uninitialized);
274	m_type = StartTag;
275	m_selfClosing = false;
276	m_attributes.clear();
277
278	#if !ASSERT_DISABLED
279	m_currentAttribute = nullptr;
280	#endif
281
282	m_data.append(character);
283	m_data8BitCheck = character;
284	}
285
286	inline void HTMLToken::beginEndTag(LChar character)
287	{
288	ASSERT(m_type == Uninitialized);
289	m_type = EndTag;
290	m_selfClosing = false;
291	m_attributes.clear();
292
293	#if !ASSERT_DISABLED
294	m_currentAttribute = nullptr;
295	#endif
296
297	m_data.append(character);
298	}
299
300	inline void HTMLToken::beginEndTag(const Vector<LChar, `32`>& characters)
301	{
302	ASSERT(m_type == Uninitialized);
303	m_type = EndTag;
304	m_selfClosing = false;
305	m_attributes.clear();
306
307	#if !ASSERT_DISABLED
308	m_currentAttribute = nullptr;
309	#endif
310
311	m_data.appendVector(characters);
312	}
313
314	inline void HTMLToken::beginAttribute(unsigned offset)
315	{
316	ASSERT(m_type == StartTag \|\| m_type == EndTag);
317	ASSERT(offset);
318
319	m_attributes.grow(m_attributes.size() + `1`);
320	m_currentAttribute = &m_attributes.last();
321
322	m_currentAttribute->startOffset = offset - m_attributeBaseOffset;
323	}
324
325	inline void HTMLToken::endAttribute(unsigned offset)
326	{
327	ASSERT(offset);
328	ASSERT(m_currentAttribute);
329	m_currentAttribute->endOffset = offset - m_attributeBaseOffset;
330	#if !ASSERT_DISABLED
331	m_currentAttribute = nullptr;
332	#endif
333	}
334
335	inline void HTMLToken::appendToAttributeName(UChar character)
336	{
337	ASSERT(character);
338	ASSERT(m_type == StartTag \|\| m_type == EndTag);
339	ASSERT(m_currentAttribute);
340	m_currentAttribute->name.append(character);
341	}
342
343	inline void HTMLToken::appendToAttributeValue(UChar character)
344	{
345	ASSERT(character);
346	ASSERT(m_type == StartTag \|\| m_type == EndTag);
347	ASSERT(m_currentAttribute);
348	m_currentAttribute->value.append(character);
349	}
350
351	inline void HTMLToken::appendToAttributeValue(unsigned i, StringView value)
352	{
353	ASSERT(!value.isEmpty());
354	ASSERT(m_type == StartTag \|\| m_type == EndTag);
355	append(m_attributes [i].value, value);
356	}
357
358	inline const HTMLToken::AttributeList& HTMLToken::attributes() const
359	{
360	ASSERT(m_type == StartTag \|\| m_type == EndTag);
361	return m_attributes;
362	}
363
364	// Used by the XSSAuditor to nuke XSS-laden attributes.
365	inline void HTMLToken::eraseValueOfAttribute(unsigned i)
366	{
367	ASSERT(m_type == StartTag \|\| m_type == EndTag);
368	ASSERT(i < m_attributes.size());
369	m_attributes [i].value.clear();
370	}
371
372	inline const HTMLToken::DataVector& HTMLToken::characters() const
373	{
374	ASSERT(m_type == Character);
375	return m_data;
376	}
377
378	inline bool HTMLToken::charactersIsAll8BitData() const
379	{
380	ASSERT(m_type == Character);
381	return m_data8BitCheck <= `0xFF`;
382	}
383
384	inline void HTMLToken::appendToCharacter(LChar character)
385	{
386	ASSERT(m_type == Uninitialized \|\| m_type == Character);
387	m_type = Character;
388	m_data.append(character);
389	}
390
391	inline void HTMLToken::appendToCharacter(UChar character)
392	{
393	ASSERT(m_type == Uninitialized \|\| m_type == Character);
394	m_type = Character;
395	m_data.append(character);
396	m_data8BitCheck \|= character;
397	}
398
399	inline void HTMLToken::appendToCharacter(const Vector<LChar, `32`>& characters)
400	{
401	ASSERT(m_type == Uninitialized \|\| m_type == Character);
402	m_type = Character;
403	m_data.appendVector(characters);
404	}
405
406	inline const HTMLToken::DataVector& HTMLToken::comment() const
407	{
408	ASSERT(m_type == Comment);
409	return m_data;
410	}
411
412	inline bool HTMLToken::commentIsAll8BitData() const
413	{
414	ASSERT(m_type == Comment);
415	return m_data8BitCheck <= `0xFF`;
416	}
417
418	inline void HTMLToken::beginComment()
419	{
420	ASSERT(m_type == Uninitialized);
421	m_type = Comment;
422	}
423
424	inline void HTMLToken::appendToComment(UChar character)
425	{
426	ASSERT(character);
427	ASSERT(m_type == Comment);
428	m_data.append(character);
429	m_data8BitCheck \|= character;
430	}
431
432	inline bool nameMatches(const HTMLToken::Attribute& attribute, StringView name)
433	{
434	unsigned size = name.length();
435	if (attribute.name.size() != size)
436	return false;
437	for (unsigned i = `0`; i < size; ++i) {
438	// FIXME: The one caller that uses this probably wants to ignore letter case.
439	if (attribute.name [i] != name [i])
440	return false;
441	}
442	return true;
443	}
444
445	inline const HTMLToken::Attribute* findAttribute(const HTMLToken::AttributeList& attributes, StringView name)
446	{
447	for (auto& attribute : attributes) {
448	if (nameMatches(attribute, name))
449	return &attribute;
450	}
451	return nullptr;
452	}
453
454	} // namespace WebCore
455

Browse the source code of webkit/Source/WebCore/html/parser/HTMLToken.h