1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 * Copyright (C) 2015 Apple Inc. All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#pragma once
28
29#include "Attribute.h"
30
31namespace WebCore {
32
33struct DoctypeData {
34 WTF_MAKE_FAST_ALLOCATED;
35public:
36 bool hasPublicIdentifier { false };
37 bool hasSystemIdentifier { false };
38 Vector<UChar> publicIdentifier;
39 Vector<UChar> systemIdentifier;
40 bool forceQuirks { false };
41};
42
43class HTMLToken {
44 WTF_MAKE_FAST_ALLOCATED;
45public:
46 enum Type {
47 Uninitialized,
48 DOCTYPE,
49 StartTag,
50 EndTag,
51 Comment,
52 Character,
53 EndOfFile,
54 };
55
56 struct Attribute {
57 Vector<UChar, 32> name;
58 Vector<UChar, 32> value;
59
60 // Used by HTMLSourceTracker.
61 unsigned startOffset;
62 unsigned endOffset;
63 };
64
65 typedef Vector<Attribute, 10> AttributeList;
66 typedef Vector<UChar, 256> DataVector;
67
68 HTMLToken();
69
70 void clear();
71
72 Type type() const;
73
74 // EndOfFile
75
76 void makeEndOfFile();
77
78 // StartTag, EndTag, DOCTYPE.
79
80 const DataVector& name() const;
81
82 void appendToName(UChar);
83
84 // DOCTYPE.
85
86 void beginDOCTYPE();
87 void beginDOCTYPE(UChar);
88
89 void setForceQuirks();
90
91 void setPublicIdentifierToEmptyString();
92 void setSystemIdentifierToEmptyString();
93
94 void appendToPublicIdentifier(UChar);
95 void appendToSystemIdentifier(UChar);
96
97 std::unique_ptr<DoctypeData> releaseDoctypeData();
98
99 // StartTag, EndTag.
100
101 bool selfClosing() const;
102 const AttributeList& attributes() const;
103
104 void beginStartTag(UChar);
105
106 void beginEndTag(LChar);
107 void beginEndTag(const Vector<LChar, 32>&);
108
109 void beginAttribute(unsigned offset);
110 void appendToAttributeName(UChar);
111 void appendToAttributeValue(UChar);
112 void endAttribute(unsigned offset);
113
114 void setSelfClosing();
115
116 // Used by HTMLTokenizer on behalf of HTMLSourceTracker.
117 void setAttributeBaseOffset(unsigned attributeBaseOffset) { m_attributeBaseOffset = attributeBaseOffset; }
118
119public:
120 // Used by the XSSAuditor to nuke XSS-laden attributes.
121 void eraseValueOfAttribute(unsigned index);
122 void appendToAttributeValue(unsigned index, StringView value);
123
124 // Character.
125
126 // Starting a character token works slightly differently than starting
127 // other types of tokens because we want to save a per-character branch.
128 // There is no beginCharacters, and appending a character sets the type.
129
130 const DataVector& characters() const;
131 bool charactersIsAll8BitData() const;
132
133 void appendToCharacter(LChar);
134 void appendToCharacter(UChar);
135 void appendToCharacter(const Vector<LChar, 32>&);
136
137 // Comment.
138
139 const DataVector& comment() const;
140 bool commentIsAll8BitData() const;
141
142 void beginComment();
143 void appendToComment(UChar);
144
145private:
146 Type m_type;
147
148 DataVector m_data;
149 UChar m_data8BitCheck;
150
151 // For StartTag and EndTag
152 bool m_selfClosing;
153 AttributeList m_attributes;
154 Attribute* m_currentAttribute;
155
156 // For DOCTYPE
157 std::unique_ptr<DoctypeData> m_doctypeData;
158
159 unsigned m_attributeBaseOffset { 0 }; // Changes across document.write() boundaries.
160};
161
162const HTMLToken::Attribute* findAttribute(const Vector<HTMLToken::Attribute>&, StringView name);
163
164inline HTMLToken::HTMLToken()
165 : m_type(Uninitialized)
166 , m_data8BitCheck(0)
167{
168}
169
170inline void HTMLToken::clear()
171{
172 m_type = Uninitialized;
173 m_data.clear();
174 m_data8BitCheck = 0;
175}
176
177inline HTMLToken::Type HTMLToken::type() const
178{
179 return m_type;
180}
181
182inline void HTMLToken::makeEndOfFile()
183{
184 ASSERT(m_type == Uninitialized);
185 m_type = EndOfFile;
186}
187
188inline const HTMLToken::DataVector& HTMLToken::name() const
189{
190 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
191 return m_data;
192}
193
194inline void HTMLToken::appendToName(UChar character)
195{
196 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
197 ASSERT(character);
198 m_data.append(character);
199 m_data8BitCheck |= character;
200}
201
202inline void HTMLToken::setForceQuirks()
203{
204 ASSERT(m_type == DOCTYPE);
205 m_doctypeData->forceQuirks = true;
206}
207
208inline void HTMLToken::beginDOCTYPE()
209{
210 ASSERT(m_type == Uninitialized);
211 m_type = DOCTYPE;
212 m_doctypeData = std::make_unique<DoctypeData>();
213}
214
215inline void HTMLToken::beginDOCTYPE(UChar character)
216{
217 ASSERT(character);
218 beginDOCTYPE();
219 m_data.append(character);
220 m_data8BitCheck |= character;
221}
222
223inline void HTMLToken::setPublicIdentifierToEmptyString()
224{
225 ASSERT(m_type == DOCTYPE);
226 m_doctypeData->hasPublicIdentifier = true;
227 m_doctypeData->publicIdentifier.clear();
228}
229
230inline void HTMLToken::setSystemIdentifierToEmptyString()
231{
232 ASSERT(m_type == DOCTYPE);
233 m_doctypeData->hasSystemIdentifier = true;
234 m_doctypeData->systemIdentifier.clear();
235}
236
237inline void HTMLToken::appendToPublicIdentifier(UChar character)
238{
239 ASSERT(character);
240 ASSERT(m_type == DOCTYPE);
241 ASSERT(m_doctypeData->hasPublicIdentifier);
242 m_doctypeData->publicIdentifier.append(character);
243}
244
245inline void HTMLToken::appendToSystemIdentifier(UChar character)
246{
247 ASSERT(character);
248 ASSERT(m_type == DOCTYPE);
249 ASSERT(m_doctypeData->hasSystemIdentifier);
250 m_doctypeData->systemIdentifier.append(character);
251}
252
253inline std::unique_ptr<DoctypeData> HTMLToken::releaseDoctypeData()
254{
255 return WTFMove(m_doctypeData);
256}
257
258inline bool HTMLToken::selfClosing() const
259{
260 ASSERT(m_type == StartTag || m_type == EndTag);
261 return m_selfClosing;
262}
263
264inline void HTMLToken::setSelfClosing()
265{
266 ASSERT(m_type == StartTag || m_type == EndTag);
267 m_selfClosing = true;
268}
269
270inline void HTMLToken::beginStartTag(UChar character)
271{
272 ASSERT(character);
273 ASSERT(m_type == Uninitialized);
274 m_type = StartTag;
275 m_selfClosing = false;
276 m_attributes.clear();
277
278#if !ASSERT_DISABLED
279 m_currentAttribute = nullptr;
280#endif
281
282 m_data.append(character);
283 m_data8BitCheck = character;
284}
285
286inline void HTMLToken::beginEndTag(LChar character)
287{
288 ASSERT(m_type == Uninitialized);
289 m_type = EndTag;
290 m_selfClosing = false;
291 m_attributes.clear();
292
293#if !ASSERT_DISABLED
294 m_currentAttribute = nullptr;
295#endif
296
297 m_data.append(character);
298}
299
300inline void HTMLToken::beginEndTag(const Vector<LChar, 32>& characters)
301{
302 ASSERT(m_type == Uninitialized);
303 m_type = EndTag;
304 m_selfClosing = false;
305 m_attributes.clear();
306
307#if !ASSERT_DISABLED
308 m_currentAttribute = nullptr;
309#endif
310
311 m_data.appendVector(characters);
312}
313
314inline void HTMLToken::beginAttribute(unsigned offset)
315{
316 ASSERT(m_type == StartTag || m_type == EndTag);
317 ASSERT(offset);
318
319 m_attributes.grow(m_attributes.size() + 1);
320 m_currentAttribute = &m_attributes.last();
321
322 m_currentAttribute->startOffset = offset - m_attributeBaseOffset;
323}
324
325inline void HTMLToken::endAttribute(unsigned offset)
326{
327 ASSERT(offset);
328 ASSERT(m_currentAttribute);
329 m_currentAttribute->endOffset = offset - m_attributeBaseOffset;
330#if !ASSERT_DISABLED
331 m_currentAttribute = nullptr;
332#endif
333}
334
335inline void HTMLToken::appendToAttributeName(UChar character)
336{
337 ASSERT(character);
338 ASSERT(m_type == StartTag || m_type == EndTag);
339 ASSERT(m_currentAttribute);
340 m_currentAttribute->name.append(character);
341}
342
343inline void HTMLToken::appendToAttributeValue(UChar character)
344{
345 ASSERT(character);
346 ASSERT(m_type == StartTag || m_type == EndTag);
347 ASSERT(m_currentAttribute);
348 m_currentAttribute->value.append(character);
349}
350
351inline void HTMLToken::appendToAttributeValue(unsigned i, StringView value)
352{
353 ASSERT(!value.isEmpty());
354 ASSERT(m_type == StartTag || m_type == EndTag);
355 append(m_attributes[i].value, value);
356}
357
358inline const HTMLToken::AttributeList& HTMLToken::attributes() const
359{
360 ASSERT(m_type == StartTag || m_type == EndTag);
361 return m_attributes;
362}
363
364// Used by the XSSAuditor to nuke XSS-laden attributes.
365inline void HTMLToken::eraseValueOfAttribute(unsigned i)
366{
367 ASSERT(m_type == StartTag || m_type == EndTag);
368 ASSERT(i < m_attributes.size());
369 m_attributes[i].value.clear();
370}
371
372inline const HTMLToken::DataVector& HTMLToken::characters() const
373{
374 ASSERT(m_type == Character);
375 return m_data;
376}
377
378inline bool HTMLToken::charactersIsAll8BitData() const
379{
380 ASSERT(m_type == Character);
381 return m_data8BitCheck <= 0xFF;
382}
383
384inline void HTMLToken::appendToCharacter(LChar character)
385{
386 ASSERT(m_type == Uninitialized || m_type == Character);
387 m_type = Character;
388 m_data.append(character);
389}
390
391inline void HTMLToken::appendToCharacter(UChar character)
392{
393 ASSERT(m_type == Uninitialized || m_type == Character);
394 m_type = Character;
395 m_data.append(character);
396 m_data8BitCheck |= character;
397}
398
399inline void HTMLToken::appendToCharacter(const Vector<LChar, 32>& characters)
400{
401 ASSERT(m_type == Uninitialized || m_type == Character);
402 m_type = Character;
403 m_data.appendVector(characters);
404}
405
406inline const HTMLToken::DataVector& HTMLToken::comment() const
407{
408 ASSERT(m_type == Comment);
409 return m_data;
410}
411
412inline bool HTMLToken::commentIsAll8BitData() const
413{
414 ASSERT(m_type == Comment);
415 return m_data8BitCheck <= 0xFF;
416}
417
418inline void HTMLToken::beginComment()
419{
420 ASSERT(m_type == Uninitialized);
421 m_type = Comment;
422}
423
424inline void HTMLToken::appendToComment(UChar character)
425{
426 ASSERT(character);
427 ASSERT(m_type == Comment);
428 m_data.append(character);
429 m_data8BitCheck |= character;
430}
431
432inline bool nameMatches(const HTMLToken::Attribute& attribute, StringView name)
433{
434 unsigned size = name.length();
435 if (attribute.name.size() != size)
436 return false;
437 for (unsigned i = 0; i < size; ++i) {
438 // FIXME: The one caller that uses this probably wants to ignore letter case.
439 if (attribute.name[i] != name[i])
440 return false;
441 }
442 return true;
443}
444
445inline const HTMLToken::Attribute* findAttribute(const HTMLToken::AttributeList& attributes, StringView name)
446{
447 for (auto& attribute : attributes) {
448 if (nameMatches(attribute, name))
449 return &attribute;
450 }
451 return nullptr;
452}
453
454} // namespace WebCore
455