1 | /* |
2 | * Copyright (C) 2013 Google, Inc. All Rights Reserved. |
3 | * Copyright (C) 2015 Apple Inc. All Rights Reserved. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * 1. Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * 2. Redistributions in binary form must reproduce the above copyright |
11 | * notice, this list of conditions and the following disclaimer in the |
12 | * documentation and/or other materials provided with the distribution. |
13 | * |
14 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
15 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | */ |
26 | |
27 | #pragma once |
28 | |
29 | #include "Attribute.h" |
30 | |
31 | namespace WebCore { |
32 | |
33 | struct DoctypeData { |
34 | WTF_MAKE_FAST_ALLOCATED; |
35 | public: |
36 | bool hasPublicIdentifier { false }; |
37 | bool hasSystemIdentifier { false }; |
38 | Vector<UChar> publicIdentifier; |
39 | Vector<UChar> systemIdentifier; |
40 | bool forceQuirks { false }; |
41 | }; |
42 | |
43 | class HTMLToken { |
44 | WTF_MAKE_FAST_ALLOCATED; |
45 | public: |
46 | enum Type { |
47 | Uninitialized, |
48 | DOCTYPE, |
49 | StartTag, |
50 | EndTag, |
51 | , |
52 | Character, |
53 | EndOfFile, |
54 | }; |
55 | |
56 | struct Attribute { |
57 | Vector<UChar, 32> name; |
58 | Vector<UChar, 32> value; |
59 | |
60 | // Used by HTMLSourceTracker. |
61 | unsigned startOffset; |
62 | unsigned endOffset; |
63 | }; |
64 | |
65 | typedef Vector<Attribute, 10> AttributeList; |
66 | typedef Vector<UChar, 256> DataVector; |
67 | |
68 | HTMLToken(); |
69 | |
70 | void clear(); |
71 | |
72 | Type type() const; |
73 | |
74 | // EndOfFile |
75 | |
76 | void makeEndOfFile(); |
77 | |
78 | // StartTag, EndTag, DOCTYPE. |
79 | |
80 | const DataVector& name() const; |
81 | |
82 | void appendToName(UChar); |
83 | |
84 | // DOCTYPE. |
85 | |
86 | void beginDOCTYPE(); |
87 | void beginDOCTYPE(UChar); |
88 | |
89 | void setForceQuirks(); |
90 | |
91 | void setPublicIdentifierToEmptyString(); |
92 | void setSystemIdentifierToEmptyString(); |
93 | |
94 | void appendToPublicIdentifier(UChar); |
95 | void appendToSystemIdentifier(UChar); |
96 | |
97 | std::unique_ptr<DoctypeData> releaseDoctypeData(); |
98 | |
99 | // StartTag, EndTag. |
100 | |
101 | bool selfClosing() const; |
102 | const AttributeList& attributes() const; |
103 | |
104 | void beginStartTag(UChar); |
105 | |
106 | void beginEndTag(LChar); |
107 | void beginEndTag(const Vector<LChar, 32>&); |
108 | |
109 | void beginAttribute(unsigned offset); |
110 | void appendToAttributeName(UChar); |
111 | void appendToAttributeValue(UChar); |
112 | void endAttribute(unsigned offset); |
113 | |
114 | void setSelfClosing(); |
115 | |
116 | // Used by HTMLTokenizer on behalf of HTMLSourceTracker. |
117 | void setAttributeBaseOffset(unsigned attributeBaseOffset) { m_attributeBaseOffset = attributeBaseOffset; } |
118 | |
119 | public: |
120 | // Used by the XSSAuditor to nuke XSS-laden attributes. |
121 | void eraseValueOfAttribute(unsigned index); |
122 | void appendToAttributeValue(unsigned index, StringView value); |
123 | |
124 | // Character. |
125 | |
126 | // Starting a character token works slightly differently than starting |
127 | // other types of tokens because we want to save a per-character branch. |
128 | // There is no beginCharacters, and appending a character sets the type. |
129 | |
130 | const DataVector& characters() const; |
131 | bool charactersIsAll8BitData() const; |
132 | |
133 | void appendToCharacter(LChar); |
134 | void appendToCharacter(UChar); |
135 | void appendToCharacter(const Vector<LChar, 32>&); |
136 | |
137 | // Comment. |
138 | |
139 | const DataVector& comment() const; |
140 | bool commentIsAll8BitData() const; |
141 | |
142 | void beginComment(); |
143 | void appendToComment(UChar); |
144 | |
145 | private: |
146 | Type m_type; |
147 | |
148 | DataVector m_data; |
149 | UChar m_data8BitCheck; |
150 | |
151 | // For StartTag and EndTag |
152 | bool m_selfClosing; |
153 | AttributeList m_attributes; |
154 | Attribute* m_currentAttribute; |
155 | |
156 | // For DOCTYPE |
157 | std::unique_ptr<DoctypeData> m_doctypeData; |
158 | |
159 | unsigned m_attributeBaseOffset { 0 }; // Changes across document.write() boundaries. |
160 | }; |
161 | |
162 | const HTMLToken::Attribute* findAttribute(const Vector<HTMLToken::Attribute>&, StringView name); |
163 | |
164 | inline HTMLToken::HTMLToken() |
165 | : m_type(Uninitialized) |
166 | , m_data8BitCheck(0) |
167 | { |
168 | } |
169 | |
170 | inline void HTMLToken::clear() |
171 | { |
172 | m_type = Uninitialized; |
173 | m_data.clear(); |
174 | m_data8BitCheck = 0; |
175 | } |
176 | |
177 | inline HTMLToken::Type HTMLToken::type() const |
178 | { |
179 | return m_type; |
180 | } |
181 | |
182 | inline void HTMLToken::makeEndOfFile() |
183 | { |
184 | ASSERT(m_type == Uninitialized); |
185 | m_type = EndOfFile; |
186 | } |
187 | |
188 | inline const HTMLToken::DataVector& HTMLToken::name() const |
189 | { |
190 | ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); |
191 | return m_data; |
192 | } |
193 | |
194 | inline void HTMLToken::appendToName(UChar character) |
195 | { |
196 | ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); |
197 | ASSERT(character); |
198 | m_data.append(character); |
199 | m_data8BitCheck |= character; |
200 | } |
201 | |
202 | inline void HTMLToken::setForceQuirks() |
203 | { |
204 | ASSERT(m_type == DOCTYPE); |
205 | m_doctypeData->forceQuirks = true; |
206 | } |
207 | |
208 | inline void HTMLToken::beginDOCTYPE() |
209 | { |
210 | ASSERT(m_type == Uninitialized); |
211 | m_type = DOCTYPE; |
212 | m_doctypeData = std::make_unique<DoctypeData>(); |
213 | } |
214 | |
215 | inline void HTMLToken::beginDOCTYPE(UChar character) |
216 | { |
217 | ASSERT(character); |
218 | beginDOCTYPE(); |
219 | m_data.append(character); |
220 | m_data8BitCheck |= character; |
221 | } |
222 | |
223 | inline void HTMLToken::setPublicIdentifierToEmptyString() |
224 | { |
225 | ASSERT(m_type == DOCTYPE); |
226 | m_doctypeData->hasPublicIdentifier = true; |
227 | m_doctypeData->publicIdentifier.clear(); |
228 | } |
229 | |
230 | inline void HTMLToken::setSystemIdentifierToEmptyString() |
231 | { |
232 | ASSERT(m_type == DOCTYPE); |
233 | m_doctypeData->hasSystemIdentifier = true; |
234 | m_doctypeData->systemIdentifier.clear(); |
235 | } |
236 | |
237 | inline void HTMLToken::appendToPublicIdentifier(UChar character) |
238 | { |
239 | ASSERT(character); |
240 | ASSERT(m_type == DOCTYPE); |
241 | ASSERT(m_doctypeData->hasPublicIdentifier); |
242 | m_doctypeData->publicIdentifier.append(character); |
243 | } |
244 | |
245 | inline void HTMLToken::appendToSystemIdentifier(UChar character) |
246 | { |
247 | ASSERT(character); |
248 | ASSERT(m_type == DOCTYPE); |
249 | ASSERT(m_doctypeData->hasSystemIdentifier); |
250 | m_doctypeData->systemIdentifier.append(character); |
251 | } |
252 | |
253 | inline std::unique_ptr<DoctypeData> HTMLToken::releaseDoctypeData() |
254 | { |
255 | return WTFMove(m_doctypeData); |
256 | } |
257 | |
258 | inline bool HTMLToken::selfClosing() const |
259 | { |
260 | ASSERT(m_type == StartTag || m_type == EndTag); |
261 | return m_selfClosing; |
262 | } |
263 | |
264 | inline void HTMLToken::setSelfClosing() |
265 | { |
266 | ASSERT(m_type == StartTag || m_type == EndTag); |
267 | m_selfClosing = true; |
268 | } |
269 | |
270 | inline void HTMLToken::beginStartTag(UChar character) |
271 | { |
272 | ASSERT(character); |
273 | ASSERT(m_type == Uninitialized); |
274 | m_type = StartTag; |
275 | m_selfClosing = false; |
276 | m_attributes.clear(); |
277 | |
278 | #if !ASSERT_DISABLED |
279 | m_currentAttribute = nullptr; |
280 | #endif |
281 | |
282 | m_data.append(character); |
283 | m_data8BitCheck = character; |
284 | } |
285 | |
286 | inline void HTMLToken::beginEndTag(LChar character) |
287 | { |
288 | ASSERT(m_type == Uninitialized); |
289 | m_type = EndTag; |
290 | m_selfClosing = false; |
291 | m_attributes.clear(); |
292 | |
293 | #if !ASSERT_DISABLED |
294 | m_currentAttribute = nullptr; |
295 | #endif |
296 | |
297 | m_data.append(character); |
298 | } |
299 | |
300 | inline void HTMLToken::beginEndTag(const Vector<LChar, 32>& characters) |
301 | { |
302 | ASSERT(m_type == Uninitialized); |
303 | m_type = EndTag; |
304 | m_selfClosing = false; |
305 | m_attributes.clear(); |
306 | |
307 | #if !ASSERT_DISABLED |
308 | m_currentAttribute = nullptr; |
309 | #endif |
310 | |
311 | m_data.appendVector(characters); |
312 | } |
313 | |
314 | inline void HTMLToken::beginAttribute(unsigned offset) |
315 | { |
316 | ASSERT(m_type == StartTag || m_type == EndTag); |
317 | ASSERT(offset); |
318 | |
319 | m_attributes.grow(m_attributes.size() + 1); |
320 | m_currentAttribute = &m_attributes.last(); |
321 | |
322 | m_currentAttribute->startOffset = offset - m_attributeBaseOffset; |
323 | } |
324 | |
325 | inline void HTMLToken::endAttribute(unsigned offset) |
326 | { |
327 | ASSERT(offset); |
328 | ASSERT(m_currentAttribute); |
329 | m_currentAttribute->endOffset = offset - m_attributeBaseOffset; |
330 | #if !ASSERT_DISABLED |
331 | m_currentAttribute = nullptr; |
332 | #endif |
333 | } |
334 | |
335 | inline void HTMLToken::appendToAttributeName(UChar character) |
336 | { |
337 | ASSERT(character); |
338 | ASSERT(m_type == StartTag || m_type == EndTag); |
339 | ASSERT(m_currentAttribute); |
340 | m_currentAttribute->name.append(character); |
341 | } |
342 | |
343 | inline void HTMLToken::appendToAttributeValue(UChar character) |
344 | { |
345 | ASSERT(character); |
346 | ASSERT(m_type == StartTag || m_type == EndTag); |
347 | ASSERT(m_currentAttribute); |
348 | m_currentAttribute->value.append(character); |
349 | } |
350 | |
351 | inline void HTMLToken::appendToAttributeValue(unsigned i, StringView value) |
352 | { |
353 | ASSERT(!value.isEmpty()); |
354 | ASSERT(m_type == StartTag || m_type == EndTag); |
355 | append(m_attributes[i].value, value); |
356 | } |
357 | |
358 | inline const HTMLToken::AttributeList& HTMLToken::attributes() const |
359 | { |
360 | ASSERT(m_type == StartTag || m_type == EndTag); |
361 | return m_attributes; |
362 | } |
363 | |
364 | // Used by the XSSAuditor to nuke XSS-laden attributes. |
365 | inline void HTMLToken::eraseValueOfAttribute(unsigned i) |
366 | { |
367 | ASSERT(m_type == StartTag || m_type == EndTag); |
368 | ASSERT(i < m_attributes.size()); |
369 | m_attributes[i].value.clear(); |
370 | } |
371 | |
372 | inline const HTMLToken::DataVector& HTMLToken::characters() const |
373 | { |
374 | ASSERT(m_type == Character); |
375 | return m_data; |
376 | } |
377 | |
378 | inline bool HTMLToken::charactersIsAll8BitData() const |
379 | { |
380 | ASSERT(m_type == Character); |
381 | return m_data8BitCheck <= 0xFF; |
382 | } |
383 | |
384 | inline void HTMLToken::appendToCharacter(LChar character) |
385 | { |
386 | ASSERT(m_type == Uninitialized || m_type == Character); |
387 | m_type = Character; |
388 | m_data.append(character); |
389 | } |
390 | |
391 | inline void HTMLToken::appendToCharacter(UChar character) |
392 | { |
393 | ASSERT(m_type == Uninitialized || m_type == Character); |
394 | m_type = Character; |
395 | m_data.append(character); |
396 | m_data8BitCheck |= character; |
397 | } |
398 | |
399 | inline void HTMLToken::appendToCharacter(const Vector<LChar, 32>& characters) |
400 | { |
401 | ASSERT(m_type == Uninitialized || m_type == Character); |
402 | m_type = Character; |
403 | m_data.appendVector(characters); |
404 | } |
405 | |
406 | inline const HTMLToken::DataVector& HTMLToken::() const |
407 | { |
408 | ASSERT(m_type == Comment); |
409 | return m_data; |
410 | } |
411 | |
412 | inline bool HTMLToken::() const |
413 | { |
414 | ASSERT(m_type == Comment); |
415 | return m_data8BitCheck <= 0xFF; |
416 | } |
417 | |
418 | inline void HTMLToken::() |
419 | { |
420 | ASSERT(m_type == Uninitialized); |
421 | m_type = Comment; |
422 | } |
423 | |
424 | inline void HTMLToken::(UChar character) |
425 | { |
426 | ASSERT(character); |
427 | ASSERT(m_type == Comment); |
428 | m_data.append(character); |
429 | m_data8BitCheck |= character; |
430 | } |
431 | |
432 | inline bool nameMatches(const HTMLToken::Attribute& attribute, StringView name) |
433 | { |
434 | unsigned size = name.length(); |
435 | if (attribute.name.size() != size) |
436 | return false; |
437 | for (unsigned i = 0; i < size; ++i) { |
438 | // FIXME: The one caller that uses this probably wants to ignore letter case. |
439 | if (attribute.name[i] != name[i]) |
440 | return false; |
441 | } |
442 | return true; |
443 | } |
444 | |
445 | inline const HTMLToken::Attribute* findAttribute(const HTMLToken::AttributeList& attributes, StringView name) |
446 | { |
447 | for (auto& attribute : attributes) { |
448 | if (nameMatches(attribute, name)) |
449 | return &attribute; |
450 | } |
451 | return nullptr; |
452 | } |
453 | |
454 | } // namespace WebCore |
455 | |