1/*
2 * Copyright (C) 2011, 2013 Google Inc. All rights reserved.
3 * Copyright (C) 2014-2015 Apple Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 * * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "config.h"
33#include "WebVTTTokenizer.h"
34
35#if ENABLE(VIDEO_TRACK)
36
37#include "MarkupTokenizerInlines.h"
38#include <wtf/text/StringBuilder.h>
39#include <wtf/unicode/CharacterNames.h>
40
41namespace WebCore {
42
43#define WEBVTT_ADVANCE_TO(stateName) \
44 do { \
45 ASSERT(!m_input.isEmpty()); \
46 m_preprocessor.advance(m_input); \
47 character = m_preprocessor.nextInputCharacter(); \
48 goto stateName; \
49 } while (false)
50
51template<unsigned charactersCount> ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount])
52{
53 return WTF::equal(s, reinterpret_cast<const LChar*>(characters), charactersCount - 1);
54}
55
56static void addNewClass(StringBuilder& classes, const StringBuilder& newClass)
57{
58 if (!classes.isEmpty())
59 classes.append(' ');
60 classes.append(newClass);
61}
62
63inline bool emitToken(WebVTTToken& resultToken, const WebVTTToken& token)
64{
65 resultToken = token;
66 return true;
67}
68
69inline bool advanceAndEmitToken(SegmentedString& source, WebVTTToken& resultToken, const WebVTTToken& token)
70{
71 source.advance();
72 return emitToken(resultToken, token);
73}
74
75WebVTTTokenizer::WebVTTTokenizer(const String& input)
76 : m_input(input)
77 , m_preprocessor(*this)
78{
79 // Append an EOF marker and close the input "stream".
80 ASSERT(!m_input.isClosed());
81 m_input.append(String { &kEndOfFileMarker, 1 });
82 m_input.close();
83}
84
85bool WebVTTTokenizer::nextToken(WebVTTToken& token)
86{
87 if (m_input.isEmpty() || !m_preprocessor.peek(m_input))
88 return false;
89
90 UChar character = m_preprocessor.nextInputCharacter();
91 if (character == kEndOfFileMarker) {
92 m_preprocessor.advance(m_input);
93 return false;
94 }
95
96 StringBuilder buffer;
97 StringBuilder result;
98 StringBuilder classes;
99
100// 4.8.10.13.4 WebVTT cue text tokenizer
101DataState:
102 if (character == '&') {
103 buffer.append('&');
104 WEBVTT_ADVANCE_TO(EscapeState);
105 } else if (character == '<') {
106 if (result.isEmpty())
107 WEBVTT_ADVANCE_TO(TagState);
108 else {
109 // We don't want to advance input or perform a state transition - just return a (new) token.
110 // (On the next call to nextToken we will see '<' again, but take the other branch in this if instead.)
111 return emitToken(token, WebVTTToken::StringToken(result.toString()));
112 }
113 } else if (character == kEndOfFileMarker)
114 return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString()));
115 else {
116 result.append(character);
117 WEBVTT_ADVANCE_TO(DataState);
118 }
119
120EscapeState:
121 if (character == ';') {
122 if (equalLiteral(buffer, "&amp"))
123 result.append('&');
124 else if (equalLiteral(buffer, "&lt"))
125 result.append('<');
126 else if (equalLiteral(buffer, "&gt"))
127 result.append('>');
128 else if (equalLiteral(buffer, "&lrm"))
129 result.append(leftToRightMark);
130 else if (equalLiteral(buffer, "&rlm"))
131 result.append(rightToLeftMark);
132 else if (equalLiteral(buffer, "&nbsp"))
133 result.append(noBreakSpace);
134 else {
135 buffer.append(character);
136 result.append(buffer);
137 }
138 buffer.clear();
139 WEBVTT_ADVANCE_TO(DataState);
140 } else if (isASCIIAlphanumeric(character)) {
141 buffer.append(character);
142 WEBVTT_ADVANCE_TO(EscapeState);
143 } else if (character == '<') {
144 result.append(buffer);
145 return emitToken(token, WebVTTToken::StringToken(result.toString()));
146 } else if (character == kEndOfFileMarker) {
147 result.append(buffer);
148 return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString()));
149 } else {
150 result.append(buffer);
151 buffer.clear();
152
153 if (character == '&') {
154 buffer.append('&');
155 WEBVTT_ADVANCE_TO(EscapeState);
156 }
157 result.append(character);
158 WEBVTT_ADVANCE_TO(DataState);
159 }
160
161TagState:
162 if (isTokenizerWhitespace(character)) {
163 ASSERT(result.isEmpty());
164 WEBVTT_ADVANCE_TO(StartTagAnnotationState);
165 } else if (character == '.') {
166 ASSERT(result.isEmpty());
167 WEBVTT_ADVANCE_TO(StartTagClassState);
168 } else if (character == '/') {
169 WEBVTT_ADVANCE_TO(EndTagState);
170 } else if (WTF::isASCIIDigit(character)) {
171 result.append(character);
172 WEBVTT_ADVANCE_TO(TimestampTagState);
173 } else if (character == '>' || character == kEndOfFileMarker) {
174 ASSERT(result.isEmpty());
175 return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString()));
176 } else {
177 result.append(character);
178 WEBVTT_ADVANCE_TO(StartTagState);
179 }
180
181StartTagState:
182 if (isTokenizerWhitespace(character))
183 WEBVTT_ADVANCE_TO(StartTagAnnotationState);
184 else if (character == '.')
185 WEBVTT_ADVANCE_TO(StartTagClassState);
186 else if (character == '>' || character == kEndOfFileMarker)
187 return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString()));
188 else {
189 result.append(character);
190 WEBVTT_ADVANCE_TO(StartTagState);
191 }
192
193StartTagClassState:
194 if (isTokenizerWhitespace(character)) {
195 addNewClass(classes, buffer);
196 buffer.clear();
197 WEBVTT_ADVANCE_TO(StartTagAnnotationState);
198 } else if (character == '.') {
199 addNewClass(classes, buffer);
200 buffer.clear();
201 WEBVTT_ADVANCE_TO(StartTagClassState);
202 } else if (character == '>' || character == kEndOfFileMarker) {
203 addNewClass(classes, buffer);
204 buffer.clear();
205 return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomicString()));
206 } else {
207 buffer.append(character);
208 WEBVTT_ADVANCE_TO(StartTagClassState);
209 }
210
211StartTagAnnotationState:
212 if (character == '>' || character == kEndOfFileMarker)
213 return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomicString(), buffer.toAtomicString()));
214 buffer.append(character);
215 WEBVTT_ADVANCE_TO(StartTagAnnotationState);
216
217EndTagState:
218 if (character == '>' || character == kEndOfFileMarker)
219 return advanceAndEmitToken(m_input, token, WebVTTToken::EndTag(result.toString()));
220 result.append(character);
221 WEBVTT_ADVANCE_TO(EndTagState);
222
223TimestampTagState:
224 if (character == '>' || character == kEndOfFileMarker)
225 return advanceAndEmitToken(m_input, token, WebVTTToken::TimestampTag(result.toString()));
226 result.append(character);
227 WEBVTT_ADVANCE_TO(TimestampTagState);
228}
229
230}
231
232#endif
233