1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLEntityParser.h"
30
31#include "CharacterReferenceParserInlines.h"
32#include "HTMLEntitySearch.h"
33#include "HTMLEntityTable.h"
34#include <wtf/text/StringBuilder.h>
35#include <wtf/unicode/CharacterNames.h>
36
37namespace WebCore {
38
39static const UChar windowsLatin1ExtensionArray[32] = {
40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
44};
45
46class HTMLEntityParser {
47public:
48 static UChar32 legalEntityFor(UChar32 value)
49 {
50 if (value <= 0 || value > UCHAR_MAX_VALUE || U_IS_SURROGATE(value))
51 return replacementCharacter;
52 if ((value & ~0x1F) != 0x80)
53 return value;
54 return windowsLatin1ExtensionArray[value - 0x80];
55 }
56
57 static bool acceptMalformed() { return true; }
58
59 static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
60 {
61 StringBuilder consumedCharacters;
62 HTMLEntitySearch entitySearch;
63 while (!source.isEmpty()) {
64 cc = source.currentCharacter();
65 entitySearch.advance(cc);
66 if (!entitySearch.isEntityPrefix())
67 break;
68 consumedCharacters.append(cc);
69 source.advancePastNonNewline();
70 }
71 notEnoughCharacters = source.isEmpty();
72 if (notEnoughCharacters) {
73 // We can't an entity because there might be a longer entity
74 // that we could match if we had more data.
75 unconsumeCharacters(source, consumedCharacters);
76 return false;
77 }
78 if (!entitySearch.mostRecentMatch()) {
79 unconsumeCharacters(source, consumedCharacters);
80 return false;
81 }
82 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
83 // We've consumed too many characters. We need to walk the
84 // source back to the point at which we had consumed an
85 // actual entity.
86 unconsumeCharacters(source, consumedCharacters);
87 consumedCharacters.clear();
88 const int length = entitySearch.mostRecentMatch()->length;
89 const LChar* reference = entitySearch.mostRecentMatch()->entity;
90 for (int i = 0; i < length; ++i) {
91 cc = source.currentCharacter();
92 ASSERT_UNUSED(reference, cc == *reference++);
93 consumedCharacters.append(cc);
94 source.advancePastNonNewline();
95 ASSERT(!source.isEmpty());
96 }
97 cc = source.currentCharacter();
98 }
99 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
100 || !additionalAllowedCharacter
101 || !(isASCIIAlphanumeric(cc) || cc == '=')) {
102 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
103 if (entitySearch.mostRecentMatch()->secondValue)
104 decodedEntity.append(entitySearch.mostRecentMatch()->secondValue);
105 return true;
106 }
107 unconsumeCharacters(source, consumedCharacters);
108 return false;
109 }
110};
111
112bool consumeHTMLEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
113{
114 return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
115}
116
117static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
118{
119 if (U_IS_BMP(value)) {
120 UChar character = static_cast<UChar>(value);
121 ASSERT(character == value);
122 result[0] = character;
123 return 1;
124 }
125
126 result[0] = U16_LEAD(value);
127 result[1] = U16_TRAIL(value);
128 return 2;
129}
130
131size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
132{
133 HTMLEntitySearch search;
134 while (*name) {
135 search.advance(*name++);
136 if (!search.isEntityPrefix())
137 return 0;
138 }
139 search.advance(';');
140 if (!search.isEntityPrefix())
141 return 0;
142
143 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
144 if (!search.mostRecentMatch()->secondValue)
145 return numberOfCodePoints;
146 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
147}
148
149} // namespace WebCore
150