1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#if ENABLE(MHTML)
34#include "MHTMLParser.h"
35
36#include "MHTMLArchive.h"
37#include "MIMEHeader.h"
38#include "MIMETypeRegistry.h"
39#include "QuotedPrintable.h"
40#include <wtf/text/Base64.h>
41
42namespace WebCore {
43
44static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
45{
46 String line;
47 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
48 if (line == boundary)
49 return true;
50 }
51 return false;
52}
53
54MHTMLParser::MHTMLParser(SharedBuffer* data)
55 : m_lineReader(data, "\r\n")
56{
57}
58
59RefPtr<MHTMLArchive> MHTMLParser::parseArchive()
60{
61 return parseArchiveWithHeader(MIMEHeader::parseHeader(m_lineReader).get());
62}
63
64RefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
65{
66 if (!header) {
67 LOG_ERROR("Failed to parse MHTML part: no header.");
68 return nullptr;
69 }
70
71 auto archive = MHTMLArchive::create();
72 if (!header->isMultipart()) {
73 // With IE a page with no resource is not multi-part.
74 bool endOfArchiveReached = false;
75 RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
76 if (!resource)
77 return nullptr;
78 archive->setMainResource(resource.releaseNonNull());
79 return archive;
80 }
81
82 // Skip the message content (it's a generic browser specific message).
83 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
84
85 bool endOfArchive = false;
86 while (!endOfArchive) {
87 RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(m_lineReader);
88 if (!resourceHeader) {
89 LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
90 return nullptr;
91 }
92 if (resourceHeader->contentType() == "multipart/alternative") {
93 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
94 RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
95 if (!subframeArchive) {
96 LOG_ERROR("Failed to parse MHTML subframe.");
97 return nullptr;
98 }
99 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
100 ASSERT_UNUSED(endOfPartReached, endOfPartReached);
101 // The top-frame is the first frame found, regardless of the nesting level.
102 if (subframeArchive->mainResource())
103 addResourceToArchive(subframeArchive->mainResource(), archive.ptr());
104 archive->addSubframeArchive(subframeArchive.releaseNonNull());
105 continue;
106 }
107
108 RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
109 if (!resource) {
110 LOG_ERROR("Failed to parse MHTML part.");
111 return nullptr;
112 }
113 addResourceToArchive(resource.get(), archive.ptr());
114 }
115
116 return archive;
117}
118
119void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
120{
121 const String& mimeType = resource->mimeType();
122 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
123 m_resources.append(resource);
124 return;
125 }
126
127 // The first document suitable resource is the main frame.
128 if (!archive->mainResource()) {
129 archive->setMainResource(*resource);
130 m_frames.append(archive);
131 return;
132 }
133
134 auto subframe = MHTMLArchive::create();
135 subframe->setMainResource(*resource);
136 m_frames.append(WTFMove(subframe));
137}
138
139RefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
140{
141 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
142
143 auto content = SharedBuffer::create();
144 const bool checkBoundary = !endOfPartBoundary.isEmpty();
145 bool endOfPartReached = false;
146 if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) {
147 if (!checkBoundary) {
148 LOG_ERROR("Binary contents requires end of part");
149 return nullptr;
150 }
151 m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
152 Vector<char> part;
153 if (!m_lineReader.nextChunk(part)) {
154 LOG_ERROR("Binary contents requires end of part");
155 return nullptr;
156 }
157 content->append(WTFMove(part));
158 m_lineReader.setSeparator("\r\n");
159 Vector<char> nextChars;
160 if (m_lineReader.peek(nextChars, 2) != 2) {
161 LOG_ERROR("Invalid seperator.");
162 return nullptr;
163 }
164 endOfPartReached = true;
165 ASSERT(nextChars.size() == 2);
166 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
167 if (!endOfArchiveReached) {
168 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
169 if (!line.isEmpty()) {
170 LOG_ERROR("No CRLF at end of binary section.");
171 return nullptr;
172 }
173 }
174 } else {
175 String line;
176 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
177 endOfArchiveReached = (line == endOfDocumentBoundary);
178 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
179 endOfPartReached = true;
180 break;
181 }
182 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
183 content->append(line.utf8().data(), line.length());
184 if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) {
185 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
186 content->append("\r\n", 2);
187 }
188 }
189 }
190 if (!endOfPartReached && checkBoundary) {
191 LOG_ERROR("No bounday found for MHTML part.");
192 return nullptr;
193 }
194
195 Vector<char> data;
196 switch (mimeHeader.contentTransferEncoding()) {
197 case MIMEHeader::Base64:
198 if (!base64Decode(content->data(), content->size(), data)) {
199 LOG_ERROR("Invalid base64 content for MHTML part.");
200 return nullptr;
201 }
202 break;
203 case MIMEHeader::QuotedPrintable:
204 quotedPrintableDecode(content->data(), content->size(), data);
205 break;
206 case MIMEHeader::SevenBit:
207 case MIMEHeader::Binary:
208 data.append(content->data(), content->size());
209 break;
210 default:
211 LOG_ERROR("Invalid encoding for MHTML part.");
212 return nullptr;
213 }
214 auto contentBuffer = SharedBuffer::create(WTFMove(data));
215 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
216 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
217 // IE and Firefox (UNMht) seem to generate only absolute URLs.
218 URL location = URL(URL(), mimeHeader.contentLocation());
219 return ArchiveResource::create(WTFMove(contentBuffer), location, mimeHeader.contentType(), mimeHeader.charset(), String());
220}
221
222size_t MHTMLParser::frameCount() const
223{
224 return m_frames.size();
225}
226
227MHTMLArchive* MHTMLParser::frameAt(size_t index) const
228{
229 return m_frames[index].get();
230}
231
232size_t MHTMLParser::subResourceCount() const
233{
234 return m_resources.size();
235}
236
237ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
238{
239 return m_resources[index].get();
240}
241
242}
243#endif
244