MHTMLParser.cpp source code [webkit/Source/WebCore/loader/archive/mhtml/MHTMLParser.cpp]

1	/*
2	* Copyright (C) 2011 Google Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are
6	* met:
7	*
8	* * Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* * Redistributions in binary form must reproduce the above
11	* copyright notice, this list of conditions and the following disclaimer
12	* in the documentation and/or other materials provided with the
13	* distribution.
14	* * Neither the name of Google Inc. nor the names of its
15	* contributors may be used to endorse or promote products derived from
16	* this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*/
30
31	#include "config.h"
32
33	#if ENABLE(MHTML)
34	#include "MHTMLParser.h"
35
36	#include "MHTMLArchive.h"
37	#include "MIMEHeader.h"
38	#include "MIMETypeRegistry.h"
39	#include "QuotedPrintable.h"
40	#include <wtf/text/Base64.h>
41
42	namespace WebCore {
43
44	static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
45	{
46	String line;
47	while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
48	if (line == boundary)
49	return true;
50	}
51	return false;
52	}
53
54	MHTMLParser::MHTMLParser(SharedBuffer* data)
55	: m_lineReader (data, "\r\n")
56	{
57	}
58
59	RefPtr<MHTMLArchive> MHTMLParser::parseArchive()
60	{
61	return parseArchiveWithHeader(MIMEHeader::parseHeader(m_lineReader).get());
62	}
63
64	RefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
65	{
66	if (!header) {
67	LOG_ERROR("Failed to parse MHTML part: no header.");
68	return nullptr;
69	}
70
71	auto archive = MHTMLArchive::create();
72	if (!header->isMultipart()) {
73	// With IE a page with no resource is not multi-part.
74	bool endOfArchiveReached = false;
75	RefPtr<ArchiveResource> resource = parseNextPart(*header, String (), String (), endOfArchiveReached);
76	if (!resource)
77	return nullptr;
78	archive ->setMainResource(resource.releaseNonNull());
79	return archive;
80	}
81
82	// Skip the message content (it's a generic browser specific message).
83	skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
84
85	bool endOfArchive = false;
86	while (!endOfArchive) {
87	RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(m_lineReader);
88	if (!resourceHeader) {
89	LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
90	return nullptr;
91	}
92	if (resourceHeader ->contentType() == "multipart/alternative") {
93	// Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
94	RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
95	if (!subframeArchive) {
96	LOG_ERROR("Failed to parse MHTML subframe.");
97	return nullptr;
98	}
99	bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
100	ASSERT_UNUSED(endOfPartReached, endOfPartReached);
101	// The top-frame is the first frame found, regardless of the nesting level.
102	if (subframeArchive ->mainResource())
103	addResourceToArchive(subframeArchive ->mainResource(), archive.ptr());
104	archive ->addSubframeArchive(subframeArchive.releaseNonNull());
105	continue;
106	}
107
108	RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
109	if (!resource) {
110	LOG_ERROR("Failed to parse MHTML part.");
111	return nullptr;
112	}
113	addResourceToArchive(resource.get(), archive.ptr());
114	}
115
116	return archive;
117	}
118
119	void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
120	{
121	const String& mimeType = resource->mimeType();
122	if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) \|\| MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) \|\| mimeType == "text/css") {
123	m_resources.append(resource);
124	return;
125	}
126
127	// The first document suitable resource is the main frame.
128	if (!archive->mainResource()) {
129	archive->setMainResource(*resource);
130	m_frames.append(archive);
131	return;
132	}
133
134	auto subframe = MHTMLArchive::create();
135	subframe ->setMainResource(*resource);
136	m_frames.append(WTFMove(subframe));
137	}
138
139	RefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
140	{
141	ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
142
143	auto content = SharedBuffer::create();
144	const bool checkBoundary = !endOfPartBoundary.isEmpty();
145	bool endOfPartReached = false;
146	if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) {
147	if (!checkBoundary) {
148	LOG_ERROR("Binary contents requires end of part");
149	return nullptr;
150	}
151	m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
152	Vector<char> part;
153	if (!m_lineReader.nextChunk(part)) {
154	LOG_ERROR("Binary contents requires end of part");
155	return nullptr;
156	}
157	content ->append(WTFMove(part));
158	m_lineReader.setSeparator("\r\n");
159	Vector<char> nextChars;
160	if (m_lineReader.peek(nextChars, `2`) != `2`) {
161	LOG_ERROR("Invalid seperator.");
162	return nullptr;
163	}
164	endOfPartReached = true;
165	ASSERT(nextChars.size() == `2`);
166	endOfArchiveReached = (nextChars [`0`] == `'-'` && nextChars [`1`] == `'-'`);
167	if (!endOfArchiveReached) {
168	String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
169	if (!line.isEmpty()) {
170	LOG_ERROR("No CRLF at end of binary section.");
171	return nullptr;
172	}
173	}
174	} else {
175	String line;
176	while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
177	endOfArchiveReached = (line == endOfDocumentBoundary);
178	if (checkBoundary && (line == endOfPartBoundary \|\| endOfArchiveReached)) {
179	endOfPartReached = true;
180	break;
181	}
182	// Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
183	content ->append(line.utf8().data(), line.length());
184	if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) {
185	// The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
186	content ->append("\r\n", `2`);
187	}
188	}
189	}
190	if (!endOfPartReached && checkBoundary) {
191	LOG_ERROR("No bounday found for MHTML part.");
192	return nullptr;
193	}
194
195	Vector<char> data;
196	switch (mimeHeader.contentTransferEncoding()) {
197	case MIMEHeader::Base64:
198	if (!base64Decode(content ->data(), content ->size(), data)) {
199	LOG_ERROR("Invalid base64 content for MHTML part.");
200	return nullptr;
201	}
202	break;
203	case MIMEHeader::QuotedPrintable:
204	quotedPrintableDecode(content ->data(), content ->size(), data);
205	break;
206	case MIMEHeader::SevenBit:
207	case MIMEHeader::Binary:
208	data.append(content ->data(), content ->size());
209	break;
210	default:
211	LOG_ERROR("Invalid encoding for MHTML part.");
212	return nullptr;
213	}
214	auto contentBuffer = SharedBuffer::create(WTFMove(data));
215	// FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
216	// The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
217	// IE and Firefox (UNMht) seem to generate only absolute URLs.
218	URL location = URL (URL (), mimeHeader.contentLocation());
219	return ArchiveResource::create(WTFMove(contentBuffer), location, mimeHeader.contentType(), mimeHeader.charset(), String ());
220	}
221
222	size_t MHTMLParser::frameCount() const
223	{
224	return m_frames.size();
225	}
226
227	MHTMLArchive* MHTMLParser::frameAt(size_t index) const
228	{
229	return m_frames [index].get();
230	}
231
232	size_t MHTMLParser::subResourceCount() const
233	{
234	return m_resources.size();
235	}
236
237	ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
238	{
239	return m_resources [index].get();
240	}
241
242	}
243	#endif
244

Browse the source code of webkit/Source/WebCore/loader/archive/mhtml/MHTMLParser.cpp