1 | /* |
2 | * Copyright (C) 2011 Google Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are |
6 | * met: |
7 | * |
8 | * * Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * * Redistributions in binary form must reproduce the above |
11 | * copyright notice, this list of conditions and the following disclaimer |
12 | * in the documentation and/or other materials provided with the |
13 | * distribution. |
14 | * * Neither the name of Google Inc. nor the names of its |
15 | * contributors may be used to endorse or promote products derived from |
16 | * this software without specific prior written permission. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | */ |
30 | |
31 | #include "config.h" |
32 | |
33 | #if ENABLE(MHTML) |
34 | #include "MHTMLParser.h" |
35 | |
36 | #include "MHTMLArchive.h" |
37 | #include "MIMEHeader.h" |
38 | #include "MIMETypeRegistry.h" |
39 | #include "QuotedPrintable.h" |
40 | #include <wtf/text/Base64.h> |
41 | |
42 | namespace WebCore { |
43 | |
44 | static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) |
45 | { |
46 | String line; |
47 | while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { |
48 | if (line == boundary) |
49 | return true; |
50 | } |
51 | return false; |
52 | } |
53 | |
54 | MHTMLParser::MHTMLParser(SharedBuffer* data) |
55 | : m_lineReader(data, "\r\n" ) |
56 | { |
57 | } |
58 | |
59 | RefPtr<MHTMLArchive> MHTMLParser::parseArchive() |
60 | { |
61 | return parseArchiveWithHeader(MIMEHeader::parseHeader(m_lineReader).get()); |
62 | } |
63 | |
64 | RefPtr<MHTMLArchive> MHTMLParser::(MIMEHeader* ) |
65 | { |
66 | if (!header) { |
67 | LOG_ERROR("Failed to parse MHTML part: no header." ); |
68 | return nullptr; |
69 | } |
70 | |
71 | auto archive = MHTMLArchive::create(); |
72 | if (!header->isMultipart()) { |
73 | // With IE a page with no resource is not multi-part. |
74 | bool endOfArchiveReached = false; |
75 | RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); |
76 | if (!resource) |
77 | return nullptr; |
78 | archive->setMainResource(resource.releaseNonNull()); |
79 | return archive; |
80 | } |
81 | |
82 | // Skip the message content (it's a generic browser specific message). |
83 | skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); |
84 | |
85 | bool endOfArchive = false; |
86 | while (!endOfArchive) { |
87 | RefPtr<MIMEHeader> = MIMEHeader::parseHeader(m_lineReader); |
88 | if (!resourceHeader) { |
89 | LOG_ERROR("Failed to parse MHTML, invalid MIME header." ); |
90 | return nullptr; |
91 | } |
92 | if (resourceHeader->contentType() == "multipart/alternative" ) { |
93 | // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). |
94 | RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); |
95 | if (!subframeArchive) { |
96 | LOG_ERROR("Failed to parse MHTML subframe." ); |
97 | return nullptr; |
98 | } |
99 | bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); |
100 | ASSERT_UNUSED(endOfPartReached, endOfPartReached); |
101 | // The top-frame is the first frame found, regardless of the nesting level. |
102 | if (subframeArchive->mainResource()) |
103 | addResourceToArchive(subframeArchive->mainResource(), archive.ptr()); |
104 | archive->addSubframeArchive(subframeArchive.releaseNonNull()); |
105 | continue; |
106 | } |
107 | |
108 | RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); |
109 | if (!resource) { |
110 | LOG_ERROR("Failed to parse MHTML part." ); |
111 | return nullptr; |
112 | } |
113 | addResourceToArchive(resource.get(), archive.ptr()); |
114 | } |
115 | |
116 | return archive; |
117 | } |
118 | |
119 | void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) |
120 | { |
121 | const String& mimeType = resource->mimeType(); |
122 | if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css" ) { |
123 | m_resources.append(resource); |
124 | return; |
125 | } |
126 | |
127 | // The first document suitable resource is the main frame. |
128 | if (!archive->mainResource()) { |
129 | archive->setMainResource(*resource); |
130 | m_frames.append(archive); |
131 | return; |
132 | } |
133 | |
134 | auto subframe = MHTMLArchive::create(); |
135 | subframe->setMainResource(*resource); |
136 | m_frames.append(WTFMove(subframe)); |
137 | } |
138 | |
139 | RefPtr<ArchiveResource> MHTMLParser::(const MIMEHeader& , const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) |
140 | { |
141 | ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); |
142 | |
143 | auto content = SharedBuffer::create(); |
144 | const bool checkBoundary = !endOfPartBoundary.isEmpty(); |
145 | bool endOfPartReached = false; |
146 | if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) { |
147 | if (!checkBoundary) { |
148 | LOG_ERROR("Binary contents requires end of part" ); |
149 | return nullptr; |
150 | } |
151 | m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); |
152 | Vector<char> part; |
153 | if (!m_lineReader.nextChunk(part)) { |
154 | LOG_ERROR("Binary contents requires end of part" ); |
155 | return nullptr; |
156 | } |
157 | content->append(WTFMove(part)); |
158 | m_lineReader.setSeparator("\r\n" ); |
159 | Vector<char> nextChars; |
160 | if (m_lineReader.peek(nextChars, 2) != 2) { |
161 | LOG_ERROR("Invalid seperator." ); |
162 | return nullptr; |
163 | } |
164 | endOfPartReached = true; |
165 | ASSERT(nextChars.size() == 2); |
166 | endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); |
167 | if (!endOfArchiveReached) { |
168 | String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); |
169 | if (!line.isEmpty()) { |
170 | LOG_ERROR("No CRLF at end of binary section." ); |
171 | return nullptr; |
172 | } |
173 | } |
174 | } else { |
175 | String line; |
176 | while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { |
177 | endOfArchiveReached = (line == endOfDocumentBoundary); |
178 | if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { |
179 | endOfPartReached = true; |
180 | break; |
181 | } |
182 | // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. |
183 | content->append(line.utf8().data(), line.length()); |
184 | if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) { |
185 | // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. |
186 | content->append("\r\n" , 2); |
187 | } |
188 | } |
189 | } |
190 | if (!endOfPartReached && checkBoundary) { |
191 | LOG_ERROR("No bounday found for MHTML part." ); |
192 | return nullptr; |
193 | } |
194 | |
195 | Vector<char> data; |
196 | switch (mimeHeader.contentTransferEncoding()) { |
197 | case MIMEHeader::Base64: |
198 | if (!base64Decode(content->data(), content->size(), data)) { |
199 | LOG_ERROR("Invalid base64 content for MHTML part." ); |
200 | return nullptr; |
201 | } |
202 | break; |
203 | case MIMEHeader::QuotedPrintable: |
204 | quotedPrintableDecode(content->data(), content->size(), data); |
205 | break; |
206 | case MIMEHeader::SevenBit: |
207 | case MIMEHeader::Binary: |
208 | data.append(content->data(), content->size()); |
209 | break; |
210 | default: |
211 | LOG_ERROR("Invalid encoding for MHTML part." ); |
212 | return nullptr; |
213 | } |
214 | auto contentBuffer = SharedBuffer::create(WTFMove(data)); |
215 | // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. |
216 | // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 |
217 | // IE and Firefox (UNMht) seem to generate only absolute URLs. |
218 | URL location = URL(URL(), mimeHeader.contentLocation()); |
219 | return ArchiveResource::create(WTFMove(contentBuffer), location, mimeHeader.contentType(), mimeHeader.charset(), String()); |
220 | } |
221 | |
222 | size_t MHTMLParser::frameCount() const |
223 | { |
224 | return m_frames.size(); |
225 | } |
226 | |
227 | MHTMLArchive* MHTMLParser::frameAt(size_t index) const |
228 | { |
229 | return m_frames[index].get(); |
230 | } |
231 | |
232 | size_t MHTMLParser::subResourceCount() const |
233 | { |
234 | return m_resources.size(); |
235 | } |
236 | |
237 | ArchiveResource* MHTMLParser::subResourceAt(size_t index) const |
238 | { |
239 | return m_resources[index].get(); |
240 | } |
241 | |
242 | } |
243 | #endif |
244 | |