| 1 | /* |
| 2 | * Copyright (C) 2011 Google Inc. All rights reserved. |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are |
| 6 | * met: |
| 7 | * |
| 8 | * * Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * * Redistributions in binary form must reproduce the above |
| 11 | * copyright notice, this list of conditions and the following disclaimer |
| 12 | * in the documentation and/or other materials provided with the |
| 13 | * distribution. |
| 14 | * * Neither the name of Google Inc. nor the names of its |
| 15 | * contributors may be used to endorse or promote products derived from |
| 16 | * this software without specific prior written permission. |
| 17 | * |
| 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | */ |
| 30 | |
| 31 | #include "config.h" |
| 32 | |
| 33 | #if ENABLE(MHTML) |
| 34 | #include "MHTMLParser.h" |
| 35 | |
| 36 | #include "MHTMLArchive.h" |
| 37 | #include "MIMEHeader.h" |
| 38 | #include "MIMETypeRegistry.h" |
| 39 | #include "QuotedPrintable.h" |
| 40 | #include <wtf/text/Base64.h> |
| 41 | |
| 42 | namespace WebCore { |
| 43 | |
| 44 | static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) |
| 45 | { |
| 46 | String line; |
| 47 | while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { |
| 48 | if (line == boundary) |
| 49 | return true; |
| 50 | } |
| 51 | return false; |
| 52 | } |
| 53 | |
| 54 | MHTMLParser::MHTMLParser(SharedBuffer* data) |
| 55 | : m_lineReader(data, "\r\n" ) |
| 56 | { |
| 57 | } |
| 58 | |
| 59 | RefPtr<MHTMLArchive> MHTMLParser::parseArchive() |
| 60 | { |
| 61 | return parseArchiveWithHeader(MIMEHeader::parseHeader(m_lineReader).get()); |
| 62 | } |
| 63 | |
| 64 | RefPtr<MHTMLArchive> MHTMLParser::(MIMEHeader* ) |
| 65 | { |
| 66 | if (!header) { |
| 67 | LOG_ERROR("Failed to parse MHTML part: no header." ); |
| 68 | return nullptr; |
| 69 | } |
| 70 | |
| 71 | auto archive = MHTMLArchive::create(); |
| 72 | if (!header->isMultipart()) { |
| 73 | // With IE a page with no resource is not multi-part. |
| 74 | bool endOfArchiveReached = false; |
| 75 | RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); |
| 76 | if (!resource) |
| 77 | return nullptr; |
| 78 | archive->setMainResource(resource.releaseNonNull()); |
| 79 | return archive; |
| 80 | } |
| 81 | |
| 82 | // Skip the message content (it's a generic browser specific message). |
| 83 | skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); |
| 84 | |
| 85 | bool endOfArchive = false; |
| 86 | while (!endOfArchive) { |
| 87 | RefPtr<MIMEHeader> = MIMEHeader::parseHeader(m_lineReader); |
| 88 | if (!resourceHeader) { |
| 89 | LOG_ERROR("Failed to parse MHTML, invalid MIME header." ); |
| 90 | return nullptr; |
| 91 | } |
| 92 | if (resourceHeader->contentType() == "multipart/alternative" ) { |
| 93 | // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). |
| 94 | RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); |
| 95 | if (!subframeArchive) { |
| 96 | LOG_ERROR("Failed to parse MHTML subframe." ); |
| 97 | return nullptr; |
| 98 | } |
| 99 | bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); |
| 100 | ASSERT_UNUSED(endOfPartReached, endOfPartReached); |
| 101 | // The top-frame is the first frame found, regardless of the nesting level. |
| 102 | if (subframeArchive->mainResource()) |
| 103 | addResourceToArchive(subframeArchive->mainResource(), archive.ptr()); |
| 104 | archive->addSubframeArchive(subframeArchive.releaseNonNull()); |
| 105 | continue; |
| 106 | } |
| 107 | |
| 108 | RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); |
| 109 | if (!resource) { |
| 110 | LOG_ERROR("Failed to parse MHTML part." ); |
| 111 | return nullptr; |
| 112 | } |
| 113 | addResourceToArchive(resource.get(), archive.ptr()); |
| 114 | } |
| 115 | |
| 116 | return archive; |
| 117 | } |
| 118 | |
| 119 | void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) |
| 120 | { |
| 121 | const String& mimeType = resource->mimeType(); |
| 122 | if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css" ) { |
| 123 | m_resources.append(resource); |
| 124 | return; |
| 125 | } |
| 126 | |
| 127 | // The first document suitable resource is the main frame. |
| 128 | if (!archive->mainResource()) { |
| 129 | archive->setMainResource(*resource); |
| 130 | m_frames.append(archive); |
| 131 | return; |
| 132 | } |
| 133 | |
| 134 | auto subframe = MHTMLArchive::create(); |
| 135 | subframe->setMainResource(*resource); |
| 136 | m_frames.append(WTFMove(subframe)); |
| 137 | } |
| 138 | |
| 139 | RefPtr<ArchiveResource> MHTMLParser::(const MIMEHeader& , const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) |
| 140 | { |
| 141 | ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); |
| 142 | |
| 143 | auto content = SharedBuffer::create(); |
| 144 | const bool checkBoundary = !endOfPartBoundary.isEmpty(); |
| 145 | bool endOfPartReached = false; |
| 146 | if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) { |
| 147 | if (!checkBoundary) { |
| 148 | LOG_ERROR("Binary contents requires end of part" ); |
| 149 | return nullptr; |
| 150 | } |
| 151 | m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); |
| 152 | Vector<char> part; |
| 153 | if (!m_lineReader.nextChunk(part)) { |
| 154 | LOG_ERROR("Binary contents requires end of part" ); |
| 155 | return nullptr; |
| 156 | } |
| 157 | content->append(WTFMove(part)); |
| 158 | m_lineReader.setSeparator("\r\n" ); |
| 159 | Vector<char> nextChars; |
| 160 | if (m_lineReader.peek(nextChars, 2) != 2) { |
| 161 | LOG_ERROR("Invalid seperator." ); |
| 162 | return nullptr; |
| 163 | } |
| 164 | endOfPartReached = true; |
| 165 | ASSERT(nextChars.size() == 2); |
| 166 | endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); |
| 167 | if (!endOfArchiveReached) { |
| 168 | String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); |
| 169 | if (!line.isEmpty()) { |
| 170 | LOG_ERROR("No CRLF at end of binary section." ); |
| 171 | return nullptr; |
| 172 | } |
| 173 | } |
| 174 | } else { |
| 175 | String line; |
| 176 | while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { |
| 177 | endOfArchiveReached = (line == endOfDocumentBoundary); |
| 178 | if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { |
| 179 | endOfPartReached = true; |
| 180 | break; |
| 181 | } |
| 182 | // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. |
| 183 | content->append(line.utf8().data(), line.length()); |
| 184 | if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) { |
| 185 | // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. |
| 186 | content->append("\r\n" , 2); |
| 187 | } |
| 188 | } |
| 189 | } |
| 190 | if (!endOfPartReached && checkBoundary) { |
| 191 | LOG_ERROR("No bounday found for MHTML part." ); |
| 192 | return nullptr; |
| 193 | } |
| 194 | |
| 195 | Vector<char> data; |
| 196 | switch (mimeHeader.contentTransferEncoding()) { |
| 197 | case MIMEHeader::Base64: |
| 198 | if (!base64Decode(content->data(), content->size(), data)) { |
| 199 | LOG_ERROR("Invalid base64 content for MHTML part." ); |
| 200 | return nullptr; |
| 201 | } |
| 202 | break; |
| 203 | case MIMEHeader::QuotedPrintable: |
| 204 | quotedPrintableDecode(content->data(), content->size(), data); |
| 205 | break; |
| 206 | case MIMEHeader::SevenBit: |
| 207 | case MIMEHeader::Binary: |
| 208 | data.append(content->data(), content->size()); |
| 209 | break; |
| 210 | default: |
| 211 | LOG_ERROR("Invalid encoding for MHTML part." ); |
| 212 | return nullptr; |
| 213 | } |
| 214 | auto contentBuffer = SharedBuffer::create(WTFMove(data)); |
| 215 | // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. |
| 216 | // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 |
| 217 | // IE and Firefox (UNMht) seem to generate only absolute URLs. |
| 218 | URL location = URL(URL(), mimeHeader.contentLocation()); |
| 219 | return ArchiveResource::create(WTFMove(contentBuffer), location, mimeHeader.contentType(), mimeHeader.charset(), String()); |
| 220 | } |
| 221 | |
| 222 | size_t MHTMLParser::frameCount() const |
| 223 | { |
| 224 | return m_frames.size(); |
| 225 | } |
| 226 | |
| 227 | MHTMLArchive* MHTMLParser::frameAt(size_t index) const |
| 228 | { |
| 229 | return m_frames[index].get(); |
| 230 | } |
| 231 | |
| 232 | size_t MHTMLParser::subResourceCount() const |
| 233 | { |
| 234 | return m_resources.size(); |
| 235 | } |
| 236 | |
| 237 | ArchiveResource* MHTMLParser::subResourceAt(size_t index) const |
| 238 | { |
| 239 | return m_resources[index].get(); |
| 240 | } |
| 241 | |
| 242 | } |
| 243 | #endif |
| 244 | |