| 1 | /* |
| 2 | * Copyright (C) 2011 Google Inc. All rights reserved. |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are |
| 6 | * met: |
| 7 | * |
| 8 | * * Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * * Redistributions in binary form must reproduce the above |
| 11 | * copyright notice, this list of conditions and the following disclaimer |
| 12 | * in the documentation and/or other materials provided with the |
| 13 | * distribution. |
| 14 | * * Neither the name of Google Inc. nor the names of its |
| 15 | * contributors may be used to endorse or promote products derived from |
| 16 | * this software without specific prior written permission. |
| 17 | * |
| 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | */ |
| 30 | |
| 31 | #include "config.h" |
| 32 | |
| 33 | #if ENABLE(MHTML) |
| 34 | |
| 35 | #include "MHTMLArchive.h" |
| 36 | |
| 37 | #include "Document.h" |
| 38 | #include "Frame.h" |
| 39 | #include "MHTMLParser.h" |
| 40 | #include "MIMETypeRegistry.h" |
| 41 | #include "Page.h" |
| 42 | #include "PageSerializer.h" |
| 43 | #include "QuotedPrintable.h" |
| 44 | #include "SchemeRegistry.h" |
| 45 | #include "SharedBuffer.h" |
| 46 | #include <time.h> |
| 47 | #include <wtf/CryptographicallyRandomNumber.h> |
| 48 | #include <wtf/DateMath.h> |
| 49 | #include <wtf/GregorianDateTime.h> |
| 50 | #include <wtf/StdLibExtras.h> |
| 51 | #include <wtf/text/Base64.h> |
| 52 | #include <wtf/text/StringBuilder.h> |
| 53 | |
| 54 | #if HAVE(SYS_TIME_H) |
| 55 | #include <sys/time.h> |
| 56 | #endif |
| 57 | |
| 58 | namespace WebCore { |
| 59 | |
| 60 | const char* const quotedPrintable = "quoted-printable" ; |
| 61 | const char* const base64 = "base64" ; |
| 62 | |
| 63 | static String generateRandomBoundary() |
| 64 | { |
| 65 | // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). |
| 66 | const size_t randomValuesLength = 10; |
| 67 | char randomValues[randomValuesLength]; |
| 68 | cryptographicallyRandomValues(&randomValues, randomValuesLength); |
| 69 | StringBuilder stringBuilder; |
| 70 | stringBuilder.append("----=_NextPart_000_" ); |
| 71 | for (size_t i = 0; i < randomValuesLength; ++i) { |
| 72 | if (i == 2) |
| 73 | stringBuilder.append('_'); |
| 74 | else if (i == 6) |
| 75 | stringBuilder.append('.'); |
| 76 | stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); |
| 77 | stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); |
| 78 | } |
| 79 | return stringBuilder.toString(); |
| 80 | } |
| 81 | |
| 82 | static String replaceNonPrintableCharacters(const String& text) |
| 83 | { |
| 84 | StringBuilder stringBuilder; |
| 85 | for (size_t i = 0; i < text.length(); ++i) { |
| 86 | if (isASCIIPrintable(text[i])) |
| 87 | stringBuilder.append(text[i]); |
| 88 | else |
| 89 | stringBuilder.append('?'); |
| 90 | } |
| 91 | return stringBuilder.toString(); |
| 92 | } |
| 93 | |
| 94 | MHTMLArchive::MHTMLArchive() |
| 95 | { |
| 96 | } |
| 97 | |
| 98 | MHTMLArchive::~MHTMLArchive() |
| 99 | { |
| 100 | // Because all frames know about each other we need to perform a deep clearing of the archives graph. |
| 101 | clearAllSubframeArchives(); |
| 102 | } |
| 103 | |
| 104 | Ref<MHTMLArchive> MHTMLArchive::create() |
| 105 | { |
| 106 | return adoptRef(*new MHTMLArchive); |
| 107 | } |
| 108 | |
| 109 | RefPtr<MHTMLArchive> MHTMLArchive::create(const URL& url, SharedBuffer& data) |
| 110 | { |
| 111 | // For security reasons we only load MHTML pages from local URLs. |
| 112 | if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol().toString())) |
| 113 | return nullptr; |
| 114 | |
| 115 | MHTMLParser parser(&data); |
| 116 | RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); |
| 117 | if (!mainArchive) |
| 118 | return nullptr; // Invalid MHTML file. |
| 119 | |
| 120 | // Since MHTML is a flat format, we need to make all frames aware of all resources. |
| 121 | for (size_t i = 0; i < parser.frameCount(); ++i) { |
| 122 | RefPtr<MHTMLArchive> archive = parser.frameAt(i); |
| 123 | for (size_t j = 1; j < parser.frameCount(); ++j) { |
| 124 | if (i != j) |
| 125 | archive->addSubframeArchive(*parser.frameAt(j)); |
| 126 | } |
| 127 | for (size_t j = 0; j < parser.subResourceCount(); ++j) |
| 128 | archive->addSubresource(*parser.subResourceAt(j)); |
| 129 | } |
| 130 | return mainArchive; |
| 131 | } |
| 132 | |
| 133 | Ref<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page) |
| 134 | { |
| 135 | Vector<PageSerializer::Resource> resources; |
| 136 | PageSerializer pageSerializer(resources); |
| 137 | pageSerializer.serialize(*page); |
| 138 | |
| 139 | String boundary = generateRandomBoundary(); |
| 140 | String endOfResourceBoundary = makeString("--" , boundary, "\r\n" ); |
| 141 | |
| 142 | GregorianDateTime now; |
| 143 | now.setToCurrentLocalTime(); |
| 144 | String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); |
| 145 | |
| 146 | StringBuilder stringBuilder; |
| 147 | stringBuilder.append("From: <Saved by WebKit>\r\n" ); |
| 148 | stringBuilder.append("Subject: " ); |
| 149 | // We replace non ASCII characters with '?' characters to match IE's behavior. |
| 150 | stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame().document()->title())); |
| 151 | stringBuilder.append("\r\nDate: " ); |
| 152 | stringBuilder.append(dateString); |
| 153 | stringBuilder.append("\r\nMIME-Version: 1.0\r\n" ); |
| 154 | stringBuilder.append("Content-Type: multipart/related;\r\n" ); |
| 155 | stringBuilder.append("\ttype=\"" ); |
| 156 | stringBuilder.append(page->mainFrame().document()->suggestedMIMEType()); |
| 157 | stringBuilder.append("\";\r\n" ); |
| 158 | stringBuilder.append("\tboundary=\"" ); |
| 159 | stringBuilder.append(boundary); |
| 160 | stringBuilder.append("\"\r\n\r\n" ); |
| 161 | |
| 162 | // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). |
| 163 | ASSERT(stringBuilder.toString().isAllASCII()); |
| 164 | CString asciiString = stringBuilder.toString().utf8(); |
| 165 | auto mhtmlData = SharedBuffer::create(); |
| 166 | mhtmlData->append(asciiString.data(), asciiString.length()); |
| 167 | |
| 168 | for (auto& resource : resources) { |
| 169 | stringBuilder.clear(); |
| 170 | stringBuilder.append(endOfResourceBoundary); |
| 171 | stringBuilder.append("Content-Type: " ); |
| 172 | stringBuilder.append(resource.mimeType); |
| 173 | |
| 174 | const char* contentEncoding = nullptr; |
| 175 | if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) |
| 176 | contentEncoding = quotedPrintable; |
| 177 | else |
| 178 | contentEncoding = base64; |
| 179 | |
| 180 | stringBuilder.append("\r\nContent-Transfer-Encoding: " ); |
| 181 | stringBuilder.append(contentEncoding); |
| 182 | stringBuilder.append("\r\nContent-Location: " ); |
| 183 | stringBuilder.append(resource.url); |
| 184 | stringBuilder.append("\r\n\r\n" ); |
| 185 | |
| 186 | asciiString = stringBuilder.toString().utf8(); |
| 187 | mhtmlData->append(asciiString.data(), asciiString.length()); |
| 188 | |
| 189 | // FIXME: ideally we would encode the content as a stream without having to fetch it all. |
| 190 | const char* data = resource.data->data(); |
| 191 | size_t dataLength = resource.data->size(); |
| 192 | Vector<char> encodedData; |
| 193 | if (!strcmp(contentEncoding, quotedPrintable)) { |
| 194 | quotedPrintableEncode(data, dataLength, encodedData); |
| 195 | mhtmlData->append(encodedData.data(), encodedData.size()); |
| 196 | mhtmlData->append("\r\n" , 2); |
| 197 | } else { |
| 198 | ASSERT(!strcmp(contentEncoding, base64)); |
| 199 | // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. |
| 200 | base64Encode(data, dataLength, encodedData); |
| 201 | const size_t maximumLineLength = 76; |
| 202 | size_t index = 0; |
| 203 | size_t encodedDataLength = encodedData.size(); |
| 204 | do { |
| 205 | size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); |
| 206 | mhtmlData->append(encodedData.data() + index, lineLength); |
| 207 | mhtmlData->append("\r\n" , 2); |
| 208 | index += maximumLineLength; |
| 209 | } while (index < encodedDataLength); |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | asciiString = makeString("--" , boundary, "--\r\n" ).utf8(); |
| 214 | mhtmlData->append(asciiString.data(), asciiString.length()); |
| 215 | |
| 216 | return mhtmlData; |
| 217 | } |
| 218 | |
| 219 | } |
| 220 | |
| 221 | #endif |
| 222 | |