diff --git a/plugins/webarchiver/archivedialog.cpp b/plugins/webarchiver/archivedialog.cpp index e297251e0..4a734445b 100644 --- a/plugins/webarchiver/archivedialog.cpp +++ b/plugins/webarchiver/archivedialog.cpp @@ -1,1375 +1,1374 @@ /* Copyright (C) 2001 Andreas Schlapbach Copyright (C) 2003 Antonio Larrosa Copyright (C) 2008 Matthias Grimrath This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // The DOM-tree is recursed twice. The first run gathers all URLs while the second // run writes out all HTML frames and CSS stylesheets. These two distinct runs are // necessary, because some frames and/or stylesheets may be dropped (for example // a frame currently not displayed or deemed insecure). In that case an URL that // points to such a frame/stylesheet has to be removed. Since the URL may be mentioned // earlier before recursing to the to-be-removed frame, two runs are necessary to get // a complete list of URLs that should be archived. // Changelog // * replace dynamic_cast<> and ->inherits() with qobject_cast<> // * use QHash instead of QMap; get rid of Ordered<> class // * fixed crash / assertion on Konqueror exit after a webpage was archived // See comment about KHTMLView parent widget in plugin_webarchiver.cpp // * Using KDE4/Qt4 QUrl::equals() and QUrl::fragment() to compare Urls // * KHTML stores comment with a trailing '-'. Looks like some off-by-one bug. // * Add mimetype indicating suffix to downloaded files. // DONE CSS mentioned in elements that are not parsed by Konqueror did not get their // href='' resolved/removed // TODO if href= etc links in a frameset refer to frames currently displayed, make links relative // to archived page instead of absolute // TODO KDE4 webarchiver: look at m_bPreserveWS // TODO KDE4 webarchiver: look at closing tags // TODO check if PartFrameData::framesWithName get a 'KHTMLPart *' if any // TODO KHTMLPart::frames(): Is it possible to have NULL pointers in returned list? // TODO If downloaded object need no data conversion, use KIO::file_copy or signal data() // TODO KDE4 check what KHTMLPart is doing on job->addMetaData() // TODO KDE4 use HTMLScriptElementImpl::charset() to get charset="" attribute of elements #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "archivedialog.h" #include "webarchiverdebug.h" //KDELibs4Support #include // Set to true if you have a patched http-io-slave that has // improved offline-browsing functionality. static const bool patchedHttpSlave = false; #define CONTENT_TYPE "" // // Qt 4.x offers a @c foreach pseudo keyword. This is however slightly slower than FOR_ITER // because @c foreach makes a shared copy of the container. // #define FOR_ITER(type,var,it) for (type::iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) #define FOR_CONST_ITER(type,var,it) for (type::const_iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) #define FOR_ITER_TEMPLATE(type,var,it) for (typename type::iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) static const mode_t archivePerms = S_IFREG | 0644; typedef QList ROPartList; // // functions needed for storing certain DOM elements in a QHash<> // namespace DOM { inline uint qHash(const CSSStyleSheet &a) { return ::qHash(static_cast(a.handle())); } inline bool operator==(const DOM::CSSStyleSheet &a, const DOM::CSSStyleSheet &b) { return a.handle() == b.handle(); } inline uint qHash(const Node &a) { return ::qHash(static_cast(a.handle())); } }// namespace DOM // // elems with 'type' attr: object, param, link, script, style // // TODO convert to bsearch? probably more time and memory efficient ArchiveDialog::NonCDataAttr::NonCDataAttr() { static const char *const non_cdata[] = { "id", "dir", "shape", "tabindex", "align", "nohref", "clear" // Unfinished... }; for (int i = 0; i != (sizeof(non_cdata) / sizeof(non_cdata[0])); ++i) { insert(non_cdata[i]); } } // TODO lazy init? ArchiveDialog::NonCDataAttr ArchiveDialog::non_cdata_attr; ArchiveDialog::RecurseData::RecurseData(KHTMLPart *_part, QTextStream *_textStream, PartFrameData *pfd) : part(_part), textStream(_textStream), partFrameData(pfd), document(_part->htmlDocument()), baseSeen(false) { Q_ASSERT(!document.isNull()); } static KHTMLPart *isArchivablePart(KParts::ReadOnlyPart *part) { KHTMLPart *cp = qobject_cast(part); if (! cp) { return NULL; } DOM::HTMLDocument domdoc(cp->htmlDocument()); if (domdoc.isNull()) { return NULL; } return cp; } ArchiveDialog::ArchiveDialog(QWidget *parent, const QString &filename, KHTMLPart *part) : KDialog(parent), m_top(part), m_job(NULL), m_uniqId(2), m_tarBall(NULL), m_filename(filename), m_widget(NULL) { setCaption(i18nc("@title:window", "Web Archiver")); setButtons(KDialog::Ok | KDialog::Cancel); setButtonGuiItem(KDialog::Ok, KStandardGuiItem::close()); setModal(false); enableButtonOk(false); setDefaultButton(KDialog::NoDefault); m_widget = new ArchiveViewBase(this); { QTreeWidgetItem *twi = m_widget->progressView->headerItem(); twi->setText(0, i18n("Status")); twi->setText(1, i18n("Url")); } setMainWidget(m_widget); QUrl srcURL = part->url(); m_widget->urlLabel->setText(QStringLiteral("" + KStringHandler::csqueeze(srcURL.toDisplayString(), 80) + ""); m_widget->targetLabel->setText(QStringLiteral("" + KStringHandler::csqueeze(filename, 80) + ""); //if(part->document().ownerDocument().isNull()) // m_document = part->document(); //else // m_document = part->document().ownerDocument(); m_tarBall = new KTar(filename, QStringLiteral("application/x-gzip")); m_archiveTime = QDateTime::currentDateTime(); } ArchiveDialog::~ArchiveDialog() { // TODO cancel outstanding download jobs? qCDebug(WEBARCHIVERPLUGIN_LOG) << "destroying"; if (m_job) { m_job->kill(); m_job = NULL; } delete m_tarBall; m_tarBall = NULL; } void ArchiveDialog::archive() { if (m_tarBall->open(QIODevice::WriteOnly)) { obtainURLs(); // Assign unique tarname to URLs // Split m_url2tar into Stylesheets / non stylesheets m_objects.clear(); assert(static_cast(m_url2tar.size()) - static_cast(m_cssURLs.size()) >= 0); // m_objects.reserve(m_url2tar.size() - m_cssURLs.size()); FOR_ITER(UrlTarMap, m_url2tar, u2t_it) { const QUrl &url = u2t_it.key(); DownloadInfo &info = u2t_it.value(); assert(info.tarName.isNull()); // info.tarName = uniqTarName( url.fileName(), 0 ); // To able to append mimetype hinting suffixes to tarnames, for instance adding '.gif' to a // webbug '87626734' adding the name to the url-to-tarname map is deferred. // This cannot be done with CSS because CSS may reference each other so when URLS // of the first CSS are changed all tarnames need to be there. // if (m_cssURLs.find(url) == m_cssURLs.end()) { m_objects.append(u2t_it); } else { info.tarName = uniqTarName(url.fileName(), 0); } } QProgressBar *pb = m_widget->progressBar; pb->setMaximum(m_url2tar.count() + 1); pb->setValue(0); m_objects_it = m_objects.begin(); downloadObjects(); } else { const QString title = i18nc("@title:window", "Unable to Open Web-Archive"); const QString text = i18n("Unable to open \n %1 \n for writing.", m_tarBall->fileName()); KMessageBox::sorry(NULL, text, title); } } void ArchiveDialog::downloadObjects() { if (m_objects_it == m_objects.end()) { m_styleSheets_it = m_cssURLs.begin(); downloadStyleSheets(); } else { m_dlurl2tar_it = (*m_objects_it); const QUrl &url = m_dlurl2tar_it.key(); DownloadInfo &info = m_dlurl2tar_it.value(); assert(m_dlurl2tar_it != m_url2tar.end()); Q_ASSERT(m_job == NULL); m_job = startDownload(url, info.part); connect(m_job, SIGNAL(result(KJob*)), SLOT(slotObjectFinished(KJob*))); } } void ArchiveDialog::slotObjectFinished(KJob *_job) { KIO::StoredTransferJob *job = qobject_cast(_job); Q_ASSERT(job == m_job); m_job = NULL; const QUrl &url = m_dlurl2tar_it.key(); DownloadInfo &info = m_dlurl2tar_it.value(); assert(info.tarName.isNull()); bool error = job->error(); if (!error) { const QString &mimetype(job->mimetype()); info.tarName = uniqTarName(appendMimeTypeSuffix(url.fileName(), mimetype), 0); QByteArray data(job->data()); const QString &tarName = info.tarName; // qCDebug(WEBARCHIVERPLUGIN_LOG) << "downloaded " << url.toDisplayString() << "size=" << data.size() << "mimetype" << mimetype; error = ! m_tarBall->writeFile(tarName, data, archivePerms, QString::null, QString::null, m_archiveTime, m_archiveTime, m_archiveTime); if (error) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; finishedArchiving(true); return; } } else { info.tarName.clear(); qCDebug(WEBARCHIVERPLUGIN_LOG) << "download error for url='" << url; } endProgressInfo(error); ++m_objects_it; downloadObjects(); } void ArchiveDialog::downloadStyleSheets() { if (m_styleSheets_it == m_cssURLs.end()) { saveWebpages(); } else { // QTimer::singleShot(3000, this, SLOT(slotDownloadStyleSheetsDelay())); const QUrl &url = m_styleSheets_it.key(); m_dlurl2tar_it = m_url2tar.find(url); assert(m_dlurl2tar_it != m_url2tar.end()); DownloadInfo &info = m_dlurl2tar_it.value(); Q_ASSERT(m_job == NULL); m_job = startDownload(url, info.part); connect(m_job, SIGNAL(result(KJob*)), SLOT(slotStyleSheetFinished(KJob*))); } } void ArchiveDialog::slotStyleSheetFinished(KJob *_job) { KIO::StoredTransferJob *job = qobject_cast(_job); Q_ASSERT(job == m_job); m_job = NULL; const QUrl &url = m_dlurl2tar_it.key(); DownloadInfo &info = m_dlurl2tar_it.value(); bool error = job->error(); if (! error) { QByteArray data(job->data()); const QString &tarName = info.tarName; URLsInStyleSheet::Iterator uss_it = m_URLsInStyleSheet.find(m_styleSheets_it.value()); assert(uss_it != m_URLsInStyleSheet.end()); DOM::DOMString ds(uss_it.key().charset()); QString cssCharSet(ds.string()); bool ok; QTextCodec *codec = KCharsets::charsets()->codecForName(cssCharSet, ok); qCDebug(WEBARCHIVERPLUGIN_LOG) << "translating URLs in CSS" << url << "charset=" << cssCharSet << " found=" << ok; assert(codec); QString css_text = codec->toUnicode(data); data.clear(); // Do *NOT* delete 'codec'! These are allocated by Qt changeCSSURLs(css_text, uss_it.value()); data = codec->fromUnicode(css_text); css_text.clear(); error = ! m_tarBall->writeFile(tarName, data, archivePerms, QString::null, QString::null, m_archiveTime, m_archiveTime, m_archiveTime); if (error) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; finishedArchiving(true); return; } } else { info.tarName.clear(); qCDebug(WEBARCHIVERPLUGIN_LOG) << "download error for css url='" << url; } endProgressInfo(error); ++m_styleSheets_it; downloadStyleSheets(); } KIO::Job *ArchiveDialog::startDownload(const QUrl &url, KHTMLPart *part) { QTreeWidgetItem *twi = new QTreeWidgetItem; twi->setText(0, i18n("Downloading")); twi->setText(1, url.toDisplayString()); QTreeWidget *tw = m_widget->progressView; tw->insertTopLevelItem(0, twi); KIO::Job *job = KIO::storedGet(url, KIO::NoReload, KIO::HideProgressInfo); // Use entry from cache only. Avoids re-downloading. Requires modified kio_http slave. job->addMetaData(QStringLiteral("cache"), patchedHttpSlave ? "cacheonly" : "cache"); // This is a duplication of the code in loader.cpp: Loader::servePendingRequests() //job->addMetaData("accept", req->object->accept()); job->addMetaData(QStringLiteral("referrer"), part->url().url()); job->addMetaData(QStringLiteral("cross-domain"), part->toplevelURL().url()); return job; } void ArchiveDialog::endProgressInfo(bool error) { QTreeWidget *tw = m_widget->progressView; tw->topLevelItem(0)->setText(0, error ? i18n("Error") : i18n("OK")); QProgressBar *pb = m_widget->progressBar; pb->setValue(pb->value() + 1); } void ArchiveDialog::saveWebpages() { bool error = saveTopFrame(); if (error) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; finishedArchiving(true); return; } QProgressBar *pb = m_widget->progressBar; pb->setValue(pb->value() + 1); // KMessageBox::information(0, i18n( "Archiving webpage completed." ), QString::null, QString::null, false); finishedArchiving(false); } void ArchiveDialog::finishedArchiving(bool tarerror) { if (tarerror) { KMessageBox::error(this, i18n("I/O error occurred while writing to web archive file %1.", m_tarBall->fileName())); } m_tarBall->close(); m_widget->progressView->sortItems(0, Qt::AscendingOrder); setDefaultButton(KDialog::Ok); setEscapeButton(KDialog::Ok); enableButtonOk(true); enableButtonCancel(false); } void ArchiveDialog::slotButtonClicked(int) { deleteLater(); // Keep memory consumption low } // This is the mess you get because C++ lacks a lambda generator // // The whole purpose of the Get* classes is to parametrize what // attribute of a KHTMLPart object should be fetched. // // GetName and GetURL are used for the 'class FuncObj' parameter // class in the template function filterFrameMappings below struct GetFromPart { const KHTMLPart *child; GetFromPart(const KHTMLPart *_child) : child(_child) { } }; struct GetName : public GetFromPart { GetName(const KHTMLPart *child) : GetFromPart(child) { } operator QString() { return child->objectName(); } }; struct GetURL : public GetFromPart { GetURL(const KHTMLPart *child) : GetFromPart(child) { } operator QUrl() { return child->url(); } }; template< class Id2Part, class FuncObj > static void filterFrameMappings(KHTMLPart *part, Id2Part &result) { Id2Part existing_frames; // TODO this can probably be optimized: no temp of existing, directly store to be removed parts. ROPartList childParts(part->frames()); FOR_ITER(ROPartList, childParts, child_it) { // TODO It is not clear from browsing the source code of KHTML if *child_it may be NULL Q_ASSERT(*child_it); KHTMLPart *cp = isArchivablePart(*child_it); if (cp) { existing_frames.insert(FuncObj(cp), cp); } } typedef QList< typename Id2Part::Iterator > IdRemoveList; IdRemoveList beRemoved; FOR_ITER_TEMPLATE(Id2Part, result, it) { typename Id2Part::Iterator exists_it = existing_frames.find(it.key()); if (exists_it == existing_frames.end()) { beRemoved.append(it); } else { it.value() = exists_it.value(); } } FOR_ITER_TEMPLATE(IdRemoveList, beRemoved, rem_it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "removing insecure(?) frame='" << (*rem_it).key(); result.erase((*rem_it)); } } template void filterFrameMappings< ArchiveDialog::Name2Part, GetName >(KHTMLPart *, ArchiveDialog::Name2Part &); template void filterFrameMappings< ArchiveDialog::URL2Part, GetURL >(KHTMLPart *, ArchiveDialog::URL2Part &); /** * Recursively traverses the DOM-Tree extracting all URLs that need to be downloaded */ void ArchiveDialog::obtainURLs() { m_url2tar.clear(); m_tarName2part.clear(); m_framesInPart.clear(); m_cssURLs.clear(); m_URLsInStyleSheet.clear(); m_URLsInStyleElement.clear(); m_topStyleSheets.clear(); obtainURLsLower(m_top, 0); FOR_ITER(FramesInPart, m_framesInPart, fip_it) { KHTMLPart *part = fip_it.key(); PartFrameData &pfd = fip_it.value(); // Remove all frames obtained from the DOM tree parse // that do not have a corresponding KHTMLPart as a direct child. // Do NOT use KHTMLPart::findFrame()! This one searches recursively all subframes as well! filterFrameMappings< Name2Part, GetName >(part, pfd.framesWithName); filterFrameMappings< URL2Part, GetURL >(part, pfd.framesWithURLOnly); } assert(! m_framesInPart.empty()); #if 0 FOR_ITER(CSSURLSet, m_cssURLs, it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "to be downloaded stylesheet='" << it.key(); } FOR_ITER(URLsInStyleSheet, m_URLsInStyleSheet, ss2u_it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "raw URLs in sheet='" << ss2u_it.key().href(); FOR_ITER(RawHRef2FullURL, ss2u_it.data(), c2f_it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << " url='" << c2f_it.key() << "' -> '" << c2f_it.data().toDisplayString(); } } FOR_ITER(URLsInStyleElement, m_URLsInStyleElement, e2u_it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << "raw URLs in style-element:"; FOR_ITER(RawHRef2FullURL, e2u_it.data(), c2f_it) { qCDebug(WEBARCHIVERPLUGIN_LOG) << " url='" << c2f_it.key() << "' -> '" << c2f_it.data().toDisplayString(); } } #endif } void ArchiveDialog::obtainStyleSheetURLsLower(DOM::CSSStyleSheet css, RecurseData &data) { //qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet title='" << styleSheet.title().string() << "' " // "type='" << styleSheet.type().string(); RawHRef2FullURL &raw2full = m_URLsInStyleSheet.insert(css, RawHRef2FullURL()).value(); DOM::CSSRuleList crl = css.cssRules(); for (int j = 0; j != static_cast(crl.length()); ++j) { DOM::CSSRule cr = crl.item(j); switch (cr.type()) { case DOM::CSSRule::STYLE_RULE: { const DOM::CSSStyleRule &csr = static_cast(cr); //qCDebug(WEBARCHIVERPLUGIN_LOG) << "found selector '" << csr.selectorText(); parseStyleDeclaration(css.baseUrl(), csr.style(), raw2full, data); } break; case DOM::CSSRule::IMPORT_RULE: { const DOM::CSSImportRule &cir = static_cast(cr); DOM::CSSStyleSheet importSheet = cir.styleSheet(); if (importSheet.isNull()) { // Given stylesheet was not downloaded / parsed by KHTML // Remove that URL from the stylesheet qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet: invalid @import url('" << cir.href() << "')"; raw2full.insert(cir.href().string(), QUrl()); } else { qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet: @import url('" << cir.href() << "') found"; QString href = cir.href().string(); Q_ASSERT(!href.isNull()); QUrl fullURL = importSheet.baseUrl(); bool inserted = insertHRefFromStyleSheet(href, raw2full, fullURL, data); if (inserted) { m_cssURLs.insert(fullURL, importSheet); obtainStyleSheetURLsLower(importSheet, data); } } } break; default: qCDebug(WEBARCHIVERPLUGIN_LOG) << " unknown/unsupported rule=" << cr.type(); } } } void ArchiveDialog::obtainURLsLower(KHTMLPart *part, int level) { //QString indent; //indent.fill(' ', level*2); QString htmlFileName = (level == 0) ? QStringLiteral("index.html") : part->url().fileName(); // Add .html extension if not found already. This works around problems with frames, // where the frame is for example "framead.php". The http-io-slave gets the mimetype // from the webserver, but files in a tar archive do not have such metadata. The result // is that Konqueror asks "save 'adframe.php' to file?" without this measure. htmlFileName = appendMimeTypeSuffix(htmlFileName, QStringLiteral("text/html")); // If level == 0, the m_tarName2part map is empty and so uniqTarName will return "index.html" unchanged. uniqTarName(htmlFileName, part); assert(m_framesInPart.find(part) == m_framesInPart.end()); FramesInPart::Iterator fip_it = m_framesInPart.insert(part, PartFrameData()); RecurseData data(part, 0, &(fip_it.value())); data.document.documentElement(); obtainPartURLsLower(data.document.documentElement(), 1, data); { // Limit lifetime of @c childParts ROPartList childParts(part->frames()); FOR_ITER(ROPartList, childParts, child_it) { KHTMLPart *cp = isArchivablePart(*child_it); if (cp) { obtainURLsLower(cp, level + 1); } } } DOM::StyleSheetList styleSheetList = data.document.styleSheets(); //qCDebug(WEBARCHIVERPLUGIN_LOG) << "# of stylesheets=" << styleSheetList.length(); for (int i = 0; i != static_cast(styleSheetList.length()); ++i) { DOM::StyleSheet ss = styleSheetList.item(i); if (ss.isCSSStyleSheet()) { DOM::CSSStyleSheet &css = static_cast(ss); QString href = css.href().string(); if (! href.isNull()) { QString href = css.href().string(); QUrl fullUrl = css.baseUrl(); qCDebug(WEBARCHIVERPLUGIN_LOG) << "top-level stylesheet='" << href; bool inserted = insertTranslateURL(fullUrl, data); if (inserted) { m_cssURLs.insert(fullUrl, css); } } else { DOM::Node node = css.ownerNode(); if (! node.isNull()) { assert(! m_topStyleSheets.contains(node)); qCDebug(WEBARCHIVERPLUGIN_LOG) << "top-level inline stylesheet '" << node.nodeName(); // TODO I think there can be more than one