diff --git a/doc/konqueror/index.docbook b/doc/konqueror/index.docbook --- a/doc/konqueror/index.docbook +++ b/doc/konqueror/index.docbook @@ -1710,14 +1710,6 @@ Copy Image copies the &URL; of the picture to the clipboard. -To save a complete web page, including images, select -Archive Web Page... from the Tools menu. -Note that this feature is provided by a plugin - and may not have been installed on your system. The web page will be -saved as a single file with a .war -extension and can be opened by left clicking -on the filename in &konqueror; running in File Manager mode. - Printing a copy of the page you are viewing is easily done with the Menubar File Print... or Print @@ -2251,7 +2243,7 @@ Browser Mode - -Archive Web Page (KHTML view only) -Invoked with Tools -Archive Web Page, this tool creates an archive -(.war ) file containing the web page being -viewed including the images. Left click on the -archive file name to view the saved page. - - Auto Refresh (KHTML view only) diff --git a/konqueror.categories b/konqueror.categories --- a/konqueror.categories +++ b/konqueror.categories @@ -1,4 +1,3 @@ org.kde.webenginepart webenginepart IDENTIFIER [WEBENGINEPART_LOG] org.kde.konqueror konqueror IDENTIFIER [KONQUEROR_LOG] org.kde.konqueror.minitools minitools (konqueror plugin) IDENTIFIER [MINITOOLSPLUGIN_LOG] -org.kde.konqueror.webarchiver webarchiver (konqueror plugin) IDENTIFIER [WEBARCHIVERPLUGIN_LOG] diff --git a/plugins/CMakeLists.txt b/plugins/CMakeLists.txt --- a/plugins/CMakeLists.txt +++ b/plugins/CMakeLists.txt @@ -8,7 +8,6 @@ add_subdirectory( dirfilter ) # TODO add_subdirectory( uachanger ) add_subdirectory( babelfish ) -add_subdirectory( webarchiver ) if (Qt5TextToSpeech_FOUND) add_subdirectory(ttsplugin) diff --git a/plugins/webarchiver/16-actions-webarchiver.png b/plugins/webarchiver/16-actions-webarchiver.png deleted file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@> rc.cpp -$XGETTEXT *.cpp -o $podir/webarchiver.pot diff --git a/plugins/webarchiver/archivedialog.h b/plugins/webarchiver/archivedialog.h deleted file mode 100644 --- a/plugins/webarchiver/archivedialog.h +++ /dev/null @@ -1,274 +0,0 @@ -/* - Copyright (C) 2003 Antonio Larrosa - Copyright (C) 2008 Matthias Grimrath - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. -*/ - -#ifndef _ARCHIVEDIALOG_H_ -#define _ARCHIVEDIALOG_H_ - -#include -#include - -#include - -#include -#include - -#include "ui_archiveviewbase.h" - -class QWidget; -class KHTMLPart; -class ArchiveViewBase; -class QUrl; -class KTar; -class QTextStream; - -class ArchiveViewBase : public QWidget, public Ui::ArchiveViewBase -{ -public: - ArchiveViewBase(QWidget *parent) : QWidget(parent) - { - setupUi(this); - } -}; - -/// Does all the hard work of downloading, manipulating and storing of -/// HTML files and inlined images, stylesheets ... -class ArchiveDialog : public KDialog -{ - Q_OBJECT -public: - ArchiveDialog(QWidget *parent, const QString &targetFilename, KHTMLPart *part); - ~ArchiveDialog() override; - - void archive(); - -protected: - /// Holds attributes that are not #CDATA - class NonCDataAttr : public QSet - { - public: - NonCDataAttr(); - }; - - static NonCDataAttr non_cdata_attr; - - KIO::Job *startDownload(const QUrl &url, KHTMLPart *part); - -public: - - // Frame handling - - typedef QHash Name2Part; -private: - typedef QHash URL2Part; - - struct PartFrameData { - Name2Part framesWithName; - URL2Part framesWithURLOnly; - }; - - typedef QHash< KHTMLPart *, PartFrameData > FramesInPart; - typedef QHash< QString, KHTMLPart * > TarName2Part; - typedef QHash< KHTMLPart *, QString > Part2TarName; - - // Stylesheets - - typedef QHash< QUrl, DOM::CSSStyleSheet > CSSURLSet; - typedef QHash< QString, QUrl > RawHRef2FullURL; - typedef QHash< DOM::CSSStyleSheet, RawHRef2FullURL > URLsInStyleSheet; - typedef QHash< DOM::Element, RawHRef2FullURL > URLsInStyleElement; - typedef QHash< DOM::Node, DOM::CSSStyleSheet > Node2StyleSheet; - - // Recursive parsing and processing - - /// Databag to hold information that is gathered during recursive traversal of the DOM tree - struct RecurseData { - KHTMLPart *const part; - QTextStream *const textStream; - PartFrameData *const partFrameData; - DOM::HTMLDocument document; - bool baseSeen; - - RecurseData(KHTMLPart *_part, QTextStream *_textStream, PartFrameData *pfd); - }; - - struct DownloadInfo { - QString tarName; - KHTMLPart *part; - - DownloadInfo(const QString &_tarName = QString::null, KHTMLPart *_part = nullptr) - : tarName(_tarName), part(_part) { } - }; - - typedef QMap< QUrl, DownloadInfo > UrlTarMap; - typedef QList< UrlTarMap::Iterator > DownloadList; - - struct AttrElem { - QString name; - QString value; - - AttrElem() { } - AttrElem(const QString &_n, const QString &_v) : name(_n), value(_v) { } - }; - typedef QLinkedList< AttrElem > AttrList; - - /** - * Looks for URL contained in attributes. - */ - struct ExtractURLs { - ExtractURLs(const QString &nodeName, const DOM::Element &element); - - AttrList attrList; /// copy of the attribute of @p element - AttrList::iterator absURL; /// for links ala <a href= ... > - AttrList::iterator transURL; /// for embedded objects like <img src=...>, favicons, background-images... - AttrList::iterator frameURL; /// if @p element contains a frameURL - AttrList::iterator frameName; /// if it is frame tag with a name element - AttrList::iterator cssURL; /// for URLs that specify CSS - }; - -private: - void downloadObjects(); - void downloadStyleSheets(); - void saveWebpages(); - void finishedArchiving(bool tarerror); - - void endProgressInfo(bool error); - - void obtainURLs(); - void obtainURLsLower(KHTMLPart *part, int level); - void obtainPartURLsLower(const DOM::Node &pNode, int level, RecurseData &data); - void obtainStyleSheetURLsLower(DOM::CSSStyleSheet styleSheet, RecurseData &data); - - bool insertTranslateURL(const QUrl &fullURL, RecurseData &data); - bool insertHRefFromStyleSheet(const QString &hrefRaw, RawHRef2FullURL &raw2full, - const QUrl &fullURL, RecurseData &data); - void parseStyleDeclaration(const QUrl &baseURL, DOM::CSSStyleDeclaration decl, - RawHRef2FullURL &urls, RecurseData &data /*, bool verbose = false*/); - - bool saveTopFrame(); - bool saveFrame(KHTMLPart *part, int level); - void saveHTMLPart(RecurseData &data); - void saveHTMLPartLower(const DOM::Node &pNode, int indent, RecurseData &data); - - QString extractCSSURL(const QString &text); - QString &changeCSSURLs(QString &text, const RawHRef2FullURL &raw2full); - - static bool hasAttrWithValue(const DOM::Element &elem, const QString &attrName, const QString &attrValue); - static bool hasChildNode(const DOM::Node &pNode, const QString &nodeName); - static AttrList::Iterator getAttribute(AttrList &attrList, const QString &attr); - - static bool hasSubUrl(const QUrl &url); - - /** - * completes a potentially partial URL in a HTML document (like <img href="...") - * to a fully qualified one. - * - * It uses the URL of the document or the URL given in the <base ...> - * element, depending on if and where a <base ...> appears on the document. - * - * Always use this method to get full URLs from href's or similar. - * - * Suppose the URL of the webpage is http://host.nowhere/. The head looks like this - *
-     * <head>
-     *   <link rel="stylesheet" href="style1.css" type="text/css" />
-     *   <base href="http://some.place/" />
-     *   <link rel="stylesheet" href="style2.css" type="text/css" />
-     * </head>
-     * 
- * - * The full URL of "style1.css" is http://host.nowhere/style1.css, whereas - * "style2.css" will become http://some.place/style2.css - * - * @return fully qualified URL of @p partURL relative to the HTML document in @c data.part - */ - static QUrl absoluteURL(const QString &partURL, RecurseData &data); - - /** - * TODO KDE4 is this in KHTML function available now? - * Functionality taken from khtml/css/csshelper.cpp:parseURL - * - * Filters a href in an element inside the HTML body. This handles - * quirks in browsers that filter out \\n, \\r in URLs. - */ - static QString parseURL(const QString &rawurl); - - /** - * Creates unique filenames to be used in the tar archive - */ - QString uniqTarName(const QString &suggestion, KHTMLPart *part); - - /** - * Taken from khtml/misc/loader.cpp DOCLOAD_SECCHECK - * - * Would be better on the public interface of KHTMLPart (or similar) - * - * Checks if an embedded link like <img src="..." should be loaded - */ - static bool urlCheckFailed(KHTMLPart *part, const QUrl &fullURL); - - /** - * Escapes HTML characters. Does not forget " as @ref Qt::escape() does. - */ - QString escapeHTML(const QString &in); - - /** - * Adds a suffix that hints at the mimetypes if such a suffix is not - * present already. If there is no such mimetype in the KDE database - * @p filename is returned unchanged. - * 'filename' -> 'filename.gif' - * 'picture.jpg' -> 'picture.jpg' - * - * NOTE This function is rather slow - */ - QString appendMimeTypeSuffix(QString filename, const QString &mimetype); - -private: - KHTMLPart *m_top; - - FramesInPart m_framesInPart; - - UrlTarMap m_url2tar; - TarName2Part m_tarName2part; - Part2TarName m_part2tarName; - CSSURLSet m_cssURLs; - URLsInStyleSheet m_URLsInStyleSheet; - URLsInStyleElement m_URLsInStyleElement; - Node2StyleSheet m_topStyleSheets; - - KIO::Job *m_job; - CSSURLSet::Iterator m_styleSheets_it; - DownloadList m_objects; - DownloadList::Iterator m_objects_it; - UrlTarMap::Iterator m_dlurl2tar_it; - - int m_uniqId; - KTar *m_tarBall; - QDateTime m_archiveTime; - QString m_filename; - - ArchiveViewBase *m_widget; - -private slots: - void slotObjectFinished(KJob *job); - void slotStyleSheetFinished(KJob *job); - void slotButtonClicked(int button) override; -}; - -#endif // _ARCHIVEDIALOG_H_ diff --git a/plugins/webarchiver/archivedialog.cpp b/plugins/webarchiver/archivedialog.cpp deleted file mode 100644 --- a/plugins/webarchiver/archivedialog.cpp +++ /dev/null @@ -1,1372 +0,0 @@ -/* - Copyright (C) 2001 Andreas Schlapbach - Copyright (C) 2003 Antonio Larrosa - Copyright (C) 2008 Matthias Grimrath - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. -*/ - -// The DOM-tree is recursed twice. The first run gathers all URLs while the second -// run writes out all HTML frames and CSS stylesheets. These two distinct runs are -// necessary, because some frames and/or stylesheets may be dropped (for example -// a frame currently not displayed or deemed insecure). In that case an URL that -// points to such a frame/stylesheet has to be removed. Since the URL may be mentioned -// earlier before recursing to the to-be-removed frame, two runs are necessary to get -// a complete list of URLs that should be archived. - -// Changelog -// * replace dynamic_cast<> and ->inherits() with qobject_cast<> -// * use QHash instead of QMap; get rid of Ordered<> class -// * fixed crash / assertion on Konqueror exit after a webpage was archived -// See comment about KHTMLView parent widget in plugin_webarchiver.cpp -// * Using KDE4/Qt4 QUrl::equals() and QUrl::fragment() to compare Urls -// * KHTML stores comment with a trailing '-'. Looks like some off-by-one bug. -// * Add mimetype indicating suffix to downloaded files. - -// DONE CSS mentioned in elements that are not parsed by Konqueror did not get their -// href='' resolved/removed - -// TODO if href= etc links in a frameset refer to frames currently displayed, make links relative -// to archived page instead of absolute -// TODO KDE4 webarchiver: look at m_bPreserveWS -// TODO KDE4 webarchiver: look at closing tags -// TODO check if PartFrameData::framesWithName get a 'KHTMLPart *' if any -// TODO KHTMLPart::frames(): Is it possible to have NULL pointers in returned list? -// TODO If downloaded object need no data conversion, use KIO::file_copy or signal data() -// TODO KDE4 check what KHTMLPart is doing on job->addMetaData() -// TODO KDE4 use HTMLScriptElementImpl::charset() to get charset="" attribute of elements - -#include "archivedialog.h" - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include - -#include "webarchiverdebug.h" - -// Set to true if you have a patched http-io-slave that has -// improved offline-browsing functionality. -static const bool patchedHttpSlave = false; - -#define CONTENT_TYPE "" - -// -// Qt 4.x offers a @c foreach pseudo keyword. This is however slightly slower than FOR_ITER -// because @c foreach makes a shared copy of the container. -// -#define FOR_ITER(type,var,it) for (type::iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) -#define FOR_CONST_ITER(type,var,it) for (type::const_iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) -#define FOR_ITER_TEMPLATE(type,var,it) for (typename type::iterator it(var.begin()), it##end(var.end()); it != it##end; ++it) - -static const mode_t archivePerms = S_IFREG | 0644; - -typedef QList ROPartList; - -// -// functions needed for storing certain DOM elements in a QHash<> -// -namespace DOM -{ - -inline uint qHash(const CSSStyleSheet &a) -{ - return ::qHash(static_cast(a.handle())); -} - -inline bool operator==(const DOM::CSSStyleSheet &a, const DOM::CSSStyleSheet &b) -{ - return a.handle() == b.handle(); -} - -inline uint qHash(const Node &a) -{ - return ::qHash(static_cast(a.handle())); -} - -}// namespace DOM - -// -// elems with 'type' attr: object, param, link, script, style -// - -// TODO convert to bsearch? probably more time and memory efficient -ArchiveDialog::NonCDataAttr::NonCDataAttr() -{ - static const char *const non_cdata[] = { - "id", "dir", "shape", "tabindex", "align", "nohref", "clear" - // Unfinished... - }; - for (int i = 0; i != (sizeof(non_cdata) / sizeof(non_cdata[0])); ++i) { - insert(non_cdata[i]); - } -} - -// TODO lazy init? -ArchiveDialog::NonCDataAttr ArchiveDialog::non_cdata_attr; - -ArchiveDialog::RecurseData::RecurseData(KHTMLPart *_part, QTextStream *_textStream, PartFrameData *pfd) - : part(_part), textStream(_textStream), partFrameData(pfd), document(_part->htmlDocument()), - baseSeen(false) -{ - Q_ASSERT(!document.isNull()); -} - -static KHTMLPart *isArchivablePart(KParts::ReadOnlyPart *part) -{ - KHTMLPart *cp = qobject_cast(part); - if (! cp) { - return nullptr; - } - DOM::HTMLDocument domdoc(cp->htmlDocument()); - if (domdoc.isNull()) { - return nullptr; - } - return cp; -} - -ArchiveDialog::ArchiveDialog(QWidget *parent, const QString &filename, KHTMLPart *part) - : KDialog(parent), m_top(part), m_job(nullptr), m_uniqId(2), m_tarBall(nullptr), m_filename(filename), m_widget(nullptr) -{ - setCaption(i18nc("@title:window", "Web Archiver")); - setButtons(KDialog::Ok | KDialog::Cancel); - setButtonGuiItem(KDialog::Ok, KStandardGuiItem::close()); - setModal(false); - enableButtonOk(false); - setDefaultButton(KDialog::NoDefault); - - m_widget = new ArchiveViewBase(this); - { - QTreeWidgetItem *twi = m_widget->progressView->headerItem(); - twi->setText(0, i18n("Status")); - twi->setText(1, i18n("Url")); - } - setMainWidget(m_widget); - - QUrl srcURL = part->url(); - m_widget->urlLabel->setText(QStringLiteral("" + - KStringHandler::csqueeze(srcURL.toDisplayString(), 80) + ""); - m_widget->targetLabel->setText(QStringLiteral("" + - KStringHandler::csqueeze(filename, 80) + ""); - - //if(part->document().ownerDocument().isNull()) - // m_document = part->document(); - //else - // m_document = part->document().ownerDocument(); - - m_tarBall = new KTar(filename, QStringLiteral("application/x-gzip")); - m_archiveTime = QDateTime::currentDateTime(); -} - -ArchiveDialog::~ArchiveDialog() -{ - // TODO cancel outstanding download jobs? - qCDebug(WEBARCHIVERPLUGIN_LOG) << "destroying"; - if (m_job) { - m_job->kill(); - m_job = nullptr; - } - delete m_tarBall; m_tarBall = nullptr; -} - -void ArchiveDialog::archive() -{ - if (m_tarBall->open(QIODevice::WriteOnly)) { - - obtainURLs(); - - // Assign unique tarname to URLs - // Split m_url2tar into Stylesheets / non stylesheets - m_objects.clear(); - assert(static_cast(m_url2tar.size()) - static_cast(m_cssURLs.size()) >= 0); -// m_objects.reserve(m_url2tar.size() - m_cssURLs.size()); - - FOR_ITER(UrlTarMap, m_url2tar, u2t_it) { - const QUrl &url = u2t_it.key(); - DownloadInfo &info = u2t_it.value(); - - assert(info.tarName.isNull()); -// info.tarName = uniqTarName( url.fileName(), 0 ); - - // To able to append mimetype hinting suffixes to tarnames, for instance adding '.gif' to a - // webbug '87626734' adding the name to the url-to-tarname map is deferred. - // This cannot be done with CSS because CSS may reference each other so when URLS - // of the first CSS are changed all tarnames need to be there. - // - if (m_cssURLs.find(url) == m_cssURLs.end()) { - m_objects.append(u2t_it); - } else { - info.tarName = uniqTarName(url.fileName(), nullptr); - } - } - - QProgressBar *pb = m_widget->progressBar; - pb->setMaximum(m_url2tar.count() + 1); - pb->setValue(0); - - m_objects_it = m_objects.begin(); - downloadObjects(); - - } else { - const QString title = i18nc("@title:window", "Unable to Open Web-Archive"); - const QString text = i18n("Unable to open \n %1 \n for writing.", m_tarBall->fileName()); - KMessageBox::sorry(nullptr, text, title); - } -} - -void ArchiveDialog::downloadObjects() -{ - - if (m_objects_it == m_objects.end()) { - - m_styleSheets_it = m_cssURLs.begin(); - downloadStyleSheets(); - - } else { - - m_dlurl2tar_it = (*m_objects_it); - const QUrl &url = m_dlurl2tar_it.key(); - DownloadInfo &info = m_dlurl2tar_it.value(); - assert(m_dlurl2tar_it != m_url2tar.end()); - - Q_ASSERT(m_job == nullptr); - m_job = startDownload(url, info.part); - connect(m_job, SIGNAL(result(KJob*)), SLOT(slotObjectFinished(KJob*))); - } -} - -void ArchiveDialog::slotObjectFinished(KJob *_job) -{ - KIO::StoredTransferJob *job = qobject_cast(_job); - Q_ASSERT(job == m_job); - m_job = nullptr; - const QUrl &url = m_dlurl2tar_it.key(); - DownloadInfo &info = m_dlurl2tar_it.value(); - - assert(info.tarName.isNull()); - bool error = job->error(); - if (!error) { - const QString &mimetype(job->mimetype()); - info.tarName = uniqTarName(appendMimeTypeSuffix(url.fileName(), mimetype), nullptr); - - QByteArray data(job->data()); - const QString &tarName = info.tarName; - -// qCDebug(WEBARCHIVERPLUGIN_LOG) << "downloaded " << url.toDisplayString() << "size=" << data.size() << "mimetype" << mimetype; - error = ! m_tarBall->writeFile(tarName, data, archivePerms, QString::null, QString::null, - m_archiveTime, m_archiveTime, m_archiveTime); - if (error) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; - finishedArchiving(true); - return; - } - } else { - info.tarName.clear(); - qCDebug(WEBARCHIVERPLUGIN_LOG) << "download error for url='" << url; - } - - endProgressInfo(error); - ++m_objects_it; - downloadObjects(); -} - -void ArchiveDialog::downloadStyleSheets() -{ - if (m_styleSheets_it == m_cssURLs.end()) { - - saveWebpages(); - - } else { - -// QTimer::singleShot(3000, this, SLOT(slotDownloadStyleSheetsDelay())); - const QUrl &url = m_styleSheets_it.key(); - m_dlurl2tar_it = m_url2tar.find(url); - assert(m_dlurl2tar_it != m_url2tar.end()); - DownloadInfo &info = m_dlurl2tar_it.value(); - - Q_ASSERT(m_job == nullptr); - m_job = startDownload(url, info.part); - connect(m_job, SIGNAL(result(KJob*)), SLOT(slotStyleSheetFinished(KJob*))); - } -} - -void ArchiveDialog::slotStyleSheetFinished(KJob *_job) -{ - KIO::StoredTransferJob *job = qobject_cast(_job); - Q_ASSERT(job == m_job); - m_job = nullptr; - const QUrl &url = m_dlurl2tar_it.key(); - DownloadInfo &info = m_dlurl2tar_it.value(); - - bool error = job->error(); - if (! error) { - QByteArray data(job->data()); - const QString &tarName = info.tarName; - - URLsInStyleSheet::Iterator uss_it = m_URLsInStyleSheet.find(m_styleSheets_it.value()); - assert(uss_it != m_URLsInStyleSheet.end()); - - DOM::DOMString ds(uss_it.key().charset()); - QString cssCharSet(ds.string()); - bool ok; - QTextCodec *codec = KCharsets::charsets()->codecForName(cssCharSet, ok); - qCDebug(WEBARCHIVERPLUGIN_LOG) << "translating URLs in CSS" << url << "charset=" << cssCharSet << " found=" << ok; - assert(codec); - QString css_text = codec->toUnicode(data); - data.clear(); - // Do *NOT* delete 'codec'! These are allocated by Qt - - changeCSSURLs(css_text, uss_it.value()); - data = codec->fromUnicode(css_text); - css_text.clear(); - - error = ! m_tarBall->writeFile(tarName, data, archivePerms, QString::null, QString::null, - m_archiveTime, m_archiveTime, m_archiveTime); - if (error) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; - finishedArchiving(true); - return; - } - } else { - info.tarName.clear(); - qCDebug(WEBARCHIVERPLUGIN_LOG) << "download error for css url='" << url; - } - - endProgressInfo(error); - ++m_styleSheets_it; - downloadStyleSheets(); -} - -KIO::Job *ArchiveDialog::startDownload(const QUrl &url, KHTMLPart *part) -{ - QTreeWidgetItem *twi = new QTreeWidgetItem; - twi->setText(0, i18n("Downloading")); - twi->setText(1, url.toDisplayString()); - QTreeWidget *tw = m_widget->progressView; - tw->insertTopLevelItem(0, twi); - - KIO::Job *job = KIO::storedGet(url, KIO::NoReload, KIO::HideProgressInfo); - - // Use entry from cache only. Avoids re-downloading. Requires modified kio_http slave. - job->addMetaData(QStringLiteral("cache"), patchedHttpSlave ? "cacheonly" : "cache"); - - // This is a duplication of the code in loader.cpp: Loader::servePendingRequests() - - //job->addMetaData("accept", req->object->accept()); - job->addMetaData(QStringLiteral("referrer"), part->url().url()); - job->addMetaData(QStringLiteral("cross-domain"), part->toplevelURL().url()); - - return job; -} - -void ArchiveDialog::endProgressInfo(bool error) -{ - QTreeWidget *tw = m_widget->progressView; - tw->topLevelItem(0)->setText(0, error ? i18n("Error") : i18n("OK")); - QProgressBar *pb = m_widget->progressBar; - pb->setValue(pb->value() + 1); -} - -void ArchiveDialog::saveWebpages() -{ - bool error = saveTopFrame(); - if (error) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "Error writing to archive file"; - finishedArchiving(true); - return; - } - QProgressBar *pb = m_widget->progressBar; - pb->setValue(pb->value() + 1); - -// KMessageBox::information(0, i18n( "Archiving webpage completed." ), QString::null, QString::null, false); - finishedArchiving(false); -} - -void ArchiveDialog::finishedArchiving(bool tarerror) -{ - if (tarerror) { - KMessageBox::error(this, i18n("I/O error occurred while writing to web archive file %1.", m_tarBall->fileName())); - } - m_tarBall->close(); - - m_widget->progressView->sortItems(0, Qt::AscendingOrder); - setDefaultButton(KDialog::Ok); - setEscapeButton(KDialog::Ok); - enableButtonOk(true); - enableButtonCancel(false); -} - -void ArchiveDialog::slotButtonClicked(int) -{ - deleteLater(); // Keep memory consumption low -} - -// This is the mess you get because C++ lacks a lambda generator -// -// The whole purpose of the Get* classes is to parametrize what -// attribute of a KHTMLPart object should be fetched. -// -// GetName and GetURL are used for the 'class FuncObj' parameter -// class in the template function filterFrameMappings below -struct GetFromPart { - const KHTMLPart *child; - - GetFromPart(const KHTMLPart *_child) : child(_child) { } -}; - -struct GetName : public GetFromPart { - GetName(const KHTMLPart *child) : GetFromPart(child) { } - - operator QString() - { - return child->objectName(); - } -}; -struct GetURL : public GetFromPart { - GetURL(const KHTMLPart *child) : GetFromPart(child) { } - - operator QUrl() - { - return child->url(); - } -}; - -template< class Id2Part, class FuncObj > -static void filterFrameMappings(KHTMLPart *part, Id2Part &result) -{ - Id2Part existing_frames; - - // TODO this can probably be optimized: no temp of existing, directly store to be removed parts. - ROPartList childParts(part->frames()); - FOR_ITER(ROPartList, childParts, child_it) { - // TODO It is not clear from browsing the source code of KHTML if *child_it may be NULL - Q_ASSERT(*child_it); - KHTMLPart *cp = isArchivablePart(*child_it); - if (cp) { - existing_frames.insert(FuncObj(cp), cp); - } - } - - typedef QList< typename Id2Part::Iterator > IdRemoveList; - IdRemoveList beRemoved; - - FOR_ITER_TEMPLATE(Id2Part, result, it) { - typename Id2Part::Iterator exists_it = existing_frames.find(it.key()); - if (exists_it == existing_frames.end()) { - beRemoved.append(it); - } else { - it.value() = exists_it.value(); - } - } - FOR_ITER_TEMPLATE(IdRemoveList, beRemoved, rem_it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "removing insecure(?) frame='" << (*rem_it).key(); - result.erase((*rem_it)); - } -} - -template void filterFrameMappings< ArchiveDialog::Name2Part, GetName >(KHTMLPart *, ArchiveDialog::Name2Part &); -template void filterFrameMappings< ArchiveDialog::URL2Part, GetURL >(KHTMLPart *, ArchiveDialog::URL2Part &); - -/** - * Recursively traverses the DOM-Tree extracting all URLs that need to be downloaded - */ -void ArchiveDialog::obtainURLs() -{ - m_url2tar.clear(); - m_tarName2part.clear(); - m_framesInPart.clear(); - m_cssURLs.clear(); - m_URLsInStyleSheet.clear(); - m_URLsInStyleElement.clear(); - m_topStyleSheets.clear(); - - obtainURLsLower(m_top, 0); - - FOR_ITER(FramesInPart, m_framesInPart, fip_it) { - KHTMLPart *part = fip_it.key(); - PartFrameData &pfd = fip_it.value(); - - // Remove all frames obtained from the DOM tree parse - // that do not have a corresponding KHTMLPart as a direct child. - - // Do NOT use KHTMLPart::findFrame()! This one searches recursively all subframes as well! - filterFrameMappings< Name2Part, GetName >(part, pfd.framesWithName); - filterFrameMappings< URL2Part, GetURL >(part, pfd.framesWithURLOnly); - } - assert(! m_framesInPart.empty()); -#if 0 - FOR_ITER(CSSURLSet, m_cssURLs, it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "to be downloaded stylesheet='" << it.key(); - } - FOR_ITER(URLsInStyleSheet, m_URLsInStyleSheet, ss2u_it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "raw URLs in sheet='" << ss2u_it.key().href(); - FOR_ITER(RawHRef2FullURL, ss2u_it.data(), c2f_it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << " url='" << c2f_it.key() << "' -> '" << c2f_it.data().toDisplayString(); - } - } - FOR_ITER(URLsInStyleElement, m_URLsInStyleElement, e2u_it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << "raw URLs in style-element:"; - FOR_ITER(RawHRef2FullURL, e2u_it.data(), c2f_it) { - qCDebug(WEBARCHIVERPLUGIN_LOG) << " url='" << c2f_it.key() << "' -> '" << c2f_it.data().toDisplayString(); - } - } -#endif -} - -void ArchiveDialog::obtainStyleSheetURLsLower(DOM::CSSStyleSheet css, RecurseData &data) -{ - - //qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet title='" << styleSheet.title().string() << "' " - // "type='" << styleSheet.type().string(); - - RawHRef2FullURL &raw2full = m_URLsInStyleSheet.insert(css, RawHRef2FullURL()).value(); - - DOM::CSSRuleList crl = css.cssRules(); - for (int j = 0; j != static_cast(crl.length()); ++j) { - - DOM::CSSRule cr = crl.item(j); - switch (cr.type()) { - - case DOM::CSSRule::STYLE_RULE: { - const DOM::CSSStyleRule &csr = static_cast(cr); - - //qCDebug(WEBARCHIVERPLUGIN_LOG) << "found selector '" << csr.selectorText(); - parseStyleDeclaration(css.baseUrl(), csr.style(), raw2full, data); - } break; - - case DOM::CSSRule::IMPORT_RULE: { - const DOM::CSSImportRule &cir = static_cast(cr); - - DOM::CSSStyleSheet importSheet = cir.styleSheet(); - if (importSheet.isNull()) { - - // Given stylesheet was not downloaded / parsed by KHTML - // Remove that URL from the stylesheet - qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet: invalid @import url('" << cir.href() << "')"; - - raw2full.insert(cir.href().string(), QUrl()); - - } else { - - qCDebug(WEBARCHIVERPLUGIN_LOG) << "stylesheet: @import url('" << cir.href() << "') found"; - - QString href = cir.href().string(); - Q_ASSERT(!href.isNull()); - - QUrl fullURL = importSheet.baseUrl(); - bool inserted = insertHRefFromStyleSheet(href, raw2full, fullURL, data); - if (inserted) { - m_cssURLs.insert(fullURL, importSheet); - obtainStyleSheetURLsLower(importSheet, data); - } - } - } break; - - default: - qCDebug(WEBARCHIVERPLUGIN_LOG) << " unknown/unsupported rule=" << cr.type(); - } - } -} - -void ArchiveDialog::obtainURLsLower(KHTMLPart *part, int level) -{ - //QString indent; - //indent.fill(' ', level*2); - - QString htmlFileName = (level == 0) ? QStringLiteral("index.html") : part->url().fileName(); - - // Add .html extension if not found already. This works around problems with frames, - // where the frame is for example "framead.php". The http-io-slave gets the mimetype - // from the webserver, but files in a tar archive do not have such metadata. The result - // is that Konqueror asks "save 'adframe.php' to file?" without this measure. - htmlFileName = appendMimeTypeSuffix(htmlFileName, QStringLiteral("text/html")); - - // If level == 0, the m_tarName2part map is empty and so uniqTarName will return "index.html" unchanged. - uniqTarName(htmlFileName, part); - - assert(m_framesInPart.find(part) == m_framesInPart.end()); - FramesInPart::Iterator fip_it = m_framesInPart.insert(part, PartFrameData()); - - RecurseData data(part, nullptr, &(fip_it.value())); - data.document.documentElement(); - obtainPartURLsLower(data.document.documentElement(), 1, data); - { - // Limit lifetime of @c childParts - ROPartList childParts(part->frames()); - FOR_ITER(ROPartList, childParts, child_it) { - KHTMLPart *cp = isArchivablePart(*child_it); - if (cp) { - obtainURLsLower(cp, level + 1); - } - } - } - - DOM::StyleSheetList styleSheetList = data.document.styleSheets(); - //qCDebug(WEBARCHIVERPLUGIN_LOG) << "# of stylesheets=" << styleSheetList.length(); - for (int i = 0; i != static_cast(styleSheetList.length()); ++i) { - DOM::StyleSheet ss = styleSheetList.item(i); - if (ss.isCSSStyleSheet()) { - DOM::CSSStyleSheet &css = static_cast(ss); - - QString href = css.href().string(); - if (! href.isNull()) { - QString href = css.href().string(); - QUrl fullUrl = css.baseUrl(); - qCDebug(WEBARCHIVERPLUGIN_LOG) << "top-level stylesheet='" << href; - bool inserted = insertTranslateURL(fullUrl, data); - if (inserted) { - m_cssURLs.insert(fullUrl, css); - } - } else { - DOM::Node node = css.ownerNode(); - if (! node.isNull()) { - assert(! m_topStyleSheets.contains(node)); - qCDebug(WEBARCHIVERPLUGIN_LOG) << "top-level inline stylesheet '" << node.nodeName(); - // TODO I think there can be more than one