diff --git a/src/file/CMakeLists.txt b/src/file/CMakeLists.txt --- a/src/file/CMakeLists.txt +++ b/src/file/CMakeLists.txt @@ -33,6 +33,7 @@ unindexedfileiterator.cpp migrator.cpp fileinfo.cpp + indexer.cpp # File Watcher filewatch.cpp diff --git a/src/file/extractor/CMakeLists.txt b/src/file/extractor/CMakeLists.txt --- a/src/file/extractor/CMakeLists.txt +++ b/src/file/extractor/CMakeLists.txt @@ -6,12 +6,6 @@ app.cpp result.cpp idlestatemonitor.cpp - ../priority.cpp - ../basicindexingjob.cpp - ../fileindexerconfig.cpp - ../storagedevices.cpp - ../regexpcache.cpp - ../fileexcludefilters.cpp ) ecm_qt_declare_logging_category(EXTRACTOR_SRCS HEADER baloodebug.h IDENTIFIER BALOO CATEGORY_NAME org.kde.baloo) @@ -21,6 +15,7 @@ target_compile_definitions(baloo_file_extractor PRIVATE -DPROJECT_VERSION="${PROJECT_VERSION}") target_link_libraries(baloo_file_extractor + baloofilecommon Qt5::DBus Qt5::Widgets KF5::FileMetaData diff --git a/src/file/extractor/app.h b/src/file/extractor/app.h --- a/src/file/extractor/app.h +++ b/src/file/extractor/app.h @@ -25,16 +25,15 @@ #include #include -#include #include #include #include -#include #include #include #include "database.h" +#include "indexer.h" #include "../fileindexerconfig.h" #include "idlestatemonitor.h" @@ -54,13 +53,8 @@ void processNextFile(); private: - void index(Transaction* tr, const QString& filePath, quint64 id); - - QMimeDatabase m_mimeDb; - - KFileMetaData::ExtractorCollection m_extractorCollection; - FileIndexerConfig m_config; + Indexer m_indexer; QSocketNotifier m_notifyNewData; QFile m_input; diff --git a/src/file/extractor/app.cpp b/src/file/extractor/app.cpp --- a/src/file/extractor/app.cpp +++ b/src/file/extractor/app.cpp @@ -31,7 +31,6 @@ #include #include -#include #include #include @@ -46,6 +45,8 @@ App::App(QObject* parent) : QObject(parent) + , m_config() + , m_indexer(&m_config) , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read) , m_input() , m_inputStream(&m_input) @@ -109,7 +110,7 @@ } m_outputStream << "S " << url << endl; - index(m_tr, url, id); + m_indexer.index(m_tr, url, id); m_outputStream << "F " << url << endl; m_updatedFiles << url; @@ -142,67 +143,3 @@ m_outputStream << "B" << endl; } } - -void App::index(Transaction* tr, const QString& url, quint64 id) -{ - QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name(); - qCDebug(BALOO) << "Indexing" << id << url << mimetype; - - if (!m_config.shouldBeIndexed(url)) { - // This apparently happens when the config has changed after the document - // was added to the content indexing db - qCWarning(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped"; - tr->removeDocument(id); - return; - } - - // The initial BasicIndexingJob run has been supplied with the file extension - // mimetype only, skip based on the "real" mimetype - if (!m_config.shouldMimeTypeBeIndexed(mimetype)) { - qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; - tr->removePhaseOne(id); - return; - } - - // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we - // have trouble processing them - // - if (mimetype.startsWith(QStringLiteral("text/"))) { - QFileInfo fileInfo(url); - if (fileInfo.size() >= 10 * 1024 * 1024) { - tr->removePhaseOne(id); - return; - } - } - - // We always run the basic indexing again. This is mostly so that the proper - // mimetype is set and we get proper type information. - // The mimetype fetched in the BasicIndexingJob is fast but not accurate - BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); - basicIndexer.index(); - - Baloo::Document doc = basicIndexer.document(); - - Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractEverything); - result.setDocument(doc); - - const QList exList = m_extractorCollection.fetchExtractors(mimetype); - - for (KFileMetaData::Extractor* ex : exList) { - ex->extract(&result); - } - - result.finish(); - if (doc.id() != id) { - qWarning() << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created"; - tr->removeDocument(id); - if (!tr->hasDocument(doc.id())) { - tr->addDocument(result.document()); - } else { - tr->replaceDocument(result.document(), DocumentTerms | DocumentData); - } - } else { - tr->replaceDocument(result.document(), DocumentTerms | DocumentData); - } - tr->removePhaseOne(doc.id()); -} diff --git a/src/tools/balooctl/indexer.h b/src/file/indexer.h rename from src/tools/balooctl/indexer.h rename to src/file/indexer.h --- a/src/tools/balooctl/indexer.h +++ b/src/file/indexer.h @@ -25,24 +25,23 @@ #include #include + #include #include "transaction.h" +#include "fileindexerconfig.h" namespace Baloo { class Indexer { public: - Indexer(const QString& url, Transaction* tr); - - void index(); + Indexer(FileIndexerConfig* config); + bool index(Transaction* tr, const QString& url, quint64 id); private: - QString m_url; - QMimeDatabase m_mimeDB; + FileIndexerConfig* m_config; + QMimeDatabase m_mimeDb; KFileMetaData::ExtractorCollection m_extractorCollection; - - Transaction* m_tr; }; } diff --git a/src/file/indexer.cpp b/src/file/indexer.cpp new file mode 100644 --- /dev/null +++ b/src/file/indexer.cpp @@ -0,0 +1,104 @@ +/* + * This file is part of the KDE Baloo Project + * Copyright (C) 2015 Pinak Ahuja + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) version 3, or any + * later version accepted by the membership of KDE e.V. (or its + * successor approved by the membership of KDE e.V.), which shall + * act as a proxy defined in Section 6 of version 3 of the license. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + */ + +#include "baloodebug.h" +#include "indexer.h" +#include "basicindexingjob.h" +#include "database.h" +#include "extractor/result.h" + +#include +#include +#include + +#include + +using namespace Baloo; + +Indexer::Indexer(FileIndexerConfig* config) + : m_config(config) +{ +} + +bool Indexer::index(Transaction* tr, const QString& url, quint64 id) +{ + QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name(); + qCDebug(BALOO) << "Indexing" << id << url << mimetype; + + if (!m_config->shouldBeIndexed(url)) { + // This apparently happens when the config has changed after the document + // was added to the content indexing db + qCWarning(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped"; + tr->removeDocument(id); + return false; + } + + // The initial BasicIndexingJob run has been supplied with the file extension + // mimetype only, skip based on the "real" mimetype + if (!m_config->shouldMimeTypeBeIndexed(mimetype)) { + qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; + tr->removePhaseOne(id); + return false; + } + + // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we + // have trouble processing them + // + if (mimetype.startsWith(QStringLiteral("text/"))) { + QFileInfo fileInfo(url); + if (fileInfo.size() >= 10 * 1024 * 1024) { + tr->removePhaseOne(id); + return false; + } + } + + // We always run the basic indexing again. This is mostly so that the proper + // mimetype is set and we get proper type information. + // The mimetype fetched in the BasicIndexingJob is fast but not accurate + BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); + basicIndexer.index(); + + Baloo::Document doc = basicIndexer.document(); + + Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractEverything); + result.setDocument(doc); + + const QList exList = m_extractorCollection.fetchExtractors(mimetype); + + for (KFileMetaData::Extractor* ex : exList) { + ex->extract(&result); + } + + result.finish(); + if (doc.id() != id) { + qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created"; + tr->removeDocument(id); + } + + if (!tr->hasDocument(doc.id())) { + tr->addDocument(result.document()); + } else { + tr->replaceDocument(result.document(), Everything); + } + tr->removePhaseOne(doc.id()); + return true; +} diff --git a/src/tools/balooctl/CMakeLists.txt b/src/tools/balooctl/CMakeLists.txt --- a/src/tools/balooctl/CMakeLists.txt +++ b/src/tools/balooctl/CMakeLists.txt @@ -2,7 +2,6 @@ set(SRCS main.cpp - indexer.cpp command.cpp configcommand.cpp statuscommand.cpp diff --git a/src/tools/balooctl/indexer.cpp b/src/tools/balooctl/indexer.cpp deleted file mode 100644 --- a/src/tools/balooctl/indexer.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * This file is part of the KDE Baloo Project - * Copyright (C) 2015 Pinak Ahuja - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) version 3, or any - * later version accepted by the membership of KDE e.V. (or its - * successor approved by the membership of KDE e.V.), which shall - * act as a proxy defined in Section 6 of version 3 of the license. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see . - * - */ - -#include "indexer.h" -#include "basicindexingjob.h" -#include "database.h" -#include "./extractor/result.h" - -#include -#include - -using namespace Baloo; - -Indexer::Indexer(const QString& url, Transaction* tr) - : m_url(url) - , m_tr(tr) -{ -} - -void Indexer::index() -{ - const QString mimetype = m_mimeDB.mimeTypeForFile(m_url).name(); - BasicIndexingJob basicIJ(m_url, mimetype, BasicIndexingJob::NoLevel); - basicIJ.index(); - Baloo::Document doc = basicIJ.document(); - - Result result(m_url, mimetype, KFileMetaData::ExtractionResult::ExtractEverything); - result.setDocument(doc); - - const QList exList = m_extractorCollection.fetchExtractors(mimetype); - - for (KFileMetaData::Extractor* ex : exList) { - ex->extract(&result); - } - - result.finish(); - if (m_tr->hasDocument(doc.id())) { - m_tr->replaceDocument(doc, Everything); - } else { - m_tr->addDocument(result.document()); - } -} diff --git a/src/tools/balooctl/main.cpp b/src/tools/balooctl/main.cpp --- a/src/tools/balooctl/main.cpp +++ b/src/tools/balooctl/main.cpp @@ -221,6 +221,8 @@ return 1; } + FileIndexerConfig config; + Indexer indexer(&config); Transaction tr(db, Transaction::ReadWrite); for (int i = 1; i < parser.positionalArguments().size(); ++i) { @@ -230,20 +232,13 @@ out << "Could not stat file: " << url << endl; continue; } - if (tr.inPhaseOne(id)) { - out << "Skipping: " << url << " Reason: Already scheduled for indexing\n"; - continue; - } - if (!tr.documentData(id).isEmpty()) { - out << "Skipping: " << url << " Reason: Already indexed\n"; - continue; + if (indexer.index(&tr, url, id)) { + out << i18nc("file url", "Indexed %1").arg(url) << endl; + } else { + out << i18nc("file url", "Failed to index %1").arg(url) << endl; } - Indexer indexer(url, &tr); - out << "Indexing " << url << endl; - indexer.index(); } tr.commit(); - out << "File(s) indexed\n"; return 0; }