diff --git a/src/engine/postingdb.cpp b/src/engine/postingdb.cpp index a687f50b..55fa10ae 100644 --- a/src/engine/postingdb.cpp +++ b/src/engine/postingdb.cpp @@ -1,321 +1,333 @@ /* This file is part of the KDE Baloo project. * Copyright (C) 2015 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "enginedebug.h" #include "postingdb.h" #include "orpostingiterator.h" #include "postingcodec.h" using namespace Baloo; PostingDB::PostingDB(MDB_dbi dbi, MDB_txn* txn) : m_txn(txn) , m_dbi(dbi) { Q_ASSERT(txn != nullptr); Q_ASSERT(dbi != 0); } PostingDB::~PostingDB() { } MDB_dbi PostingDB::create(MDB_txn* txn) { MDB_dbi dbi = 0; int rc = mdb_dbi_open(txn, "postingdb", MDB_CREATE, &dbi); if (rc) { qCWarning(ENGINE) << "PostingDB::create" << mdb_strerror(rc); return 0; } return dbi; } MDB_dbi PostingDB::open(MDB_txn* txn) { MDB_dbi dbi = 0; int rc = mdb_dbi_open(txn, "postingdb", 0, &dbi); if (rc) { qCWarning(ENGINE) << "PostingDB::open" << mdb_strerror(rc); return 0; } return dbi; } void PostingDB::put(const QByteArray& term, const PostingList& list) { Q_ASSERT(!term.isEmpty()); Q_ASSERT(!list.isEmpty()); MDB_val key; key.mv_size = term.size(); key.mv_data = static_cast(const_cast(term.constData())); PostingCodec codec; QByteArray arr = codec.encode(list); MDB_val val; val.mv_size = arr.size(); val.mv_data = static_cast(arr.data()); int rc = mdb_put(m_txn, m_dbi, &key, &val, 0); if (rc) { qCWarning(ENGINE) << "PostingDB::put" << mdb_strerror(rc); } } PostingList PostingDB::get(const QByteArray& term) { Q_ASSERT(!term.isEmpty()); MDB_val key; key.mv_size = term.size(); key.mv_data = static_cast(const_cast(term.constData())); MDB_val val{0, nullptr}; int rc = mdb_get(m_txn, m_dbi, &key, &val); if (rc) { if (rc != MDB_NOTFOUND) { qCDebug(ENGINE) << "PostingDB::get" << term << mdb_strerror(rc); } return PostingList(); } QByteArray arr = QByteArray::fromRawData(static_cast(val.mv_data), val.mv_size); PostingCodec codec; return codec.decode(arr); } void PostingDB::del(const QByteArray& term) { Q_ASSERT(!term.isEmpty()); MDB_val key; key.mv_size = term.size(); key.mv_data = static_cast(const_cast(term.constData())); int rc = mdb_del(m_txn, m_dbi, &key, nullptr); if (rc != 0 && rc != MDB_NOTFOUND) { qCDebug(ENGINE) << "PostingDB::del" << term << mdb_strerror(rc); } } QVector< QByteArray > PostingDB::fetchTermsStartingWith(const QByteArray& term) { MDB_val key; key.mv_size = term.size(); key.mv_data = static_cast(const_cast(term.constData())); MDB_cursor* cursor; int rc = mdb_cursor_open(m_txn, m_dbi, &cursor); if (rc) { qCWarning(ENGINE) << "PostingDB::fetchTermsStartingWith" << mdb_strerror(rc); return {}; } QVector terms; rc = mdb_cursor_get(cursor, &key, nullptr, MDB_SET_RANGE); while (rc == 0) { const QByteArray arr(static_cast(key.mv_data), key.mv_size); if (!arr.startsWith(term)) { break; } terms << arr; rc = mdb_cursor_get(cursor, &key, nullptr, MDB_NEXT); } if (rc != MDB_NOTFOUND) { qCDebug(ENGINE) << "PostingDB::fetchTermsStartingWith" << mdb_strerror(rc); } mdb_cursor_close(cursor); return terms; } class DBPostingIterator : public PostingIterator { public: DBPostingIterator(void* data, uint size); quint64 docId() const override; quint64 next() override; private: const QVector m_vec; int m_pos; }; PostingIterator* PostingDB::iter(const QByteArray& term) { MDB_val key; key.mv_size = term.size(); key.mv_data = static_cast(const_cast(term.constData())); MDB_val val; int rc = mdb_get(m_txn, m_dbi, &key, &val); if (rc) { qCDebug(ENGINE) << "PostingDB::iter" << term << mdb_strerror(rc); return nullptr; } return new DBPostingIterator(val.mv_data, val.mv_size); } // // Posting Iterator // DBPostingIterator::DBPostingIterator(void* data, uint size) : m_vec(PostingCodec().decode(QByteArray(static_cast(data), size))) , m_pos(-1) { } quint64 DBPostingIterator::docId() const { if (m_pos < 0 || m_pos >= m_vec.size()) { return 0; } return m_vec[m_pos]; } quint64 DBPostingIterator::next() { if (m_pos >= m_vec.size() - 1) { m_pos = m_vec.size(); return 0; } m_pos++; return m_vec[m_pos]; } template PostingIterator* PostingDB::iter(const QByteArray& prefix, Validator validate) { Q_ASSERT(!prefix.isEmpty()); MDB_val key; key.mv_size = prefix.size(); key.mv_data = static_cast(const_cast(prefix.constData())); MDB_cursor* cursor; int rc = mdb_cursor_open(m_txn, m_dbi, &cursor); if (rc) { qCWarning(ENGINE) << "PostingDB::regexpIter" << mdb_strerror(rc); return nullptr; } QVector termIterators; MDB_val val; rc = mdb_cursor_get(cursor, &key, &val, MDB_SET_RANGE); while (rc == 0) { const QByteArray arr(static_cast(key.mv_data), key.mv_size); if (!arr.startsWith(prefix)) { break; } if (validate(arr)) { termIterators << new DBPostingIterator(val.mv_data, val.mv_size); } rc = mdb_cursor_get(cursor, &key, &val, MDB_NEXT); } if (rc != 0 && rc != MDB_NOTFOUND) { qCWarning(ENGINE) << "PostingDB::regexpIter" << mdb_strerror(rc); } mdb_cursor_close(cursor); if (termIterators.isEmpty()) { return nullptr; } return new OrPostingIterator(termIterators); } PostingIterator* PostingDB::prefixIter(const QByteArray& prefix) { auto validate = [] (const QByteArray& arr) { Q_UNUSED(arr); return true; }; return iter(prefix, validate); } PostingIterator* PostingDB::regexpIter(const QRegularExpression& regexp, const QByteArray& prefix) { int prefixLen = prefix.length(); auto validate = [®exp, prefixLen] (const QByteArray& arr) { QString term = QString::fromUtf8(arr.mid(prefixLen)); return regexp.match(term).hasMatch(); }; return iter(prefix, validate); } PostingIterator* PostingDB::compIter(const QByteArray& prefix, qlonglong comVal, PostingDB::Comparator com) { int prefixLen = prefix.length(); auto validate = [prefixLen, comVal, com] (const QByteArray& arr) { bool ok = false; auto val = QByteArray::fromRawData(arr.constData() + prefixLen, arr.length() - prefixLen).toLongLong(&ok); return ok && ((com == LessEqual && val <= comVal) || (com == GreaterEqual && val >= comVal)); }; return iter(prefix, validate); } +PostingIterator* PostingDB::compIter(const QByteArray& prefix, double comVal, PostingDB::Comparator com) +{ + int prefixLen = prefix.length(); + auto validate = [prefixLen, comVal, com] (const QByteArray& arr) { + bool ok = false; + auto val = QByteArray::fromRawData(arr.constData() + prefixLen, arr.length() - prefixLen).toDouble(&ok); + return ok && ((com == LessEqual && val <= comVal) || + (com == GreaterEqual && val >= comVal)); + }; + return iter(prefix, validate); +} + PostingIterator* PostingDB::compIter(const QByteArray& prefix, const QByteArray& comVal, PostingDB::Comparator com) { int prefixLen = prefix.length(); auto validate = [prefixLen, comVal, com] (const QByteArray& arr) { auto val = QByteArray::fromRawData(arr.constData() + prefixLen, arr.length() - prefixLen); return ((com == LessEqual && val <= comVal) || (com == GreaterEqual && val >= comVal)); }; return iter(prefix, validate); } QMap PostingDB::toTestMap() const { MDB_cursor* cursor; mdb_cursor_open(m_txn, m_dbi, &cursor); MDB_val key = {0, nullptr}; MDB_val val; QMap map; while (1) { int rc = mdb_cursor_get(cursor, &key, &val, MDB_NEXT); if (rc == MDB_NOTFOUND) { break; } if (rc) { qCDebug(ENGINE) << "PostingDB::toTestMap" << mdb_strerror(rc); break; } const QByteArray ba(static_cast(key.mv_data), key.mv_size); const PostingList plist = PostingCodec().decode(QByteArray(static_cast(val.mv_data), val.mv_size)); map.insert(ba, plist); } mdb_cursor_close(cursor); return map; } diff --git a/src/engine/postingdb.h b/src/engine/postingdb.h index 41b96f2c..6aa982b8 100644 --- a/src/engine/postingdb.h +++ b/src/engine/postingdb.h @@ -1,78 +1,86 @@ /* This file is part of the KDE Baloo project. * Copyright (C) 2015 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef BALOO_POSTINGDB_H #define BALOO_POSTINGDB_H #include "postingiterator.h" #include #include #include #include namespace Baloo { typedef QVector PostingList; /** * The PostingDB is the main database that maps -> ... * This is used to do to lookup ids when searching for a . */ class BALOO_ENGINE_EXPORT PostingDB { public: PostingDB(MDB_dbi, MDB_txn* txn); ~PostingDB(); static MDB_dbi create(MDB_txn* txn); static MDB_dbi open(MDB_txn* txn); void put(const QByteArray& term, const PostingList& list); PostingList get(const QByteArray& term); void del(const QByteArray& term); PostingIterator* iter(const QByteArray& term); PostingIterator* prefixIter(const QByteArray& term); PostingIterator* regexpIter(const QRegularExpression& regexp, const QByteArray& prefix); enum Comparator { LessEqual, GreaterEqual }; + // For integral types only: + template + typename std::enable_if::value, PostingIterator*>::type + compIter(const QByteArray& prefix, T val, Comparator com) { + qlonglong l = val; + return compIter(prefix, l, com); + } PostingIterator* compIter(const QByteArray& prefix, qlonglong val, Comparator com); + PostingIterator* compIter(const QByteArray& prefix, double val, Comparator com); PostingIterator* compIter(const QByteArray& prefix, const QByteArray& val, Comparator com); QVector fetchTermsStartingWith(const QByteArray& term); QMap toTestMap() const; private: template PostingIterator* iter(const QByteArray& prefix, Validator validate); MDB_txn* m_txn; MDB_dbi m_dbi; }; } #endif // BALOO_POSTINGDB_H diff --git a/src/engine/transaction.cpp b/src/engine/transaction.cpp index 2986db91..b8512e8a 100644 --- a/src/engine/transaction.cpp +++ b/src/engine/transaction.cpp @@ -1,649 +1,655 @@ /* * This file is part of the KDE Baloo project. * Copyright (C) 2015 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "transaction.h" #include "postingdb.h" #include "documentdb.h" #include "documenturldb.h" #include "documentiddb.h" #include "positiondb.h" #include "documentdatadb.h" #include "mtimedb.h" #include "document.h" #include "enginequery.h" #include "andpostingiterator.h" #include "orpostingiterator.h" #include "phraseanditerator.h" #include "writetransaction.h" #include "idutils.h" #include "database.h" #include "databasesize.h" #include "enginedebug.h" #include #include #include using namespace Baloo; Transaction::Transaction(const Database& db, Transaction::TransactionType type) : m_dbis(db.m_dbis) , m_env(db.m_env) , m_writeTrans(nullptr) { uint flags = type == ReadOnly ? MDB_RDONLY : 0; int rc = mdb_txn_begin(db.m_env, nullptr, flags, &m_txn); if (rc) { qCDebug(ENGINE) << "Transaction" << mdb_strerror(rc); return; } if (type == ReadWrite) { m_writeTrans = new WriteTransaction(m_dbis, m_txn); } } Transaction::Transaction(Database* db, Transaction::TransactionType type) : Transaction(*db, type) { } Transaction::~Transaction() { if (m_writeTrans) { qWarning(ENGINE) << "Closing an active WriteTransaction without calling abort/commit"; } if (m_txn) { abort(); } } bool Transaction::hasDocument(quint64 id) const { Q_ASSERT(id > 0); IdFilenameDB idFilenameDb(m_dbis.idFilenameDbi, m_txn); return idFilenameDb.contains(id); } bool Transaction::inPhaseOne(quint64 id) const { Q_ASSERT(id > 0); DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); return contentIndexingDb.contains(id); } bool Transaction::hasFailed(quint64 id) const { Q_ASSERT(id > 0); DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); return failedIdDb.contains(id); } QVector Transaction::failedIds(quint64 limit) const { DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); return failedIdDb.fetchItems(limit); } QByteArray Transaction::documentUrl(quint64 id) const { Q_ASSERT(m_txn); Q_ASSERT(id > 0); DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); return docUrlDb.get(id); } quint64 Transaction::documentId(const QByteArray& path) const { Q_ASSERT(m_txn); Q_ASSERT(!path.isEmpty()); DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); QList li = path.split('/'); quint64 parentId = 0; for (const QByteArray& fileName : li) { if (fileName.isEmpty()) { continue; } parentId = docUrlDb.getId(parentId, fileName); if (!parentId) { return 0; } } return parentId; } QVector Transaction::childrenDocumentId(quint64 parentId) const { DocumentUrlDB docUrlDB(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); return docUrlDB.getChildren(parentId); } DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const { Q_ASSERT(m_txn); DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn); return docTimeDb.get(id); } QByteArray Transaction::documentData(quint64 id) const { Q_ASSERT(m_txn); Q_ASSERT(id > 0); DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn); return docDataDb.get(id); } bool Transaction::hasChanges() const { Q_ASSERT(m_txn); if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return false; } return m_writeTrans->hasChanges(); } QVector Transaction::fetchPhaseOneIds(int size) const { Q_ASSERT(m_txn); Q_ASSERT(size > 0); DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); return contentIndexingDb.fetchItems(size); } QVector Transaction::fetchTermsStartingWith(const QByteArray& term) const { Q_ASSERT(term.size() > 0); PostingDB postingDb(m_dbis.postingDbi, m_txn); return postingDb.fetchTermsStartingWith(term); } uint Transaction::phaseOneSize() const { Q_ASSERT(m_txn); DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); return contentIndexingDb.size(); } uint Transaction::size() const { Q_ASSERT(m_txn); DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn); return docTermsDb.size(); } // // Write Operations // void Transaction::setPhaseOne(quint64 id) { Q_ASSERT(m_txn); Q_ASSERT(id > 0); Q_ASSERT(m_writeTrans); DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); contentIndexingDb.put(id); } void Transaction::removePhaseOne(quint64 id) { Q_ASSERT(m_txn); Q_ASSERT(id > 0); Q_ASSERT(m_writeTrans); DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); contentIndexingDb.del(id); } void Transaction::addFailed(quint64 id) { Q_ASSERT(m_txn); Q_ASSERT(id > 0); Q_ASSERT(m_writeTrans); DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); failedIdDb.put(id); } void Transaction::addDocument(const Document& doc) { Q_ASSERT(m_txn); Q_ASSERT(doc.id() > 0); if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return; } m_writeTrans->addDocument(doc); } void Transaction::removeDocument(quint64 id) { Q_ASSERT(m_txn); Q_ASSERT(id > 0); if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return; } m_writeTrans->removeDocument(id); } void Transaction::removeRecursively(quint64 id) { Q_ASSERT(m_txn); Q_ASSERT(id > 0); if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return; } m_writeTrans->removeRecursively(id); } void Transaction::replaceDocument(const Document& doc, DocumentOperations operations) { Q_ASSERT(m_txn); Q_ASSERT(doc.id() > 0); Q_ASSERT(m_writeTrans); if (!hasDocument(doc.id())) { qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist"; } if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return; } m_writeTrans->replaceDocument(doc, operations); } void Transaction::commit() { Q_ASSERT(m_txn); if (!m_writeTrans) { qCWarning(ENGINE) << "m_writeTrans is null"; return; } m_writeTrans->commit(); delete m_writeTrans; m_writeTrans = nullptr; int rc = mdb_txn_commit(m_txn); if (rc) { qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(rc); } m_txn = nullptr; } void Transaction::abort() { Q_ASSERT(m_txn); mdb_txn_abort(m_txn); m_txn = nullptr; delete m_writeTrans; m_writeTrans = nullptr; } // // Queries // PostingIterator* Transaction::postingIterator(const EngineQuery& query) const { PostingDB postingDb(m_dbis.postingDbi, m_txn); PositionDB positionDb(m_dbis.positionDBi, m_txn); if (query.leaf()) { if (query.op() == EngineQuery::Equal) { return postingDb.iter(query.term()); } else if (query.op() == EngineQuery::StartsWith) { return postingDb.prefixIter(query.term()); } else { Q_ASSERT(0); } } const auto subQueries = query.subQueries(); if (subQueries.isEmpty()) { return nullptr; } if (query.op() == EngineQuery::Phrase) { if (subQueries.size() == 1) { qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" << query; return postingIterator(subQueries[0]); } QVector vec; vec.reserve(subQueries.size()); for (const EngineQuery& q : subQueries) { if (!q.leaf()) { qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase queries must contain leaf queries"; continue; } vec << positionDb.iter(q.term()); } return new PhraseAndIterator(vec); } QVector vec; vec.reserve(subQueries.size()); for (const EngineQuery& q : subQueries) { auto iterator = postingIterator(q); if (iterator) { vec << iterator; } else if (query.op() == EngineQuery::And) { return nullptr; } } if (vec.empty()) { return nullptr; } else if (vec.size() == 1) { return vec.takeFirst(); } if (query.op() == EngineQuery::And) { return new AndPostingIterator(vec); } else if (query.op() == EngineQuery::Or) { return new OrPostingIterator(vec); } Q_ASSERT(0); return nullptr; } PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const { PostingDB postingDb(m_dbis.postingDbi, m_txn); return postingDb.compIter(prefix, value, com); } +PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const +{ + PostingDB postingDb(m_dbis.postingDbi, m_txn); + return postingDb.compIter(prefix, value, com); +} + PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const { PostingDB postingDb(m_dbis.postingDbi, m_txn); return postingDb.compIter(prefix, value, com); } PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const { MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn); return mTimeDb.iterRange(beginTime, endTime); } PostingIterator* Transaction::docUrlIter(quint64 id) const { DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); return docUrlDb.iter(id); } QVector Transaction::exec(const EngineQuery& query, int limit) const { Q_ASSERT(m_txn); QVector results; PostingIterator* it = postingIterator(query); if (!it) { return results; } while (it->next() && limit) { results << it->docId(); limit--; } return results; } // // Introspection // QVector Transaction::documentTerms(quint64 docId) const { Q_ASSERT(docId); DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); return documentTermsDB.get(docId); } QVector Transaction::documentFileNameTerms(quint64 docId) const { Q_ASSERT(docId); DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); return documentFileNameTermsDB.get(docId); } QVector Transaction::documentXattrTerms(quint64 docId) const { Q_ASSERT(docId); DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); return documentXattrTermsDB.get(docId); } // // File Size // static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi) { MDB_stat stat; mdb_stat(txn, dbi, &stat); return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize; } DatabaseSize Transaction::dbSize() { DatabaseSize dbSize; dbSize.postingDb = dbiSize(m_txn, m_dbis.postingDbi); dbSize.positionDb = dbiSize(m_txn, m_dbis.positionDBi); dbSize.docTerms = dbiSize(m_txn, m_dbis.docTermsDbi); dbSize.docFilenameTerms = dbiSize(m_txn, m_dbis.docFilenameTermsDbi); dbSize.docXattrTerms = dbiSize(m_txn, m_dbis.docXattrTermsDbi); dbSize.idTree = dbiSize(m_txn, m_dbis.idTreeDbi); dbSize.idFilename = dbiSize(m_txn, m_dbis.idFilenameDbi); dbSize.docTime = dbiSize(m_txn, m_dbis.docTimeDbi); dbSize.docData = dbiSize(m_txn, m_dbis.docDataDbi); dbSize.contentIndexingIds = dbiSize(m_txn, m_dbis.contentIndexingDbi); dbSize.failedIds = dbiSize(m_txn, m_dbis.failedIdDbi); dbSize.mtimeDb = dbiSize(m_txn, m_dbis.mtimeDbi); dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb; MDB_envinfo info; mdb_env_info(m_env, &info); dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size return dbSize; } // // Debugging // void Transaction::checkFsTree() { DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); PostingDB postingDb(m_dbis.postingDbi, m_txn); const auto map = postingDb.toTestMap(); QSet allIds; for (const auto& list : map) { for (quint64 id : list) { allIds << id; } } std::cout << "Total Document IDs: " << allIds.size() << std::endl; int count = 0; for (quint64 id: qAsConst(allIds)) { QByteArray url = docUrlDb.get(id); if (url.isEmpty()) { auto terms = documentTermsDB.get(id); auto fileNameTerms = documentFileNameTermsDB.get(id); auto xAttrTerms = documentXattrTermsDB.get(id); // Lets reverse engineer the terms QList newTerms; QMapIterator it(map); while (it.hasNext()) { it.next(); if (it.value().contains(id)) { newTerms << it.key(); } } std::cout << "Missing filePath for " << id << std::endl; std::cout << "\tPostingDB Terms: "; for (const QByteArray& term : qAsConst(newTerms)) { std::cout << qPrintable(term) << " "; } std::cout << std::endl; std::cout << "\tDocumentTermsDB: "; for (const QByteArray& term : terms) { std::cout << qPrintable(term) << " "; } std::cout << std::endl; std::cout << "\tFileNameTermsDB: "; for (const QByteArray& term : fileNameTerms) { std::cout << qPrintable(term) << " "; } std::cout << std::endl; std::cout << "\tXAttrTermsDB: "; for (const QByteArray& term : xAttrTerms) { std::cout << qPrintable(term) << " "; } std::cout << std::endl; count++; } else if (!QFileInfo::exists(QString::fromUtf8(url))) { std::cout << "FilePath " << qPrintable(url) << " for " << id << " does not exist"<< std::endl; count++; } } std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl; } void Transaction::checkTermsDbinPostingDb() { DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); PostingDB postingDb(m_dbis.postingDbi, m_txn); // Iterate over each document, and fetch all terms // check if each term maps to its own id in the posting db const auto map = postingDb.toTestMap(); QSet allIds; for (const auto& list : map) { for (quint64 id : list) { allIds << id; } } std::cout << "PostingDB check .." << std::endl; for (quint64 id : qAsConst(allIds)) { QVector terms = documentTermsDB.get(id); terms += documentXattrTermsDB.get(id); terms += documentFileNameTermsDB.get(id); for (const QByteArray& term : qAsConst(terms)) { PostingList plist = postingDb.get(term); if (!plist.contains(id)) { std::cout << id << " is missing term " << qPrintable(term) << std::endl; } } } } void Transaction::checkPostingDbinTermsDb() { DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); PostingDB postingDb(m_dbis.postingDbi, m_txn); QMap map = postingDb.toTestMap(); QMapIterator it(map); std::cout << "DocumentTermsDB check .." << std::endl; while (it.hasNext()) { it.next(); const QByteArray& term = it.key(); const PostingList& list = it.value(); for (quint64 id : list) { if (documentTermsDB.get(id).contains(term)) { continue; } if (documentFileNameTermsDB.get(id).contains(term)) { continue; } if (documentXattrTermsDB.get(id).contains(term)) { continue; } std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl; } } } diff --git a/src/engine/transaction.h b/src/engine/transaction.h index 7c15b740..6c5a8e38 100644 --- a/src/engine/transaction.h +++ b/src/engine/transaction.h @@ -1,140 +1,141 @@ /* * This file is part of the KDE Baloo project. * Copyright (C) 2015 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef BALOO_TRANSACTION_H #define BALOO_TRANSACTION_H #include "databasedbis.h" #include "mtimedb.h" #include "postingdb.h" #include "writetransaction.h" #include "documenttimedb.h" #include #include namespace Baloo { class Database; class Document; class PostingIterator; class EngineQuery; class DatabaseSize; class DBState; class BALOO_ENGINE_EXPORT Transaction { public: enum TransactionType { ReadOnly, ReadWrite }; Transaction(const Database& db, TransactionType type); Transaction(Database* db, TransactionType type); ~Transaction(); // // Getters // bool hasDocument(quint64 id) const; bool inPhaseOne(quint64 id) const; bool hasFailed(quint64 id) const; QVector failedIds(quint64 limit) const; QByteArray documentUrl(quint64 id) const; /** * This method is not cheap, and does not stat the filesystem in order to convert the path * \p path into an id. */ quint64 documentId(const QByteArray& path) const; QVector childrenDocumentId(quint64 parentId) const; QByteArray documentData(quint64 id) const; DocumentTimeDB::TimeInfo documentTimeInfo(quint64 id) const; QVector exec(const EngineQuery& query, int limit = -1) const; PostingIterator* postingIterator(const EngineQuery& query) const; PostingIterator* postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const; + PostingIterator* postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const; PostingIterator* postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const; PostingIterator* mTimeRangeIter(quint32 beginTime, quint32 endTime) const; PostingIterator* docUrlIter(quint64 id) const; QVector fetchPhaseOneIds(int size) const; uint phaseOneSize() const; uint size() const; QVector fetchTermsStartingWith(const QByteArray& term) const; // // Introspecing document data // QVector documentTerms(quint64 docId) const; QVector documentFileNameTerms(quint64 docId) const; QVector documentXattrTerms(quint64 docId) const; DatabaseSize dbSize(); // // Transaction handling // void commit(); void abort(); bool hasChanges() const; // // Write Methods // void addDocument(const Document& doc); void removeDocument(quint64 id); void removeRecursively(quint64 parentId); void addFailed(quint64 id); bool removeRecursively(quint64 parentId, std::function shouldDelete) { Q_ASSERT(m_txn); Q_ASSERT(m_writeTrans); return m_writeTrans->removeRecursively(parentId, shouldDelete); } void replaceDocument(const Document& doc, DocumentOperations operations); void setPhaseOne(quint64 id); void removePhaseOne(quint64 id); // Debugging void checkFsTree(); void checkTermsDbinPostingDb(); void checkPostingDbinTermsDb(); private: Transaction(const Transaction& rhs) = delete; const DatabaseDbis& m_dbis; MDB_txn *m_txn = nullptr; MDB_env *m_env = nullptr; WriteTransaction *m_writeTrans = nullptr; friend class DatabaseSanitizerImpl; friend class DBState; // for testing }; } #endif // BALOO_TRANSACTION_H diff --git a/src/lib/searchstore.cpp b/src/lib/searchstore.cpp index 9b77667b..20b0e9eb 100644 --- a/src/lib/searchstore.cpp +++ b/src/lib/searchstore.cpp @@ -1,414 +1,418 @@ /* * This file is part of the KDE Baloo Project * Copyright (C) 2013-2015 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) version 3, or any * later version accepted by the membership of KDE e.V. (or its * successor approved by the membership of KDE e.V.), which shall * act as a proxy defined in Section 6 of version 3 of the license. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * */ #include "baloodebug.h" #include "searchstore.h" #include "term.h" #include "global.h" #include "baloodebug.h" #include "database.h" #include "transaction.h" #include "enginequery.h" #include "queryparser.h" #include "termgenerator.h" #include "andpostingiterator.h" #include "orpostingiterator.h" #include "idutils.h" #include #include #include #include #include #include #include namespace Baloo { namespace { QPair calculateTimeRange(const QDateTime& dt, Term::Comparator com) { Q_ASSERT(dt.isValid()); quint32 timet = dt.toSecsSinceEpoch(); if (com == Term::LessEqual) { return {0, timet}; } if (com == Term::Less) { return {0, timet - 1}; } if (com == Term::GreaterEqual) { return {timet, std::numeric_limits::max()}; } if (com == Term::Greater) { return {timet + 1, std::numeric_limits::max()}; } if (com == Term::Equal) { timet = QDateTime(dt.date()).toSecsSinceEpoch(); return {timet, timet + 24 * 60 * 60 - 1}; } Q_ASSERT_X(0, __func__, "mtime query must contain a valid comparator"); return {0, 0}; } struct InternalProperty { const char* propertyName; const char* prefix; QVariant::Type valueType; }; constexpr std::array internalProperties {{ { "filename", "F", QVariant::String }, { "mimetype", "M", QVariant::String }, { "rating", "R", QVariant::Int }, { "tag", "TAG-", QVariant::String }, { "tags", "TA", QVariant::String }, { "usercomment", "C", QVariant::String } }}; std::pair propertyInfo(const QByteArray& property) { auto it = std::find_if(std::begin(internalProperties), std::end(internalProperties), [&property] (const InternalProperty& entry) { return property == entry.propertyName; }); if (it != std::end(internalProperties)) { return { (*it).prefix, (*it).valueType }; } else { KFileMetaData::PropertyInfo pi = KFileMetaData::PropertyInfo::fromName(property); if (pi.property() == KFileMetaData::Property::Empty) { return { QByteArray(), QVariant::Invalid }; } int propPrefix = static_cast(pi.property()); return { 'X' + QByteArray::number(propPrefix) + '-', pi.valueType() }; } } } SearchStore::SearchStore() : m_db(nullptr) { m_db = globalDatabaseInstance(); if (!m_db->open(Database::ReadOnlyDatabase)) { m_db = nullptr; } } SearchStore::~SearchStore() { } // Return the result with-in [offset, offset + limit) QStringList SearchStore::exec(const Term& term, uint offset, int limit, bool sortResults) { if (!m_db || !m_db->isOpen()) { return QStringList(); } Transaction tr(m_db, Transaction::ReadOnly); QScopedPointer it(constructQuery(&tr, term)); if (!it) { return QStringList(); } if (sortResults) { QVector> resultIds; while (it->next()) { quint64 id = it->docId(); quint32 mtime = tr.documentTimeInfo(id).mTime; resultIds << std::pair{id, mtime}; Q_ASSERT(id > 0); } // Not enough results within range, no need to sort. if (offset >= static_cast(resultIds.size())) { return QStringList(); } auto compFunc = [](const std::pair& lhs, const std::pair& rhs) { return lhs.second > rhs.second; }; std::sort(resultIds.begin(), resultIds.end(), compFunc); if (limit < 0) { limit = resultIds.size(); } QStringList results; const uint end = qMin(static_cast(resultIds.size()), offset + static_cast(limit)); results.reserve(end - offset); for (uint i = offset; i < end; i++) { const quint64 id = resultIds[i].first; const QString filePath = tr.documentUrl(id); results << filePath; } return results; } else { QStringList results; uint ulimit = limit < 0 ? UINT_MAX : limit; while (offset && it->next()) { offset--; } while (ulimit && it->next()) { quint64 id = it->docId(); Q_ASSERT(id > 0); results << tr.documentUrl(it->docId()); Q_ASSERT(!results.last().isEmpty()); ulimit--; } return results; } } PostingIterator* SearchStore::constructQuery(Transaction* tr, const Term& term) { Q_ASSERT(tr); if (term.operation() == Term::And || term.operation() == Term::Or) { const QList subTerms = term.subTerms(); QVector vec; vec.reserve(subTerms.size()); for (const Term& t : subTerms) { auto iterator = constructQuery(tr, t); // constructQuery returns a nullptr to signal an empty list if (iterator) { vec << iterator; } else if (term.operation() == Term::And) { return nullptr; } } if (vec.isEmpty()) { return nullptr; } else if (vec.size() == 1) { return vec.takeFirst(); } if (term.operation() == Term::And) { return new AndPostingIterator(vec); } else { return new OrPostingIterator(vec); } } if (term.value().isNull()) { return nullptr; } Q_ASSERT(term.value().isValid()); Q_ASSERT(term.comparator() != Term::Auto); Q_ASSERT(term.comparator() == Term::Contains ? term.value().type() == QVariant::String : true); const QVariant value = term.value(); const QByteArray property = term.property().toLower().toUtf8(); if (property == "type" || property == "kind") { EngineQuery q = constructTypeQuery(value.toString()); return tr->postingIterator(q); } else if (property == "includefolder") { const QFileInfo fi(value.toString()); const QByteArray folder = QFile::encodeName(fi.canonicalFilePath()); if (folder.isEmpty()) { return nullptr; } if (!folder.startsWith('/')) { return nullptr; } quint64 id = filePathToId(folder); if (!id) { qCDebug(BALOO) << "Folder" << value.toString() << "does not exist"; return nullptr; } return tr->docUrlIter(id); } else if (property == "modified" || property == "mtime") { if (value.type() == QVariant::ByteArray) { // Used by Baloo::Query QByteArray ba = value.toByteArray(); Q_ASSERT(ba.size() >= 4); int year = ba.mid(0, 4).toInt(); int month = ba.mid(4, 2).toInt(); int day = ba.mid(6, 2).toInt(); Q_ASSERT(year); // uses 0 to represent whole month or whole year month = month >= 0 && month <= 12 ? month : 0; day = day >= 0 && day <= 31 ? day : 0; QDate startDate(year, month ? month : 1, day ? day : 1); QDate endDate(startDate); if (month == 0) { endDate.setDate(endDate.year(), 12, 31); } else if (day == 0) { endDate.setDate(endDate.year(), endDate.month(), endDate.daysInMonth()); } return tr->mTimeRangeIter(QDateTime(startDate).toSecsSinceEpoch(), QDateTime(endDate, QTime(23, 59, 59)).toSecsSinceEpoch()); } else if (value.type() == QVariant::String) { const QDateTime dt = value.toDateTime(); QPair timerange = calculateTimeRange(dt, term.comparator()); if ((timerange.first == 0) && (timerange.second == 0)) { return nullptr; } return tr->mTimeRangeIter(timerange.first, timerange.second); } else { Q_ASSERT_X(0, "SearchStore::constructQuery", "modified property must contain date/datetime values"); return nullptr; } } else if (property == "tag") { if (term.comparator() == Term::Equal) { const QByteArray prefix = "TAG-"; EngineQuery q = EngineQuery(prefix + value.toByteArray()); return tr->postingIterator(q); } else if (term.comparator() == Term::Contains) { const QByteArray prefix = "TA"; EngineQuery q = constructEqualsQuery(prefix, value.toString()); return tr->postingIterator(q); } else { Q_ASSERT(0); return nullptr; } } QByteArray prefix; QVariant::Type valueType = QVariant::String; if (!property.isEmpty()) { std::tie(prefix, valueType) = propertyInfo(property); if (prefix.isEmpty()) { return nullptr; } } auto com = term.comparator(); if (com == Term::Contains && valueType == QVariant::Int) { com = Term::Equal; } if (com == Term::Contains) { EngineQuery q = constructContainsQuery(prefix, value.toString()); return tr->postingIterator(q); } if (com == Term::Equal) { EngineQuery q = constructEqualsQuery(prefix, value.toString()); return tr->postingIterator(q); } PostingDB::Comparator pcom; if (com == Term::Greater || com == Term::GreaterEqual) { pcom = PostingDB::GreaterEqual; } else if (com == Term::Less || com == Term::LessEqual) { pcom = PostingDB::LessEqual; } // FIXME -- has to be kept in sync with the code from // Baloo::Result::add if (valueType == QVariant::Int) { qlonglong intVal = value.toLongLong(); if (term.comparator() == Term::Greater) { intVal++; } else if (term.comparator() == Term::Less) { intVal--; } return tr->postingCompIterator(prefix, intVal, pcom); + } else if (valueType == QVariant::Double) { + double dVal = value.toDouble(); + return tr->postingCompIterator(prefix, dVal, pcom); + } else if (valueType == QVariant::DateTime) { QDateTime dt = value.toDateTime(); const QByteArray ba = dt.toString(Qt::ISODate).toUtf8(); return tr->postingCompIterator(prefix, ba, pcom); } else { qCDebug(BALOO) << "Comparison must be with an integer"; } return nullptr; } EngineQuery SearchStore::constructContainsQuery(const QByteArray& prefix, const QString& value) { QueryParser parser; return parser.parseQuery(value, prefix); } EngineQuery SearchStore::constructEqualsQuery(const QByteArray& prefix, const QString& value) { // We use the TermGenerator to normalize the words in the value and to // split it into other words. If we split the words, we then add them as a // phrase query. const QByteArrayList terms = TermGenerator::termList(value); QVector queries; for (const QByteArray& term : terms) { QByteArray arr = prefix + term; // FIXME - compatibility hack, to find truncated terms with old // DBs, remove on next DB bump if (arr.size() > 25) { queries << EngineQuery(arr.left(25), EngineQuery::StartsWith); } else { queries << EngineQuery(arr); } } if (queries.isEmpty()) { return EngineQuery(); } else if (queries.size() == 1) { return queries.first(); } else { return EngineQuery(queries, EngineQuery::Phrase); } } EngineQuery SearchStore::constructTypeQuery(const QString& value) { Q_ASSERT(!value.isEmpty()); KFileMetaData::TypeInfo ti = KFileMetaData::TypeInfo::fromName(value); if (ti == KFileMetaData::Type::Empty) { qCDebug(BALOO) << "Type" << value << "does not exist"; return EngineQuery(); } int num = static_cast(ti.type()); return EngineQuery('T' + QByteArray::number(num)); } } // namespace Baloo