diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -15,6 +15,7 @@ phraseanditerator.cpp positiondb.cpp postingdb.cpp + fuzzydb.cpp postingiterator.cpp queryparser.cpp termgenerator.cpp diff --git a/src/engine/database.cpp b/src/engine/database.cpp --- a/src/engine/database.cpp +++ b/src/engine/database.cpp @@ -22,6 +22,7 @@ #include "database.h" #include "transaction.h" #include "postingdb.h" +#include "fuzzydb.h" #include "documentdb.h" #include "documenturldb.h" #include "documentiddb.h" @@ -104,7 +105,7 @@ * maximal number of allowed named databases, must match number of databases we create below * each additional one leads to overhead */ - mdb_env_set_maxdbs(m_env, 12); + mdb_env_set_maxdbs(m_env, 13); /** * size limit for database == size limit of mmap @@ -145,6 +146,7 @@ m_dbis.postingDbi = PostingDB::open(txn); m_dbis.positionDBi = PositionDB::open(txn); + m_dbis.fuzzyDbi = FuzzyDB::open(txn); m_dbis.docTermsDbi = DocumentDB::open("docterms", txn); m_dbis.docFilenameTermsDbi = DocumentDB::open("docfilenameterms", txn); @@ -187,6 +189,7 @@ m_dbis.postingDbi = PostingDB::create(txn); m_dbis.positionDBi = PositionDB::create(txn); + m_dbis.fuzzyDbi = FuzzyDB::create(txn); m_dbis.docTermsDbi = DocumentDB::create("docterms", txn); m_dbis.docFilenameTermsDbi = DocumentDB::create("docfilenameterms", txn); diff --git a/src/engine/databasedbis.h b/src/engine/databasedbis.h --- a/src/engine/databasedbis.h +++ b/src/engine/databasedbis.h @@ -29,6 +29,7 @@ public: MDB_dbi postingDbi; MDB_dbi positionDBi; + MDB_dbi fuzzyDbi; MDB_dbi docTermsDbi; MDB_dbi docFilenameTermsDbi; @@ -47,6 +48,7 @@ DatabaseDbis() : postingDbi(0) , positionDBi(0) + , fuzzyDbi(0) , docTermsDbi(0) , docFilenameTermsDbi(0) , docXattrTermsDbi(0) @@ -62,7 +64,7 @@ bool isValid() { return postingDbi && positionDBi && docTermsDbi && docFilenameTermsDbi && docXattrTermsDbi && idTreeDbi && idFilenameDbi && docTimeDbi && docDataDbi && contentIndexingDbi && mtimeDbi - && failedIdDbi; + && failedIdDbi && fuzzyDbi; } }; diff --git a/src/engine/fuzzydb.h b/src/engine/fuzzydb.h new file mode 100644 --- /dev/null +++ b/src/engine/fuzzydb.h @@ -0,0 +1,43 @@ +#ifndef BALOO_FUZZYDB_H +#define BALOO_FUZZYDB_H + +#include "postingiterator.h" +#include "documentdb.h" +#include "postingdb.h" + +#include "lmdb.h" + +namespace Baloo { + + /** + * The FuzzyDB is used to implement fuzzy search on filenames. + * It does so by computing the bi-grams of each term and matching + * them to the inverted index (this database). + */ + class BALOO_ENGINE_EXPORT FuzzyDB + { + public: + FuzzyDB(MDB_dbi, MDB_txn* txn); + ~FuzzyDB(); + + static MDB_dbi create(MDB_txn* txn); + static MDB_dbi open(MDB_txn* txn); + + void sync_terms(const DocumentDB& filename_terms); + + void put(const QByteArray& bigram, const PostingList& list); + PostingList get(const QByteArray& bigram); + + PostingIterator* iter(const QByteArray& term); + + private: + MDB_txn* m_txn; + MDB_dbi m_dbi; + + QStringList into_bigrams(const QString& term); + + QByteArray raw_get(const QByteArray& bigram); + }; +} + +#endif // BALOO_FUZZYDB_H diff --git a/src/engine/fuzzydb.cpp b/src/engine/fuzzydb.cpp new file mode 100644 --- /dev/null +++ b/src/engine/fuzzydb.cpp @@ -0,0 +1,206 @@ +#include "fuzzydb.h" +#include "documentdb.h" +#include "postingdb.h" +#include "postingcodec.h" +#include + +#include + +using namespace Baloo; + +class OwningPostingIterator : public PostingIterator { +public: + OwningPostingIterator(); + quint64 docId() const override; + quint64 next() override; + + void push(quint64 id); + +private: + QVector m_vec; + int m_pos; +}; + + +FuzzyDB::FuzzyDB(MDB_dbi dbi, MDB_txn* txn) + : m_txn(txn) + , m_dbi(dbi) +{ + Q_ASSERT(txn != nullptr); + Q_ASSERT(dbi != 0); +} + +FuzzyDB::~FuzzyDB() +{ +} + +MDB_dbi FuzzyDB::create(MDB_txn* txn) +{ + MDB_dbi dbi; + int rc = mdb_dbi_open(txn, "fuzzydb", MDB_CREATE, &dbi); + Q_ASSERT_X(rc == 0, "FuzzyDB::create", mdb_strerror(rc)); + + return dbi; +} + +MDB_dbi FuzzyDB::open(MDB_txn* txn) +{ + MDB_dbi dbi; + int rc = mdb_dbi_open(txn, "fuzzydb", 0, &dbi); + if (rc == MDB_NOTFOUND) { + qDebug() << "fuzzy open failed"; + return 0; + } + Q_ASSERT_X(rc == 0, "FuzzyDB::open", mdb_strerror(rc)); + + return dbi; +} + +void FuzzyDB::put(const QByteArray& bigram, const PostingList& list) +{ + Q_ASSERT(!bigram.isEmpty()); + Q_ASSERT(!list.isEmpty()); + + MDB_val key; + key.mv_size = bigram.size(); + key.mv_data = static_cast(const_cast(bigram.constData())); + + PostingCodec codec; + QByteArray arr = codec.encode(list); + + MDB_val val; + val.mv_size = arr.size(); + val.mv_data = static_cast(arr.data()); + + int rc = mdb_put(m_txn, m_dbi, &key, &val, 0); + Q_ASSERT_X(rc == 0, "FuzzyDB::put", mdb_strerror(rc)); +} + +QByteArray FuzzyDB::raw_get(const QByteArray& bigram) +{ + Q_ASSERT(!bigram.isEmpty()); + + MDB_val key; + key.mv_size = bigram.size(); + key.mv_data = static_cast(const_cast(bigram.constData())); + + MDB_val val; + int rc = mdb_get(m_txn, m_dbi, &key, &val); + if (rc == MDB_NOTFOUND) { + return QByteArray(); + } + Q_ASSERT_X(rc == 0, "FuzzyDB::get", mdb_strerror(rc)); + + return QByteArray::fromRawData(static_cast(val.mv_data), val.mv_size); +} + +PostingList FuzzyDB::get(const QByteArray& bigram) +{ + QByteArray arr = this->raw_get(bigram); + PostingCodec codec; + return codec.decode(arr); +} + +void FuzzyDB::sync_terms(const DocumentDB& filename_terms) +{ + // get all words (inefficient just for the proof-of-concept) + QMap> terms = filename_terms.toTestMap(); + + // create an inverted index for bigram -> list of ids + for (auto i = terms.constBegin(); i != terms.constEnd(); ++i) { + + for (const QByteArray& term : i.value()) { + QString sterm = QString::fromUtf8(term); + if (sterm.startsWith('F')) continue; + + QStringList bigrams = this->into_bigrams(sterm); + + for (const QString& bigram : bigrams) { + QByteArray bytes = bigram.toUtf8(); + PostingList ids = this->get(bytes); + + if (!ids.contains(i.key())) { + ids.append(i.key()); + this->put(bytes, ids); + } + } + } + } +} + +PostingIterator* FuzzyDB::iter(const QByteArray& term) +{ + qDebug() << "FuzzyDB::iter" << term; + QMap scores; + QStringList bigrams = this->into_bigrams(term); + + // Score all possible results + for (const QString& bigram : bigrams) { + const QByteArray ids = this->raw_get(bigram.toUtf8()); + + for (int i = 0; i < ids.size(); i += sizeof(quint64)) { + quint64 id = *((quint64*)(ids.constData() + i)); + int bump = scores.value(id, 0) + 1; + scores[id] = bump; + } + } + + OwningPostingIterator* found = new OwningPostingIterator(); + + // Filter away bad matches + for (auto i = scores.constBegin(); i != scores.constEnd(); ++i) { + if (i.value() + 2 >= bigrams.size()) { + found->push(i.key()); + } + } + + return found; +} + +QStringList FuzzyDB::into_bigrams(const QString& term) +{ + QStringList grams; + grams.reserve(term.size() - 1); + + // Get all the individual characters into `list` + // TODO: use QTextBoundaryFinder to get bigrams + + // Generate bigrams here + for (int i = 1; i < term.size(); i += 1) { + grams.append(term.mid(i - 1, 2)); + } + return grams; +} + +// OwningPostingIterator + +OwningPostingIterator::OwningPostingIterator() + : m_vec() + , m_pos(0) +{ +} + +quint64 OwningPostingIterator::docId() const +{ + if (m_pos < 0 || m_pos >= m_vec.size()) { + return 0; + } + + return m_vec[m_pos]; +} + +quint64 OwningPostingIterator::next() +{ + if (m_pos >= m_vec.size() - 1) { + m_pos = m_vec.size(); + return 0; + } + + m_pos++; + return m_vec[m_pos]; +} + +void OwningPostingIterator::push(quint64 id) +{ + this->m_vec.append(id); +} diff --git a/src/engine/transaction.cpp b/src/engine/transaction.cpp --- a/src/engine/transaction.cpp +++ b/src/engine/transaction.cpp @@ -20,6 +20,7 @@ #include "transaction.h" #include "postingdb.h" +#include "fuzzydb.h" #include "documentdb.h" #include "documenturldb.h" #include "documentiddb.h" @@ -295,8 +296,12 @@ { PostingDB postingDb(m_dbis.postingDbi, m_txn); PositionDB positionDb(m_dbis.positionDBi, m_txn); + FuzzyDB fuzzy_db(m_dbis.fuzzyDbi, m_txn); if (query.leaf()) { + // TODO: support fuzzy matching through config + qDebug() << "returning fuzzy match on" << query.term(); + return fuzzy_db.iter(query.term()); if (query.op() == EngineQuery::Equal) { return postingDb.iter(query.term()); } else if (query.op() == EngineQuery::StartsWith) {