diff --git a/autotests/unit/engine/CMakeLists.txt b/autotests/unit/engine/CMakeLists.txt --- a/autotests/unit/engine/CMakeLists.txt +++ b/autotests/unit/engine/CMakeLists.txt @@ -21,6 +21,7 @@ termgeneratortest queryparsertest + fuzzysearchtest # Query andpostingiteratortest diff --git a/autotests/unit/engine/fuzzysearchtest.cpp b/autotests/unit/engine/fuzzysearchtest.cpp new file mode 100644 --- /dev/null +++ b/autotests/unit/engine/fuzzysearchtest.cpp @@ -0,0 +1,166 @@ +#include +#include +#include +#include "fuzzysearch.h" + +using namespace Baloo; + +class FuzzySearchTest : public QObject +{ + Q_OBJECT +private Q_SLOTS: + + void testWordFeatures1(); + void testWordFeatures2(); + void testWordFeatures3(); + void testWordFeatures4(); + void testWordFeatures5(); + + void testFeatures(); + + void testSearch(); +}; + +void FuzzySearchTest::testWordFeatures1() +{ + QString word("repetition"); + QList feats({"re", "ep", "pe", "et", "ti", "it", "ti", "io", "on"}); + QList computed = FuzzySearch::word_to_features(word); + QCOMPARE(computed, feats); +} + +void FuzzySearchTest::testWordFeatures2() +{ + QString word("odd"); + QList feats({"od", "dd"}); + QList computed = FuzzySearch::word_to_features(word); + QCOMPARE(computed, feats); +} + +void FuzzySearchTest::testWordFeatures3() +{ + QString word("hi"); + QList feats({"hi"}); + QList computed = FuzzySearch::word_to_features(word); + QCOMPARE(computed, feats); +} + +void FuzzySearchTest::testWordFeatures4() +{ + QString word(""); + QList computed = FuzzySearch::word_to_features(word); + QList feats; + QCOMPARE(computed, feats); +} + +void FuzzySearchTest::testWordFeatures5() +{ + QString word("cAsE"); + QList feats({"ca", "as", "se"}); + QList computed = FuzzySearch::word_to_features(word); + QCOMPARE(computed, feats); +} + +void FuzzySearchTest::testFeatures() +{ + QStringList terms({"notes", "april8", "2018", "md"}); + QMap exported = FuzzySearch::features(1010, terms); + QMap correct; + + auto make = [](quint8 wid, quint8 len) -> FuzzyDataList { + FuzzyData fdata; + fdata.id = 1010; + fdata.wid = wid; + fdata.len = len; + QList ids({fdata}); + FuzzyDataList dlist; + dlist.m_datalist = ids; + return dlist; + }; + + correct.insert("no", make(0, 5)); + correct.insert("ot", make(0, 5)); + correct.insert("te", make(0, 5)); + correct.insert("es", make(0, 5)); + + correct.insert("ap", make(1, 6)); + correct.insert("pr", make(1, 6)); + correct.insert("ri", make(1, 6)); + correct.insert("il", make(1, 6)); + correct.insert("l8", make(1, 6)); + + correct.insert("20", make(2, 4)); + correct.insert("01", make(2, 4)); + correct.insert("18", make(2, 4)); + + correct.insert("md", make(3, 2)); + + QCOMPARE(exported, correct); +} + +void FuzzySearchTest::testSearch() +{ + QStringList file1({"notes", "april8", "2018", "md"}); + QStringList file2({"notes", "wednesday", "04092018", "md"}); + QStringList file3({"thisissomepoorlydelimitedfile"}); + QStringList file4({"bathsroom", "png"}); + QStringList file5({"livingroom", "png"}); + QStringList file6({"washroom", "png"}); + QStringList file7({"sittingrooms", "png"}); + QStringList file8({"CS4540"}); + QStringList file9({"CS4641"}); + QStringList file10({"CS4476"}); + QStringList file11({"CS2200"}); + QStringList file12({"LMC2200"}); + + QMap db; + db.unite(FuzzySearch::features(1, file1)); + db.unite(FuzzySearch::features(2, file2)); + db.unite(FuzzySearch::features(3, file3)); + db.unite(FuzzySearch::features(4, file4)); + db.unite(FuzzySearch::features(5, file5)); + db.unite(FuzzySearch::features(6, file6)); + db.unite(FuzzySearch::features(7, file7)); + db.unite(FuzzySearch::features(8, file8)); + db.unite(FuzzySearch::features(9, file9)); + db.unite(FuzzySearch::features(10, file10)); + db.unite(FuzzySearch::features(11, file11)); + db.unite(FuzzySearch::features(12, file12)); + + FuzzyDataList list; + FuzzyDataGetter getter = [&](const FuzzyFeature& feat) -> const FuzzyDataList& { + list.m_datalist.clear(); + for (auto i = db.find(feat); i != db.end() && i.key() == feat; ++i) { + list.m_datalist.append(i->m_datalist); + } + return list; + }; + + FuzzySearch fuzzy(2); + QList results; + + results = fuzzy.search(QString("wensday"), getter); + QCOMPARE(results, QList({ 2 })); + + results = fuzzy.search(QString("poorly"), getter); + QCOMPARE(results, QList({ 3 })); + + results = fuzzy.search(QString("4540"), getter); + QCOMPARE(results, QList({ 8, 2 })); + + results = fuzzy.search(QString("2200"), getter); + QCOMPARE(results, QList({ 11, 12, 1, 2 })); + + results = fuzzy.search(QString("room"), getter); + QCOMPARE(results, QList({ 6, 4, 5, 7, 3 })); + + results = fuzzy.search(QString("noots"), getter); + QCOMPARE(results, QList({ 1, 2 })); + + results = fuzzy.search(QString("delemeted"), getter); + QCOMPARE(results, QList({ 3 })); +} + +QTEST_MAIN(FuzzySearchTest) + +#include "fuzzysearchtest.moc" diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -15,6 +15,7 @@ phraseanditerator.cpp positiondb.cpp postingdb.cpp + fuzzydb.cpp postingiterator.cpp queryparser.cpp termgenerator.cpp @@ -24,6 +25,7 @@ writetransaction.cpp global.cpp fsutils.cpp + fuzzysearch.cpp ) if(${BUILD_EXPERIMENTAL}) diff --git a/src/engine/database.cpp b/src/engine/database.cpp --- a/src/engine/database.cpp +++ b/src/engine/database.cpp @@ -22,6 +22,7 @@ #include "database.h" #include "transaction.h" #include "postingdb.h" +#include "fuzzydb.h" #include "documentdb.h" #include "documenturldb.h" #include "documentiddb.h" @@ -104,7 +105,7 @@ * maximal number of allowed named databases, must match number of databases we create below * each additional one leads to overhead */ - mdb_env_set_maxdbs(m_env, 12); + mdb_env_set_maxdbs(m_env, 13); /** * size limit for database == size limit of mmap @@ -145,6 +146,7 @@ m_dbis.postingDbi = PostingDB::open(txn); m_dbis.positionDBi = PositionDB::open(txn); + m_dbis.fuzzyDbi = FuzzyDB::open(txn); m_dbis.docTermsDbi = DocumentDB::open("docterms", txn); m_dbis.docFilenameTermsDbi = DocumentDB::open("docfilenameterms", txn); @@ -187,6 +189,7 @@ m_dbis.postingDbi = PostingDB::create(txn); m_dbis.positionDBi = PositionDB::create(txn); + m_dbis.fuzzyDbi = FuzzyDB::create(txn); m_dbis.docTermsDbi = DocumentDB::create("docterms", txn); m_dbis.docFilenameTermsDbi = DocumentDB::create("docfilenameterms", txn); diff --git a/src/engine/databasedbis.h b/src/engine/databasedbis.h --- a/src/engine/databasedbis.h +++ b/src/engine/databasedbis.h @@ -29,6 +29,7 @@ public: MDB_dbi postingDbi; MDB_dbi positionDBi; + MDB_dbi fuzzyDbi; MDB_dbi docTermsDbi; MDB_dbi docFilenameTermsDbi; @@ -47,6 +48,7 @@ DatabaseDbis() : postingDbi(0) , positionDBi(0) + , fuzzyDbi(0) , docTermsDbi(0) , docFilenameTermsDbi(0) , docXattrTermsDbi(0) @@ -62,7 +64,7 @@ bool isValid() { return postingDbi && positionDBi && docTermsDbi && docFilenameTermsDbi && docXattrTermsDbi && idTreeDbi && idFilenameDbi && docTimeDbi && docDataDbi && contentIndexingDbi && mtimeDbi - && failedIdDbi; + && failedIdDbi && fuzzyDbi; } }; diff --git a/src/engine/fuzzydb.h b/src/engine/fuzzydb.h new file mode 100644 --- /dev/null +++ b/src/engine/fuzzydb.h @@ -0,0 +1,43 @@ +#ifndef BALOO_FUZZYDB_H +#define BALOO_FUZZYDB_H + +#include "postingiterator.h" +#include "documentdb.h" +#include "postingdb.h" + +#include "lmdb.h" + +namespace Baloo { + + /** + * The FuzzyDB is used to implement fuzzy search on filenames. + * It does so by computing the bi-grams of each term and matching + * them to the inverted index (this database). + */ + class BALOO_ENGINE_EXPORT FuzzyDB + { + public: + FuzzyDB(MDB_dbi, MDB_txn* txn); + ~FuzzyDB(); + + static MDB_dbi create(MDB_txn* txn); + static MDB_dbi open(MDB_txn* txn); + + void sync_terms(const DocumentDB& filename_terms); + + void put(const QByteArray& bigram, const PostingList& list); + PostingList get(const QByteArray& bigram); + + PostingIterator* iter(const QByteArray& term); + + private: + MDB_txn* m_txn; + MDB_dbi m_dbi; + + QStringList into_bigrams(const QString& term); + + QByteArray raw_get(const QByteArray& bigram); + }; +} + +#endif // BALOO_FUZZYDB_H diff --git a/src/engine/fuzzydb.cpp b/src/engine/fuzzydb.cpp new file mode 100644 --- /dev/null +++ b/src/engine/fuzzydb.cpp @@ -0,0 +1,206 @@ +#include "fuzzydb.h" +#include "documentdb.h" +#include "postingdb.h" +#include "postingcodec.h" +#include + +#include + +using namespace Baloo; + +class OwningPostingIterator : public PostingIterator { +public: + OwningPostingIterator(); + quint64 docId() const override; + quint64 next() override; + + void push(quint64 id); + +private: + QVector m_vec; + int m_pos; +}; + + +FuzzyDB::FuzzyDB(MDB_dbi dbi, MDB_txn* txn) + : m_txn(txn) + , m_dbi(dbi) +{ + Q_ASSERT(txn != nullptr); + Q_ASSERT(dbi != 0); +} + +FuzzyDB::~FuzzyDB() +{ +} + +MDB_dbi FuzzyDB::create(MDB_txn* txn) +{ + MDB_dbi dbi; + int rc = mdb_dbi_open(txn, "fuzzydb", MDB_CREATE, &dbi); + Q_ASSERT_X(rc == 0, "FuzzyDB::create", mdb_strerror(rc)); + + return dbi; +} + +MDB_dbi FuzzyDB::open(MDB_txn* txn) +{ + MDB_dbi dbi; + int rc = mdb_dbi_open(txn, "fuzzydb", 0, &dbi); + if (rc == MDB_NOTFOUND) { + qDebug() << "fuzzy open failed"; + return 0; + } + Q_ASSERT_X(rc == 0, "FuzzyDB::open", mdb_strerror(rc)); + + return dbi; +} + +void FuzzyDB::put(const QByteArray& bigram, const PostingList& list) +{ + Q_ASSERT(!bigram.isEmpty()); + Q_ASSERT(!list.isEmpty()); + + MDB_val key; + key.mv_size = bigram.size(); + key.mv_data = static_cast(const_cast(bigram.constData())); + + PostingCodec codec; + QByteArray arr = codec.encode(list); + + MDB_val val; + val.mv_size = arr.size(); + val.mv_data = static_cast(arr.data()); + + int rc = mdb_put(m_txn, m_dbi, &key, &val, 0); + Q_ASSERT_X(rc == 0, "FuzzyDB::put", mdb_strerror(rc)); +} + +QByteArray FuzzyDB::raw_get(const QByteArray& bigram) +{ + Q_ASSERT(!bigram.isEmpty()); + + MDB_val key; + key.mv_size = bigram.size(); + key.mv_data = static_cast(const_cast(bigram.constData())); + + MDB_val val; + int rc = mdb_get(m_txn, m_dbi, &key, &val); + if (rc == MDB_NOTFOUND) { + return QByteArray(); + } + Q_ASSERT_X(rc == 0, "FuzzyDB::get", mdb_strerror(rc)); + + return QByteArray::fromRawData(static_cast(val.mv_data), val.mv_size); +} + +PostingList FuzzyDB::get(const QByteArray& bigram) +{ + QByteArray arr = this->raw_get(bigram); + PostingCodec codec; + return codec.decode(arr); +} + +void FuzzyDB::sync_terms(const DocumentDB& filename_terms) +{ + // get all words (inefficient just for the proof-of-concept) + QMap> terms = filename_terms.toTestMap(); + + // create an inverted index for bigram -> list of ids + for (auto i = terms.constBegin(); i != terms.constEnd(); ++i) { + + for (const QByteArray& term : i.value()) { + QString sterm = QString::fromUtf8(term); + if (sterm.startsWith('F')) continue; + + QStringList bigrams = this->into_bigrams(sterm); + + for (const QString& bigram : bigrams) { + QByteArray bytes = bigram.toUtf8(); + PostingList ids = this->get(bytes); + + if (!ids.contains(i.key())) { + ids.append(i.key()); + this->put(bytes, ids); + } + } + } + } +} + +PostingIterator* FuzzyDB::iter(const QByteArray& term) +{ + qDebug() << "FuzzyDB::iter" << term; + QMap scores; + QStringList bigrams = this->into_bigrams(term); + + // Score all possible results + for (const QString& bigram : bigrams) { + const QByteArray ids = this->raw_get(bigram.toUtf8()); + + for (int i = 0; i < ids.size(); i += sizeof(quint64)) { + quint64 id = *((quint64*)(ids.constData() + i)); + int bump = scores.value(id, 0) + 1; + scores[id] = bump; + } + } + + OwningPostingIterator* found = new OwningPostingIterator(); + + // Filter away bad matches + for (auto i = scores.constBegin(); i != scores.constEnd(); ++i) { + if (i.value() + 2 >= bigrams.size()) { + found->push(i.key()); + } + } + + return found; +} + +QStringList FuzzyDB::into_bigrams(const QString& term) +{ + QStringList grams; + grams.reserve(term.size() - 1); + + // Get all the individual characters into `list` + // TODO: use QTextBoundaryFinder to get bigrams + + // Generate bigrams here + for (int i = 1; i < term.size(); i += 1) { + grams.append(term.mid(i - 1, 2)); + } + return grams; +} + +// OwningPostingIterator + +OwningPostingIterator::OwningPostingIterator() + : m_vec() + , m_pos(0) +{ +} + +quint64 OwningPostingIterator::docId() const +{ + if (m_pos < 0 || m_pos >= m_vec.size()) { + return 0; + } + + return m_vec[m_pos]; +} + +quint64 OwningPostingIterator::next() +{ + if (m_pos >= m_vec.size() - 1) { + m_pos = m_vec.size(); + return 0; + } + + m_pos++; + return m_vec[m_pos]; +} + +void OwningPostingIterator::push(quint64 id) +{ + this->m_vec.append(id); +} diff --git a/src/engine/fuzzysearch.h b/src/engine/fuzzysearch.h new file mode 100644 --- /dev/null +++ b/src/engine/fuzzysearch.h @@ -0,0 +1,57 @@ +#ifndef BALOO_FUZZYSEARCH_H +#define BALOO_FUZZYSEARCH_H + +#include "engine_export.h" + +#include +#include +#include +#include + +#include + +namespace Baloo { + class BALOO_ENGINE_EXPORT FuzzyData + { + public: + quint64 id; + quint8 wid; + quint8 len; + + bool operator==(const FuzzyData &other) const; + bool operator<(const FuzzyData& other) const; + }; + + class BALOO_ENGINE_EXPORT FuzzyDataList + { + public: + QList m_datalist; + + QByteArray into_bytes(); + static FuzzyData from_bytes(); + + bool operator==(const FuzzyDataList &other) const; + }; + + using FuzzyFeature = QByteArray; + + using FuzzyDataGetter = + std::function; + + class BALOO_ENGINE_EXPORT FuzzySearch + { + public: + FuzzySearch(int tolerance); + + static QMap features(quint64 id, QStringList terms); + + static QList word_to_features(const QString& word); + + QList search(const QString& query, const FuzzyDataGetter getter); + + private: + int m_tolerance; + }; +} + +#endif // BALOO_FUZZYSEARCH_H diff --git a/src/engine/fuzzysearch.cpp b/src/engine/fuzzysearch.cpp new file mode 100644 --- /dev/null +++ b/src/engine/fuzzysearch.cpp @@ -0,0 +1,145 @@ +#include "fuzzysearch.h" + +#include + +using namespace Baloo; + + +FuzzySearch::FuzzySearch(int tolerance) + : m_tolerance(tolerance) +{ +} + + +QList FuzzySearch::word_to_features(const QString& term) +{ + QList grams; + if (term.size() < 2) { + return grams; + } + grams.reserve(term.size() - 1); + + // TODO: use QTextBoundaryFinder to get correct graphemes + for (int i = 1; i < term.size(); i += 1) { + grams.append(term.mid(i - 1, 2).toLower().toUtf8()); + } + return grams; +} + + +QMap FuzzySearch::features(quint64 id, QStringList terms) +{ + QMap output; + quint8 wid = 0; + + for (const QString& word : terms) { + // Get the features for every word + QList features = FuzzySearch::word_to_features(word); + // Create an inverted index of the features + for (const QByteArray& feature : features) { + // make the associated data + FuzzyData data; + data.id = id; + data.wid = wid; + data.len = word.size(); + + // make this feature linked to the data it came from + auto existing = output.find(feature); + + if (existing == output.end()) { + // `feature` not used before, make a new one and insert + FuzzyDataList list; + list.m_datalist.append(data); + output.insert(feature, list); + } + else { + // found existing `feature`, add this id and word size to it + existing->m_datalist.append(data); + } + } + if (wid < 255) { + wid += 1; + } + } + return output; +} + + +QList +FuzzySearch::search(const QString& query, const FuzzyDataGetter getter) +{ + QList features = this->word_to_features(query); + QMap scores; + + // Calculate how close each term is to query, the higher score the better + for (const FuzzyFeature& feature : features) { + // Get all documents of each feature + const FuzzyDataList& documents = getter(feature); + for (const FuzzyData& doc : documents.m_datalist) { + // Keep track of the score of each document and its length + auto score = scores.find(doc); + if (score == scores.end()) { + scores.insert(doc, 1); + } + else { + *score += 1; + } + } + } + + // Remove based on threshold + QList> passing; + for (auto i = scores.begin(); i != scores.end(); ++i) { + if (i.value() + this->m_tolerance >= features.size()) { + passing.append(qMakePair(i.key(), i.value())); + } + } + + // Sort based on best score & shortest word + std::sort(passing.begin(), passing.end(), + [](const QPair& a, const QPair& b) { + if (a.second == b.second) { + return a.first.len < b.first.len; + } + return a.second > b.second; + }); + + // Shed metadata and return only IDs + QList ids; + ids.reserve(passing.size()); + for (const QPair& a : passing) { + ids.append(a.first.id); + } + + return ids; +} + + +QByteArray FuzzyDataList::into_bytes() +{ + return QByteArray(); // TODO: +} + + +FuzzyData FuzzyDataList::from_bytes() +{ + return FuzzyData(); // TODO: +} + +bool FuzzyDataList::operator==(const FuzzyDataList &other) const +{ + return other.m_datalist == m_datalist; +} + +bool FuzzyData::operator==(const FuzzyData &other) const +{ + return id == other.id && wid == other.wid && len == other.len; +} + +bool FuzzyData::operator<(const FuzzyData &other) const +{ + if (id == other.id) { + return wid < other.wid; + } + return id < other.id; +} diff --git a/src/engine/transaction.cpp b/src/engine/transaction.cpp --- a/src/engine/transaction.cpp +++ b/src/engine/transaction.cpp @@ -20,6 +20,7 @@ #include "transaction.h" #include "postingdb.h" +#include "fuzzydb.h" #include "documentdb.h" #include "documenturldb.h" #include "documentiddb.h" @@ -295,8 +296,12 @@ { PostingDB postingDb(m_dbis.postingDbi, m_txn); PositionDB positionDb(m_dbis.positionDBi, m_txn); + FuzzyDB fuzzy_db(m_dbis.fuzzyDbi, m_txn); if (query.leaf()) { + // TODO: support fuzzy matching through config + qDebug() << "returning fuzzy match on" << query.term(); + return fuzzy_db.iter(query.term()); if (query.op() == EngineQuery::Equal) { return postingDb.iter(query.term()); } else if (query.op() == EngineQuery::StartsWith) {