diff --git a/autotests/unit/engine/CMakeLists.txt b/autotests/unit/engine/CMakeLists.txt --- a/autotests/unit/engine/CMakeLists.txt +++ b/autotests/unit/engine/CMakeLists.txt @@ -20,6 +20,7 @@ mtimedbtest termgeneratortest + termgeneratortestutf queryparsertest # Query diff --git a/autotests/unit/engine/termgeneratortestutf.h b/autotests/unit/engine/termgeneratortestutf.h new file mode 100644 --- /dev/null +++ b/autotests/unit/engine/termgeneratortestutf.h @@ -0,0 +1,56 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright (C) 2014-2015 Vishesh Handa + * Copyright (C) 2018 Michael Heidelbach + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef TERMGENERATORTESTUTF_H +#define TERMGENERATORTESTUTF_H + +#include "document.h" + +#include +namespace Baloo +{ + +class TermGeneratorTestUTF : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void testWordBoundaries(); + void testUnderscoreWord(); + void testUnderscore_splitting(); + void testAccetCharacters(); + void testUnicodeCompatibleComposition(); + void testUnicodeLowering(); + void testEmails(); + void testWordPositions(); + + void testTermList(); + void testTermList_data(); + +private: + const QList allWords(const Document& doc) + { + return doc.m_terms.keys(); + } +}; + +} +#endif // TERMGENERATORTESTUTF_H diff --git a/autotests/unit/engine/termgeneratortestutf.cpp b/autotests/unit/engine/termgeneratortestutf.cpp new file mode 100644 --- /dev/null +++ b/autotests/unit/engine/termgeneratortestutf.cpp @@ -0,0 +1,322 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright (C) 2014-2015 Vishesh Handa + * Copyright (C) 2018 Michael Heidelbach + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "termgeneratortestutf.h" +#include "termgenerator.h" +#include "document.h" + +#include +#include + +using namespace Baloo; + +void TermGeneratorTestUTF::testWordBoundaries() +{ + QString str = QString::fromLatin1("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("32.3") << QByteArray("brown") << QByteArray("can't") << QByteArray("feet") << QByteArray("fox") << QByteArray("jump") + << QByteArray("no") << QByteArray("quick") << QByteArray("right") << QByteArray("the") << QByteArray("txt") << QByteArray("wrong") + << QByteArray("xx"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnderscoreWord() +{ + QString str = QString::fromLatin1("_plant"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("plant"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnderscore_splitting() +{ + QString str = QString::fromLatin1("Hello_Howdy"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("hello") << QByteArray("howdy"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testAccetCharacters() +{ + QString str = QStringLiteral("Como está Kûg"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("como") << QByteArray("esta") << QByteArray("kug"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnicodeCompatibleComposition() +{ + // The 0xfb00 corresponds to U+FB00 which is a 'ff' + QString str = QLatin1Literal("maffab"); + QString str2 = QLatin1Literal("ma") + QChar(0xfb00) + QStringLiteral("ab"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str2); + + QList words = allWords(doc); + QCOMPARE(words.size(), 1); + + QByteArray output = words.first(); + QCOMPARE(str.toUtf8(), output); +} + +void TermGeneratorTestUTF::testUnicodeLowering() +{ + // This string is unicode mathematical italic "Hedge" + QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QCOMPARE(words, {QByteArray("hedge")}); +} + +void TermGeneratorTestUTF::testEmails() +{ + QString str = QString::fromLatin1("me@vhanda.in"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("in") << QByteArray("me") << QByteArray("vhanda"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testWordPositions() +{ + Document doc; + TermGenerator termGen(&doc); + + QString str = QString::fromLatin1("Hello hi how hi"); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("hello") << QByteArray("hi") << QByteArray("how"); + QCOMPARE(words, expectedWords); + + QVector posInfo1 = doc.m_terms.value("hello").positions; + QCOMPARE(posInfo1, QVector() << 1); + + QVector posInfo2 = doc.m_terms.value("hi").positions; + QCOMPARE(posInfo2, QVector() << 2 << 4); + + QVector posInfo3 = doc.m_terms.value("how").positions; + QCOMPARE(posInfo3, QVector() << 3); +} + +void TermGeneratorTestUTF::testTermList_data() +{ + QTest::addColumn("phrase"); + QTest::addColumn("terms"); + QTest::addColumn("failmessage"); + const QString nofail = QString(); + + QTest::addRow("wordboundaries") + << QStringLiteral("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt") + << QStringList{ + QStringLiteral("the"), + QStringLiteral("quick"), + QStringLiteral("brown"), + QStringLiteral("fox"), + QStringLiteral("can't"), + QStringLiteral("jump"), + QStringLiteral("32.3"), + QStringLiteral("feet"), + QStringLiteral("right"), + QStringLiteral("no"), + QStringLiteral("wrong"), + QStringLiteral("xx"), + QStringLiteral("txt"), + } + << nofail; + /* TODO use later + QTest::addRow("wordboundaries-doubles") + << QStringLiteral("The quick (\"brown\") 'fox' can't jump the 32.3 feet, right or wrong? No-Wrong;xx.txt") + << QStringList{ + QStringLiteral("the"), + QStringLiteral("quick"), + QStringLiteral("brown"), + QStringLiteral("fox"), + QStringLiteral("can't"), + QStringLiteral("jump"), + QStringLiteral("32.3"), + QStringLiteral("feet"), + QStringLiteral("right"), + QStringLiteral("no"), + QStringLiteral("wrong"), + QStringLiteral("xx"), + QStringLiteral("txt"), + } + << nofail; + */ + QTest::addRow("_prefix") + << QString::fromLatin1("_plant") + << QStringList{ + QStringLiteral("plant") + } + << nofail; + QTest::addRow("underscore") + << QString::fromLatin1("Hello_Howdy") + << QStringList{ + QStringLiteral("hello"), + QStringLiteral("howdy") + } + << nofail; + QTest::addRow("accent") + << QStringLiteral("Como está Kûg") + << QStringList{ + QStringLiteral("como"), + QStringLiteral("esta"), + QStringLiteral("kug") + } + << nofail; +//FIXME: Make it compile +/* + QString str2 = QLatin1Literal("ma") + QChar(0xfb00) + QStringLiteral("ab"); + QTest::addRow("toUtf8") + << QLatin1Literal("maffab") + // The 0xfb00 corresponds to U+FB00 which is a 'ff' + << QStringList{str2} + << false; +*/ + QTest::addRow("fromUtf8") + // This string is unicode mathematical italic "Hedge" + << QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92") + << QStringList{QStringLiteral("hedge")} + << nofail; + QTest::addRow("email") + << QString::fromLatin1("me@vhanda.in") + << QStringList{ + QStringLiteral("me"), + QStringLiteral("vhanda"), + QStringLiteral("in") + } + << nofail; + + // Taken from https://www.coscom.co.jp/hiragana-katakana/kana_readsentences/hiragana201.html + QTest::addRow("hiragana") + << QStringLiteral("やましたさんは いま うちに います。") + << QStringList{ + QStringLiteral("やましたさんは"), + QStringLiteral("いま"), + QStringLiteral("うちに"), + QStringLiteral("います") + } + << QStringLiteral("hiragana not implemented"); + QTest::addRow("chinese") + << QStringLiteral("苦橙可用作提煉精油") + << QStringList{ + QStringLiteral("苦"), + QStringLiteral("橙"), + QStringLiteral("可"), + QStringLiteral("用"), + QStringLiteral("作"), + QStringLiteral("提"), + QStringLiteral("煉"), + QStringLiteral("精"), + QStringLiteral("油") + } + << QStringLiteral("Chinese not implemented"); + QTest::addRow("arabic") + << QStringLiteral("شجيرة أو شجرة يصل ارتفاعها إلى عشرة أمتار") + << QStringList{ + QStringLiteral("شجيرة"), + QStringLiteral("او"), // hamza removed from elif + QStringLiteral("شجرة"), + QStringLiteral("يصل"), + QStringLiteral("ارتفاعها"), + QStringLiteral("الى"), //hamza removed from elif + QStringLiteral("عشرة"), + QStringLiteral("امتار"), //hamza removed from elif + } + << nofail; +} + +void TermGeneratorTestUTF::testTermList() +{ + QFETCH(QString, phrase); + QFETCH(QStringList, terms); + QFETCH(QString, failmessage); + + Document doc; + TermGenerator termGen(&doc); + + const QStringList result = termGen.termList(phrase); + qDebug() << "phrase" << phrase; + qDebug() << "terms " << terms.join(QLatin1Literal(" / ")); + qDebug() << "result" << result.join(QLatin1Literal(" / ")); + + if (failmessage.isEmpty()) { + QCOMPARE(result, terms); + } else { + QEXPECT_FAIL("", qPrintable(failmessage), Continue); + QCOMPARE(result, terms); + } +} + + +// other options: QTEST_GUILESS_MAIN, QTEST_APPLESS_MAIN +QTEST_GUILESS_MAIN(TermGeneratorTestUTF) + + diff --git a/src/engine/document.h b/src/engine/document.h --- a/src/engine/document.h +++ b/src/engine/document.h @@ -96,6 +96,7 @@ friend class WriteTransaction; friend class TermGeneratorTest; + friend class TermGeneratorTestUTF; }; inline QDebug operator<<(QDebug dbg, const Document &doc) {