diff --git a/autotests/unit/engine/CMakeLists.txt b/autotests/unit/engine/CMakeLists.txt --- a/autotests/unit/engine/CMakeLists.txt +++ b/autotests/unit/engine/CMakeLists.txt @@ -20,6 +20,7 @@ mtimedbtest termgeneratortest + termgeneratortestutf queryparsertest # Query diff --git a/autotests/unit/engine/termgeneratortestutf.h b/autotests/unit/engine/termgeneratortestutf.h new file mode 100644 --- /dev/null +++ b/autotests/unit/engine/termgeneratortestutf.h @@ -0,0 +1,58 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright (C) 2014-2015 Vishesh Handa + * Copyright (C) 2018 Michael Heidelbach + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef TERMGENERATORTESTUTF_H +#define TERMGENERATORTESTUTF_H + +#include "document.h" + +#include +namespace Baloo +{ + +class TermGeneratorTestUTF : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void testWordBoundaries(); + void testUnderscoreWord(); + void testUnderscore_splitting(); + void testAccetCharacters(); + void testUnicodeCompatibleComposition(); + void testUnicodeLowering(); + void testEmails(); + void testWordPositions(); + + void testTermList(); + void testTermList_data(); + void testDocumentTerms(); + void testDocumentTerms_data(); + +private: + const QList allWords(const Document& doc) + { + return doc.m_terms.keys(); + } +}; + +} +#endif // TERMGENERATORTESTUTF_H diff --git a/autotests/unit/engine/termgeneratortestutf.cpp b/autotests/unit/engine/termgeneratortestutf.cpp new file mode 100644 --- /dev/null +++ b/autotests/unit/engine/termgeneratortestutf.cpp @@ -0,0 +1,530 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright (C) 2014-2015 Vishesh Handa + * Copyright (C) 2018 Michael Heidelbach + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "termgeneratortestutf.h" +#include "termgenerator.h" +#include "document.h" + +#include +#include + +using namespace Baloo; + +void TermGeneratorTestUTF::testWordBoundaries() +{ + QString str = QString::fromLatin1("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("32.3") << QByteArray("brown") << QByteArray("can't") << QByteArray("feet") << QByteArray("fox") << QByteArray("jump") + << QByteArray("no") << QByteArray("quick") << QByteArray("right") << QByteArray("the") << QByteArray("txt") << QByteArray("wrong") + << QByteArray("xx"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnderscoreWord() +{ + QString str = QString::fromLatin1("_plant"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("plant"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnderscore_splitting() +{ + QString str = QString::fromLatin1("Hello_Howdy"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("hello") << QByteArray("howdy"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testAccetCharacters() +{ + QString str = QStringLiteral("Como está Kûg"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("como") << QByteArray("esta") << QByteArray("kug"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testUnicodeCompatibleComposition() +{ + // The 0xfb00 corresponds to U+FB00 which is a 'ff' + QString str = QLatin1Literal("maffab"); + QString str2 = QLatin1Literal("ma") + QChar(0xfb00) + QStringLiteral("ab"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str2); + + QList words = allWords(doc); + QCOMPARE(words.size(), 1); + + QByteArray output = words.first(); + QCOMPARE(str.toUtf8(), output); +} + +void TermGeneratorTestUTF::testUnicodeLowering() +{ + // This string is unicode mathematical italic "Hedge" + QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QCOMPARE(words, {QByteArray("hedge")}); +} + +void TermGeneratorTestUTF::testEmails() +{ + QString str = QString::fromLatin1("me@vhanda.in"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("in") << QByteArray("me") << QByteArray("vhanda"); + + QCOMPARE(words, expectedWords); +} + +void TermGeneratorTestUTF::testWordPositions() +{ + Document doc; + TermGenerator termGen(&doc); + + QString str = QString::fromLatin1("Hello hi how hi"); + termGen.indexText(str); + + QList words = allWords(doc); + + QList expectedWords; + expectedWords << QByteArray("hello") << QByteArray("hi") << QByteArray("how"); + QCOMPARE(words, expectedWords); + + QVector posInfo1 = doc.m_terms.value("hello").positions; + QCOMPARE(posInfo1, QVector() << 1); + + QVector posInfo2 = doc.m_terms.value("hi").positions; + QCOMPARE(posInfo2, QVector() << 2 << 4); + + QVector posInfo3 = doc.m_terms.value("how").positions; + QCOMPARE(posInfo3, QVector() << 3); +} + +void TermGeneratorTestUTF::testTermList_data() +{ + QTest::addColumn("phrase"); + QTest::addColumn("terms"); + QTest::addColumn("failmessage"); + const QString nofail = QString(); + + QTest::addRow("wordboundaries") + << QStringLiteral("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt") + << QStringList{ + QStringLiteral("the"), + QStringLiteral("quick"), + QStringLiteral("brown"), + QStringLiteral("fox"), + QStringLiteral("can't"), + QStringLiteral("jump"), + QStringLiteral("32.3"), + QStringLiteral("feet"), + QStringLiteral("right"), + QStringLiteral("no"), + QStringLiteral("wrong"), + QStringLiteral("xx"), + QStringLiteral("txt"), + } + << nofail; + QTest::addRow("_prefix") + << QString::fromLatin1("_plant") + << QStringList{ + QStringLiteral("plant") + } + << nofail; + QTest::addRow("underscore") + << QString::fromLatin1("Hello_Howdy") + << QStringList{ + QStringLiteral("hello"), + QStringLiteral("howdy") + } + << nofail; + QTest::addRow("accent") + << QStringLiteral("Como está Kûg") + << QStringList{ + QStringLiteral("como"), + QStringLiteral("esta"), + QStringLiteral("kug") + } + << nofail; + QTest::addRow("ligature") + // The 0xfb00 corresponds to U+FB00 which is ligature 'ff' + << QStringLiteral("ma%1ab").arg(QChar(0xfb00)) + << QStringList{QStringLiteral("maffab")} + << nofail; + QTest::addRow("fromUtf8") + // This string is unicode mathematical italic "Hedge" + << QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92") + << QStringList{QStringLiteral("hedge")} + << nofail; + QTest::addRow("email") + << QString::fromLatin1("me@vhanda.in") + << QStringList{ + QStringLiteral("me"), + QStringLiteral("vhanda"), + QStringLiteral("in") + } + << nofail; + + // Taken from https://www.coscom.co.jp/hiragana-katakana/kana_readsentences/hiragana201.html + QTest::addRow("hiragana") + << QStringLiteral("やましたさんは いま うちに います。") + << QStringList{ + QStringLiteral("やましたさんは"), + QStringLiteral("いま"), + QStringLiteral("うちに"), + QStringLiteral("います") + } + << QStringLiteral("Hiragana not implemented"); + QTest::addRow("chinese") + << QStringLiteral("苦橙可用作提煉精油") + << QStringList{ + QStringLiteral("苦"), + QStringLiteral("橙"), + QStringLiteral("可"), + QStringLiteral("用"), + QStringLiteral("作"), + QStringLiteral("提"), + QStringLiteral("煉"), + QStringLiteral("精"), + QStringLiteral("油") + } + << QStringLiteral("Chinese not implemented"); + QTest::addRow("chin+lat+chin") + << QStringLiteral("苦橙Big brown可用") + << QStringList{ + QStringLiteral("苦"), + QStringLiteral("橙"), + QStringLiteral("big"), + QStringLiteral("brown"), + QStringLiteral("可"), + QStringLiteral("用") + } + << QStringLiteral("Chinese not implemented"); + QTest::addRow("chin+lat+chin1") + << QStringLiteral("苦橙可Big brown用") + << QStringList{ + QStringLiteral("苦"), + QStringLiteral("橙"), + QStringLiteral("可"), + QStringLiteral("big"), + QStringLiteral("brown"), + QStringLiteral("用") + } + << QStringLiteral("Chinese not implemented"); + // The word boundary 'Chicago' + '에' is should be detected by QTextBoundaryFinder but isn't. + + QTest::addRow("korean+latin") + // Following empty lines prevent + // overlapping of adjacent lines in IDE + << QStringLiteral("나는 Chicago에 산다") // "I live in Chicago" + + + << QStringList{ + QStringLiteral("나"), + + QStringLiteral("는"), + + QStringLiteral("chicago"), + QStringLiteral("에"), + + QStringLiteral("산다"), + + } + << QStringLiteral("Korean not implemented"); + QTest::addRow("arabic") + << QStringLiteral("شجيرة أو شجرة يصل ارتفاعها إلى عشرة أمتار") + << QStringList{ + QStringLiteral("شجيرة"), + QStringLiteral("او"), // hamza removed from elif + QStringLiteral("شجرة"), + QStringLiteral("يصل"), + QStringLiteral("ارتفاعها"), + QStringLiteral("الى"), //hamza removed from elif + QStringLiteral("عشرة"), + QStringLiteral("امتار"), //hamza removed from elif + } + << nofail; + +} + +void TermGeneratorTestUTF::testTermList() +{ + QFETCH(QString, phrase); + QFETCH(QStringList, terms); + QFETCH(QString, failmessage); + + Document doc; + TermGenerator termGen(&doc); + + const QStringList result = termGen.termList(phrase); + if (failmessage.isEmpty()) { + QCOMPARE(result, terms); + } else { + QEXPECT_FAIL("", qPrintable(failmessage), Continue); + QCOMPARE(result, terms); + } +} + +void TermGeneratorTestUTF::testDocumentTerms_data() +{ + QTest::addColumn("phrase"); + QTest::addColumn>("terms"); + QTest::addColumn>>("positions"); + QTest::addColumn("failmessage"); + const QString nofail = QString(); + auto toVecVec = [&](QVector vec) { + QVector> result; + for (const uint pos : vec) { + result << QVector{pos}; + } + return result; + }; + + QTest::addRow("wordboundaries") + << QStringLiteral("The quick (\"brown\") 'fox' can't jump 32.3 feet, right? No-Wrong;xx.txt") + << QList{ + QByteArray("32.3"), + QByteArray("brown"), + QByteArray("can't"), + QByteArray("feet"), + QByteArray("fox"), + QByteArray("jump"), + QByteArray("no"), + QByteArray("quick"), + QByteArray("right"), + QByteArray("the"), + QByteArray("txt"), + QByteArray("wrong"), + QByteArray("xx"), + } + << toVecVec({7, 3, 5, 8, 4, 6, 10, 2, 9, 1, 13, 11, 12}) + << nofail; + QTest::addRow("positions") + << QStringLiteral("Hello hi how hi") + << QList{ + QByteArray("hello"), + QByteArray("hi"), + QByteArray("how"), + } + << QVector>{{1}, {2, 4}, {3}} + << nofail; + QTest::addRow("_prefix") + << QString::fromLatin1("_plant") + << QList{ + QByteArray("plant") + } + << toVecVec({1}) + << nofail; + QTest::addRow("underscore") + << QString::fromLatin1("Hello_Howdy") + << QList{ + QByteArray("hello"), + QByteArray("howdy") + } + << toVecVec({1, 2}) + << nofail; + QTest::addRow("accent") + << QStringLiteral("Como está Kûg") + << QList{ + QByteArray("como"), + QByteArray("esta"), + QByteArray("kug") + } + << toVecVec({1, 2, 3}) + << nofail; + QTest::addRow("ligature") + // The 0xfb00 corresponds to U+FB00 which is ligature 'ff' + << QStringLiteral("ma%1ab").arg(QChar(0xfb00)) + << QList{QByteArray("maffab")} + << toVecVec({1}) + << nofail; + QTest::addRow("fromUtf8") + // This string is unicode mathematical italic "Hedge" + << QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92") + << QList{QByteArray("hedge")} + << toVecVec({1}) + << nofail; + QTest::addRow("email") + << QString::fromLatin1("me@vhanda.in") + << QList{ + QByteArray("in"), + QByteArray("me"), + QByteArray("vhanda"), + } + << toVecVec({3, 1, 2}) + + << nofail; + + // Taken from https://www.coscom.co.jp/hiragana-katakana/kana_readsentences/hiragana201.html + QTest::addRow("hiragana") + << QStringLiteral("やましたさんは いま うちに います。") + << QList{ + QByteArray("やましたさんは"), + QByteArray("いま"), + QByteArray("うちに"), + QByteArray("います") + } + << toVecVec({1, 2, 3, 4}) + << QStringLiteral("Hiragana not implemented"); + QTest::addRow("chinese") + << QStringLiteral("苦橙可用作提煉精油") + << QList{ + QByteArray("作"), + QByteArray("可"), + QByteArray("提"), + QByteArray("橙"), + QByteArray("油"), + QByteArray("煉"), + QByteArray("用"), + QByteArray("精"), + QByteArray("苦"), + } + << toVecVec({5, 3, 6, 2, 9, 7, 4, 8, 1}) + << QStringLiteral("Chinese not implemented"); + QTest::addRow("chinese+latin") + << QStringLiteral("苦橙可Big brown用") + << QList{ + QByteArray("big"), + QByteArray("brown"), + QByteArray("可"), + QByteArray("橙"), + QByteArray("用"), + QByteArray("苦"), + } + << toVecVec({6, 4, 1, 2, 5}) + << QStringLiteral("Chinese not implemented"); + // The word boundary 'Chicago' + '에' is should be detected by QTextBoundaryFinder but isn't. + + QTest::addRow("korean+latin") + // Following empty lines prevent + // overlapping of adjacent lines in IDE + << QStringLiteral("나는 Chicago에 산다") // "I live in Chicago" + + + << QList{ + QByteArray("chicago"), + QByteArray("나"), + + QByteArray("는"), + + QByteArray("에"), + + QByteArray("산다"), + + } + << toVecVec({2, 3, 1, 4, 5}) + << QStringLiteral("Korean not implemented"); + QTest::addRow("arabic") + << QStringLiteral("شجيرة أو شجرة يصل ارتفاعها إلى عشرة أمتار") + << QList{ + QByteArray("ارتفاعها"), + QByteArray("الى"), //hamza removed from elif + QByteArray("امتار"), //hamza removed from elif + QByteArray("او"), // hamza removed from elif + QByteArray("شجرة"), + QByteArray("شجيرة"), + QByteArray("عشرة"), + QByteArray("يصل"), + } + << toVecVec({5, 6, 8, 2, 3, 1, 7, 4}) + << nofail; + +} + +void TermGeneratorTestUTF::testDocumentTerms() +{ + QFETCH(QString, phrase); + QFETCH(QList, terms); + QFETCH(QVector>, positions); + QFETCH(QString, failmessage); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(phrase); + + QList result = allWords(doc); + + if (failmessage.isEmpty()) { + QCOMPARE(result, terms); + for (int i = 0; i < positions.count(); i++) { + const QVector position = positions[i]; + QVector posInfo1 = doc.m_terms.value(terms[i]).positions; + QVERIFY2(posInfo1 == position, qPrintable(QStringLiteral("Position #%1: %2 != %3").arg(i) + .arg(posInfo1[0]) + .arg(position[0]) + )); + } + } else { + QEXPECT_FAIL("", qPrintable(failmessage), Continue); + QCOMPARE(result, terms); + } +} + +QTEST_GUILESS_MAIN(TermGeneratorTestUTF) + + diff --git a/src/engine/document.h b/src/engine/document.h --- a/src/engine/document.h +++ b/src/engine/document.h @@ -96,6 +96,7 @@ friend class WriteTransaction; friend class TermGeneratorTest; + friend class TermGeneratorTestUTF; }; inline QDebug operator<<(QDebug dbg, const Document &doc) {