diff --git a/autotests/unit/engine/queryparsertest.cpp b/autotests/unit/engine/queryparsertest.cpp --- a/autotests/unit/engine/queryparsertest.cpp +++ b/autotests/unit/engine/queryparsertest.cpp @@ -40,6 +40,7 @@ void testAccentSearch(); void testUnderscoreSplitting(); void testAutoExpand(); + void testUnicodeLowering(); }; void QueryParserTest::testSinglePrefixWord() @@ -218,6 +219,17 @@ } } +void QueryParserTest::testUnicodeLowering() +{ + // This string is unicode mathematical italic "Hedge" + QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92"); + + QueryParser parser; + EngineQuery query = parser.parseQuery(str); + EngineQuery expected = EngineQuery("hedge", EngineQuery::StartsWith, 1); + QCOMPARE(query, expected); +} + QTEST_MAIN(QueryParserTest) #include "queryparsertest.moc" diff --git a/autotests/unit/engine/termgeneratortest.cpp b/autotests/unit/engine/termgeneratortest.cpp --- a/autotests/unit/engine/termgeneratortest.cpp +++ b/autotests/unit/engine/termgeneratortest.cpp @@ -38,6 +38,7 @@ void testUnderscore_splitting(); void testAccetCharacters(); void testUnicodeCompatibleComposition(); + void testUnicodeLowering(); void testEmails(); void testWordPositions(); @@ -130,6 +131,20 @@ QCOMPARE(str.toUtf8(), output); } +void TermGeneratorTest::testUnicodeLowering() +{ + // This string is unicode mathematical italic "Hedge" + QString str = QString::fromUtf8("\xF0\x9D\x90\xBB\xF0\x9D\x91\x92\xF0\x9D\x91\x91\xF0\x9D\x91\x94\xF0\x9D\x91\x92"); + + Document doc; + TermGenerator termGen(&doc); + termGen.indexText(str); + + QList words = allWords(doc); + + QCOMPARE(words, {QByteArray("hedge")}); +} + void TermGeneratorTest::testEmails() { QString str = QString::fromLatin1("me@vhanda.in"); diff --git a/src/engine/queryparser.cpp b/src/engine/queryparser.cpp --- a/src/engine/queryparser.cpp +++ b/src/engine/queryparser.cpp @@ -112,11 +112,8 @@ QString str = text.mid(start, end - start); - // Get the string ready for saving - str = str.toLower(); - - // Remove all accents - const QString denormalized = str.normalized(QString::NormalizationForm_KD); + // Remove all accents and lower it + const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower(); QString cleanString; Q_FOREACH (const QChar& ch, denormalized) { auto cat = ch.category(); diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp --- a/src/engine/termgenerator.cpp +++ b/src/engine/termgenerator.cpp @@ -57,11 +57,9 @@ QString str = text.mid(start, end - start); - // Get the string ready for saving - str = str.toLower(); - - // Remove all accents - const QString denormalized = str.normalized(QString::NormalizationForm_KD); + // Remove all accents. It is important to call toLower after normalization, + // since some exotic unicode symbols can remain uppercase + const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower(); QString cleanString; cleanString.reserve(denormalized.size());