diff --git a/autotests/unit/engine/termgeneratortestutf.cpp b/autotests/unit/engine/termgeneratortestutf.cpp --- a/autotests/unit/engine/termgeneratortestutf.cpp +++ b/autotests/unit/engine/termgeneratortestutf.cpp @@ -253,7 +253,7 @@ QStringLiteral("精"), QStringLiteral("油") } - << QStringLiteral("Chinese not implemented"); + << nofail; QTest::addRow("chin+lat+chin") << QStringLiteral("苦橙Big brown可用") << QStringList{ @@ -264,7 +264,7 @@ QStringLiteral("可"), QStringLiteral("用") } - << QStringLiteral("Chinese not implemented"); + << nofail; QTest::addRow("chin+lat+chin1") << QStringLiteral("苦橙可Big brown用") << QStringList{ @@ -275,7 +275,7 @@ QStringLiteral("brown"), QStringLiteral("用") } - << QStringLiteral("Chinese not implemented"); + << nofail; // The word boundary 'Chicago' + '에' is should be detected by QTextBoundaryFinder but isn't. QTest::addRow("korean+latin") @@ -309,6 +309,38 @@ QStringLiteral("امتار"), //hamza removed from elif } << nofail; + /* + * Only to check if surrogate pairs are processed at all + */ + auto randomSurrogatePairs = QVector{ + // Random CJK Unified Ideographs Extension B + 0x00020000, + 0x0002040D, // 𠐍 + // First and last of CJK Compatibility Ideographs Supplement + 0x0002F800, + 0x0002FA1D, + }; + QStringList expect; + for (const uint cp : randomSurrogatePairs) { + expect << QString::fromUcs4(&cp, 1); + } + auto phrase = QString::fromUcs4(randomSurrogatePairs.data(), randomSurrogatePairs.count()); + QCOMPARE(randomSurrogatePairs.count() * 2, phrase.count()); + QTest::addRow("surrogates") + << phrase + << expect + << nofail; + expect.clear(); + randomSurrogatePairs.append(0x0002A6FF); // Unassigned code point + phrase = QString::fromUcs4(randomSurrogatePairs.data(), randomSurrogatePairs.count()); + for (const uint cp : randomSurrogatePairs) { + expect << QString::fromUcs4(&cp, 1); + } + QTest::addRow("unassigned cp") + << phrase + << expect + << QStringLiteral("undefined code point"); + } @@ -322,6 +354,10 @@ TermGenerator termGen(&doc); const QStringList result = termGen.termList(phrase); + qDebug() << "phrase" << phrase; + qDebug() << "terms " << terms.join(QLatin1Literal(" / ")); + qDebug() << "result" << result.join(QLatin1Literal(" / ")); + if (failmessage.isEmpty()) { QCOMPARE(result, terms); } else { @@ -445,7 +481,7 @@ QByteArray("苦"), } << toVecVec({5, 3, 6, 2, 9, 7, 4, 8, 1}) - << QStringLiteral("Chinese not implemented"); + << nofail; QTest::addRow("chinese+latin") << QStringLiteral("苦橙可Big brown用") << QList{ @@ -456,8 +492,8 @@ QByteArray("用"), QByteArray("苦"), } - << toVecVec({6, 4, 1, 2, 5}) - << QStringLiteral("Chinese not implemented"); + << toVecVec({4, 5, 3, 2, 6, 1}) + << nofail; // The word boundary 'Chicago' + '에' is should be detected by QTextBoundaryFinder but isn't. QTest::addRow("korean+latin") @@ -509,6 +545,9 @@ QList result = allWords(doc); + qDebug() << "phrase" << phrase; + qDebug() << "terms " << qPrintable(terms.join(" + ")); + qDebug() << "result" << qPrintable(result.join(" + ")); if (failmessage.isEmpty()) { QCOMPARE(result, terms); for (int i = 0; i < positions.count(); i++) { diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -25,6 +25,7 @@ writetransaction.cpp global.cpp fsutils.cpp + characterrangescjk.cpp ) add_library(KF5BalooEngine ${BALOO_ENGINE_SRCS}) diff --git a/src/engine/characterrangescjk.h b/src/engine/characterrangescjk.h new file mode 100644 --- /dev/null +++ b/src/engine/characterrangescjk.h @@ -0,0 +1,53 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright 2018 Michael Heidelbach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License or (at your option) version 3 or any later version + * accepted by the membership of KDE e.V. (or its successor approved + * by the membership of KDE e.V.), which shall act as a proxy + * defined in Section 14 of version 3 of the license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef CHARACTERRANGESCJK_H +#define CHARACTERRANGESCJK_H + +#include + +namespace Baloo +{ + +/** + * This class provides tools to generate + * search terms for CJK languages +*/ +class CjkCharacters { +public: + struct CjkResult { + /// List of search terms taking each grapheme cluster as a term + QStringList terms; + /// string positions processed. May differ from \a term.count() in case of surrogate pairs + int positionsProcessed = 0; + }; + /** + * Tests if \p text contains CJK characters + * \return CjkResult + * */ + static const CjkResult termsFromCJK(const QString& text, const int start = 0); +private: + static const QVector m_logographs; +}; + +} + +#endif // CHARACTERRANGESCJK_H diff --git a/src/engine/characterrangescjk.cpp b/src/engine/characterrangescjk.cpp new file mode 100644 --- /dev/null +++ b/src/engine/characterrangescjk.cpp @@ -0,0 +1,62 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright 2018 Michael Heidelbach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License or (at your option) version 3 or any later version + * accepted by the membership of KDE e.V. (or its successor approved + * by the membership of KDE e.V.), which shall act as a proxy + * defined in Section 14 of version 3 of the license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "characterrangescjk.h" +#include +using namespace Baloo; + +// TODO: Remove this vector if QChar::Script_Han is the only +// applicable script +const QVector CjkCharacters::m_logographs = { + QChar::Script_Han +}; + +const CjkCharacters::CjkResult CjkCharacters::termsFromCJK(const QString& text, const int start) { + CjkResult result; + int length = text.length(); + for (int i = start; i < length; i++) { + // qDebug() << "script:" << text.at(i).script() << "char:" << text.at(i) << "text:" << text; + uint ucs4 = text.at(i).unicode(); + QChar::Script script = QChar::script(ucs4); + if (script == 0 || m_logographs.contains(script)) { + int processed = 0; + if (i < length - 1 && QChar::isHighSurrogate(ucs4)) { + processed++; + const uint low = text.at(++i).unicode(); + Q_ASSERT(QChar::isLowSurrogate(low)); + ucs4 = QChar::surrogateToUcs4(ucs4, low); + } + if (!m_logographs.contains(QChar::script(ucs4))) { + break; + } + + if (QChar::isLetter(ucs4)) { + processed++; + result.terms << QString::fromUcs4(&ucs4, 1); + result.positionsProcessed += processed; + } + } else { + break; + } + } + return result; + +} diff --git a/src/engine/queryparser.cpp b/src/engine/queryparser.cpp --- a/src/engine/queryparser.cpp +++ b/src/engine/queryparser.cpp @@ -20,6 +20,7 @@ #include "queryparser.h" #include "enginequery.h" +#include "characterrangescjk.h" #include #include @@ -159,6 +160,11 @@ phraseQueries.clear(); } + const auto& cjkTerms = CjkCharacters::termsFromCJK(text_); + for (const auto& cjkTerm : cjkTerms.terms) { + queries << EngineQuery(cjkTerm.toUtf8(), EngineQuery::StartsWith); + } + if (queries.size() == 1) { return queries.first(); } diff --git a/src/engine/termgenerator.h b/src/engine/termgenerator.h --- a/src/engine/termgenerator.h +++ b/src/engine/termgenerator.h @@ -52,6 +52,7 @@ // Trim all terms to this size const static int maxTermSize = 25; private: + const QString static cleanString(const QString& text); Document* m_doc; int m_position; }; diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp --- a/src/engine/termgenerator.cpp +++ b/src/engine/termgenerator.cpp @@ -19,6 +19,7 @@ */ #include "termgenerator.h" +#include "characterrangescjk.h" #include "document.h" #include @@ -37,46 +38,61 @@ indexText(text, QByteArray(), wdfInc); } +const QString TermGenerator::cleanString(const QString& text) +{ + + // Remove all accents. It is important to call toLower after normalization, + // since some exotic unicode symbols can remain uppercase + const QString denormalized = text.normalized(QString::NormalizationForm_KD).toLower(); + + QString cleanedString; + cleanedString.reserve(denormalized.size()); + for (const QChar& ch : denormalized) { + auto cat = ch.category(); + if (cat != QChar::Mark_NonSpacing && cat != QChar::Mark_SpacingCombining && cat != QChar::Mark_Enclosing) { + cleanedString.append(ch); + } + } + + return cleanedString.normalized(QString::NormalizationForm_KC); +} + QStringList TermGenerator::termList(const QString& text_) { QString text(text_); text.replace('_', ' '); int start = 0; - int end = 0; - QStringList list; + QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text); for (; bf.position() != -1; bf.toNextBoundary()) { if (bf.boundaryReasons() & QTextBoundaryFinder::StartOfItem) { start = bf.position(); - continue; - } - else if (bf.boundaryReasons() & QTextBoundaryFinder::EndOfItem) { - end = bf.position(); - - QString str = text.mid(start, end - start); - - // Remove all accents. It is important to call toLower after normalization, - // since some exotic unicode symbols can remain uppercase - const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower(); - - QString cleanString; - cleanString.reserve(denormalized.size()); - Q_FOREACH (const QChar& ch, denormalized) { - auto cat = ch.category(); - if (cat != QChar::Mark_NonSpacing && cat != QChar::Mark_SpacingCombining && cat != QChar::Mark_Enclosing) { - cleanString.append(ch); - } - } - - str = cleanString.normalized(QString::NormalizationForm_KC); + } else if (bf.boundaryReasons() & QTextBoundaryFinder::EndOfItem) { + const QString str = TermGenerator::cleanString(text.mid(start, bf.position() - start)); if (!str.isEmpty()) { list << str; } + if (bf.boundaryReasons() & QTextBoundaryFinder::BreakOpportunity) { + // Try capture the next logograph + auto cjk = CjkCharacters::termsFromCJK(text.mid(bf.position())); + if (cjk.positionsProcessed) { + list << cjk.terms[0]; + } + } + } else if (bf.boundaryReasons() & QTextBoundaryFinder::BreakOpportunity) { + auto cjk = CjkCharacters::termsFromCJK(text.mid(bf.position())); + if (cjk.positionsProcessed) { + list += cjk.terms; + // Fast forward to boundary before last logograph + while (bf.position() < start + cjk.positionsProcessed) { + bf.toNextBoundary(); + } + start = bf.position(); + } } } - return list; }