diff --git a/autotests/unit/engine/termgeneratortestutf.cpp b/autotests/unit/engine/termgeneratortestutf.cpp --- a/autotests/unit/engine/termgeneratortestutf.cpp +++ b/autotests/unit/engine/termgeneratortestutf.cpp @@ -277,7 +277,7 @@ QStringLiteral("精"), QStringLiteral("油") } - << QStringLiteral("Chinese not implemented"); + << nofail; QTest::addRow("arabic") << QStringLiteral("شجيرة أو شجرة يصل ارتفاعها إلى عشرة أمتار") << QStringList{ diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -25,6 +25,7 @@ writetransaction.cpp global.cpp fsutils.cpp + characterrangescjk.cpp ) add_library(KF5BalooEngine ${BALOO_ENGINE_SRCS}) diff --git a/src/engine/characterrangescjk.h b/src/engine/characterrangescjk.h new file mode 100644 --- /dev/null +++ b/src/engine/characterrangescjk.h @@ -0,0 +1,49 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright 2018 Michael Heidelbach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License or (at your option) version 3 or any later version + * accepted by the membership of KDE e.V. (or its successor approved + * by the membership of KDE e.V.), which shall act as a proxy + * defined in Section 14 of version 3 of the license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef CHARACTERRANGESCJK_H +#define CHARACTERRANGESCJK_H + +#include + +namespace Baloo +{ + +/** + * This class provides tools to generate + * search terms for CJK languages +*/ +class CjkCharacters { +public: + /** + * Tests if \p text contains CJK characters and + * returns \return a list of search terms taking + * each grapheme as a term. + * If no CJK characters are found this list is empty. + * */ + static const QStringList termsFromCJK(const QString& text); +private: + static const QVector m_graphemeWords; +}; + +} + +#endif // CHARACTERRANGESCJK_H diff --git a/src/engine/characterrangescjk.cpp b/src/engine/characterrangescjk.cpp new file mode 100644 --- /dev/null +++ b/src/engine/characterrangescjk.cpp @@ -0,0 +1,45 @@ +/* + * This file is part of the KDE Baloo project. + * Copyright 2018 Michael Heidelbach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License or (at your option) version 3 or any later version + * accepted by the membership of KDE e.V. (or its successor approved + * by the membership of KDE e.V.), which shall act as a proxy + * defined in Section 14 of version 3 of the license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "characterrangescjk.h" + +using namespace Baloo; + +// TODO: Remove this vector if QChar::Script_Han is the only +// applicable script +const QVector CjkCharacters::m_graphemeWords = { + QChar::Script_Han +}; + +const QStringList CjkCharacters::termsFromCJK(const QString& text) { + QStringList result; + int length = text.length(); + for (int i = 0; i < length; i++) { + QChar c = text.at(i); + if (i < length - 1 && c.isHighSurrogate() && text.at(i+1).isLowSurrogate()) { + c = c.surrogateToUcs4(c, text.at(++i)); + } + if (c.isLetter() && m_graphemeWords.contains(c.script())) { + result << c; + } + } + return result; +} diff --git a/src/engine/queryparser.cpp b/src/engine/queryparser.cpp --- a/src/engine/queryparser.cpp +++ b/src/engine/queryparser.cpp @@ -20,6 +20,7 @@ #include "queryparser.h" #include "enginequery.h" +#include "characterrangescjk.h" #include #include @@ -158,7 +159,12 @@ queries << phraseQueries; phraseQueries.clear(); } - + + const auto& cjkTerms = CjkCharacters::termsFromCJK(text_); + for (const auto& cjkTerm : cjkTerms) { + queries << EngineQuery(cjkTerm.toUtf8(), EngineQuery::StartsWith); + } + if (queries.size() == 1) { return queries.first(); } diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp --- a/src/engine/termgenerator.cpp +++ b/src/engine/termgenerator.cpp @@ -19,6 +19,7 @@ */ #include "termgenerator.h" +#include "characterrangescjk.h" #include "document.h" #include @@ -76,7 +77,7 @@ } } } - + list += CjkCharacters::termsFromCJK(text_); return list; }