diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 90077fa..92cc5c5 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -1,37 +1,22 @@ project(sonnetdata) include(ECMMarkNonGuiExecutable) add_executable(parsetrigrams parsetrigrams.cpp) add_executable(gentrigrams gentrigrams.cpp) # Mark it as non-gui so we won't create an app bundle on Mac OS X ecm_mark_nongui_executable(parsetrigrams) ecm_mark_nongui_executable(gentrigrams) TARGET_LINK_LIBRARIES(parsetrigrams PUBLIC Qt5::Core) TARGET_LINK_LIBRARIES(gentrigrams PUBLIC Qt5::Core KF5::SonnetCore) INSTALL(TARGETS parsetrigrams ${KF5_INSTALL_TARGETS_DEFAULT_ARGS}) INSTALL(TARGETS gentrigrams ${KF5_INSTALL_TARGETS_DEFAULT_ARGS}) if(CMAKE_CROSSCOMPILING AND PARSETRIGRAMS_EXECUTABLE) add_executable(KF5::parsetrigrams IMPORTED GLOBAL) set_target_properties(KF5::parsetrigrams PROPERTIES IMPORTED_LOCATION ${PARSETRIGRAMS_EXECUTABLE}) else() add_executable(KF5::parsetrigrams ALIAS parsetrigrams) endif() - -function(create_trigrams_map _target_name _in_DIR _out_FILE) - add_custom_target(trigrams_${_target_name} ALL) - add_custom_command(TARGET trigrams_${_target_name} - COMMAND $ "${_in_DIR}" > "${_out_FILE}" - ) - - add_dependencies(trigrams_${_target_name} parsetrigrams) -endfunction(create_trigrams_map) - -create_trigrams_map(sonnet - ${CMAKE_SOURCE_DIR}/data/trigrams - ${CMAKE_CURRENT_BINARY_DIR}/trigrams.map - ) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/trigrams.map DESTINATION ${KDE_INSTALL_DATADIR_KF5}/sonnet/) diff --git a/data/trigrams.qrc.in b/data/trigrams.qrc.in new file mode 100644 index 0000000..6ca17f1 --- /dev/null +++ b/data/trigrams.qrc.in @@ -0,0 +1,6 @@ + + + +@CMAKE_BINARY_DIR@/data/trigrams.map + + diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index e054472..316b205 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -1,80 +1,87 @@ project(sonnetcore) set(sonnetcore_SRCS loader.cpp client.cpp spellerplugin.cpp speller.cpp settings.cpp backgroundchecker.cpp guesslanguage.cpp textbreaks.cpp tokenizer.cpp languagefilter.cpp ) +# create trigrams file + add trigrams resource +add_custom_command(OUTPUT "${CMAKE_BINARY_DIR}/data/trigrams.map" + DEPENDS parsetrigrams + COMMAND $ "${CMAKE_SOURCE_DIR}/data/trigrams" > "${CMAKE_BINARY_DIR}/data/trigrams.map") +configure_file(${CMAKE_SOURCE_DIR}/data/trigrams.qrc.in ${CMAKE_BINARY_DIR}/data/trigrams.qrc @ONLY) +qt5_add_resources(sonnetcore_SRCS "${CMAKE_BINARY_DIR}/data/trigrams.qrc") + ecm_qt_declare_logging_category(sonnetcore_SRCS HEADER core_debug.h IDENTIFIER SONNET_LOG_CORE CATEGORY_NAME sonnet.core) # Dear packagers, this is just used as an extra search paths for plugins. Don't get your panties in a twist. add_definitions(-DINSTALLATION_PLUGIN_PATH="${CMAKE_INSTALL_PREFIX}/${KDE_INSTALL_PLUGINDIR}") add_library(KF5SonnetCore ${sonnetcore_SRCS}) generate_export_header(KF5SonnetCore BASE_NAME SonnetCore EXPORT_FILE_NAME sonnetcore_export.h) add_library(KF5::SonnetCore ALIAS KF5SonnetCore) ecm_generate_headers(SonnetCore_CamelCase_HEADERS HEADER_NAMES BackgroundChecker Speller GuessLanguage PREFIX Sonnet REQUIRED_HEADERS SonnetCore_HEADERS ) target_link_libraries(KF5SonnetCore PUBLIC Qt5::Core) set_target_properties(KF5SonnetCore PROPERTIES VERSION ${SONNET_VERSION_STRING} SOVERSION ${SONNET_SOVERSION} EXPORT_NAME SonnetCore ) # CMAKE_CURRENT_BINARY_DIR: for camelcase headers and lowercase forwarders target_include_directories(KF5SonnetCore INTERFACE "$") target_include_directories(KF5SonnetCore PUBLIC "$") install(TARGETS KF5SonnetCore EXPORT KF5SonnetTargets ${KF5_INSTALL_TARGETS_DEFAULT_ARGS}) install(FILES ${SonnetCore_CamelCase_HEADERS} DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF5}/SonnetCore/Sonnet COMPONENT Devel) install(FILES ${SonnetCore_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/sonnetcore_export.h DESTINATION ${KDE_INSTALL_INCLUDEDIR_KF5}/SonnetCore/sonnet COMPONENT Devel) if (BUILD_QCH) ecm_add_qch( KF5SonnetCore_QCH NAME SonnetCore BASE_NAME KF5SonnetCore VERSION ${KF5_VERSION} ORG_DOMAIN org.kde SOURCES # using only public headers, to cover only public API ${SonnetCore_HEADERS} LINK_QCHS Qt5Core_QCH INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR} BLANK_MACROS SONNETCORE_EXPORT SONNETCORE_DEPRECATED_EXPORT SONNETCORE_DEPRECATED TAGFILE_INSTALL_DESTINATION ${KDE_INSTALL_QTQCHDIR} QCH_INSTALL_DESTINATION ${KDE_INSTALL_QTQCHDIR} COMPONENT Devel ) endif() include(ECMGeneratePriFile) ecm_generate_pri_file(BASE_NAME SonnetCore LIB_NAME KF5SonnetCore DEPS "core" FILENAME_VAR PRI_FILENAME INCLUDE_INSTALL_DIR ${KDE_INSTALL_INCLUDEDIR_KF5}/SonnetCore) install(FILES ${PRI_FILENAME} DESTINATION ${ECM_MKSPECS_INSTALL_DIR}) diff --git a/src/core/guesslanguage.cpp b/src/core/guesslanguage.cpp index 8ea2458..141036b 100644 --- a/src/core/guesslanguage.cpp +++ b/src/core/guesslanguage.cpp @@ -1,864 +1,853 @@ /* This file is part of the KDE libraries Copyright (c) 2006 Jacob R Rideout Copyright (c) 2009 Jakub Stachowski Copyright (c) 2013 Martin Sandsmark This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include "guesslanguage.h" #include "loader_p.h" #include "speller.h" #include "tokenizer_p.h" #include "core_debug.h" #include "spellerplugin_p.h" /* All language tags should be valid according to IETF BCP 47, as codified in RFC 4646. ISO 639-1 codes should be used for the language part except for cases where there exists no code, then 639-3 codes should be used. Country codes should only be used in special cases. Scripts can be differentiated by IANA subtags, available here: http://www.iana.org/assignments/language-subtag-registry The script tags correspond to ISO 15924 An overview of the best practices concerning language tagging is available here: http://www.w3.org/International/articles/language-tags/Overview.en.php lang tags should use underscores (_) rather than hyphens (-) to separate subsections. EXCEPTIONS: For cases of known differences from the above tagging scheme and major spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers shall be used. All exception shall be noted here: BCP SPELLCHECK az-Latn az */ namespace Sonnet { class GuessLanguagePrivate { public: GuessLanguagePrivate(); // language trigram score static QHash< QString, QHash > s_knownModels; void loadModels(); QList< QChar::Script > findRuns(const QString &text); QList createOrderedModel(const QString &content); int distance(const QList &model, const QHash &knownModel); QStringList guessFromTrigrams(const QString &sample, const QStringList &langs); QStringList identify(const QString &sample, const QList< QChar::Script > &scripts); QString guessFromDictionaries(const QString &sentence, const QStringList &candidates); static QSet s_knownDictionaries; static QMultiHash s_scriptLanguages; static QMap s_dictionaryNameMap; const int MIN_LENGTH; int m_maxItems; double m_minConfidence; }; QHash< QString, QHash > GuessLanguagePrivate::s_knownModels; QSet GuessLanguagePrivate::s_knownDictionaries; QMultiHash GuessLanguagePrivate::s_scriptLanguages; QMap GuessLanguagePrivate::s_dictionaryNameMap; QStringList getNames(QLocale::Script script) { QStringList locales; const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry); locales.reserve(matchingLocales.size()); for (const QLocale &locale : matchingLocales) { locales << locale.name(); } return locales; } GuessLanguagePrivate::GuessLanguagePrivate() : MIN_LENGTH(5) , m_maxItems(1) , m_minConfidence(0) { if (!s_scriptLanguages.isEmpty()) { return; } s_knownDictionaries = Loader::openLoader()->languages().toSet(); QSet dictionaryLanguages; for (const QString &dictName : qAsConst(s_knownDictionaries)) { QString languageName = QLocale(dictName).name(); if (languageName.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; continue; } dictionaryLanguages.insert(languageName); } QSet allLanguages; for (int i = 0; i < int(QChar::ScriptCount); i++) { QChar::Script script = static_cast(i); QStringList names; switch (script) { case QChar::Script_Latin: names = getNames(QLocale::LatinScript); break; case QChar::Script_Greek: names = getNames(QLocale::GreekScript); break; case QChar::Script_Cyrillic: names = getNames(QLocale::CyrillicScript); break; case QChar::Script_Armenian: names = getNames(QLocale::ArmenianScript); break; case QChar::Script_Hebrew: names = getNames(QLocale::HebrewScript); break; case QChar::Script_Arabic: names = getNames(QLocale::ArabicScript); break; case QChar::Script_Syriac: names = getNames(QLocale::SyriacScript); break; case QChar::Script_Thaana: names = getNames(QLocale::ThaanaScript); break; case QChar::Script_Devanagari: names = getNames(QLocale::DevanagariScript); break; case QChar::Script_Bengali: names = getNames(QLocale::BengaliScript); break; case QChar::Script_Gurmukhi: names = getNames(QLocale::GurmukhiScript); break; case QChar::Script_Gujarati: names = getNames(QLocale::GujaratiScript); break; case QChar::Script_Oriya: names = getNames(QLocale::OriyaScript); break; case QChar::Script_Tamil: names = getNames(QLocale::TamilScript); break; case QChar::Script_Telugu: names = getNames(QLocale::TeluguScript); break; case QChar::Script_Kannada: names = getNames(QLocale::KannadaScript); break; case QChar::Script_Malayalam: names = getNames(QLocale::MalayalamScript); break; case QChar::Script_Sinhala: names = getNames(QLocale::SinhalaScript); break; case QChar::Script_Thai: names = getNames(QLocale::ThaiScript); break; case QChar::Script_Lao: names = getNames(QLocale::LaoScript); break; case QChar::Script_Tibetan: names = getNames(QLocale::TibetanScript); break; case QChar::Script_Myanmar: names = getNames(QLocale::MyanmarScript); break; case QChar::Script_Georgian: names = getNames(QLocale::GeorgianScript); break; case QChar::Script_Hangul: names = getNames(QLocale::HangulScript); break; case QChar::Script_Ethiopic: names = getNames(QLocale::EthiopicScript); break; case QChar::Script_Cherokee: names = getNames(QLocale::CherokeeScript); break; case QChar::Script_CanadianAboriginal: names = getNames(QLocale::CanadianAboriginalScript); break; case QChar::Script_Ogham: names = getNames(QLocale::OghamScript); break; case QChar::Script_Runic: names = getNames(QLocale::RunicScript); break; case QChar::Script_Khmer: names = getNames(QLocale::KhmerScript); break; case QChar::Script_Mongolian: names = getNames(QLocale::MongolianScript); break; case QChar::Script_Hiragana: names = getNames(QLocale::HiraganaScript); break; case QChar::Script_Katakana: names = getNames(QLocale::KatakanaScript); break; case QChar::Script_Bopomofo: names = getNames(QLocale::BopomofoScript); break; case QChar::Script_Han: names = getNames(QLocale::HanScript); break; case QChar::Script_Yi: names = getNames(QLocale::YiScript); break; case QChar::Script_OldItalic: names = getNames(QLocale::OldItalicScript); break; case QChar::Script_Gothic: names = getNames(QLocale::GothicScript); break; case QChar::Script_Deseret: names = getNames(QLocale::DeseretScript); break; case QChar::Script_Tagalog: names = getNames(QLocale::TagalogScript); break; case QChar::Script_Hanunoo: names = getNames(QLocale::HanunooScript); break; case QChar::Script_Buhid: names = getNames(QLocale::BuhidScript); break; case QChar::Script_Tagbanwa: names = getNames(QLocale::TagbanwaScript); break; case QChar::Script_Coptic: names = getNames(QLocale::CopticScript); break; case QChar::Script_Limbu: names = getNames(QLocale::LimbuScript); break; case QChar::Script_TaiLe: names = getNames(QLocale::TaiLeScript); break; case QChar::Script_LinearB: names = getNames(QLocale::LinearBScript); break; case QChar::Script_Ugaritic: names = getNames(QLocale::UgariticScript); break; case QChar::Script_Shavian: names = getNames(QLocale::ShavianScript); break; case QChar::Script_Osmanya: names = getNames(QLocale::OsmanyaScript); break; case QChar::Script_Cypriot: names = getNames(QLocale::CypriotScript); break; case QChar::Script_Braille: names = getNames(QLocale::BrailleScript); break; case QChar::Script_Buginese: names = getNames(QLocale::BugineseScript); break; case QChar::Script_NewTaiLue: names = getNames(QLocale::NewTaiLueScript); break; case QChar::Script_Glagolitic: names = getNames(QLocale::GlagoliticScript); break; case QChar::Script_Tifinagh: names = getNames(QLocale::TifinaghScript); break; case QChar::Script_SylotiNagri: names = getNames(QLocale::SylotiNagriScript); break; case QChar::Script_OldPersian: names = getNames(QLocale::OldPersianScript); break; case QChar::Script_Kharoshthi: names = getNames(QLocale::KharoshthiScript); break; case QChar::Script_Balinese: names = getNames(QLocale::BalineseScript); break; case QChar::Script_Cuneiform: names = getNames(QLocale::CuneiformScript); break; case QChar::Script_Phoenician: names = getNames(QLocale::PhoenicianScript); break; case QChar::Script_PhagsPa: names = getNames(QLocale::PhagsPaScript); break; case QChar::Script_Nko: names = getNames(QLocale::NkoScript); break; case QChar::Script_Sundanese: names = getNames(QLocale::SundaneseScript); break; case QChar::Script_Lepcha: names = getNames(QLocale::LepchaScript); break; case QChar::Script_OlChiki: names = getNames(QLocale::OlChikiScript); break; case QChar::Script_Vai: names = getNames(QLocale::VaiScript); break; case QChar::Script_Saurashtra: names = getNames(QLocale::SaurashtraScript); break; case QChar::Script_KayahLi: names = getNames(QLocale::KayahLiScript); break; case QChar::Script_Rejang: names = getNames(QLocale::RejangScript); break; case QChar::Script_Lycian: names = getNames(QLocale::LycianScript); break; case QChar::Script_Carian: names = getNames(QLocale::CarianScript); break; case QChar::Script_Lydian: names = getNames(QLocale::LydianScript); break; case QChar::Script_Cham: names = getNames(QLocale::ChamScript); break; case QChar::Script_TaiTham: names = getNames(QLocale::LannaScript); break; case QChar::Script_TaiViet: names = getNames(QLocale::TaiVietScript); break; case QChar::Script_Avestan: names = getNames(QLocale::AvestanScript); break; case QChar::Script_EgyptianHieroglyphs: names = getNames(QLocale::EgyptianHieroglyphsScript); break; case QChar::Script_Samaritan: names = getNames(QLocale::SamaritanScript); break; case QChar::Script_Lisu: names = getNames(QLocale::FraserScript); break; case QChar::Script_Bamum: names = getNames(QLocale::BamumScript); break; case QChar::Script_Javanese: names = getNames(QLocale::JavaneseScript); break; case QChar::Script_MeeteiMayek: names = getNames(QLocale::MeiteiMayekScript); break; case QChar::Script_ImperialAramaic: names = getNames(QLocale::ImperialAramaicScript); break; case QChar::Script_OldSouthArabian: names = getNames(QLocale::OldSouthArabianScript); break; case QChar::Script_InscriptionalParthian: names = getNames(QLocale::InscriptionalParthianScript); break; case QChar::Script_InscriptionalPahlavi: names = getNames(QLocale::InscriptionalPahlaviScript); break; case QChar::Script_Kaithi: names = getNames(QLocale::KaithiScript); break; case QChar::Script_Batak: names = getNames(QLocale::BatakScript); break; case QChar::Script_Brahmi: names = getNames(QLocale::BrahmiScript); break; case QChar::Script_Mandaic: names = getNames(QLocale::MandaeanScript); break; case QChar::Script_Chakma: names = getNames(QLocale::ChakmaScript); break; case QChar::Script_MeroiticCursive: case QChar::Script_MeroiticHieroglyphs: names = getNames(QLocale::MeroiticCursiveScript); names.append(getNames(QLocale::MeroiticScript)); break; case QChar::Script_Miao: names = getNames(QLocale::PollardPhoneticScript); break; case QChar::Script_Sharada: names = getNames(QLocale::SharadaScript); break; case QChar::Script_SoraSompeng: names = getNames(QLocale::SoraSompengScript); break; case QChar::Script_Takri: names = getNames(QLocale::TakriScript); break; case QChar::Script_CaucasianAlbanian: names = getNames(QLocale::CaucasianAlbanianScript); break; case QChar::Script_BassaVah: names = getNames(QLocale::BassaVahScript); break; case QChar::Script_Duployan: names = getNames(QLocale::DuployanScript); break; case QChar::Script_Elbasan: names = getNames(QLocale::ElbasanScript); break; case QChar::Script_Grantha: names = getNames(QLocale::GranthaScript); break; case QChar::Script_PahawhHmong: names = getNames(QLocale::PahawhHmongScript); break; case QChar::Script_Khojki: names = getNames(QLocale::KhojkiScript); break; case QChar::Script_LinearA: names = getNames(QLocale::LinearAScript); break; case QChar::Script_Mahajani: names = getNames(QLocale::MahajaniScript); break; case QChar::Script_Manichaean: names = getNames(QLocale::ManichaeanScript); break; case QChar::Script_MendeKikakui: names = getNames(QLocale::MendeKikakuiScript); break; case QChar::Script_Modi: names = getNames(QLocale::ModiScript); break; case QChar::Script_Mro: names = getNames(QLocale::MroScript); break; case QChar::Script_OldNorthArabian: names = getNames(QLocale::OldNorthArabianScript); break; case QChar::Script_Nabataean: names = getNames(QLocale::NabataeanScript); break; case QChar::Script_Palmyrene: names = getNames(QLocale::PalmyreneScript); break; case QChar::Script_PauCinHau: names = getNames(QLocale::PauCinHauScript); break; case QChar::Script_OldPermic: names = getNames(QLocale::OldPermicScript); break; case QChar::Script_PsalterPahlavi: names = getNames(QLocale::PsalterPahlaviScript); break; case QChar::Script_Siddham: names = getNames(QLocale::SiddhamScript); break; case QChar::Script_Khudawadi: names = getNames(QLocale::KhudawadiScript); break; case QChar::Script_Tirhuta: names = getNames(QLocale::TirhutaScript); break; case QChar::Script_WarangCiti: names = getNames(QLocale::VarangKshitiScript); break; case QChar::Script_Ahom: names = getNames(QLocale::AhomScript); break; case QChar::Script_AnatolianHieroglyphs: names = getNames(QLocale::AnatolianHieroglyphsScript); break; case QChar::Script_Hatran: names = getNames(QLocale::HatranScript); break; case QChar::Script_Multani: names = getNames(QLocale::MultaniScript); break; case QChar::Script_OldHungarian: names = getNames(QLocale::OldHungarianScript); break; case QChar::Script_Unknown: case QChar::Script_Inherited: case QChar::Script_Common: case QChar::Script_OldTurkic: case QChar::Script_SignWriting: break; default: qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; break; } allLanguages.unite(names.toSet()); { // Remove unknown languages QStringList pruned; for (const QString &name : qAsConst(names)) { if (!dictionaryLanguages.contains(name)) { continue; } pruned.append(name); } names = pruned; } if (names.isEmpty()) { continue; } for (const QString &name : qAsConst(names)) { s_scriptLanguages.insert(script, name); } } // Try to handle some badly named dictionaries if (!allLanguages.contains(s_knownDictionaries)) { QSet dicts(s_knownDictionaries); dicts.subtract(allLanguages); for (const QString &dictName : dicts) { QString languageName = QLocale(dictName).name(); if (languageName.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; continue; } s_dictionaryNameMap[languageName] = dictName; if (!s_scriptLanguages.values().contains(languageName)) { qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; } } } } GuessLanguage::GuessLanguage() : d(new GuessLanguagePrivate) { } GuessLanguage::~GuessLanguage() { delete d; } QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const { if (text.isEmpty()) { return QString(); } // Filter for available dictionaries QStringList suggestionsList; for (const QString &suggestion : suggestionsListIn) { if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) { suggestionsList.append(suggestion); } } // Load the model on demand if (d->s_knownModels.isEmpty()) { d->loadModels(); } QStringList candidateLanguages = d->identify(text, d->findRuns(text)); // Hack for some bad dictionary names for (int i = 0; i < candidateLanguages.count(); i++) { if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) { candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]); } } if (candidateLanguages.count() == 1) { return candidateLanguages.first(); } // Wasn't able to get a good guess with the trigrams, try checking all // dictionaries for the suggested languages. candidateLanguages.append(suggestionsList); candidateLanguages.removeDuplicates(); QString identified = d->guessFromDictionaries(text, candidateLanguages); if (!identified.isEmpty()) { return identified; } qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; // None of our methods worked, just return the best suggestion if (!suggestionsList.isEmpty()) { return suggestionsList.first(); } qCWarning(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; // Not even any suggestions, give up return QString(); } void GuessLanguage::setLimits(int maxItems, double minConfidence) { d->m_maxItems = maxItems; d->m_minConfidence = minConfidence; } void GuessLanguagePrivate::loadModels() { - QString triMapFile - = QStandardPaths::locate(QStandardPaths::GenericDataLocation, QStringLiteral( - "kf5/sonnet/trigrams.map")); - - if (triMapFile.isEmpty()) { -#ifdef Q_OS_WIN - triMapFile = QStringLiteral("%1/data/kf5/sonnet/trigrams.map").arg( - QCoreApplication::applicationDirPath()); -#else - triMapFile = QStringLiteral("%1/../share/kf5/sonnet/trigrams.map").arg( - QCoreApplication::applicationDirPath()); -#endif - } + // use trigrams from resource file, easy to deploy on all platforms + const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map"); qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; QFile sin(triMapFile); if (!sin.open(QIODevice::ReadOnly)) { qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; return; } QDataStream in(&sin); in >> s_knownModels; // Sanity check QSet availableLanguages; QHashIterator > iterator(s_knownModels); while (iterator.hasNext()) { iterator.next(); if (iterator.value().count() < MAXGRAMS) { qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; } availableLanguages.insert(iterator.key()); } QSet knownLanguages(s_scriptLanguages.values().toSet()); knownLanguages.subtract(availableLanguages); if (!knownLanguages.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; } } QList GuessLanguagePrivate::findRuns(const QString &text) { QChar::Script script = QChar::Script_Unknown; QHash scriptCounts; int totalCount = 0; for (const QChar c : text) { script = c.script(); if (script == QChar::Script_Common || script == QChar::Script_Inherited) { continue; } if (!c.isLetter()) { continue; } scriptCounts[script]++; totalCount++; } QList relevantScripts; if (totalCount == 0) { return relevantScripts; } for (const QChar::Script &script : scriptCounts.keys()) { // return run types that used for 40% or more of the string if (scriptCounts[script] * 100 / totalCount >= 40) { relevantScripts << script; // always return basic latin if found more than 15%. } else if (script == QChar::Script_Latin && scriptCounts[script] * 100 / totalCount >= 15) { relevantScripts << script; } } return relevantScripts; } QStringList GuessLanguagePrivate::identify(const QString &sample, const QList &scripts) { if (sample.size() < MIN_LENGTH) { return QStringList(); } QStringList guesses; for (const QChar::Script script : scripts) { guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script))); } return guesses; } QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) { QStringList ret; const QList sampleTrigrams = createOrderedModel(sample); // Sort by score QMultiMap scores; for (const QString &language : languages) { if (s_knownModels.contains(language)) { scores.insert(distance(sampleTrigrams, s_knownModels[language]), language); } } // Skip if either no results or best result is completely unknown (distance >= maxdistance) if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; return ret; } int counter = 0; double confidence = 0; QMapIterator it(scores); it.next(); QString prevItem = it.value(); int prevScore = it.key(); while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { it.next(); counter++; confidence += (it.key() - prevScore)/(double)it.key(); ret += prevItem; prevItem = it.value(); prevScore = it.key(); } if (counter < m_maxItems && confidence < m_minConfidence) { ret += prevItem; } return ret; } QList GuessLanguagePrivate::createOrderedModel(const QString &content) { QHash trigramCounts; QMap orderedTrigrams; for (int i = 0; i < (content.size() - 2); ++i) { QString tri = content.mid(i, 3).toLower(); trigramCounts[tri]++; } for (const QString &key : trigramCounts.keys()) { const QChar *data = key.constData(); bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); if (!hasTwoSpaces) { orderedTrigrams.insertMulti(-trigramCounts[key], key); } } return orderedTrigrams.values(); } int GuessLanguagePrivate::distance(const QList &model, const QHash &knownModel) { int counter = -1; int dist = 0; for (const QString &trigram : model) { if (knownModel.contains(trigram)) { dist += qAbs(++counter - knownModel.value(trigram)); } else { dist += MAXGRAMS; } if (counter == (MAXGRAMS-1)) { break; } } return dist; } QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates) { // Try to see how many languages we can get spell checking for QList > spellers; for (const QString &lang : candidates) { if (!Loader::openLoader()->languages().contains(lang)) { qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; continue; } QSharedPointer plugin = Loader::openLoader()->cachedSpeller(lang); if (!plugin.isNull()) { spellers.append(plugin); } } // If there's no spell checkers, give up if (spellers.isEmpty()) { return QString(); } QMap correctHits; WordTokenizer tokenizer(sentence); while (tokenizer.hasNext()) { QStringRef word = tokenizer.next(); if (!tokenizer.isSpellcheckable()) { continue; } for (int i = 0; i < spellers.count(); ++i) { if (spellers[i]->isCorrect(word.toString())) { correctHits[spellers[i]->language()]++; } } } if (correctHits.isEmpty()) { return QString(); } QMap::const_iterator max = correctHits.constBegin(); for (QMap::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { if (itr.value() > max.value()) { max = itr; } } return max.key(); } }