diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 2c4b76f..100b05f 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -1,39 +1,39 @@ project(sonnetdata) include(ECMMarkNonGuiExecutable) add_executable(parsetrigrams parsetrigrams.cpp) add_executable(gentrigrams gentrigrams.cpp) # Mark it as non-gui so we won't create an app bundle on Mac OS X ecm_mark_nongui_executable(parsetrigrams) ecm_mark_nongui_executable(gentrigrams) TARGET_LINK_LIBRARIES(parsetrigrams PUBLIC Qt5::Core) -TARGET_LINK_LIBRARIES(gentrigrams PUBLIC Qt5::Core) +TARGET_LINK_LIBRARIES(gentrigrams PUBLIC Qt5::Core KF5::SonnetCore) INSTALL(TARGETS parsetrigrams EXPORT KF5SonnetTargets ${KF5_INSTALL_TARGETS_DEFAULT_ARGS}) INSTALL(TARGETS gentrigrams EXPORT KF5SonnetTargets ${KF5_INSTALL_TARGETS_DEFAULT_ARGS}) cmake_policy(SET CMP0026 OLD) # FIXME: make this work with CMP0026 set to NEW if(CMAKE_CROSSCOMPILING AND PARSETRIGRAMS_EXECUTABLE) add_executable(KF5::parsetrigrams IMPORTED GLOBAL) set_target_properties(KF5::parsetrigrams PROPERTIES IMPORTED_LOCATION ${PARSETRIGRAMS_EXECUTABLE}) else() add_executable(KF5::parsetrigrams ALIAS parsetrigrams) endif() macro(create_trigrams_map _target_name _in_DIR _out_FILE) get_target_property(PARSETRIGRAMS_EXECUTABLE parsetrigrams LOCATION) add_custom_target(trigrams_${_target_name} ALL) add_custom_command(TARGET trigrams_${_target_name} COMMAND $ "${_in_DIR}" > "${_out_FILE}" ) add_dependencies(trigrams_${_target_name} parsetrigrams) endmacro(create_trigrams_map) create_trigrams_map(sonnet ${CMAKE_SOURCE_DIR}/data/trigrams ${CMAKE_CURRENT_BINARY_DIR}/trigrams.map ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/trigrams.map DESTINATION ${KDE_INSTALL_DATADIR_KF5}/sonnet/) diff --git a/data/gentrigrams.cpp b/data/gentrigrams.cpp index 7dd7755..6badbfa 100644 --- a/data/gentrigrams.cpp +++ b/data/gentrigrams.cpp @@ -1,94 +1,94 @@ /** * parsetrigrams.cpp * * Parse a corpus of data and generate trigrams * * Copyright 2013 Martin Sandsmark * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301 USA */ #include #include #include #include +#include "guesslanguage.h" int main(int argc, char *argv[]) { if (argc < 3) { qWarning() << argv[0] << "corpus.txt outfile.trigram"; return -1; } QFile file(QString::fromLocal8Bit(argv[1])); if (!file.open(QIODevice::ReadOnly | QFile::Text)) { qWarning() << "Unable to open corpus:" << argv[1]; return -1; } QTextStream stream(&file); stream.setCodec("UTF-8"); QFile outFile(QString::fromLocal8Bit(argv[2])); if (!outFile.open(QIODevice::WriteOnly)) { qWarning() << "Unable to open output file" << argv[2]; return -1; } QHash model; qDebug() << "Reading in" << file.size() << "bytes"; QString trigram = stream.read(3); QString contents = stream.readAll(); qDebug() << "finished reading!"; qDebug() << "Building model..."; for (int i=0; i orderedTrigrams; for (const QString &key : model.keys()) { const QChar* data=key.constData(); bool hasTwoSpaces=(data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); if (!hasTwoSpaces) orderedTrigrams.insertMulti(model[key], key); } qDebug() << "Sorted!"; qDebug() << "Weeding out..."; QMap::iterator i = orderedTrigrams.begin(); - while (orderedTrigrams.size() > 300) { - orderedTrigrams.erase(i); - i++; + while (orderedTrigrams.size() > Sonnet::MAXGRAMS) { + i = orderedTrigrams.erase(i); } qDebug() << "Weeded!"; qDebug() << "Storing..."; i = orderedTrigrams.end(); int count=0; QTextStream outStream(&outFile); outStream.setCodec("UTF-8"); while (i != orderedTrigrams.begin()) { --i; outStream << *i << "\t\t\t" << count++ << '\n'; } } diff --git a/src/core/guesslanguage.cpp b/src/core/guesslanguage.cpp index 546c514..723ad96 100644 --- a/src/core/guesslanguage.cpp +++ b/src/core/guesslanguage.cpp @@ -1,848 +1,845 @@ /* This file is part of the KDE libraries Copyright (c) 2006 Jacob R Rideout Copyright (c) 2009 Jakub Stachowski Copyright (c) 2013 Martin Sandsmark This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include #include #include "guesslanguage.h" #include "loader_p.h" #include "speller.h" #include "tokenizer_p.h" #include "core_debug.h" #include "spellerplugin_p.h" /* All language tags should be valid according to IETF BCP 47, as codefied in RFC 4646. ISO 639-1 codes should be used for the language part except for cases where there exists no code, then 639-3 codes should be used. Country codes should only be used in special cases. Scripts can be differentiated by IANA subtags, availble here: http://www.iana.org/assignments/language-subtag-registry The script tags corresond to ISO 15924 An overview of the best practices concerning language tagging is available here: http://www.w3.org/International/articles/language-tags/Overview.en.php lang tags should use underscores (_) rather than hypens (-) to sepereate subsections. EXCEPTIONS: For cases of known differences from the above tagging scheme and major spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers shall be used. All exception shall be noted here: BCP SPELLCHECK az-Latn az */ namespace Sonnet { -// Amount of trigrams in each file -static const int MAXGRAMS = 300; - class GuessLanguagePrivate { public: GuessLanguagePrivate(); // language trigram score static QHash< QString, QHash > s_knownModels; void loadModels( ); QList< QChar::Script > findRuns(const QString& text); QList createOrderedModel(const QString& content); int distance( const QList& model, const QHash& knownModel ); QStringList guessFromTrigrams(const QString & sample, const QStringList& langs); QStringList identify(const QString& sample, const QList< QChar::Script >& scripts); QString guessFromDictionaries(const QString& sentence, const QStringList& candidates); static QSet s_knownDictionaries; static QMultiHash s_scriptLanguages; static QMap s_dictionaryNameMap; const int MIN_LENGTH; int m_maxItems; double m_minConfidence; }; QHash< QString, QHash > GuessLanguagePrivate::s_knownModels; QSet GuessLanguagePrivate::s_knownDictionaries; QMultiHash GuessLanguagePrivate::s_scriptLanguages; QMap GuessLanguagePrivate::s_dictionaryNameMap; QStringList getNames(QLocale::Script script) { QStringList locales; for (const QLocale &locale : QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry)) { locales << locale.name(); } return locales; } GuessLanguagePrivate::GuessLanguagePrivate() : MIN_LENGTH(5), m_maxItems(1), m_minConfidence(0) { if (!s_scriptLanguages.isEmpty()) return; s_knownDictionaries = Loader::openLoader()->languages().toSet(); QSet dictionaryLanguages; for (QString dictName : s_knownDictionaries) { QString languageName = QLocale(dictName).name(); if (languageName.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; continue; } dictionaryLanguages.insert(languageName); } QSet allLanguages; for (int i=0; i= QT_VERSION_CHECK(5, 7, 0) case QChar::Script_Ahom: names = getNames(QLocale::AhomScript); break; case QChar::Script_AnatolianHieroglyphs: names = getNames(QLocale::AnatolianHieroglyphsScript); break; case QChar::Script_Hatran: names = getNames(QLocale::HatranScript); break; case QChar::Script_Multani: names = getNames(QLocale::MultaniScript); break; case QChar::Script_OldHungarian: names = getNames(QLocale::OldHungarianScript); break; #endif case QChar::Script_Unknown: case QChar::Script_Inherited: case QChar::Script_Common: case QChar::Script_OldTurkic: #if QT_VERSION >= QT_VERSION_CHECK(5, 6, 0) case QChar::Script_SignWriting: #endif break; default: qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; break; } allLanguages.unite(names.toSet()); { // Remove unknown languages QStringList pruned; for (const QString &name : names) { if (!dictionaryLanguages.contains(name)) { continue; } pruned.append(name); } names = pruned; } if (names.isEmpty()) { continue; } for (const QString &name : names) { s_scriptLanguages.insert(script, name); } } // Try to handle some badly named dictionaries if (!allLanguages.contains(s_knownDictionaries)) { QSet dicts(s_knownDictionaries); dicts.subtract(allLanguages); for (const QString &dictName : dicts) { QString languageName = QLocale(dictName).name(); if (languageName.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; continue; } s_dictionaryNameMap[languageName] = dictName; if (!s_scriptLanguages.values().contains(languageName)) { qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; } } } } GuessLanguage::GuessLanguage() : d(new GuessLanguagePrivate) { } GuessLanguage::~GuessLanguage() { delete d; } QString GuessLanguage::identify(const QString& text, const QStringList& suggestionsListIn) const { if (text.isEmpty()) return QString(); // Filter for available dictionaries QStringList suggestionsList; for (const QString &suggestion : suggestionsListIn) { if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) { suggestionsList.append(suggestion); } } // Load the model on demand if (d->s_knownModels.isEmpty()) { d->loadModels(); } QStringList candidateLanguages = d->identify(text, d->findRuns(text)); // Hack for some bad dictionary names for (int i=0; is_dictionaryNameMap.contains(candidateLanguages[i])) { candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]); } } if (candidateLanguages.count() == 1) { return candidateLanguages.first(); } // Wasn't able to get a good guess with the trigrams, try checking all // dictionaries for the suggested languages. candidateLanguages.append(suggestionsList); candidateLanguages.removeDuplicates(); QString identified = d->guessFromDictionaries(text, candidateLanguages); if (!identified.isEmpty()) { return identified; } qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; // None of our methods worked, just return the best suggestion if (!suggestionsList.isEmpty()) { return suggestionsList.first(); } qCWarning(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; // Not even any suggestions, give up return QString(); } void GuessLanguage::setLimits(int maxItems, double minConfidence) { d->m_maxItems = maxItems; d->m_minConfidence = minConfidence; } void GuessLanguagePrivate::loadModels() { QString triMapFile = QStandardPaths::locate(QStandardPaths::GenericDataLocation, QStringLiteral("kf5/sonnet/trigrams.map")); if (triMapFile.isEmpty()) { triMapFile = QStringLiteral("%1/../share/kf5/sonnet/trigrams.map").arg(QCoreApplication::applicationDirPath()); } qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; QFile sin(triMapFile); if (!sin.open(QIODevice::ReadOnly)) { qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; return; } QDataStream in(&sin); in >> s_knownModels; // Sanity check QSet availableLanguages; QHashIterator> iterator(s_knownModels); while (iterator.hasNext()) { iterator.next(); if (iterator.value().count() < MAXGRAMS) { qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; } availableLanguages.insert(iterator.key()); } QSet knownLanguages(s_scriptLanguages.values().toSet()); knownLanguages.subtract(availableLanguages); if (!knownLanguages.isEmpty()) { qCWarning(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; } } QList GuessLanguagePrivate::findRuns(const QString & text) { QChar::Script script = QChar::Script_Unknown; QHash scriptCounts; int totalCount = 0; foreach (const QChar c, text) { script = c.script(); if (script == QChar::Script_Common || script == QChar::Script_Inherited) { continue; } if (!c.isLetter()) { continue; } scriptCounts[script]++; totalCount++; } QList relevantScripts; if (totalCount == 0) return relevantScripts; foreach(const QChar::Script &script, scriptCounts.keys()) { // return run types that used for 40% or more of the string if (scriptCounts[script] * 100 / totalCount >= 40) { relevantScripts << script; // always return basic latin if found more than 15%. } else if (script == QChar::Script_Latin && scriptCounts[script] * 100 / totalCount >= 15) { relevantScripts << script; } } return relevantScripts; } QStringList GuessLanguagePrivate::identify(const QString& sample, const QList& scripts) { if (sample.size() < MIN_LENGTH) { return QStringList(); } QStringList guesses; for (const QChar::Script script : scripts) { guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script))); } return guesses; } QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) { QStringList ret; const QList sampleTrigrams = createOrderedModel(sample); // Sort by score QMultiMap scores; for (const QString &language : languages) { if (s_knownModels.contains(language)) { scores.insert(distance(sampleTrigrams, s_knownModels[language]), language); } } // Skip if either no results or best result is completely unknown (distance >= maxdistance) if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; return ret; } int counter = 0; double confidence = 0; QMapIterator it(scores); it.next(); QString prevItem = it.value(); int prevScore = it.key(); while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { it.next(); counter++; confidence += (it.key() - prevScore)/(double)it.key(); ret += prevItem; prevItem=it.value(); prevScore=it.key(); } if (counter < m_maxItems && confidence < m_minConfidence) { ret += prevItem; } return ret; } QList GuessLanguagePrivate::createOrderedModel(const QString& content) { QHash trigramCounts; QMap orderedTrigrams; for (int i = 0; i < (content.size() - 2); ++i) { QString tri = content.mid(i, 3).toLower(); trigramCounts[tri]++; } foreach (const QString &key, trigramCounts.keys()) { const QChar* data=key.constData(); bool hasTwoSpaces=(data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); if (!hasTwoSpaces) orderedTrigrams.insertMulti( - trigramCounts[key], key); } return orderedTrigrams.values(); } int GuessLanguagePrivate::distance(const QList& model, const QHash& knownModel) { int counter = -1; int dist = 0; Q_FOREACH(const QString& trigram, model) { if (knownModel.contains(trigram)) { dist += qAbs(++counter - knownModel.value(trigram)); } else { dist += MAXGRAMS; } if (counter==(MAXGRAMS-1)) { break; } } return dist; } QString GuessLanguagePrivate::guessFromDictionaries(const QString& sentence, const QStringList& candidates) { // Try to see how many languages we can get spell checking for QList> spellers; for (const QString& lang : candidates) { if (!Loader::openLoader()->languages().contains(lang)) { qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; continue; } QSharedPointer plugin = Loader::openLoader()->cachedSpeller(lang); if (!plugin.isNull()) spellers.append(plugin); } // If there's no spell checkers, give up if (spellers.isEmpty()) { return QString(); } QMap correctHits; WordTokenizer tokenizer(sentence); while (tokenizer.hasNext()) { QStringRef word = tokenizer.next(); if (!tokenizer.isSpellcheckable()) continue; for (int i = 0; i < spellers.count(); ++i) { if (spellers[i]->isCorrect(word.toString())) { correctHits[spellers[i]->language()]++; } } } if (correctHits.isEmpty()) return QString(); QMap::const_iterator max = correctHits.constBegin(); for (QMap::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { if (itr.value() > max.value()) { max = itr; } } return max.key(); } } diff --git a/src/core/guesslanguage.h b/src/core/guesslanguage.h index e1bb609..2f224b8 100644 --- a/src/core/guesslanguage.h +++ b/src/core/guesslanguage.h @@ -1,90 +1,93 @@ /* This file is part of the KDE libraries Copyright (c) 2006 Jacob R Rideout This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef GUESSLANGUAGE_H #define GUESSLANGUAGE_H #include #include #include "sonnetcore_export.h" namespace Sonnet { +// Amount of trigrams in each file +static const int MAXGRAMS = 300; + class GuessLanguagePrivate; /** * @short GuessLanguage determines the language of a given text. * * GuessLanguage can determine the differnce between ~75 languages for a given string. It is * based off a perl script origionaly written by Maciej Ceglowski * called Languid. His script used a 2 part huristic to determine language. First the text * is is checked for the scripts it contains, then for each set of languages useing those * scripts a n-gram frequency model of a given language is compared to a model of the text. * The most similar language model is assumed to be the language. If no language is found * an empty string is returned. * * * @author Jacob Rideout * @since 4.3 */ class SONNETCORE_EXPORT GuessLanguage { public: /** Constructor * Creates a new GuessLanguage instance. If @p text is specified, * it sets the text to be checked. * @param text the text that is to be checked */ GuessLanguage(); /** Destructor */ ~GuessLanguage(); /** * Sets limits to number of languages returned by identify(). The confidence for each language is computed * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly * as after call to setLimits(1,0). * @param maxItems The list returned by identify() will never have more than maxItems item * @param minConfidence The list will have only enough items for their summary confidence equal * or exceed minConfidence. */ void setLimits(int maxItems, double minConfidence); /** * Returns the 2 digit ISO 639-1 code for the language of the currently * set text and. Three digits are returned only in the case where a 2 digit * code does not exist. If @p text isn't empty, set the text to checked. * @param text to be identified * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means * it is impossible to determine language with confidence required by setLimits */ QString identify(const QString& text, const QStringList &suggestions = QStringList()) const; private: GuessLanguagePrivate* const d; }; } #endif