diff --git a/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.json b/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.json new file mode 100644 index 00000000..6d80d543 --- /dev/null +++ b/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.json @@ -0,0 +1,28 @@ +[ + { + "@type": "TrainReservation", + "reservationFor": { + "@type": "TrainTrip", + "arrivalStation": { + "@type": "TrainStation", + "name": "MONTPELLIER ST-RO" + }, + "arrivalTime": "2018-07-15T19:58:00", + "departureStation": { + "@type": "TrainStation", + "name": "TOULOUSE MATABIAU" + }, + "departureTime": "2018-07-15T17:50:00", + "trainNumber": "6857" + }, + "reservationNumber": "XXX007", + "reservedTicket": { + "@type": "Ticket", + "ticketedSeat": { + "@type": "Seat", + "seatNumber": "31", + "seatSection": "13" + } + } + } +] diff --git a/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.txt b/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.txt new file mode 100644 index 00000000..0af21650 --- /dev/null +++ b/plugins/messageviewer/bodypartformatter/autotests/unstructureddata/sncf_one-leg-single-tgv.txt @@ -0,0 +1,16 @@ +VOTRE CONFIRMATION E-BILLET + TOULOUSE MATABIAU / MONTPELLIER ST-RO 35.00 EUR + CK +Nom : DOE DOSSIER VOYAGE : XXX007 +Prénom : JOHN Référence client : 0011223344556677889 +Voyageur : ADULTE N° e-billet : 123456789 + Départ / Arrivée Date / Heure TGV TGV LOISIR REDUIT-ECH/REMB PAYANT JOUR DU + DEPART + PR11 - ABC123 + TRAIN N°6857 + TOULOUSE MATABIAU 15/07 à 17h50 VOITURE 13 - PLACE 31 + 1e CLASSE / PLACE ASSISE + MONTPELLIER ST-RO 15/07 à 19h58 ISOLEE SOLO + E-Billet valable uniquement sur ce train +Présence à quai obligatoire 2 mn avant départ. +Pour connaître l'empreinte CO2 de votre voyage et accéder au détail de la méthode de calcul, rendez-vous sur SNCF.com diff --git a/plugins/messageviewer/bodypartformatter/autotests/unstructureddataextractortest.cpp b/plugins/messageviewer/bodypartformatter/autotests/unstructureddataextractortest.cpp index 47a7e71f..6600e222 100644 --- a/plugins/messageviewer/bodypartformatter/autotests/unstructureddataextractortest.cpp +++ b/plugins/messageviewer/bodypartformatter/autotests/unstructureddataextractortest.cpp @@ -1,142 +1,144 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "extractor.h" #include "extractorengine.h" #include "extractorpreprocessor.h" #include #include #include #include #include #include class UnstructuredDataExtractorTest : public QObject { Q_OBJECT private Q_SLOTS: void initTestCase() { Q_INIT_RESOURCE(extractors); // use some exotic locale to ensure the date/time parsing doesn't just work by luck QLocale::setDefault(QLocale(QStringLiteral("fr_FR"))); } void testExtractText_data() { QTest::addColumn("inputFile"); QTest::addColumn("extractorName"); QTest::addColumn("jsonFile"); QDir dir(QStringLiteral(SOURCE_DIR "/unstructureddata")); const auto lst = dir.entryList(QStringList(QStringLiteral("*.txt")), QDir::Files | QDir::Readable | QDir::NoSymLinks); for (const auto &file : lst) { const auto refFile = dir.path() + QLatin1Char('/') + file.left(file.size() - 4) + QStringLiteral(".json"); if (!QFile::exists(refFile)) { qDebug() << "reference file" << refFile << "does not exist, skipping test file" << file; continue; } const auto idx = file.indexOf(QLatin1Char('_')); QTest::newRow(file.toLatin1()) << QString(dir.path() + QLatin1Char('/') + file) << file.left(idx) << refFile; } } void testExtractText() { QFETCH(QString, inputFile); QFETCH(QString, extractorName); QFETCH(QString, jsonFile); QFile f(inputFile); QVERIFY(f.open(QFile::ReadOnly)); Extractor extractor; QVERIFY(extractor.load(QLatin1String(":/org.kde.pim/messageviewer/semantic/extractors/") + extractorName + QLatin1String(".json"))); ExtractorEngine engine; engine.setText(QString::fromUtf8(f.readAll())); + engine.setSenderDate(QDateTime(QDate(2017, 12, 29), QTime(18, 46, 2))); engine.setExtractor(&extractor); const auto data = engine.extract(); QFile ref(jsonFile); QVERIFY(ref.open(QFile::ReadOnly)); const auto doc = QJsonDocument::fromJson(ref.readAll()); QVERIFY(doc.isArray()); if (data != doc.array()) { qDebug().noquote() << QJsonDocument(data).toJson(); } QCOMPARE(data, doc.array()); } void testExtractHtml_data() { QTest::addColumn("inputFile"); QTest::addColumn("extractorName"); QTest::addColumn("jsonFile"); QDir dir(QStringLiteral(SOURCE_DIR "/unstructureddata")); const auto lst = dir.entryList(QStringList(QStringLiteral("*.html")), QDir::Files | QDir::Readable | QDir::NoSymLinks); for (const auto &file : lst) { const auto refFile = dir.path() + QLatin1Char('/') + file.left(file.size() - 5) + QStringLiteral(".json"); if (!QFile::exists(refFile)) { qDebug() << "reference file" << refFile << "does not exist, skipping test file" << file; continue; } const auto idx = file.indexOf(QLatin1Char('_')); QTest::newRow(file.toLatin1()) << QString(dir.path() + QLatin1Char('/') + file) << file.left(idx) << refFile; } } void testExtractHtml() { QFETCH(QString, inputFile); QFETCH(QString, extractorName); QFETCH(QString, jsonFile); QFile f(inputFile); QVERIFY(f.open(QFile::ReadOnly)); Extractor extractor; QVERIFY(extractor.load(QLatin1String(":/org.kde.pim/messageviewer/semantic/extractors/") + extractorName + QLatin1String(".json"))); ExtractorPreprocessor preproc; preproc.preprocessHtml(QString::fromUtf8(f.readAll())); ExtractorEngine engine; engine.setText(preproc.text()); + engine.setSenderDate(QDateTime(QDate(2017, 12, 29), QTime(18, 46, 2))); engine.setExtractor(&extractor); const auto data = engine.extract(); QFile ref(jsonFile); QVERIFY(ref.open(QFile::ReadOnly)); const auto doc = QJsonDocument::fromJson(ref.readAll()); QVERIFY(doc.isArray()); if (data != doc.array()) { qDebug().noquote() << QJsonDocument(data).toJson(); } QCOMPARE(data, doc.array()); } }; QTEST_MAIN(UnstructuredDataExtractorTest) #include "unstructureddataextractortest.moc" diff --git a/plugins/messageviewer/bodypartformatter/semantic/extractorengine.cpp b/plugins/messageviewer/bodypartformatter/semantic/extractorengine.cpp index f7e55401..6debfe0b 100644 --- a/plugins/messageviewer/bodypartformatter/semantic/extractorengine.cpp +++ b/plugins/messageviewer/bodypartformatter/semantic/extractorengine.cpp @@ -1,139 +1,153 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "extractorengine.h" #include "semantic_debug.h" #include #include #include #include class JsApi : public QObject { Q_OBJECT public: explicit JsApi(QJSEngine *engine) : QObject(engine) , m_engine(engine) { } Q_INVOKABLE QJSValue newObject(const QString &typeName) const; Q_INVOKABLE QDateTime toDateTime(const QString &dtStr, const QString &format, const QString &localeName) const; private: QJSEngine *m_engine; }; QJSValue JsApi::newObject(const QString &typeName) const { auto v = m_engine->newObject(); v.setProperty(QStringLiteral("@type"), typeName); return v; } QDateTime JsApi::toDateTime(const QString &dtStr, const QString &format, const QString &localeName) const { QLocale locale(localeName); const auto dt = locale.toDateTime(dtStr, format); if (dt.isValid()) { return dt; } // try harder for the "MMM" month format // QLocale expects the exact string in QLocale::shortMonthName(), while we often encounter a three // letter month identifier. For en_US that's the same, for Swedish it isn't though for example. So // let's try to fix up the month identifiers to the full short name. if (format.contains(QLatin1String("MMM"))) { auto dtStrFixed = dtStr; for (int i = 0; i < 12; ++i) { const auto monthName = locale.monthName(i, QLocale::ShortFormat); dtStrFixed = dtStrFixed.replace(monthName.left(3), monthName); } return locale.toDateTime(dtStrFixed, format); } return dt; } -ExtractorEngine::ExtractorEngine() = default; +class ContextObject : public QObject +{ + Q_OBJECT + Q_PROPERTY(QDateTime senderDate MEMBER m_senderDate) + +public: + QDateTime m_senderDate; +}; + +ExtractorEngine::ExtractorEngine() + : m_context(new ContextObject) // will be deleted by QJSEngine taking ownership +{ +} + ExtractorEngine::~ExtractorEngine() = default; void ExtractorEngine::setExtractor(const Extractor *extractor) { m_extractor = extractor; } -const QString &ExtractorEngine::text() const +void ExtractorEngine::setText(const QString &text) { - return m_text; + m_text = text; } -void ExtractorEngine::setText(const QString &text) +void ExtractorEngine::setSenderDate(const QDateTime &dt) { - m_text = text; + m_context->m_senderDate = dt; } QJsonArray ExtractorEngine::extract() { if (!m_extractor || m_text.isEmpty()) { return {}; } executeScript(); return m_result; } void ExtractorEngine::executeScript() { Q_ASSERT(m_extractor); QFile f(m_extractor->scriptFileName()); if (!f.open(QFile::ReadOnly)) { qCWarning(SEMANTIC_LOG) << "Failed to open extractor script" << f.fileName() << f.errorString(); return; } QJSEngine engine; engine.installExtensions(QJSEngine::ConsoleExtension); auto jsApi = new JsApi(&engine); engine.globalObject().setProperty(QStringLiteral("JsonLd"), engine.newQObject(jsApi)); + engine.globalObject().setProperty(QStringLiteral("Context"), engine.newQObject(m_context)); auto result = engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName()); if (result.isError()) { qCWarning(SEMANTIC_LOG) << "Script parsing error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } auto mainFunc = engine.globalObject().property(QLatin1String("main")); if (!mainFunc.isCallable()) { qCWarning(SEMANTIC_LOG) << "Script has no main() function!"; return; } result = mainFunc.call({m_text}); if (result.isError()) { qCWarning(SEMANTIC_LOG) << "Script execution error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } m_result = QJsonArray::fromVariantList(result.toVariant().toList()); } #include "extractorengine.moc" diff --git a/plugins/messageviewer/bodypartformatter/semantic/extractorengine.h b/plugins/messageviewer/bodypartformatter/semantic/extractorengine.h index 6517da5a..039e17e8 100644 --- a/plugins/messageviewer/bodypartformatter/semantic/extractorengine.h +++ b/plugins/messageviewer/bodypartformatter/semantic/extractorengine.h @@ -1,51 +1,56 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef EXTRACTORENGINE_H #define EXTRACTORENGINE_H #include "extractor.h" #include #include #include +class ContextObject; +class QDateTime; + /** Code for executing an extractor rule set on a specific email part. */ class ExtractorEngine { public: ExtractorEngine(); ~ExtractorEngine(); void setExtractor(const Extractor *extractor); - const QString &text() const; void setText(const QString &text); + /** The date the email containing the processed text was sent. */ + void setSenderDate(const QDateTime &dt); QJsonArray extract(); private: void executeScript(); const Extractor *m_extractor = nullptr; + ContextObject *m_context = nullptr; QString m_text; QJsonArray m_result; }; #endif // EXTRACTORENGINE_H diff --git a/plugins/messageviewer/bodypartformatter/semantic/extractors/extractors.qrc b/plugins/messageviewer/bodypartformatter/semantic/extractors/extractors.qrc index 860568aa..75a35df3 100644 --- a/plugins/messageviewer/bodypartformatter/semantic/extractors/extractors.qrc +++ b/plugins/messageviewer/bodypartformatter/semantic/extractors/extractors.qrc @@ -1,16 +1,18 @@ amadeus.json amadeus.js brusselsairlines.json brusselsairlines.js deutschebahn.json deutschebahn.js eurowings.json eurowings.js fcmtravel.json fcmtravel.js + sncf.json + sncf.js swiss.json swiss.js diff --git a/plugins/messageviewer/bodypartformatter/semantic/extractors/sncf.js b/plugins/messageviewer/bodypartformatter/semantic/extractors/sncf.js index bdc66834..24aee596 100644 --- a/plugins/messageviewer/bodypartformatter/semantic/extractors/sncf.js +++ b/plugins/messageviewer/bodypartformatter/semantic/extractors/sncf.js @@ -1,75 +1,81 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ +function parseDate(dateStr, timeStr) { + // the text does not contain the year at all, so guess that from Context.senderDate + var date = JsonLd.toDateTime(dateStr + '/' + Context.senderDate.getFullYear() + ' ' + timeStr, "dd/MM/yyyy hh'h'mm", "fr"); + if (date < Context.senderDate) + date.setFullYear(Context.senderDate.getFullYear() + 1); + return date; +} + function main(text) { var reservations = new Array(); var bookingRef = text.match(/DOSSIER VOYAGE : ([A-Z0-9]{6})/); var pos = 0; while (true) { var header = text.substr(pos).match(/ Départ \/ Arrivée.*\n/); if (!header) break; var index = header.index + header[0].length; var res = JsonLd.newObject("TrainReservation"); res.reservationNumber = bookingRef[1]; res.reservationFor = JsonLd.newObject("TrainTrip"); var depLine = text.substr(pos + index).match(/\n ([\w -]+?) +(\d{2}\/\d{2}) à (\d{2}h\d{2})/); if (!depLine) break; index += depLine.index + depLine[0].length; res.reservationFor.departureStation = JsonLd.newObject("TrainStation"); res.reservationFor.departureStation.name = depLine[1]; - // TODO determine the year (which is nowhere in the ticket!) - res.reservationFor.departureTime = JsonLd.toDateTime(depLine[2] + '/' + 1970 + ' ' + depLine[3], "dd/MM/yyyy hh'h'mm", "fr"); + res.reservationFor.departureTime = parseDate(depLine[2], depLine[3]); var arrLine = text.substr(pos + index).match(/\n ([\w -]+?) +(\d{2}\/\d{2}) à (\d{2}h\d{2})/); if (!arrLine) break; index += arrLine.index + arrLine[0].length; res.reservationFor.arrivalStation = JsonLd.newObject("TrainStation"); res.reservationFor.arrivalStation.name = arrLine[1]; - // TODO year, see above - res.reservationFor.arrivalTime = JsonLd.toDateTime(arrLine[2] + '/' + 1970 + ' ' + arrLine[3], "dd/MM/yyyy hh'h'mm", "fr"); + res.reservationFor.arrivalTime = parseDate(arrLine[2], arrLine[3]); // parse seat, train number, etc from the text for one leg // since the stations are vertically centered, the stuff we are looking for might be at different // positions relative to them var legText = text.substring(pos + header.index + header[0].length, pos + index); var trainNumber = legText.match(/TRAIN N°(\d{3,4})/); if (trainNumber) res.reservationFor.trainNumber = trainNumber[1]; var seatRes = legText.match(/VOITURE (\d+) - PLACE (\d+)/); if (seatRes) { res.reservedTicket = JsonLd.newObject("Ticket"); res.reservedTicket.ticketedSeat = JsonLd.newObject("Seat"); res.reservedTicket.ticketedSeat.seatSection = seatRes[1]; res.reservedTicket.ticketedSeat.seatNumber = seatRes[2]; } reservations.push(res); if (index == 0) break; pos += index; } return reservations; } diff --git a/plugins/messageviewer/bodypartformatter/semantic/semanticprocessor.cpp b/plugins/messageviewer/bodypartformatter/semantic/semanticprocessor.cpp index 8de84bd6..c530d1c5 100644 --- a/plugins/messageviewer/bodypartformatter/semantic/semanticprocessor.cpp +++ b/plugins/messageviewer/bodypartformatter/semantic/semanticprocessor.cpp @@ -1,125 +1,126 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "semanticprocessor.h" #include "extractorengine.h" #include "extractorpreprocessor.h" #include "extractorpostprocessor.h" #include "jsonlddocument.h" #include "structureddataextractor.h" #include "semanticmemento.h" #include "semantic_debug.h" #include std::weak_ptr SemanticProcessor::s_repository; SemanticProcessor::SemanticProcessor() { m_repository = s_repository.lock(); if (!m_repository) { m_repository.reset(new ExtractorRepository); s_repository = m_repository; } } SemanticProcessor::~SemanticProcessor() = default; MimeTreeParser::MessagePart::Ptr SemanticProcessor::process(MimeTreeParser::Interface::BodyPart &part) const { auto nodeHelper = part.nodeHelper(); if (!nodeHelper) { return {}; } auto memento = dynamic_cast(nodeHelper->bodyPartMemento(part.topLevelContent(), "org.kde.messageviewer.semanticData")); if (!memento) { memento = new SemanticMemento; nodeHelper->setBodyPartMemento(part.topLevelContent(), "org.kde.messageviewer.semanticData", memento); } // check if we still have to do anything at all if (memento->hasStructuredData()) { return {}; } if (memento->isParsed(part.content()->index())) { return {}; } memento->setParsed(part.content()->index()); qCDebug(SEMANTIC_LOG) << "-------------------------------------------- BEGIN SEMANTIC PARSING"; qCDebug(SEMANTIC_LOG) << part.content()->contentType()->mimeType(); // look for structured data first, cheaper and better quality if (part.content()->contentType()->mimeType() == "text/html") { StructuredDataExtractor extractor; extractor.parse(part.content()->decodedText()); const auto data = extractor.data(); const auto decodedData = JsonLdDocument::fromJson(data); if (data.size() != decodedData.size()) { qCDebug(SEMANTIC_LOG).noquote() << "Unhandled content:" << QJsonDocument(data).toJson(); } if (!decodedData.isEmpty()) { memento->setData(decodedData); memento->setStructuredDataFound(true); qCDebug(SEMANTIC_LOG) << "Found structured data:" << decodedData; } } // try the unstructured data extractor as a fallback if (memento->isEmpty()) { const auto extractors = m_repository->extractorsForMessage(part.content()); if (extractors.empty()) { return {}; } qCDebug(SEMANTIC_LOG) << "Found unstructured extractor rules for message" << extractors.size(); ExtractorPreprocessor preproc; if (part.content()->contentType()->isPlainText()) { preproc.preprocessPlainText(part.content()->decodedText()); } else if (part.content()->contentType()->isHTMLText()) { preproc.preprocessHtml(part.content()->decodedText()); } else if (part.content()->contentType()->mimeType() == "application/pdf") { preproc.preprocessPdf(part.content()->decodedContent()); } else { return {}; } for (auto extractor : extractors) { ExtractorEngine engine; engine.setExtractor(extractor); + engine.setSenderDate(static_cast(part.content()->topLevel())->date()->dateTime()); engine.setText(preproc.text()); const auto data = engine.extract(); qCDebug(SEMANTIC_LOG).noquote() << QJsonDocument(data).toJson(); const auto decodedData = JsonLdDocument::fromJson(data); if (!decodedData.isEmpty()) { memento->setData(decodedData); break; } } } // postprocessor to filter incomplete/broken elements and merge duplicates ExtractorPostprocessor postproc; postproc.process(memento->data()); memento->setData(postproc.result()); qCDebug(SEMANTIC_LOG) << "-------------------------------------------- END SEMANTIC PARSING"; return {}; }