diff --git a/src/extractorengine.cpp b/src/extractorengine.cpp index a4b9155..e725be0 100644 --- a/src/extractorengine.cpp +++ b/src/extractorengine.cpp @@ -1,438 +1,505 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config-kitinerary.h" #include "extractorengine.h" #include "extractor.h" +#include "extractorrepository.h" #include "genericpdfextractor.h" #include "htmldocument.h" #include "jsonlddocument.h" #include "logging.h" #include "pdfdocument.h" #include "structureddataextractor.h" #include "jsapi/barcode.h" #include "jsapi/context.h" #include "jsapi/jsonld.h" #ifdef HAVE_KCAL #include #include #endif #include #include #include +#include + #include #include #include #include #include #include #include #include using namespace KItinerary; namespace KItinerary { class ExtractorEnginePrivate { public: void setupEngine(); + + void extractStructured(); + void extractCustom(); + void extractGeneric(); + void executeScript(const Extractor *extractor); void processScriptResult(const QJSValue &result); void extractPass(); void extractBoardingPass(QJsonObject &resFor); void extractEventTicketPass(QJsonObject &resFor); std::vector m_extractors; JsApi::Barcode *m_barcodeApi = nullptr; JsApi::Context *m_context = nullptr; JsApi::JsonLd *m_jsonLdApi = nullptr; QString m_text; HtmlDocument *m_htmlDoc = nullptr; PdfDocument *m_pdfDoc = nullptr; KPkPass::Pass *m_pass = nullptr; #ifdef HAVE_KCAL KCalCore::Calendar::Ptr m_calendar; #endif + KMime::Content *m_mimeContext = nullptr; GenericPdfExtractor m_genericPdfExtractor; QJsonArray m_result; QJSEngine m_engine; + ExtractorRepository m_repo; }; } void ExtractorEnginePrivate::setupEngine() { m_context = new JsApi::Context; // will be deleted by QJSEngine taking ownership m_engine.installExtensions(QJSEngine::ConsoleExtension); m_jsonLdApi = new JsApi::JsonLd(&m_engine); m_engine.globalObject().setProperty(QStringLiteral("JsonLd"), m_engine.newQObject(m_jsonLdApi)); m_barcodeApi = new JsApi::Barcode; m_engine.globalObject().setProperty(QStringLiteral("Barcode"), m_engine.newQObject(m_barcodeApi)); m_engine.globalObject().setProperty(QStringLiteral("Context"), m_engine.newQObject(m_context)); } ExtractorEngine::ExtractorEngine() : d(new ExtractorEnginePrivate) { d->setupEngine(); } ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default; ExtractorEngine::~ExtractorEngine() = default; void ExtractorEngine::clear() { d->m_text.clear(); d->m_pdfDoc = nullptr; d->m_htmlDoc = nullptr; d->m_pass = nullptr; #ifdef HAVE_KCAL d->m_calendar.reset(); #endif d->m_result = {}; + d->m_mimeContext = nullptr; d->m_context->m_senderDate = {}; } void ExtractorEngine::setExtractors(std::vector &&extractors) { d->m_extractors = extractors; } void ExtractorEngine::setText(const QString &text) { d->m_text = text; } void ExtractorEngine::setHtmlDocument(HtmlDocument *htmlDoc) { d->m_htmlDoc = htmlDoc; } void ExtractorEngine::setPdfDocument(PdfDocument *pdfDoc) { d->m_pdfDoc = pdfDoc; } void ExtractorEngine::setPass(KPkPass::Pass *pass) { d->m_pass = pass; } void ExtractorEngine::setCalendar(const QSharedPointer &calendar) { #ifdef HAVE_KCAL d->m_calendar = calendar; #else Q_UNUSED(calendar); #endif } -void ExtractorEngine::setSenderDate(const QDateTime &dt) +void ExtractorEngine::setContent(KMime::Content *content) +{ + // TODO for each type, create the corresponding document + setContext(content); +} + +void ExtractorEngine::setContext(KMime::Content *context) +{ + d->m_mimeContext = context; + auto dateHdr = context->header(); + while (!dateHdr && context->parent()) { + context = context->parent(); + dateHdr = context->header(); + } + if (dateHdr) { + setContextDate(dateHdr->dateTime()); + } +} + +void ExtractorEngine::setContextDate(const QDateTime &dt) { d->m_context->m_senderDate = dt; d->m_jsonLdApi->setContextDate(dt); d->m_barcodeApi->setContextDate(dt.date()); d->m_genericPdfExtractor.setContextDate(dt); } +void ExtractorEngine::setSenderDate(const QDateTime &dt) +{ + setContextDate(dt); +} + QJsonArray ExtractorEngine::extract() { - for (const auto extractor : d->m_extractors) { + // structured content + d->extractStructured(); + if (!d->m_result.isEmpty()) { + return d->m_result; + } + + // custom extractors + if (d->m_pass) { + d->m_extractors = d->m_repo.extractorsForPass(d->m_pass); + } else if (d->m_mimeContext) { + d->m_extractors = d->m_repo.extractorsForMessage(d->m_mimeContext); + } + d->extractCustom(); + + // generic extractors + d->extractGeneric(); + + // check if generic extractors identified documents we have custom extractors for + d->m_extractors = d->m_repo.extractorsForJsonLd(d->m_result); + d->extractCustom(); + + return d->m_result; +} + +void ExtractorEnginePrivate::extractStructured() +{ + if (m_htmlDoc) { + for (const auto &v : StructuredDataExtractor::extract(m_htmlDoc)) { + m_result.push_back(v); + } + } +} + +void ExtractorEnginePrivate::extractCustom() +{ + for (const auto extractor : m_extractors) { switch (extractor->type()) { case Extractor::Text: // running text extractors on PDF or HTML docs is possible, // but only extract the text when really needed - if (d->m_text.isEmpty() && d->m_pdfDoc) { - d->m_text = d->m_pdfDoc->text(); + if (m_text.isEmpty() && m_pdfDoc) { + m_text = m_pdfDoc->text(); } - if (d->m_text.isEmpty() && d->m_htmlDoc) { - d->m_text = d->m_htmlDoc->root().recursiveContent(); + if (m_text.isEmpty() && m_htmlDoc) { + m_text = m_htmlDoc->root().recursiveContent(); } - if (!d->m_text.isEmpty()) { - d->executeScript(extractor); + if (!m_text.isEmpty()) { + executeScript(extractor); } break; case Extractor::Html: - if (d->m_htmlDoc) { - if (extractor->scriptFileName().isEmpty()) { - for (const auto &v : StructuredDataExtractor::extract(d->m_htmlDoc)) { - d->m_result.push_back(v); - } - } - d->executeScript(extractor); + if (m_htmlDoc) { + executeScript(extractor); } break; case Extractor::Pdf: - if (d->m_pdfDoc) { - if (extractor->scriptFileName().isEmpty()) { - d->m_genericPdfExtractor.extract(d->m_pdfDoc, d->m_result); - } else { - d->executeScript(extractor); - } + if (m_pdfDoc) { + executeScript(extractor); } break; case Extractor::PkPass: - if (d->m_pass) { - d->executeScript(extractor); - d->extractPass(); + if (m_pass) { + executeScript(extractor); } break; case Extractor::ICal: #ifdef HAVE_KCAL - if (d->m_calendar) { - d->executeScript(extractor); + if (m_calendar) { + executeScript(extractor); } #endif break; } - if (!d->m_result.isEmpty()) { + if (!m_result.isEmpty()) { break; } } +} - return d->m_result; +void ExtractorEnginePrivate::extractGeneric() +{ + if (m_pass) { + extractPass(); + } else if (m_pdfDoc && m_result.isEmpty()) { + m_genericPdfExtractor.extract(m_pdfDoc, m_result); + } } void ExtractorEnginePrivate::executeScript(const Extractor *extractor) { Q_ASSERT(extractor); if (extractor->scriptFileName().isEmpty()) { return; } QFile f(extractor->scriptFileName()); if (!f.open(QFile::ReadOnly)) { qCWarning(Log) << "Failed to open extractor script" << f.fileName() << f.errorString(); return; } auto result = m_engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName()); if (result.isError()) { qCWarning(Log) << "Script parsing error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } auto mainFunc = m_engine.globalObject().property(extractor->scriptFunction()); if (!mainFunc.isCallable()) { qCWarning(Log) << "Script entry point not found!" << extractor->scriptFunction(); return; } QJSValueList args; switch (extractor->type()) { case Extractor::Text: args = {m_text}; break; case Extractor::Html: args = {m_engine.toScriptValue(m_htmlDoc)}; break; case Extractor::Pdf: args = {m_engine.toScriptValue(m_pdfDoc)}; break; case Extractor::PkPass: args = {m_engine.toScriptValue(m_pass)}; break; case Extractor::ICal: #ifdef HAVE_KCAL for (const auto &event : m_calendar->events()) { processScriptResult(mainFunc.call({m_engine.toScriptValue(*event.data())})); } #endif break; } if (!args.isEmpty()) { processScriptResult(mainFunc.call(args)); } } void ExtractorEnginePrivate::processScriptResult(const QJSValue &result) { if (result.isError()) { qCWarning(Log) << "Script execution error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } if (result.isArray()) { QJSValueIterator it(result); while (it.hasNext()) { it.next(); if (it.value().isObject()) { m_result.push_back(QJsonValue::fromVariant(it.value().toVariant())); } } } else if (result.isObject()) { m_result.push_back(QJsonValue::fromVariant(result.toVariant())); } else { qCWarning(Log) << "Invalid result type from script"; } } void ExtractorEnginePrivate::extractPass() { if (m_result.size() > 1) { // a pkpass file contains exactly one boarding pass return; } if (m_result.isEmpty()) { // no script run, so we need to create the top-level element ourselves QJsonObject res; QJsonObject resFor; if (auto boardingPass = qobject_cast(m_pass)) { switch (boardingPass->transitType()) { case KPkPass::BoardingPass::Air: res.insert(QLatin1String("@type"), QLatin1String("FlightReservation")); resFor.insert(QLatin1String("@type"), QLatin1String("Flight")); break; // TODO expand once we have test files for train tickets default: return; } } else { switch (m_pass->type()) { case KPkPass::Pass::EventTicket: res.insert(QLatin1String("@type"), QLatin1String("EventReservation")); resFor.insert(QLatin1String("@type"), QLatin1String("Event")); break; default: return; } } res.insert(QLatin1String("reservationFor"), resFor); m_result.push_back(res); } // extract structured data from a pkpass, if the extractor script hasn't done so already auto res = m_result.at(0).toObject(); auto resFor = res.value(QLatin1String("reservationFor")).toObject(); switch (m_pass->type()) { case KPkPass::Pass::BoardingPass: extractBoardingPass(resFor); break; case KPkPass::Pass::EventTicket: extractEventTicketPass(resFor); break; default: return; } // barcode contains the ticket token if (!m_pass->barcodes().isEmpty() && !res.contains(QLatin1String("reservedTicket"))) { const auto barcode = m_pass->barcodes().at(0); QString token; switch (barcode.format()) { case KPkPass::Barcode::QR: token += QLatin1String("qrCode:"); break; case KPkPass::Barcode::Aztec: token += QLatin1String("aztecCode:"); break; default: break; } token += barcode.message(); QJsonObject ticket; ticket.insert(QLatin1String("@type"), QLatin1String("Ticket")); ticket.insert(QLatin1String("ticketToken"), token); res.insert(QLatin1String("reservedTicket"), ticket); } res.insert(QLatin1String("reservationFor"), resFor); // associate the pass with the result, so we can find the pass again for display if (!m_pass->passTypeIdentifier().isEmpty() && !m_pass->serialNumber().isEmpty()) { res.insert(QLatin1String("pkpassPassTypeIdentifier"), m_pass->passTypeIdentifier()); res.insert(QLatin1String("pkpassSerialNumber"), m_pass->serialNumber()); } m_result[0] = res; } void ExtractorEnginePrivate::extractBoardingPass(QJsonObject &resFor) { // "relevantDate" is the best guess for the boarding time if (m_pass->relevantDate().isValid() && !resFor.contains(QLatin1String("boardingTime"))) { resFor.insert(QLatin1String("boardingTime"), m_pass->relevantDate().toString(Qt::ISODate)); } // look for common field names containing the boarding time, if we still have no idea if (!resFor.contains(QLatin1String("boardingTime"))) { for (const auto &field : m_pass->fields()) { if (!field.key().contains(QLatin1String("boarding"), Qt::CaseInsensitive)) { continue; } const auto time = QTime::fromString(field.value().toString()); if (time.isValid()) { // this misses date, but the postprocessor will fill that in resFor.insert(QLatin1String("boardingTime"), QDateTime(QDate(1, 1, 1), time).toString(Qt::ISODate)); break; } } } // location is the best guess for the departure airport geo coordinates auto depAirport = resFor.value(QLatin1String("departureAirport")).toObject(); if (depAirport.isEmpty()) { depAirport.insert(QLatin1String("@type"), QLatin1String("Airport")); } auto depGeo = depAirport.value(QLatin1String("geo")).toObject(); if (m_pass->locations().size() == 1 && depGeo.isEmpty()) { const auto loc = m_pass->locations().at(0); depGeo.insert(QLatin1String("@type"), QLatin1String("GeoCoordinates")); depGeo.insert(QLatin1String("latitude"), loc.latitude()); depGeo.insert(QLatin1String("longitude"), loc.longitude()); depAirport.insert(QLatin1String("geo"), depGeo); resFor.insert(QLatin1String("departureAirport"), depAirport); } } void ExtractorEnginePrivate::extractEventTicketPass(QJsonObject &resFor) { if (!resFor.contains(QLatin1String("name"))) { resFor.insert(QLatin1String("name"), m_pass->description()); } // "relevantDate" is the best guess for the start time if (m_pass->relevantDate().isValid() && !resFor.contains(QLatin1String("startDate"))) { resFor.insert(QLatin1String("startDate"), m_pass->relevantDate().toString(Qt::ISODate)); } // location is the best guess for the venue auto venue = resFor.value(QLatin1String("location")).toObject(); if (venue.isEmpty()) { venue.insert(QLatin1String("@type"), QLatin1String("Place")); } auto geo = venue.value(QLatin1String("geo")).toObject(); if (!m_pass->locations().isEmpty() && geo.isEmpty()) { const auto loc = m_pass->locations().at(0); geo.insert(QLatin1String("@type"), QLatin1String("GeoCoordinates")); geo.insert(QLatin1String("latitude"), loc.latitude()); geo.insert(QLatin1String("longitude"), loc.longitude()); venue.insert(QLatin1String("geo"), geo); venue.insert(QLatin1String("name"), loc.relevantText()); resFor.insert(QLatin1String("location"), venue); } } diff --git a/src/extractorengine.h b/src/extractorengine.h index 3a16bf4..ea8dbc9 100644 --- a/src/extractorengine.h +++ b/src/extractorengine.h @@ -1,170 +1,195 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef EXTRACTORENGINE_H #define EXTRACTORENGINE_H #include "kitinerary_export.h" #include #include template class QSharedPointer; namespace KCalCore { class Calendar; } namespace KPkPass { class Pass; } +namespace KMime { +class Content; +} + class QDateTime; class QJsonArray; class QString; namespace KItinerary { class Extractor; class ExtractorEnginePrivate; class HtmlDocument; class PdfDocument; /** * Unstructured data extraction engine. * * This will apply the given Extractor instance to the given input data * (plain text, HTML text, PDF documents, etc), and return the extracted * JSON-LD data. * * @section create_extractors Creating Extractors * * @subsection extractor_api Extractor API * * For adding custom extractors, two parts are needed: * - JSON meta-data describing the extractor and when to apply it, as described * in the Extractor documentation. * - An extractor JavaScript file, compatible with QJSEngine. * * The extractor script will have access to API defined in the JsApi namespace: * - JsApi::Context: information about the input data being processed. * - JsApi::JsonLd: functions for generating JSON-LD data. * - JsApi::Barcode: barcode decoding functions. * * The entry point to the script is specified in the meta-data, its argument depends * on the extractor type: * - Plain text extractors are passed a string. * If input is HTML or PDF, the string will be the text of the document stripped * of all formatting etc. * - HTML extractors are passed a HtmlDocument instance allowing DOM-like access to * the document structure. * - PDF extractors are passed a PdfDocument instance allowing access to textual and * image content. * - Apple Wallet pass extractors are passed a KPkPass::BoardingPass instance. * - iCalendar event extractors are passed KCalCore::Event instances. * * These functions should return an object or an array of objects following the JSON-LD * format defined on schema.org. JsApi::JsonLd provides helper functions to build such * objects. If @c null or an empty array is returned, the next applicable extractor is * run. * * Returned objects are then passed through ExtractorPostprocessor which will normalize, * augment and validate the data. This can greatly simplify the extraction, as for example * the expansion of an IATA BCBP ticket token already fills most key properties of a flight * reservation automatically. * * @subsection extractor_tools Development Tools * * For interactive testing during development of new extractors, it is recommended to * link (or copy) the JSON meta data and JavaScript code files to the search path for * Extractor meta data. * * Additionally, there's an interactive testing and inspection tool called @c kitinerary-workbench * (see https://phabricator.kde.org/source/kitinerary-workbench/). * * @subsection extractor_testing Automated Testing * * There are a few unit tests for extractors in the kitinerary repository (see unstructureddataextractortest.cpp), * however the majority of real-world test data cannot be shared this way, due to privacy * and copyright issues (e.g. PDFs containing copyrighted vendor logos and user credit card details). * Therefore there is also support for testing against external data (see extractortest.cpp). * * External test data is assumed to be in a folder named @c kitinerary-tests next to the @c kitinerary * source folder. The test program searches this folder recursively for folders with the following content * and attempts to extract data from each test file in there. * * - @c context.eml: MIME message header data specifying the context in which the test data * was received. This typically only needs a @c From: and @c Date: line, but can even be * entirely empty (but existing) for structured data that does not need a custom extractor. * This context information is applied to all tests in this folder. * - @c .[txt|html|pdf|pkpass|ics]: The input test data. * - @c .json: The expected JSON-LD output. If this file doesn't * exists it is created by the test program. * - @c .skip: If this file is present the corresponding test * is skipped. */ class KITINERARY_EXPORT ExtractorEngine { public: ExtractorEngine(); ~ExtractorEngine(); ExtractorEngine(ExtractorEngine &&) noexcept; ExtractorEngine(const ExtractorEngine &) = delete; /** Resets the internal state, call before processing new input data. */ void clear(); /** Set the extractors to be run on the current data. */ - void setExtractors(std::vector &&extractors); + [[deprecated]] void setExtractors(std::vector &&extractors); /** The text to extract data from. * Only considered for text extractors. */ void setText(const QString &text); /** A HTML document to extract data from. * Only considered for HTML and text extractors. */ void setHtmlDocument(HtmlDocument *htmlDoc); /** A PDF document to extract data from. * Only considered for PDF or text extractors. */ void setPdfDocument(PdfDocument *pdfDoc); /** The pkpass boarding pass to extract data from. * Only considered for pkpass extractors. */ void setPass(KPkPass::Pass *pass); /** The iCalendar to extract data from. * Only considered for ical extractors. */ void setCalendar(const QSharedPointer &calendar); + /** A MIME part to extract from. + * This is assumed to contain one of the supported mime types. + * @p content is also set as extraction context (see setContext). + */ + void setContent(KMime::Content *content); + + /** Sets the MIME part the document we try to extract comes from. + * Use this for documents received by email, to provide additional + * hints for the extraction. + * Calling this method is not necessary when using setContent, + * only when using any of the other content setter methods directly. + */ + void setContext(KMime::Content *context); + + /** Set the date the extracted document has been issued at. + * This does not need to be perfectly accurate and is used to + * complete incomplete date information in the document (typically + * a missing year). + * This method does not need to be called when setContext is used. + */ + void setContextDate(const QDateTime &dt); /** The date the email containing the processed text was sent. */ - void setSenderDate(const QDateTime &dt); + [[deprecated("Use setContextDate")]] void setSenderDate(const QDateTime &dt); /** Perform the actual extration, and return the JSON-LD data * that has been found. */ QJsonArray extract(); private: std::unique_ptr d; }; } #endif // EXTRACTORENGINE_H