diff --git a/src/extractorengine.cpp b/src/extractorengine.cpp index 5de0a00..b07302f 100644 --- a/src/extractorengine.cpp +++ b/src/extractorengine.cpp @@ -1,666 +1,674 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config-kitinerary.h" #include "barcodedecoder.h" #include "extractorengine.h" #include "extractor.h" #include "extractorinput.h" #include "extractorrepository.h" #include "genericpdfextractor_p.h" #include "genericpkpassextractor_p.h" #include "genericuic918extractor_p.h" #include "htmldocument.h" #include "iatabcbpparser.h" #include "jsonlddocument.h" #include "logging.h" #include "pdf/pdfdocument.h" #include "structureddataextractor_p.h" #include "uic9183/uic9183parser.h" #include "jsapi/barcode.h" #include "jsapi/context.h" #include "jsapi/jsonld.h" #ifdef HAVE_KCAL #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace KItinerary; namespace KItinerary { class ExtractorEnginePrivate { public: void setupEngine(); void resetContent(); void openDocument(); bool shouldExtractExternally() const; void extractExternal(); void extractRecursive(KMime::Content *content); void extractDocument(); void extractStructured(); void extractCustom(); void extractGeneric(); void executeScript(const Extractor &extractor); void processScriptResult(const QJSValue &result); ExtractorEngine *q = nullptr; std::vector m_extractors; std::vector m_additionalExtractors; JsApi::Barcode *m_barcodeApi = nullptr; JsApi::Context *m_context = nullptr; JsApi::JsonLd *m_jsonLdApi = nullptr; QString m_text; QByteArray m_data; ExtractorInput::Type m_inputType = ExtractorInput::Unknown; std::unique_ptr> m_htmlDoc; std::unique_ptr> m_pdfDoc; std::unique_ptr> m_pass; #ifdef HAVE_KCAL KCalendarCore::Calendar::Ptr m_calendar; #endif KMime::Content *m_mimeContent = nullptr; KMime::Content *m_mimeContext = nullptr; std::unique_ptr m_ownedMimeContent; GenericPdfExtractor m_genericPdfExtractor; QJsonArray m_result; QJSEngine m_engine; ExtractorRepository m_repo; BarcodeDecoder m_barcodeDecoder; QString m_externalExtractor; + QString m_usedExtractor; }; template static std::unique_ptr> make_owning_ptr(T *ptr) { return std::unique_ptr>(ptr, [](T *ptr){ delete ptr; }); } template static std::unique_ptr> make_nonowning_ptr(T *ptr) { return std::unique_ptr>(ptr, [](T*){}); } } void ExtractorEnginePrivate::setupEngine() { m_context = new JsApi::Context; // will be deleted by QJSEngine taking ownership m_engine.installExtensions(QJSEngine::ConsoleExtension); m_jsonLdApi = new JsApi::JsonLd(&m_engine); m_engine.globalObject().setProperty(QStringLiteral("JsonLd"), m_engine.newQObject(m_jsonLdApi)); m_barcodeApi = new JsApi::Barcode; m_barcodeApi->setDecoder(&m_barcodeDecoder); m_engine.globalObject().setProperty(QStringLiteral("Barcode"), m_engine.newQObject(m_barcodeApi)); m_engine.globalObject().setProperty(QStringLiteral("Context"), m_engine.newQObject(m_context)); } bool ExtractorEnginePrivate::shouldExtractExternally() const { return !m_externalExtractor.isEmpty() && !m_data.isEmpty() && m_inputType == ExtractorInput::Pdf; } void ExtractorEnginePrivate::extractExternal() { m_extractors.clear(); if (m_mimeContext) { m_extractors = m_repo.extractorsForMessage(m_mimeContext); } QStringList extNames; extNames.reserve(m_extractors.size()); std::transform(m_extractors.begin(), m_extractors.end(), std::back_inserter(extNames), [](const auto &ext) { return ext.name(); }); QProcess proc; proc.setProgram(m_externalExtractor); proc.setArguments({QLatin1String("--type"), ExtractorInput::typeToString(m_inputType), QLatin1String("--context-date"), m_context->m_senderDate.toString(Qt::ISODate), QLatin1String("--extractors"), extNames.join(QLatin1Char(';'))}); proc.start(QProcess::ReadWrite); proc.setProcessChannelMode(QProcess::ForwardedErrorChannel); if (!proc.waitForStarted(1000)) { qCWarning(Log) << "could not start external extractor" << m_externalExtractor << proc.errorString(); return; } proc.write(m_data); proc.closeWriteChannel(); if (!proc.waitForFinished(15000)) { qCWarning(Log) << "external extractor did not exit cleanly" << m_externalExtractor << proc.errorString(); return; } const auto res = QJsonDocument::fromJson(proc.readAllStandardOutput()).array(); std::copy(res.begin(), res.end(), std::back_inserter(m_result)); } ExtractorEngine::ExtractorEngine() : d(new ExtractorEnginePrivate) { d->q = this; d->m_genericPdfExtractor.setBarcodeDecoder(&d->m_barcodeDecoder); d->setupEngine(); } ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default; ExtractorEngine::~ExtractorEngine() = default; void ExtractorEngine::clear() { d->resetContent(); d->m_result = {}; d->m_mimeContext = nullptr; d->m_context->m_senderDate = {}; d->m_ownedMimeContent.reset(); d->m_barcodeDecoder.clearCache(); + d->m_usedExtractor.clear(); } void ExtractorEnginePrivate::resetContent() { m_text.clear(); m_data.clear(); m_inputType = ExtractorInput::Unknown; m_pdfDoc.reset(); m_htmlDoc.reset(); m_pass.reset(); #ifdef HAVE_KCAL m_calendar.reset(); #endif m_mimeContent = nullptr; } void ExtractorEngine::setText(const QString &text) { d->m_text = text; } void ExtractorEngine::setHtmlDocument(HtmlDocument *htmlDoc) { d->m_htmlDoc = make_nonowning_ptr(htmlDoc); } void ExtractorEngine::setPdfDocument(PdfDocument *pdfDoc) { d->m_pdfDoc = make_nonowning_ptr(pdfDoc); } void ExtractorEngine::setPass(KPkPass::Pass *pass) { d->m_pass = make_nonowning_ptr(pass); } void ExtractorEngine::setCalendar(const QSharedPointer &calendar) { #ifdef HAVE_KCAL d->m_calendar = calendar; #else Q_UNUSED(calendar); #endif } void ExtractorEngine::setData(const QByteArray &data, const QString &fileName) { // let's not even try to parse anything with implausible size if (data.size() <= 4 || data.size() > 4000000) { return; } const auto nameType = ExtractorInput::typeFromFileName(fileName); const auto contentType = ExtractorInput::typeFromContent(data); setData(data, nameType == ExtractorInput::Unknown ? contentType : nameType); } void ExtractorEngine::setData(const QByteArray &data, ExtractorInput::Type type) { // let's not even try to parse anything with implausible size if (data.size() <= 4 || data.size() > 4000000) { return; } d->m_data = data; d->m_inputType = type; } void ExtractorEnginePrivate::openDocument() { if (m_data.isEmpty()) { return; } switch (m_inputType) { case ExtractorInput::PkPass: m_pass = make_owning_ptr(KPkPass::Pass::fromData(m_data)); m_data.clear(); break; case ExtractorInput::Pdf: m_pdfDoc = make_owning_ptr(PdfDocument::fromData(m_data)); m_data.clear(); break; case ExtractorInput::Html: m_htmlDoc = make_owning_ptr(HtmlDocument::fromData(m_data)); m_data.clear(); break; case ExtractorInput::ICal: { #ifdef HAVE_KCAL m_calendar.reset(new KCalendarCore::MemoryCalendar(QTimeZone())); KCalendarCore::ICalFormat format; if (format.fromRawString(m_calendar, m_data)) { m_calendar->setProductId(format.loadedProductId()); break; } qCDebug(Log) << "Failed to parse iCal content."; m_calendar.reset(); #else qCDebug(Log) << "Trying to exctract ical file, but ical support is not enabled."; #endif m_data.clear(); break; } case ExtractorInput::Text: m_text = QString::fromUtf8(m_data); m_data.clear(); break; case ExtractorInput::Email: m_ownedMimeContent.reset(new KMime::Message); m_ownedMimeContent->setContent(KMime::CRLFtoLF(m_data)); m_ownedMimeContent->parse(); m_data.clear(); q->setContent(m_ownedMimeContent.get()); break; case ExtractorInput::JsonLd: { // pass through JSON data, so the using code can apply post-processing to that const auto doc = QJsonDocument::fromJson(m_data); if (doc.isObject()) { m_result.push_back(doc.object()); } else if (doc.isArray()) { m_result = doc.array(); } m_data.clear(); break; } default: break; } } void ExtractorEngine::setContent(KMime::Content *content) { setContext(content); auto mtType = ExtractorInput::Unknown; auto fnType = ExtractorInput::Unknown; const auto ct = content->contentType(false); if (ct) { mtType = ExtractorInput::typeFromMimeType(QString::fromLatin1(ct->mimeType())); fnType = ExtractorInput::typeFromFileName(ct->name()); } const auto cd = content->contentDisposition(false); if (fnType == ExtractorInput::Unknown && cd) { fnType = ExtractorInput::typeFromFileName(cd->filename()); } if (mtType == ExtractorInput::PkPass || fnType == ExtractorInput::PkPass) { setData(content->decodedContent(), ExtractorInput::PkPass); } else if (mtType == ExtractorInput::ICal || fnType == ExtractorInput::ICal) { setData(content->decodedContent(), ExtractorInput::ICal); } else if (mtType == ExtractorInput::Pdf || fnType == ExtractorInput::Pdf) { setData(content->decodedContent(), ExtractorInput::Pdf); } else if (mtType == ExtractorInput::Html) { setData(content->decodedContent(), ExtractorInput::Html); } else if ( (mtType == ExtractorInput::Text) || (!ct && content->isTopLevel())) { d->m_text = content->decodedText(); } d->m_mimeContent = (ct && ct->isMultipart()) ? content : nullptr; } void ExtractorEngine::setContext(KMime::Content *context) { d->m_mimeContext = context; if (context) { auto dateHdr = context->header(); while (!dateHdr && context->parent()) { context = context->parent(); dateHdr = context->header(); } if (dateHdr) { setContextDate(dateHdr->dateTime()); return; } } setContextDate({}); } void ExtractorEngine::setContextDate(const QDateTime &dt) { d->m_context->m_senderDate = dt; d->m_jsonLdApi->setContextDate(dt); d->m_barcodeApi->setContextDate(dt); d->m_genericPdfExtractor.setContextDate(dt); } QJsonArray ExtractorEngine::extract() { d->extractDocument(); return d->m_result; } void ExtractorEnginePrivate::extractRecursive(KMime::Content *content) { QJsonArray aggregatedResult; const auto children = content->contents(); for (const auto child : children) { resetContent(); q->setContent(child); extractDocument(); // the extractor takes early exits if data has been found, so make it look like that isn't the case std::copy(m_result.begin(), m_result.end(), std::back_inserter(aggregatedResult)); m_result = {}; } m_result = std::move(aggregatedResult); } void ExtractorEnginePrivate::extractDocument() { // recurse into email MIME nodes if needed if (m_inputType == ExtractorInput::Email) { openDocument(); } if (m_mimeContent) { extractRecursive(m_mimeContent); return; } if (shouldExtractExternally()) { extractExternal(); return; } openDocument(); // structured content extractStructured(); if (!m_result.isEmpty()) { return; } // custom extractors m_extractors.clear(); if (m_pass) { m_extractors = m_repo.extractorsForPass(m_pass.get()); #ifdef HAVE_KCAL } else if (m_calendar) { m_extractors = m_repo.extractorsForCalendar(m_calendar); #endif } if (m_extractors.empty()) { if (m_mimeContext) { m_extractors = m_repo.extractorsForMessage(m_mimeContext); } else { m_extractors = std::move(m_additionalExtractors); } } extractCustom(); // generic extractors extractGeneric(); } void ExtractorEnginePrivate::extractStructured() { if (m_htmlDoc) { qCDebug(Log) << "Looking for structured annotations..."; const auto res = StructuredDataExtractor::extract(m_htmlDoc.get()); std::copy(res.begin(), res.end(), std::back_inserter(m_result)); } } void ExtractorEnginePrivate::extractCustom() { for (const auto &extractor : m_extractors) { switch (extractor.type()) { case ExtractorInput::Text: // running text extractors on PDF or HTML docs is possible, // but only extract the text when really needed if (m_text.isEmpty() && m_pdfDoc) { m_text = m_pdfDoc->text(); } if (m_text.isEmpty() && m_htmlDoc) { m_text = m_htmlDoc->root().recursiveContent(); } if (m_text.isEmpty() && !m_data.isEmpty()) { m_text = QString::fromUtf8(m_data); } if (!m_text.isEmpty()) { executeScript(extractor); } break; case ExtractorInput::Html: if (m_htmlDoc) { executeScript(extractor); } break; case ExtractorInput::Pdf: if (m_pdfDoc) { executeScript(extractor); } break; case ExtractorInput::PkPass: if (m_pass) { executeScript(extractor); } break; case ExtractorInput::ICal: #ifdef HAVE_KCAL if (m_calendar) { executeScript(extractor); } #endif break; default: qCWarning(Log) << "Unexpected extractor type:" << extractor.type(); break; } if (!m_result.isEmpty()) { + m_usedExtractor = extractor.name(); break; } } } void ExtractorEnginePrivate::extractGeneric() { if (m_pass) { if (m_result.size() > 1) { // a pkpass file contains exactly one boarding pass return; } if (m_result.isEmpty()) { m_result.push_back(QJsonObject()); } auto res = m_result.at(0).toObject(); res = GenericPkPassExtractor::extract(m_pass.get(), res, m_context->m_senderDate); m_result[0] = res; } else if (m_pdfDoc && m_result.isEmpty()) { QJsonArray genericResult; m_genericPdfExtractor.extract(m_pdfDoc.get(), genericResult); // check if generic extractors identified documents we have custom extractors for m_extractors = m_repo.extractorsForJsonLd(genericResult); extractCustom(); // check the unrecognized (vendor-specific) barcodes, if any const auto unrecognizedCodes = m_genericPdfExtractor.unrecognizedBarcodes(); for (const auto &code : unrecognizedCodes) { m_extractors = m_repo.extractorsForBarcode(code); extractCustom(); } // if none of that found something, take the generic extractor result as-is if (m_result.isEmpty()) { m_result = genericResult; } } else if (!m_text.isEmpty() && m_result.isEmpty()) { if (IataBcbpParser::maybeIataBcbp(m_text)) { const auto res = IataBcbpParser::parse(m_text, m_context->m_senderDate.date()); m_result = JsonLdDocument::toJson(res); } } else if (!m_data.isEmpty() && m_result.isEmpty()) { if (Uic9183Parser::maybeUic9183(m_data)) { GenericUic918Extractor::extract(m_data, m_result, m_context->m_senderDate); return; } // try again as text m_text = QString::fromUtf8(m_data); extractGeneric(); } } void ExtractorEnginePrivate::executeScript(const Extractor &extractor) { if (extractor.scriptFileName().isEmpty()) { return; } QFile f(extractor.scriptFileName()); if (!f.open(QFile::ReadOnly)) { qCWarning(Log) << "Failed to open extractor script" << f.fileName() << f.errorString(); return; } auto result = m_engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName()); if (result.isError()) { qCWarning(Log) << "Script parsing error in" << result.property(QStringLiteral("fileName")).toString() << ':' << result.property(QStringLiteral("lineNumber")).toInt() << result.toString(); return; } auto mainFunc = m_engine.globalObject().property(extractor.scriptFunction()); if (!mainFunc.isCallable()) { qCWarning(Log) << "Script entry point not found!" << extractor.scriptFunction(); return; } qCDebug(Log) << "Running custom extractor" << extractor.scriptFileName() << extractor.scriptFunction(); QJSValueList args; switch (extractor.type()) { case ExtractorInput::Text: args = {m_text}; break; case ExtractorInput::Html: args = {m_engine.toScriptValue(m_htmlDoc.get())}; break; case ExtractorInput::Pdf: args = {m_engine.toScriptValue(m_pdfDoc.get())}; break; case ExtractorInput::PkPass: args = {m_engine.toScriptValue(m_pass.get())}; break; case ExtractorInput::ICal: { #ifdef HAVE_KCAL const auto events = m_calendar->events(); for (const auto &event : events) { processScriptResult(mainFunc.call({m_engine.toScriptValue(*event.data())})); } #endif break; } default: qCWarning(Log) << "Unexpected extractor input type:" << extractor.type(); break; } if (!args.isEmpty()) { processScriptResult(mainFunc.call(args)); } } void ExtractorEnginePrivate::processScriptResult(const QJSValue &result) { if (result.isError()) { qCWarning(Log) << "Script execution error in" << result.property(QStringLiteral("fileName")).toString() << ':' << result.property(QStringLiteral("lineNumber")).toInt() << result.toString(); return; } if (result.isArray()) { QJSValueIterator it(result); while (it.hasNext()) { it.next(); if (it.value().isObject()) { m_result.push_back(QJsonValue::fromVariant(it.value().toVariant())); } } } else if (result.isObject()) { m_result.push_back(QJsonValue::fromVariant(result.toVariant())); } else { qCWarning(Log) << "Invalid result type from script"; } } void ExtractorEngine::setUseSeparateProcess(bool separateProcess) { if (!separateProcess) { d->m_externalExtractor.clear(); return; } // find external extractor QFileInfo fi(QLatin1String(CMAKE_INSTALL_FULL_LIBEXECDIR_KF5) + QLatin1String("/kitinerary-extractor")); if (!fi.exists() && !fi.isFile() && !fi.isExecutable()) { qCCritical(Log) << "Cannot find external extractor:" << fi.fileName(); return; } d->m_externalExtractor = fi.canonicalFilePath(); } void ExtractorEngine::setAdditionalExtractors(std::vector &&extractors) { d->m_additionalExtractors = std::move(extractors); } + +QString ExtractorEngine::usedCustomExtractor() const +{ + return d->m_usedExtractor; +} diff --git a/src/extractorengine.h b/src/extractorengine.h index 64b20d4..4055e86 100644 --- a/src/extractorengine.h +++ b/src/extractorengine.h @@ -1,219 +1,225 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef EXTRACTORENGINE_H #define EXTRACTORENGINE_H #include "kitinerary_export.h" #include "extractorinput.h" #include #include #include template class QSharedPointer; namespace KCalendarCore { class Calendar; } namespace KPkPass { class Pass; } namespace KMime { class Content; } class QByteArray; class QDateTime; class QJsonArray; namespace KItinerary { class Extractor; class ExtractorEnginePrivate; class HtmlDocument; class PdfDocument; /** * Unstructured data extraction engine. * * This will apply the given Extractor instance to the given input data * (plain text, HTML text, PDF documents, etc), and return the extracted * JSON-LD data. * * @section create_extractors Creating Extractors * * @subsection extractor_api Extractor API * * For adding custom extractors, two parts are needed: * - JSON meta-data describing the extractor and when to apply it, as described * in the Extractor documentation. * - An extractor JavaScript file, compatible with QJSEngine. * * The extractor script will have access to API defined in the JsApi namespace: * - JsApi::Context: information about the input data being processed. * - JsApi::JsonLd: functions for generating JSON-LD data. * - JsApi::Barcode: barcode decoding functions. * * The entry point to the script is specified in the meta-data, its argument depends * on the extractor type: * - Plain text extractors are passed a string. * If input is HTML or PDF, the string will be the text of the document stripped * of all formatting etc. * - HTML extractors are passed a HtmlDocument instance allowing DOM-like access to * the document structure. * - PDF extractors are passed a PdfDocument instance allowing access to textual and * image content. * - Apple Wallet pass extractors are passed a KPkPass::BoardingPass instance. * - iCalendar event extractors are passed KCalendarCore::Event instances. * * These functions should return an object or an array of objects following the JSON-LD * format defined on schema.org. JsApi::JsonLd provides helper functions to build such * objects. If @c null or an empty array is returned, the next applicable extractor is * run. * * Returned objects are then passed through ExtractorPostprocessor which will normalize, * augment and validate the data. This can greatly simplify the extraction, as for example * the expansion of an IATA BCBP ticket token already fills most key properties of a flight * reservation automatically. * * @subsection extractor_tools Development Tools * * For interactive testing during development of new extractors, it is recommended to * link (or copy) the JSON meta data and JavaScript code files to the search path for * Extractor meta data. * * Additionally, there's an interactive testing and inspection tool called @c kitinerary-workbench * (see https://phabricator.kde.org/source/kitinerary-workbench/). * * @subsection extractor_testing Automated Testing * * There are a few unit tests for extractors in the kitinerary repository (see autotests/extractordata), * however the majority of real-world test data cannot be shared this way, due to privacy * and copyright issues (e.g. PDFs containing copyrighted vendor logos and user credit card details). * Therefore there is also support for testing against external data (see extractortest.cpp). * * External test data is assumed to be in a folder named @c kitinerary-tests next to the @c kitinerary * source folder. The test program searches this folder recursively for folders with the following content * and attempts to extract data from each test file in there. * * - @c context.eml: MIME message header data specifying the context in which the test data * was received. This typically only needs a @c From: and @c Date: line, but can even be * entirely empty (or non-existing) for structured data that does not need a custom extractor. * This context information is applied to all tests in this folder. * - @c \.[txt|html|pdf|pkpass|ics|eml|mbox]: The input test data. * - @c \.json: The expected JSON-LD output. If this file doesn't * exists it is created by the test program. * - @c \.skip: If this file is present the corresponding test * is skipped. */ class KITINERARY_EXPORT ExtractorEngine { public: ExtractorEngine(); ~ExtractorEngine(); ExtractorEngine(ExtractorEngine &&) noexcept; ExtractorEngine(const ExtractorEngine &) = delete; /** Resets the internal state, call before processing new input data. */ void clear(); /** The text to extract data from. * Only considered for text extractors. */ void setText(const QString &text); /** A HTML document to extract data from. * Only considered for HTML and text extractors. */ void setHtmlDocument(HtmlDocument *htmlDoc); /** A PDF document to extract data from. * Only considered for PDF or text extractors. */ void setPdfDocument(PdfDocument *pdfDoc); /** The pkpass boarding pass to extract data from. * Only considered for pkpass extractors. */ void setPass(KPkPass::Pass *pass); /** The iCalendar to extract data from. * Only considered for ical extractors. */ void setCalendar(const QSharedPointer &calendar); /** A MIME part to extract from. * This is assumed to contain one of the supported mime types. * @p content is also set as extraction context (see setContext). */ void setContent(KMime::Content *content); /** Any kind of data to extract from. * ExtractorEngine tries to auto-detect what type of data this is * and pick one of the above methods accordingly. * Avoid using this if you know exactly what data you have. * @param fileName Used as a hint to determine the type, optional. */ void setData(const QByteArray &data, const QString &fileName = {}); /** Raw data to extract, but with a known type. * No content type detection is performed here, you should be sure about @p type. */ void setData(const QByteArray &data, ExtractorInput::Type type); /** Sets the MIME part the document we try to extract comes from. * Use this for documents received by email, to provide additional * hints for the extraction. * Calling this method is not necessary when using setContent, * only when using any of the other content setter methods directly. */ void setContext(KMime::Content *context); /** Set the date the extracted document has been issued at. * This does not need to be perfectly accurate and is used to * complete incomplete date information in the document (typically * a missing year). * This method does not need to be called when setContext is used. */ void setContextDate(const QDateTime &dt); /** Perform extraction of "risky" content such as PDF files in a separate process. * This is safer as it isolates the using application from crashes/hangs due to corrupt files. * It is however slower, and not available on all platforms. * This is off by default. */ void setUseSeparateProcess(bool separateProcess); /** Sets additional extractors to run on the given data. * Extractors are usually automatically selected, this is therefore most likely not needed to * be called manually. This mainly exists for the external extractor process. */ void setAdditionalExtractors(std::vector &&extractors); /** Perform the actual extraction, and return the JSON-LD data * that has been found. */ QJsonArray extract(); + /** Returns the extractor id used to obtain the result. + * Can be empty if generic extractors have been used. + * Not supposed to be used for normal operations, this is only needed for tooling. + */ + QString usedCustomExtractor() const; + private: std::unique_ptr d; }; } #endif // EXTRACTORENGINE_H