diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8db7fbf..48578a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,154 +1,155 @@ set(KDE_INSTALL_INCLUDEDIR_PIM ${KDE_INSTALL_INCLUDEDIR}/KPim) add_subdirectory(knowledgedb-generator) configure_file(config-kitinerary.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config-kitinerary.h) set(kitinerary_lib_srcs datatypes/action.cpp datatypes/brand.cpp datatypes/bustrip.cpp datatypes/event.cpp datatypes/flight.cpp datatypes/organization.cpp datatypes/person.cpp datatypes/place.cpp datatypes/reservation.cpp datatypes/taxi.cpp datatypes/ticket.cpp datatypes/traintrip.cpp datatypes/rentalcar.cpp datatypes/visit.cpp jsapi/barcode.cpp jsapi/context.cpp jsapi/jsonld.cpp knowledgedb/airportdb.cpp knowledgedb/countrydb.cpp knowledgedb/knowledgedb.cpp knowledgedb/timezonedb.cpp knowledgedb/trainstationdb.cpp barcodedecoder.cpp calendarhandler.cpp extractor.cpp extractorengine.cpp extractorfilter.cpp extractorpostprocessor.cpp extractorrepository.cpp + genericpdfextractor.cpp htmldocument.cpp iatabcbpparser.cpp jsonlddocument.cpp jsonldimportfilter.cpp mergeutil.cpp pdfdocument.cpp sortutil.cpp stringutil.cpp structureddataextractor.cpp uic9183parser.cpp ) qt5_add_resources(kitinerary_lib_srcs extractors/extractors.qrc) ecm_qt_declare_logging_category(kitinerary_lib_srcs HEADER logging.h IDENTIFIER KItinerary::Log CATEGORY_NAME org.kde.kitinerary) kde_source_files_enable_exceptions(barcodedecoder.cpp) add_library(KPimItinerary ${kitinerary_lib_srcs}) add_library(KPim::Itinerary ALIAS KPimItinerary) generate_export_header(KPimItinerary BASE_NAME KItinerary) set_target_properties(KPimItinerary PROPERTIES VERSION ${KITINERARY_VERSION_STRING} SOVERSION ${KITINERARY_SOVERSION} EXPORT_NAME Itinerary ) target_include_directories(KPimItinerary INTERFACE "$") target_include_directories(KPimItinerary PUBLIC "$") target_link_libraries(KPimItinerary PUBLIC Qt5::Core KF5::Mime PRIVATE Qt5::Qml KF5::I18n KF5::Contacts KPim::PkPass ${ZLIB_LIBRARIES} ) if (HAVE_POPPLER) target_link_libraries(KPimItinerary PRIVATE Poppler::Core) endif() if (HAVE_ZXING) target_link_libraries(KPimItinerary PRIVATE zxing::libzxing) endif() if (HAVE_KCAL) target_link_libraries(KPimItinerary PUBLIC KF5::CalendarCore) endif() if (HAVE_LIBXML2) target_compile_definitions(KPimItinerary PRIVATE ${LIBXML2_DEFINITIONS}) target_include_directories(KPimItinerary PRIVATE ${LIBXML2_INCLUDE_DIR}) target_link_libraries(KPimItinerary PRIVATE ${LIBXML2_LIBRARIES}) endif() ecm_generate_headers(KItinerary_FORWARDING_HEADERS HEADER_NAMES BarcodeDecoder CalendarHandler Extractor ExtractorEngine ExtractorPostprocessor ExtractorRepository HtmlDocument IataBcbpParser JsonLdDocument MergeUtil PdfDocument SortUtil Uic9183Parser PREFIX KItinerary REQUIRED_HEADERS KItinerary_HEADERS ) ecm_generate_headers(KItinerary_KnowledgeDb_FORWARDING_HEADERS HEADER_NAMES AirportDb CountryDb KnowledgeDb TrainStationDb PREFIX KItinerary REQUIRED_HEADERS KItinerary_KnowledgeDb_HEADERS RELATIVE knowledgedb ) ecm_generate_headers(KItinerary_Datatypes_FORWARDING_HEADERS HEADER_NAMES Action Brand BusTrip Datatypes Event Flight Organization Reservation RentalCar Person Place Taxi Ticket TrainTrip Visit PREFIX KItinerary REQUIRED_HEADERS KItinerary_Datatypes_HEADERS RELATIVE datatypes ) install(TARGETS KPimItinerary EXPORT KPimItineraryTargets ${INSTALL_TARGETS_DEFAULT_ARGS}) install(FILES ${KItinerary_FORWARDING_HEADERS} ${KItinerary_KnowledgeDb_FORWARDING_HEADERS} ${KItinerary_Datatypes_FORWARDING_HEADERS} DESTINATION ${KDE_INSTALL_INCLUDEDIR_PIM}/KItinerary ) install(FILES ${KItinerary_HEADERS} ${KItinerary_AirportDb_HEADERS} ${KItinerary_Datatypes_HEADERS} ${KItinerary_KnowledgeDb_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/kitinerary_export.h DESTINATION ${KDE_INSTALL_INCLUDEDIR_PIM}/kitinerary ) diff --git a/src/extractor.cpp b/src/extractor.cpp index 82e032d..07e3bee 100644 --- a/src/extractor.cpp +++ b/src/extractor.cpp @@ -1,107 +1,103 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "extractor.h" #include "extractorfilter.h" #include "logging.h" #include #include #include #include using namespace KItinerary; namespace KItinerary { class ExtractorPrivate { public: QString m_scriptName; QString m_scriptFunction; std::vector m_filters; Extractor::Type m_type = Extractor::Text; }; } Extractor::Extractor() : d(new ExtractorPrivate) { } Extractor::Extractor(Extractor &&) = default; Extractor::~Extractor() = default; bool Extractor::load(const QJsonObject &obj, const QString &baseDir) { const auto type = obj.value(QLatin1String("type")).toString(); if (type == QLatin1String("pkpass")) { d->m_type = PkPass; } else if (type == QLatin1String("pdf")) { d->m_type = Pdf; } else if (type == QLatin1String("html")) { d->m_type = Html; } else if (type == QLatin1String("ical")) { d->m_type = ICal; } for (const auto &filterValue : obj.value(QLatin1String("filter")).toArray()) { ExtractorFilter f; if (!f.load(filterValue.toObject())) { return false; } d->m_filters.push_back(std::move(f)); } const auto scriptName = obj.value(QLatin1String("script")).toString(); if (!scriptName.isEmpty()) { d->m_scriptName = baseDir + QLatin1Char('/') + scriptName; } if (!d->m_scriptName.isEmpty() && !QFile::exists(d->m_scriptName)) { qCWarning(Log) << "Script file not found:" << d->m_scriptName; return false; } - if (d->m_type != PkPass && d->m_scriptName.isEmpty()) { - qCWarning(Log) << "Script file required for text, HTML or PDF extractors!"; - return false; - } d->m_scriptFunction = obj.value(QLatin1String("function")).toString(QStringLiteral("main")); return !d->m_filters.empty(); } Extractor::Type Extractor::type() const { return d->m_type; } QString Extractor::scriptFileName() const { return d->m_scriptName; } QString Extractor::scriptFunction() const { return d->m_scriptFunction; } const std::vector &Extractor::filters() const { return d->m_filters; } diff --git a/src/extractorengine.cpp b/src/extractorengine.cpp index 5c1de63..9e77422 100644 --- a/src/extractorengine.cpp +++ b/src/extractorengine.cpp @@ -1,431 +1,438 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config-kitinerary.h" #include "extractorengine.h" #include "extractor.h" +#include "genericpdfextractor.h" #include "htmldocument.h" #include "jsonlddocument.h" #include "logging.h" #include "pdfdocument.h" #include "structureddataextractor.h" #include "jsapi/barcode.h" #include "jsapi/context.h" #include "jsapi/jsonld.h" #ifdef HAVE_KCAL #include #include #endif #include #include #include #include #include #include #include #include #include #include #include using namespace KItinerary; namespace KItinerary { class ExtractorEnginePrivate { public: void setupEngine(); void executeScript(const Extractor *extractor); void processScriptResult(const QJSValue &result); void extractPass(); void extractBoardingPass(QJsonObject &resFor); void extractEventTicketPass(QJsonObject &resFor); std::vector m_extractors; JsApi::Barcode *m_barcodeApi = nullptr; JsApi::Context *m_context = nullptr; JsApi::JsonLd *m_jsonLdApi = nullptr; QString m_text; HtmlDocument *m_htmlDoc = nullptr; PdfDocument *m_pdfDoc = nullptr; KPkPass::Pass *m_pass; #ifdef HAVE_KCAL KCalCore::Calendar::Ptr m_calendar; #endif + GenericPdfExtractor m_genericPdfExtractor; QJsonArray m_result; QJSEngine m_engine; }; } void ExtractorEnginePrivate::setupEngine() { m_context = new JsApi::Context; // will be deleted by QJSEngine taking ownership m_engine.installExtensions(QJSEngine::ConsoleExtension); m_jsonLdApi = new JsApi::JsonLd(&m_engine); m_engine.globalObject().setProperty(QStringLiteral("JsonLd"), m_engine.newQObject(m_jsonLdApi)); m_barcodeApi = new JsApi::Barcode; m_engine.globalObject().setProperty(QStringLiteral("Barcode"), m_engine.newQObject(m_barcodeApi)); m_engine.globalObject().setProperty(QStringLiteral("Context"), m_engine.newQObject(m_context)); } ExtractorEngine::ExtractorEngine() : d(new ExtractorEnginePrivate) { d->setupEngine(); } ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default; ExtractorEngine::~ExtractorEngine() = default; void ExtractorEngine::clear() { d->m_text.clear(); d->m_pdfDoc = nullptr; d->m_htmlDoc = nullptr; d->m_pass = nullptr; #ifdef HAVE_KCAL d->m_calendar.reset(); #endif d->m_result = {}; d->m_context->m_senderDate = {}; } void ExtractorEngine::setExtractors(std::vector &&extractors) { d->m_extractors = extractors; } void ExtractorEngine::setText(const QString &text) { d->m_text = text; } void ExtractorEngine::setHtmlDocument(HtmlDocument *htmlDoc) { d->m_htmlDoc = htmlDoc; } void ExtractorEngine::setPdfDocument(PdfDocument *pdfDoc) { d->m_pdfDoc = pdfDoc; } void ExtractorEngine::setPass(KPkPass::Pass *pass) { d->m_pass = pass; } void ExtractorEngine::setCalendar(const QSharedPointer &calendar) { #ifdef HAVE_KCAL d->m_calendar = calendar; #else Q_UNUSED(calendar); #endif } void ExtractorEngine::setSenderDate(const QDateTime &dt) { d->m_context->m_senderDate = dt; d->m_jsonLdApi->setContextDate(dt); d->m_barcodeApi->setContextDate(dt.date()); + d->m_genericPdfExtractor.setContextDate(dt); } QJsonArray ExtractorEngine::extract() { for (const auto extractor : d->m_extractors) { switch (extractor->type()) { case Extractor::Text: // running text extractors on PDF or HTML docs is possible, // but only extract the text when really needed if (d->m_text.isEmpty() && d->m_pdfDoc) { d->m_text = d->m_pdfDoc->text(); } if (d->m_text.isEmpty() && d->m_htmlDoc) { d->m_text = d->m_htmlDoc->root().recursiveContent(); } if (!d->m_text.isEmpty()) { d->executeScript(extractor); } break; case Extractor::Html: if (d->m_htmlDoc) { if (extractor->scriptFileName().isEmpty()) { for (const auto &v : StructuredDataExtractor::extract(d->m_htmlDoc)) { d->m_result.push_back(v); } } d->executeScript(extractor); } break; case Extractor::Pdf: if (d->m_pdfDoc) { - d->executeScript(extractor); + if (extractor->scriptFileName().isEmpty()) { + d->m_genericPdfExtractor.extract(d->m_pdfDoc, d->m_result); + } else { + d->executeScript(extractor); + } } break; case Extractor::PkPass: if (d->m_pass) { d->executeScript(extractor); d->extractPass(); } break; case Extractor::ICal: #ifdef HAVE_KCAL if (d->m_calendar) { d->executeScript(extractor); } #endif break; } if (!d->m_result.isEmpty()) { break; } } return d->m_result; } void ExtractorEnginePrivate::executeScript(const Extractor *extractor) { Q_ASSERT(extractor); if (extractor->scriptFileName().isEmpty()) { return; } QFile f(extractor->scriptFileName()); if (!f.open(QFile::ReadOnly)) { qCWarning(Log) << "Failed to open extractor script" << f.fileName() << f.errorString(); return; } auto result = m_engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName()); if (result.isError()) { qCWarning(Log) << "Script parsing error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } auto mainFunc = m_engine.globalObject().property(extractor->scriptFunction()); if (!mainFunc.isCallable()) { qCWarning(Log) << "Script entry point not found!" << extractor->scriptFunction(); return; } QJSValueList args; switch (extractor->type()) { case Extractor::Text: args = {m_text}; break; case Extractor::Html: args = {m_engine.toScriptValue(m_htmlDoc)}; break; case Extractor::Pdf: args = {m_engine.toScriptValue(m_pdfDoc)}; break; case Extractor::PkPass: args = {m_engine.toScriptValue(m_pass)}; break; case Extractor::ICal: #ifdef HAVE_KCAL for (const auto &event : m_calendar->events()) { processScriptResult(mainFunc.call({m_engine.toScriptValue(*event.data())})); } #endif break; } if (!args.isEmpty()) { processScriptResult(mainFunc.call(args)); } } void ExtractorEnginePrivate::processScriptResult(const QJSValue &result) { if (result.isError()) { qCWarning(Log) << "Script execution error in" << result.property(QLatin1String("fileName")).toString() << ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString(); return; } if (result.isArray()) { QJSValueIterator it(result); while (it.hasNext()) { it.next(); if (it.value().isObject()) { m_result.push_back(QJsonValue::fromVariant(it.value().toVariant())); } } } else if (result.isObject()) { m_result.push_back(QJsonValue::fromVariant(result.toVariant())); } else { qCWarning(Log) << "Invalid result type from script"; } } void ExtractorEnginePrivate::extractPass() { if (m_result.size() > 1) { // a pkpass file contains exactly one boarding pass return; } if (m_result.isEmpty()) { // no script run, so we need to create the top-level element ourselves QJsonObject res; QJsonObject resFor; if (auto boardingPass = qobject_cast(m_pass)) { switch (boardingPass->transitType()) { case KPkPass::BoardingPass::Air: res.insert(QLatin1String("@type"), QLatin1String("FlightReservation")); resFor.insert(QLatin1String("@type"), QLatin1String("Flight")); break; // TODO expand once we have test files for train tickets default: return; } } else { switch (m_pass->type()) { case KPkPass::Pass::EventTicket: res.insert(QLatin1String("@type"), QLatin1String("EventReservation")); resFor.insert(QLatin1String("@type"), QLatin1String("Event")); break; default: return; } } res.insert(QLatin1String("reservationFor"), resFor); m_result.push_back(res); } // extract structured data from a pkpass, if the extractor script hasn't done so already auto res = m_result.at(0).toObject(); auto resFor = res.value(QLatin1String("reservationFor")).toObject(); switch (m_pass->type()) { case KPkPass::Pass::BoardingPass: extractBoardingPass(resFor); break; case KPkPass::Pass::EventTicket: extractEventTicketPass(resFor); break; default: return; } // barcode contains the ticket token if (!m_pass->barcodes().isEmpty() && !res.contains(QLatin1String("reservedTicket"))) { const auto barcode = m_pass->barcodes().at(0); QString token; switch (barcode.format()) { case KPkPass::Barcode::QR: token += QLatin1String("qrCode:"); break; case KPkPass::Barcode::Aztec: token += QLatin1String("aztecCode:"); break; default: break; } token += barcode.message(); QJsonObject ticket; ticket.insert(QLatin1String("@type"), QLatin1String("Ticket")); ticket.insert(QLatin1String("ticketToken"), token); res.insert(QLatin1String("reservedTicket"), ticket); } res.insert(QLatin1String("reservationFor"), resFor); // associate the pass with the result, so we can find the pass again for display if (!m_pass->passTypeIdentifier().isEmpty() && !m_pass->serialNumber().isEmpty()) { res.insert(QLatin1String("pkpassPassTypeIdentifier"), m_pass->passTypeIdentifier()); res.insert(QLatin1String("pkpassSerialNumber"), m_pass->serialNumber()); } m_result[0] = res; } void ExtractorEnginePrivate::extractBoardingPass(QJsonObject &resFor) { // "relevantDate" is the best guess for the boarding time if (m_pass->relevantDate().isValid() && !resFor.contains(QLatin1String("boardingTime"))) { resFor.insert(QLatin1String("boardingTime"), m_pass->relevantDate().toString(Qt::ISODate)); } // look for common field names containing the boarding time, if we still have no idea if (!resFor.contains(QLatin1String("boardingTime"))) { for (const auto &field : m_pass->fields()) { if (!field.key().contains(QLatin1String("boarding"), Qt::CaseInsensitive)) { continue; } const auto time = QTime::fromString(field.value().toString()); if (time.isValid()) { // this misses date, but the postprocessor will fill that in resFor.insert(QLatin1String("boardingTime"), QDateTime(QDate(1, 1, 1), time).toString(Qt::ISODate)); break; } } } // location is the best guess for the departure airport geo coordinates auto depAirport = resFor.value(QLatin1String("departureAirport")).toObject(); if (depAirport.isEmpty()) { depAirport.insert(QLatin1String("@type"), QLatin1String("Airport")); } auto depGeo = depAirport.value(QLatin1String("geo")).toObject(); if (m_pass->locations().size() == 1 && depGeo.isEmpty()) { const auto loc = m_pass->locations().at(0); depGeo.insert(QLatin1String("@type"), QLatin1String("GeoCoordinates")); depGeo.insert(QLatin1String("latitude"), loc.latitude()); depGeo.insert(QLatin1String("longitude"), loc.longitude()); depAirport.insert(QLatin1String("geo"), depGeo); resFor.insert(QLatin1String("departureAirport"), depAirport); } } void ExtractorEnginePrivate::extractEventTicketPass(QJsonObject &resFor) { if (!resFor.contains(QLatin1String("name"))) { resFor.insert(QLatin1String("name"), m_pass->description()); } // "relevantDate" is the best guess for the start time if (m_pass->relevantDate().isValid() && !resFor.contains(QLatin1String("startDate"))) { resFor.insert(QLatin1String("startDate"), m_pass->relevantDate().toString(Qt::ISODate)); } // location is the best guess for the venue auto venue = resFor.value(QLatin1String("location")).toObject(); if (venue.isEmpty()) { venue.insert(QLatin1String("@type"), QLatin1String("Place")); } auto geo = venue.value(QLatin1String("geo")).toObject(); if (!m_pass->locations().isEmpty() && geo.isEmpty()) { const auto loc = m_pass->locations().at(0); geo.insert(QLatin1String("@type"), QLatin1String("GeoCoordinates")); geo.insert(QLatin1String("latitude"), loc.latitude()); geo.insert(QLatin1String("longitude"), loc.longitude()); venue.insert(QLatin1String("geo"), geo); venue.insert(QLatin1String("name"), loc.relevantText()); resFor.insert(QLatin1String("location"), venue); } } diff --git a/src/extractorrepository.cpp b/src/extractorrepository.cpp index 8a7f17a..2cfbc98 100644 --- a/src/extractorrepository.cpp +++ b/src/extractorrepository.cpp @@ -1,176 +1,184 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "extractorrepository.h" #include "extractor.h" #include "extractorfilter.h" #include "logging.h" #include #include #include #include #include #include #include using namespace KItinerary; static void initResources() // must be outside of a namespace { Q_INIT_RESOURCE(extractors); } namespace KItinerary { class ExtractorRepositoryPrivate { public: void loadExtractors(); std::vector m_extractors; Extractor m_genericHtmlExtractor; + Extractor m_genericPdfExtractor; Extractor m_genericPkPassExtractor; }; } ExtractorRepository::ExtractorRepository() : d(new ExtractorRepositoryPrivate) { initResources(); d->loadExtractors(); } ExtractorRepository::~ExtractorRepository() = default; ExtractorRepository::ExtractorRepository(KItinerary::ExtractorRepository &&) = default; std::vector ExtractorRepository::extractorsForMessage(KMime::Content *part) const { std::vector v; if (!part) { return v; } v.push_back(&d->m_genericHtmlExtractor); for (auto it = d->m_extractors.begin(), end = d->m_extractors.end(); it != end; ++it) { if ((*it).type() == Extractor::PkPass) { continue; } for (const auto &filter : (*it).filters()) { auto header = part->headerByType(filter.headerName()); auto ancestor = part; while (!header && ancestor->parent()) { ancestor = ancestor->parent(); header = ancestor->headerByType(filter.headerName()); } if (!header) { continue; } const auto headerData = header->asUnicodeString(); if (filter.matches(headerData)) { v.push_back(&(*it)); break; } } } + // ### we probably want to check for the part mimetype here (but note the test data doesn't have that set!) + if (v.size() == 1) { + v.push_back(&d->m_genericPdfExtractor); + } + return v; } std::vector ExtractorRepository::extractorsForPass(KPkPass::Pass *pass) const { std::vector v; if (pass->type() != KPkPass::Pass::BoardingPass && pass->type() != KPkPass::Pass::EventTicket) { return v; } for (auto it = d->m_extractors.begin(), end = d->m_extractors.end(); it != end; ++it) { if ((*it).type() != Extractor::PkPass) { continue; } for (const auto &filter : (*it).filters()) { QString value; if (strcmp(filter.headerName(), "passTypeIdentifier") == 0) { value = pass->passTypeIdentifier(); } else { continue; } if (filter.matches(value)) { v.push_back(&(*it)); break; } } } if (v.empty()) { v.push_back(&d->m_genericPkPassExtractor); } return v; } void ExtractorRepositoryPrivate::loadExtractors() { auto searchDirs = QStandardPaths::standardLocations(QStandardPaths::GenericDataLocation); searchDirs += QStringLiteral(":/org.kde.pim"); for (const auto &dir : qAsConst(searchDirs)) { QDirIterator it(dir + QStringLiteral("/kitinerary/extractors"), {QStringLiteral("*.json")}, QDir::Files); while (it.hasNext()) { const auto fileName = it.next(); QFile file(fileName); if (!file.open(QFile::ReadOnly)) { continue; } QJsonParseError error; const auto doc = QJsonDocument::fromJson(file.readAll(), &error); if (doc.isNull()) { qCWarning(Log) << "Extractor loading error:" << fileName << error.errorString(); continue; } QFileInfo fi(fileName); if (doc.isObject()) { const auto obj = doc.object(); Extractor e; if (e.load(obj, fi.absolutePath())) { m_extractors.push_back(std::move(e)); } } else if (doc.isArray()) { for (const auto &v : doc.array()) { Extractor e; if (e.load(v.toObject(), fi.absolutePath())) { m_extractors.push_back(std::move(e)); } } } else { qCWarning(Log) << "Invalid extractor meta-data:" << fileName; continue; } } } QJsonObject dummy; dummy.insert(QLatin1String("type"), QLatin1String("html")); m_genericHtmlExtractor.load(dummy, {}); + dummy.insert(QLatin1String("type"), QLatin1String("pdf")); + m_genericPdfExtractor.load(dummy, QString()); dummy.insert(QLatin1String("type"), QLatin1String("pkpass")); m_genericPkPassExtractor.load(dummy, QString()); } diff --git a/src/genericpdfextractor.cpp b/src/genericpdfextractor.cpp new file mode 100644 index 0000000..3cefef2 --- /dev/null +++ b/src/genericpdfextractor.cpp @@ -0,0 +1,107 @@ +/* + Copyright (C) 2018 Volker Krause + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "genericpdfextractor.h" + +#include +#include +#include +#include +#include + +#include +#include + +using namespace KItinerary; + +enum { + MaxPageCount = 10, // maximum in the current test set is 6 + MaxFileSize = 4000000, // maximum in the current test set is 980kB + MinImageHeight = 10, + MinImageWidth = 30, + MaxImageHeight = 1000, // TODO what's a realisitic value here? + MaxImageWidth = 1000 +}; + +GenericPdfExtractor::GenericPdfExtractor() = default; +GenericPdfExtractor::~GenericPdfExtractor() = default; + +void GenericPdfExtractor::setContextDate(const QDateTime &dt) +{ + m_contextDate = dt; +} + +void GenericPdfExtractor::extract(PdfDocument *doc, QJsonArray &result) +{ + // stay away from documents that are atypically large for what we are looking for + // that's just unecessarily eating up resources + if (doc->pageCount() > MaxPageCount || doc->fileSize() > MaxFileSize) { + return; + } + + m_imageIds.clear(); + for (int i = 0; i < doc->pageCount(); ++i) { + const auto page = doc->page(i); + + for (int j = 0; j < page.imageCount(); ++j) { + const auto img = page.image(j); + // image size sanity checks + if (img.height() < MinImageHeight || img.height() > MaxImageHeight || img.width() < MinImageWidth || img.height() > MaxImageWidth) { + continue; + } + + if (m_imageIds.find(img.objectId()) != m_imageIds.end()) { + continue; + } + + extractImage(img, result); + m_imageIds.insert(img.objectId()); + } + } +} + +void GenericPdfExtractor::extractImage(const PdfImage &img, QJsonArray &result) +{ + const auto aspectRatio = img.width() < img.height() ? + (float)img.height() / (float)img.width() : + (float)img.width() / (float)img.height(); + + // almost square, assume Aztec (or QR, which we don't handle here yet) + if (aspectRatio < 1.2f) { + const auto b = BarcodeDecoder::decodeAztecBinary(img.image()); + if (Uic9183Parser::maybeUic9183(b)) { + // TODO + } else { + extractBarcode(QString::fromUtf8(b), result); + } + } + + // rectangular with medium aspect ratio, assume PDF 417 + if (aspectRatio > 1.5 && aspectRatio < 6) { + const auto s = BarcodeDecoder::decodePdf417(img.image()); + extractBarcode(s, result); + } +} + +void GenericPdfExtractor::extractBarcode(const QString &code, QJsonArray &result) +{ + if (IataBcbpParser::maybeIataBcbp(code)) { + const auto res = IataBcbpParser::parse(code, m_contextDate.date()); + const auto jsonLd = JsonLdDocument::toJson(res); + std::copy(jsonLd.begin(), jsonLd.end(), std::back_inserter(result)); + } +} diff --git a/src/genericpdfextractor.h b/src/genericpdfextractor.h new file mode 100644 index 0000000..d39ec91 --- /dev/null +++ b/src/genericpdfextractor.h @@ -0,0 +1,62 @@ +/* + Copyright (C) 2018 Volker Krause + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef KITINERARY_GENERICPDFEXTRACTOR_H +#define KITINERARY_GENERICPDFEXTRACTOR_H + +#include + +#include + +class QJsonArray; +class QString; + +namespace KItinerary { + +class PdfDocument; +class PdfImage; + +/** Generic extractor for PDF documents. + * This is applied to all PDF documents and searches for + * barcodes we can recognize. + * + * @internal + */ +class GenericPdfExtractor +{ +public: + GenericPdfExtractor(); + ~GenericPdfExtractor(); + GenericPdfExtractor(const GenericPdfExtractor&) = delete; + + /** Set the context date used for extraction. */ + void setContextDate(const QDateTime &dt); + + /** Try to extract the given document. */ + void extract(PdfDocument *doc, QJsonArray &result); + +private: + void extractImage(const PdfImage &img, QJsonArray &result); + void extractBarcode(const QString &code, QJsonArray &result); + + QDateTime m_contextDate; + std::unordered_set m_imageIds; +}; + +} + +#endif // KITINERARY_GENERICPDFEXTRACTOR_H