diff --git a/src/generic/genericextractor.cpp b/src/generic/genericextractor.cpp new file mode 100644 index 0000000..8863e03 --- /dev/null +++ b/src/generic/genericextractor.cpp @@ -0,0 +1,20 @@ +/* + Copyright (C) 2019 Volker Krause + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "genericextractor_p.h" + +using namespace KItinerary; diff --git a/src/generic/genericextractor_p.h b/src/generic/genericextractor_p.h new file mode 100644 index 0000000..97f11fc --- /dev/null +++ b/src/generic/genericextractor_p.h @@ -0,0 +1,39 @@ +/* + Copyright (C) 2019 Volker Krause + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef KITINERARY_GENERICEXTRACTOR_H +#define KITINERARY_GENERICEXTRACTOR_H + +#include +#include + +namespace KItinerary { + +/** Shared bits between all generic extractors. */ +namespace GenericExtractor +{ + /** Generic extraction result. */ + struct Result { + QJsonArray result; // JSON-LD data extracted from this document or page + QVariant barcode; // unrecognized barcode for further processing + int pageNum = -1; // page number, if result is from a single PDF page + }; +} + +} + +#endif // KITINERARY_GENERICEXTRACTOR_H diff --git a/src/generic/genericpdfextractor.cpp b/src/generic/genericpdfextractor.cpp index 237852d..42a1c3b 100644 --- a/src/generic/genericpdfextractor.cpp +++ b/src/generic/genericpdfextractor.cpp @@ -1,145 +1,145 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "genericpdfextractor_p.h" #include "genericuic918extractor_p.h" #include #include #include #include #include #include #include #include #include #include using namespace KItinerary; enum { MaxPageCount = 10, // maximum in the current test set is 6 MaxFileSize = 4000000, // maximum in the current test set is 980kB // unit is 1/72 inch, assuming landscape orientation MinTargetImageHeight = 28, MinTargetImageWidth = 36, MaxTargetImageHeight = 252, MaxTargetImageWidth = 252, }; GenericPdfExtractor::GenericPdfExtractor() = default; GenericPdfExtractor::~GenericPdfExtractor() = default; void GenericPdfExtractor::setBarcodeDecoder(BarcodeDecoder *decoder) { m_barcodeDecoder = decoder; } void GenericPdfExtractor::setContextDate(const QDateTime &dt) { m_contextDate = dt; } -std::vector GenericPdfExtractor::extract(PdfDocument *doc) +std::vector GenericPdfExtractor::extract(PdfDocument *doc) { - std::vector result; + std::vector result; // stay away from documents that are atypically large for what we are looking for // that's just unnecessarily eating up resources if (doc->pageCount() > MaxPageCount || doc->fileSize() > MaxFileSize) { return result; } m_imageIds.clear(); for (int i = 0; i < doc->pageCount(); ++i) { const auto page = doc->page(i); for (int j = 0; j < page.imageCount(); ++j) { const auto img = page.image(j); if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { continue; } if (!maybeBarcode(img)) { continue; } auto r = extractImage(img); if (!r.barcode.isNull() || !r.result.isEmpty()) { r.pageNum = i; result.push_back(r); } if (img.hasObjectId()) { m_imageIds.insert(img.objectId()); } } } return result; } -GenericPdfExtractor::Result GenericPdfExtractor::extractImage(const PdfImage &img) +GenericExtractor::Result GenericPdfExtractor::extractImage(const PdfImage &img) { const auto b = m_barcodeDecoder->decodeBinary(img.image()); if (Uic9183Parser::maybeUic9183(b)) { QJsonArray result; GenericUic918Extractor::extract(b, result, m_contextDate); if (!result.isEmpty()) { - return Result {-1, result, b}; + return GenericExtractor::Result{result, b, -1}; } return {}; } if (b.isEmpty()) { return extractBarcode(m_barcodeDecoder->decodeString(img.image())); } else { return extractBarcode(QString::fromUtf8(b)); } } -GenericPdfExtractor::Result GenericPdfExtractor::extractBarcode(const QString &code) +GenericExtractor::Result GenericPdfExtractor::extractBarcode(const QString &code) { if (code.isEmpty()) { return {}; } if (IataBcbpParser::maybeIataBcbp(code)) { const auto res = IataBcbpParser::parse(code, m_contextDate.date()); const auto jsonLd = JsonLdDocument::toJson(res); - return {-1, jsonLd, code}; + return {jsonLd, code, -1}; } - return {-1, {}, code}; + return {{}, code, -1}; } bool GenericPdfExtractor::maybeBarcode(const PdfImage &img, BarcodeDecoder::BarcodeTypes hint) { const auto w = img.width(); const auto h = img.height(); if (!BarcodeDecoder::isPlausibleSize(img.sourceWidth(), img.sourceHeight()) || !BarcodeDecoder::isPlausibleAspectRatio(w, h, hint)) { return false; } // image target size checks if (std::min(w, h) < MinTargetImageHeight || std::max(w, h) < MinTargetImageWidth || h > MaxTargetImageHeight || w > MaxTargetImageWidth) { return false; } return true; } diff --git a/src/generic/genericpdfextractor_p.h b/src/generic/genericpdfextractor_p.h index 735773f..e7e4dea 100644 --- a/src/generic/genericpdfextractor_p.h +++ b/src/generic/genericpdfextractor_p.h @@ -1,79 +1,72 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifndef KITINERARY_GENERICPDFEXTRACTOR_P_H #define KITINERARY_GENERICPDFEXTRACTOR_P_H +#include "genericextractor_p.h" + #include #include -#include #include #include -class QJsonArray; class QString; namespace KItinerary { class BarcodeDecoder; class PdfDocument; class PdfImage; /** Generic extractor for PDF documents. * This is applied to all PDF documents and searches for * barcodes we can recognize. * * @internal */ class GenericPdfExtractor { public: GenericPdfExtractor(); ~GenericPdfExtractor(); GenericPdfExtractor(const GenericPdfExtractor&) = delete; void setBarcodeDecoder(BarcodeDecoder *decoder); /** Set the context date used for extraction. */ void setContextDate(const QDateTime &dt); - /** PDF extraction result. */ - struct Result { - int pageNum = -1; // page number, if result is from a single page - QJsonArray result; // JSON-LD data extracted from this document or page - QVariant barcode; // unrecognized barcode for further processing - }; - /** Try to extract the given document. */ - std::vector extract(PdfDocument *doc); + std::vector extract(PdfDocument *doc); /** Quick pre-check without image decoding if @p img might be a barcode. */ static bool maybeBarcode(const PdfImage &img, BarcodeDecoder::BarcodeTypes hint = BarcodeDecoder::Any); private: - Result extractImage(const PdfImage &img); - Result extractBarcode(const QString &code); + GenericExtractor::Result extractImage(const PdfImage &img); + GenericExtractor::Result extractBarcode(const QString &code); QDateTime m_contextDate; std::unordered_set m_imageIds; BarcodeDecoder *m_barcodeDecoder = nullptr; }; } #endif // KITINERARY_GENERICPDFEXTRACTOR_P_H