diff --git a/src/genericpdfextractor.cpp b/src/genericpdfextractor.cpp index 3cefef2..0c313fc 100644 --- a/src/genericpdfextractor.cpp +++ b/src/genericpdfextractor.cpp @@ -1,107 +1,131 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "genericpdfextractor.h" #include #include #include #include #include #include #include +#include using namespace KItinerary; enum { MaxPageCount = 10, // maximum in the current test set is 6 MaxFileSize = 4000000, // maximum in the current test set is 980kB MinImageHeight = 10, MinImageWidth = 30, MaxImageHeight = 1000, // TODO what's a realisitic value here? MaxImageWidth = 1000 }; GenericPdfExtractor::GenericPdfExtractor() = default; GenericPdfExtractor::~GenericPdfExtractor() = default; void GenericPdfExtractor::setContextDate(const QDateTime &dt) { m_contextDate = dt; } void GenericPdfExtractor::extract(PdfDocument *doc, QJsonArray &result) { // stay away from documents that are atypically large for what we are looking for // that's just unecessarily eating up resources if (doc->pageCount() > MaxPageCount || doc->fileSize() > MaxFileSize) { return; } m_imageIds.clear(); for (int i = 0; i < doc->pageCount(); ++i) { const auto page = doc->page(i); for (int j = 0; j < page.imageCount(); ++j) { const auto img = page.image(j); // image size sanity checks if (img.height() < MinImageHeight || img.height() > MaxImageHeight || img.width() < MinImageWidth || img.height() > MaxImageWidth) { continue; } if (m_imageIds.find(img.objectId()) != m_imageIds.end()) { continue; } extractImage(img, result); m_imageIds.insert(img.objectId()); } } } void GenericPdfExtractor::extractImage(const PdfImage &img, QJsonArray &result) { const auto aspectRatio = img.width() < img.height() ? (float)img.height() / (float)img.width() : (float)img.width() / (float)img.height(); // almost square, assume Aztec (or QR, which we don't handle here yet) if (aspectRatio < 1.2f) { const auto b = BarcodeDecoder::decodeAztecBinary(img.image()); if (Uic9183Parser::maybeUic9183(b)) { - // TODO + extractUic9183(b, result); } else { extractBarcode(QString::fromUtf8(b), result); } } // rectangular with medium aspect ratio, assume PDF 417 if (aspectRatio > 1.5 && aspectRatio < 6) { const auto s = BarcodeDecoder::decodePdf417(img.image()); extractBarcode(s, result); } } void GenericPdfExtractor::extractBarcode(const QString &code, QJsonArray &result) { if (IataBcbpParser::maybeIataBcbp(code)) { const auto res = IataBcbpParser::parse(code, m_contextDate.date()); const auto jsonLd = JsonLdDocument::toJson(res); std::copy(jsonLd.begin(), jsonLd.end(), std::back_inserter(result)); } } + +void GenericPdfExtractor::extractUic9183(const QByteArray &data, QJsonArray &result) +{ + Uic9183Parser p; + p.parse(data); + if (!p.isValid()) { + return; + } + + // TODO: add RCT2 ticket data + QJsonObject org; + org.insert(QLatin1String("@type"), QLatin1String("Organization")); + org.insert(QLatin1String("identifier"), QString(QLatin1String("uic:") + p.carrierId())); + QJsonObject trip; + trip.insert(QLatin1String("@type"), QLatin1String("TrainTrip")); + trip.insert(QLatin1String("provider"), org); + QJsonObject res; + res.insert(QLatin1String("@type"), QLatin1String("TrainReservation")); + res.insert(QLatin1String("reservationFor"), trip); + res.insert(QLatin1String("reservationNumber"), p.pnr()); + + result.push_back(res); +} diff --git a/src/genericpdfextractor.h b/src/genericpdfextractor.h index d39ec91..3030288 100644 --- a/src/genericpdfextractor.h +++ b/src/genericpdfextractor.h @@ -1,62 +1,63 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifndef KITINERARY_GENERICPDFEXTRACTOR_H #define KITINERARY_GENERICPDFEXTRACTOR_H #include #include class QJsonArray; class QString; namespace KItinerary { class PdfDocument; class PdfImage; /** Generic extractor for PDF documents. * This is applied to all PDF documents and searches for * barcodes we can recognize. * * @internal */ class GenericPdfExtractor { public: GenericPdfExtractor(); ~GenericPdfExtractor(); GenericPdfExtractor(const GenericPdfExtractor&) = delete; /** Set the context date used for extraction. */ void setContextDate(const QDateTime &dt); /** Try to extract the given document. */ void extract(PdfDocument *doc, QJsonArray &result); private: void extractImage(const PdfImage &img, QJsonArray &result); void extractBarcode(const QString &code, QJsonArray &result); + void extractUic9183(const QByteArray &data, QJsonArray &result); QDateTime m_contextDate; std::unordered_set m_imageIds; }; } #endif // KITINERARY_GENERICPDFEXTRACTOR_H