diff --git a/autotests/pdfdocumenttest.cpp b/autotests/pdfdocumenttest.cpp index fac5b6d..0843fb8 100644 --- a/autotests/pdfdocumenttest.cpp +++ b/autotests/pdfdocumenttest.cpp @@ -1,89 +1,97 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include using namespace KItinerary; class PdfDocumentTest : public QObject { Q_OBJECT private Q_SLOTS: + void initTestCase() + { + qputenv("TZ", "GMT"); + } + void testPdfDocument() { QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf")); QVERIFY(f.open(QFile::ReadOnly)); #ifdef HAVE_POPPLER std::unique_ptr doc(PdfDocument::fromData(f.readAll())); QVERIFY(doc); QCOMPARE(doc->text(), QStringLiteral("This is the first page.\nIt contains a PDF 417 barcode.\nThis is the second page.\nIt contains an Aztec code.\n")); QCOMPARE(doc->pageCount(), 2); QCOMPARE(doc->property("pages").toList().size(), 2); auto page = doc->page(0); QCOMPARE(page.text(), QStringLiteral("This is the first page.\nIt contains a PDF 417 barcode.\n")); QCOMPARE(page.imageCount(), 1); QCOMPARE(PdfPage::staticMetaObject.property(1).readOnGadget(&page).toList().size(), 1); QCOMPARE(page.textInRect(0, 0, 1, 0.5), QStringLiteral("This is the first page.\nIt contains a PDF 417 barcode.\n")); QCOMPARE(page.textInRect(0, 0.5, 1, 1), QString()); auto img = page.image(0); QCOMPARE(img.width(), 212); QCOMPARE(img.height(), 92); QCOMPARE(img.sourceHeight(), 152); QCOMPARE(img.sourceWidth(), 350); QCOMPARE(img.image().width(), 350); QCOMPARE(img.image().height(), 152); page = doc->page(1); QCOMPARE(page.text(), QStringLiteral("This is the second page.\nIt contains an Aztec code.\n")); QCOMPARE(page.imageCount(), 1); img = page.image(0); QCOMPARE(img.width(), 93); QCOMPARE(img.height(), 93); QCOMPARE(img.image().width(), 276); QCOMPARE(img.image().height(), 276); QCOMPARE(img.sourceHeight(), 276); QCOMPARE(img.sourceWidth(), 276); QVERIFY(page.imagesInRect(0, 0, 0.5, 1).isEmpty()); QCOMPARE(page.imagesInRect(0, 0.5, 1, 1).size(), 1); + + QCOMPARE(doc->creationTime(), QDateTime({2018, 4, 29}, {11, 41, 28}, Qt::OffsetFromUTC, 7200)); + QCOMPARE(doc->modificationTime(), QDateTime()); #endif } void testInvalidPdfDocument() { QVERIFY(!PdfDocument::fromData(QByteArray())); QVERIFY(!PdfDocument::fromData(QByteArray("HELLO"))); QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf")); QVERIFY(f.open(QFile::ReadOnly)); QVERIFY(!PdfDocument::fromData(f.readAll().left(f.size() / 2))); } }; QTEST_GUILESS_MAIN(PdfDocumentTest) #include "pdfdocumenttest.moc" diff --git a/src/pdf/pdfdocument.cpp b/src/pdf/pdfdocument.cpp index 20dc901..5c6e3b9 100644 --- a/src/pdf/pdfdocument.cpp +++ b/src/pdf/pdfdocument.cpp @@ -1,237 +1,281 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "config-kitinerary.h" #include "pdfdocument.h" #include "pdfdocument_p.h" #include "pdfextractoroutputdevice_p.h" #include "pdfimage_p.h" #include "popplerutils_p.h" #include "logging.h" +#include #include #include #include #ifdef HAVE_POPPLER #include +#include #include #include #endif #include using namespace KItinerary; void PdfPagePrivate::load() { if (m_loaded) { return; } #ifdef HAVE_POPPLER QScopedValueRollback globalParamResetter(globalParams, PopplerUtils::globalParams()); PdfExtractorOutputDevice device; m_doc->m_popplerDoc->displayPageSlice(&device, m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1); device.finalize(); const auto pageRect = m_doc->m_popplerDoc->getPage(m_pageNum + 1)->getCropBox(); std::unique_ptr s(device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2)); #ifdef HAVE_POPPLER_0_72 m_text = QString::fromUtf8(s->c_str()); #else m_text = QString::fromUtf8(s->getCString()); #endif m_images = std::move(device.m_images); for (auto it = m_images.begin(); it != m_images.end(); ++it) { (*it).d->m_page = this; } #endif m_loaded = true; } PdfPage::PdfPage() : d(new PdfPagePrivate) { } PdfPage::PdfPage(const PdfPage&) = default; PdfPage::~PdfPage() = default; PdfPage& PdfPage::operator=(const PdfPage&) = default; QString PdfPage::text() const { d->load(); return d->m_text; } #ifdef HAVE_POPPLER static double ratio(double begin, double end, double ratio) { return begin + (end - begin) * ratio; } #endif QString PdfPage::textInRect(double left, double top, double right, double bottom) const { #ifdef HAVE_POPPLER QScopedValueRollback globalParamResetter(globalParams, PopplerUtils::globalParams()); TextOutputDev device(nullptr, false, 0, false, false); d->m_doc->m_popplerDoc->displayPageSlice(&device, d->m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1); const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox(); std::unique_ptr s(device.getText(ratio(pageRect->x1, pageRect->x2, left), ratio(pageRect->y1, pageRect->y2, top), ratio(pageRect->x1, pageRect->x2, right), ratio(pageRect->y1, pageRect->y2, bottom))); #ifdef HAVE_POPPLER_0_72 return QString::fromUtf8(s->c_str()); #else return QString::fromUtf8(s->getCString()); #endif #else Q_UNUSED(left); Q_UNUSED(top); Q_UNUSED(right); Q_UNUSED(bottom); return {}; #endif } int PdfPage::imageCount() const { d->load(); return d->m_images.size(); } PdfImage PdfPage::image(int index) const { d->load(); return d->m_images[index]; } QVariantList PdfPage::imagesVariant() const { d->load(); QVariantList l; l.reserve(imageCount()); std::for_each(d->m_images.begin(), d->m_images.end(), [&l](const PdfImage& img) { l.push_back(QVariant::fromValue(img)); }); return l; } QVariantList PdfPage::imagesInRect(double left, double top, double right, double bottom) const { d->load(); QVariantList l; #ifdef HAVE_POPPLER QScopedValueRollback globalParamResetter(globalParams, PopplerUtils::globalParams()); const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox(); for (const auto &img : d->m_images) { if ((img.d->m_transform.dx() >= ratio(pageRect->x1, pageRect->x2, left) && img.d->m_transform.dx() <= ratio(pageRect->x1, pageRect->x2, right)) && (img.d->m_transform.dy() >= ratio(pageRect->y1, pageRect->y2, top) && img.d->m_transform.dy() <= ratio(pageRect->y1, pageRect->y2, bottom))) { l.push_back(QVariant::fromValue(img)); } } #else Q_UNUSED(left); Q_UNUSED(top); Q_UNUSED(right); Q_UNUSED(bottom); #endif return l; } PdfDocument::PdfDocument(QObject *parent) : QObject(parent) , d(new PdfDocumentPrivate) { } PdfDocument::~PdfDocument() = default; QString PdfDocument::text() const { QString text; std::for_each(d->m_pages.begin(), d->m_pages.end(), [&text](const PdfPage &p) { text += p.text(); }); return text; } int PdfDocument::pageCount() const { #ifdef HAVE_POPPLER return d->m_popplerDoc->getNumPages(); #else return 0; #endif } PdfPage PdfDocument::page(int index) const { return d->m_pages[index]; } int PdfDocument::fileSize() const { return d->m_pdfData.size(); } +static QDateTime parsePdfDateTime(const char *str) +{ + int year, month, day, hour, min, sec, tzHours, tzMins; + char tz; + + if (!parseDateString(str, &year, &month, &day, &hour, &min, &sec, &tz, &tzHours, &tzMins)) { + return {}; + } + + QDate date(year, month, day); + QTime time(hour, min, sec); + if (!date.isValid() || !time.isValid()) { + return {}; + } + + int offset = tzHours * 3600 + tzMins * 60; + if (tz == '+') { + return QDateTime(date, time, Qt::OffsetFromUTC, offset); + } else if (tz == '-') { + return QDateTime(date, time, Qt::OffsetFromUTC, -offset); + } + return QDateTime(date, time, Qt::UTC); +} + +QDateTime PdfDocument::creationTime() const +{ + std::unique_ptr dt(d->m_popplerDoc->getDocInfoCreatDate()); + if (!dt) { + return {}; + } + return parsePdfDateTime(dt->c_str()); +} + +QDateTime PdfDocument::modificationTime() const +{ + std::unique_ptr dt(d->m_popplerDoc->getDocInfoModDate()); + if (!dt) { + return {}; + } + return parsePdfDateTime(dt->c_str()); +} + QVariantList PdfDocument::pagesVariant() const { QVariantList l; l.reserve(pageCount()); std::for_each(d->m_pages.begin(), d->m_pages.end(), [&l](const PdfPage& p) { l.push_back(QVariant::fromValue(p)); }); return l; } PdfDocument* PdfDocument::fromData(const QByteArray &data, QObject *parent) { #ifdef HAVE_POPPLER QScopedValueRollback globalParamResetter(globalParams, PopplerUtils::globalParams()); std::unique_ptr doc(new PdfDocument(parent)); doc->d->m_pdfData = data; // PDFDoc takes ownership of stream #ifdef HAVE_POPPLER_0_58 auto stream = new MemStream(const_cast(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), Object()); #else Object obj; obj.initNull(); auto stream = new MemStream(const_cast(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), &obj); #endif std::unique_ptr popplerDoc(new PDFDoc(stream, nullptr, nullptr)); if (!popplerDoc->isOk()) { qCWarning(Log) << "Got invalid PDF document!" << popplerDoc->getErrorCode(); return nullptr; } doc->d->m_pages.reserve(popplerDoc->getNumPages()); for (int i = 0; i < popplerDoc->getNumPages(); ++i) { PdfPage page; page.d->m_pageNum = i; page.d->m_doc = doc->d.get(); doc->d->m_pages.push_back(page); } doc->d->m_popplerDoc = std::move(popplerDoc); return doc.release(); #else Q_UNUSED(data); Q_UNUSED(parent); return nullptr; #endif } diff --git a/src/pdf/pdfdocument.h b/src/pdf/pdfdocument.h index 8ee1805..4d710df 100644 --- a/src/pdf/pdfdocument.h +++ b/src/pdf/pdfdocument.h @@ -1,122 +1,131 @@ /* Copyright (C) 2018 Volker Krause This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifndef KITINERARY_PDFDOCUMENT_H #define KITINERARY_PDFDOCUMENT_H #include "kitinerary_export.h" #include #include #include #include #include +class QDateTime; class QImage; class QTransform; namespace KItinerary { class PdfPagePrivate; /** A page in a PDF document. */ class KITINERARY_EXPORT PdfPage { Q_GADGET Q_PROPERTY(QString text READ text) Q_PROPERTY(QVariantList images READ imagesVariant) public: PdfPage(); PdfPage(const PdfPage&); ~PdfPage(); PdfPage& operator=(const PdfPage&); /** The entire text on this page. */ QString text() const; /** Returns the text in the specified sub-rect of this page. * All parameters are relative values between @c 0 and @c 1 of the entire page size. */ Q_INVOKABLE QString textInRect(double left, double top, double right, double bottom) const; /** The number of images found in this document. */ int imageCount() const; /** The n-th image found in this document. */ PdfImage image(int index) const; /** Returns the images in the specified sub-rect of this page. * All parameters are relative values between @c 0 and @c 1 of the entire page size. */ Q_INVOKABLE QVariantList imagesInRect(double left, double top, double right, double bottom) const; private: QVariantList imagesVariant() const; friend class PdfDocument; QExplicitlySharedDataPointer d; }; class PdfDocumentPrivate; /** PDF document for extraction. * This is used as input for ExtractorEngine and the JS extractor scripts. * @note This class is only functional if Poppler is available as a dependency, * otherwise all methods return empty values. */ class KITINERARY_EXPORT PdfDocument : public QObject { Q_OBJECT Q_PROPERTY(QString text READ text CONSTANT) Q_PROPERTY(int pageCount READ pageCount CONSTANT) Q_PROPERTY(QVariantList pages READ pagesVariant CONSTANT) + Q_PROPERTY(QDateTime creationTime READ creationTime CONSTANT) + Q_PROPERTY(QDateTime modificationTime READ modificationTime CONSTANT) + public: explicit PdfDocument(QObject *parent = nullptr); ~PdfDocument(); /** The entire text extracted from the PDF document. */ QString text() const; /** The number of pages in this document. */ int pageCount() const; /** The n-thj page in this document. */ PdfPage page(int index) const; /** File size of the entire document in bytes. */ int fileSize() const; + /** Creation time as specified in the PDF file. */ + QDateTime creationTime() const; + /** Modification time as specified in the PDF file. */ + QDateTime modificationTime() const; + /** Creates a PdfDocument from the given raw data. * @returns @c nullptr if loading fails or Poppler was not found. */ static PdfDocument* fromData(const QByteArray &data, QObject *parent = nullptr); private: QVariantList pagesVariant() const; std::unique_ptr d; }; } Q_DECLARE_METATYPE(KItinerary::PdfPage) #endif // KITINERARY_PDFDOCUMENT_H