diff --git a/autotests/epubextractortest.cpp b/autotests/epubextractortest.cpp index 9d46b24..3ed52ec 100644 --- a/autotests/epubextractortest.cpp +++ b/autotests/epubextractortest.cpp @@ -1,77 +1,77 @@ /* * * Copyright (C) 2014 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "epubextractortest.h" #include "simpleextractionresult.h" #include "indexerextractortestsconfig.h" #include "extractors/epubextractor.h" #include #include #include using namespace KFileMetaData; QString EPubExtractorTest::testFilePath(const QString& fileName) const { return QLatin1String(INDEXER_TESTS_SAMPLE_FILES_PATH) + QLatin1Char('/') + fileName; } void EPubExtractorTest::test() { EPubExtractor plugin{this}; SimpleExtractionResult result(testFilePath("test.epub"), "application/epub+zip"); plugin.extract(&result); QCOMPARE(result.types().size(), 1); QCOMPARE(result.types().constFirst(), Type::Document); // We're doing a contains instead of an exact check cause the epub file contains // a ton of css and other garbage. QVERIFY(result.text().contains(QStringLiteral("This is a sample PDF file for KFileMetaData."))); QCOMPARE(result.properties().value(Property::Author), QVariant(QStringLiteral("Happy Man"))); QCOMPARE(result.properties().value(Property::Publisher), QVariant(QStringLiteral("Happy Publisher"))); QCOMPARE(result.properties().value(Property::Title), QVariant(QStringLiteral("The Big Brown Bear"))); QCOMPARE(result.properties().value(Property::Subject), QVariant(QStringLiteral("Baloo KFileMetaData"))); - QCOMPARE(result.properties().value(Property::Comment), QVariant(QStringLiteral("Honey"))); + QCOMPARE(result.properties().value(Property::Description), QVariant(QStringLiteral("Honey"))); QDateTime dt(QDate(2014, 1, 1), QTime(1, 1, 1)); dt.setTimeSpec(Qt::UTC); QCOMPARE(result.properties().value(Property::CreationDate), QVariant(dt)); QCOMPARE(result.properties().value(Property::ReleaseYear), QVariant(2014)); QCOMPARE(result.properties().size(), 7); } void EPubExtractorTest::testMetaDataOnly() { EPubExtractor plugin{this}; SimpleExtractionResult result(testFilePath("test.epub"), "application/epub+zip", ExtractionResult::ExtractMetaData); plugin.extract(&result); QVERIFY(!result.types().isEmpty()); QVERIFY(!result.properties().isEmpty()); QVERIFY(result.text().isEmpty()); } QTEST_GUILESS_MAIN(EPubExtractorTest) diff --git a/autotests/odfextractortest.cpp b/autotests/odfextractortest.cpp index ab62da5..c5c8335 100644 --- a/autotests/odfextractortest.cpp +++ b/autotests/odfextractortest.cpp @@ -1,110 +1,110 @@ /* * * Copyright (C) 2014 Vishesh Handa * Copyright (C) 2016 Christoph Cullmann * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "odfextractortest.h" #include #include #include #include "simpleextractionresult.h" #include "indexerextractortestsconfig.h" #include "extractors/odfextractor.h" using namespace KFileMetaData; QString OdfExtractorTest::testFilePath(const QString& fileName) const { return QLatin1String(INDEXER_TESTS_SAMPLE_FILES_PATH) + QLatin1Char('/') + fileName; } void OdfExtractorTest::testText() { OdfExtractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test.odt")), QStringLiteral("application/vnd.oasis.opendocument.text")); plugin.extract(&result); QCOMPARE(result.types().size(), 1); QCOMPARE(result.types().at(0), Type::Document); QCOMPARE(result.properties().value(Property::Title), QVariant(QStringLiteral("KFileMetaData Title"))); QCOMPARE(result.properties().value(Property::Subject), QVariant(QStringLiteral("KFileMetaData Subject"))); QCOMPARE(result.properties().value(Property::Keywords), QVariant(QStringLiteral("KFileMetaData keyword"))); - QCOMPARE(result.properties().value(Property::Comment), QVariant(QStringLiteral("KFileMetaData comment"))); + QCOMPARE(result.properties().value(Property::Description), QVariant(QStringLiteral("KFileMetaData description"))); QVERIFY(result.properties().value(Property::Generator).toString().contains(QStringLiteral("LibreOffice"))); QDateTime dt(QDate(2014, 07, 01), QTime(17, 37, 40, 690)); QCOMPARE(result.properties().value(Property::CreationDate), QVariant(dt)); QCOMPARE(result.properties().value(Property::WordCount), QVariant(4)); QCOMPARE(result.properties().value(Property::PageCount), QVariant(1)); QCOMPARE(result.text(), QStringLiteral("Test file for KFileMetaData. ")); QCOMPARE(result.properties().size(), 8); } void OdfExtractorTest::testTextMetaDataOnly() { OdfExtractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test.odt")), QStringLiteral("application/vnd.oasis.opendocument.text"), ExtractionResult::ExtractMetaData); plugin.extract(&result); QCOMPARE(result.types().size(), 1); QCOMPARE(result.properties().size(), 8); QVERIFY(result.text().isEmpty()); } void OdfExtractorTest::testPresentation() { OdfExtractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test.odp")), QStringLiteral("application/vnd.oasis.opendocument.presentation")); plugin.extract(&result); QCOMPARE(result.types().size(), 2); QCOMPARE(result.types().at(0), Type::Document); QCOMPARE(result.types().at(1), Type::Presentation); QVERIFY(result.properties().value(Property::Generator).toString().contains(QStringLiteral("LibreOffice"))); QDateTime dt(QDate(2014, 07, 02), QTime(10, 59, 23, 434)); QCOMPARE(result.properties().value(Property::CreationDate), QVariant(dt)); QCOMPARE(result.text(), QStringLiteral("KFileMetaData Pres ")); } void OdfExtractorTest::testTextMissingMetaNoCrash() { OdfExtractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test_missing_meta.odt")), QStringLiteral("application/vnd.oasis.opendocument.text")); plugin.extract(&result); } void OdfExtractorTest::testTextMissingContentNoCrash() { OdfExtractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test_missing_content.odt")), QStringLiteral("application/vnd.oasis.opendocument.text")); plugin.extract(&result); } QTEST_GUILESS_MAIN(OdfExtractorTest) diff --git a/autotests/office2007extractortest.cpp b/autotests/office2007extractortest.cpp index fbe00df..aed28b9 100644 --- a/autotests/office2007extractortest.cpp +++ b/autotests/office2007extractortest.cpp @@ -1,77 +1,77 @@ /* * * Copyright (C) 2014 Vishesh Handa * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "office2007extractortest.h" #include "simpleextractionresult.h" #include "indexerextractortestsconfig.h" #include "extractors/office2007extractor.h" #include #include #include using namespace KFileMetaData; QString Office2007ExtractorTest::testFilePath(const QString& fileName) const { return QLatin1String(INDEXER_TESTS_SAMPLE_FILES_PATH) + QLatin1Char('/') + fileName; } void Office2007ExtractorTest::test() { Office2007Extractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test_libreoffice.docx")), QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); plugin.extract(&result); QCOMPARE(result.types().size(), 1); QCOMPARE(result.types().at(0), Type::Document); QCOMPARE(result.properties().value(Property::Title), QVariant(QStringLiteral("KFileMetaData Title"))); QCOMPARE(result.properties().value(Property::Subject), QVariant(QStringLiteral("KFileMetaData Subject"))); QCOMPARE(result.properties().value(Property::Keywords), QVariant(QStringLiteral("KFileMetaData keyword"))); - QCOMPARE(result.properties().value(Property::Comment), QVariant(QStringLiteral("KFileMetaData comment"))); + QCOMPARE(result.properties().value(Property::Description), QVariant(QStringLiteral("KFileMetaData comment"))); QCOMPARE(result.properties().value(Property::Language), QVariant(QStringLiteral("en-US"))); QVERIFY(result.properties().value(Property::Generator).toString().contains(QStringLiteral("LibreOffice"))); QDateTime dt(QDate(2014, 07, 01), QTime(17, 37, 40)); dt.setTimeSpec(Qt::UTC); QCOMPARE(result.properties().value(Property::CreationDate), QVariant(dt)); QCOMPARE(result.properties().size(), 7); QCOMPARE(result.text(), QStringLiteral("Test file for KFileMetaData. ")); } void Office2007ExtractorTest::testMetaDataOnly() { Office2007Extractor plugin{this}; SimpleExtractionResult result(testFilePath(QStringLiteral("test_libreoffice.docx")), QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), ExtractionResult::ExtractMetaData); plugin.extract(&result); QVERIFY(!result.types().isEmpty()); QVERIFY(!result.properties().isEmpty()); QVERIFY(result.text().isEmpty()); } QTEST_GUILESS_MAIN(Office2007ExtractorTest) diff --git a/autotests/samplefiles/test.odt b/autotests/samplefiles/test.odt index c51da66..41a2129 100644 Binary files a/autotests/samplefiles/test.odt and b/autotests/samplefiles/test.odt differ diff --git a/src/extractors/dublincoreextractor.cpp b/src/extractors/dublincoreextractor.cpp index 3ed4ca6..f6b5720 100644 --- a/src/extractors/dublincoreextractor.cpp +++ b/src/extractors/dublincoreextractor.cpp @@ -1,63 +1,63 @@ /* Helper class to extract XML encoded Dublin Core metadata Copyright (C) 2018 Stefan Brüns This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "dublincoreextractor.h" #include "extractionresult.h" namespace { inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); } inline QString dctermsNS() { return QStringLiteral("http://purl.org/dc/terms/"); } } namespace KFileMetaData { void DublinCoreExtractor::extract(ExtractionResult* result, const QDomNode& fragment) { QDomElement e = fragment.firstChildElement(); while (!e.isNull()) { const QString namespaceURI = e.namespaceURI(); const QString localName = e.localName(); // Dublin Core // According to http://dublincore.org/documents/dces/, the // properties should be treated the same regardless if // used in the legacy DCES or DCMI-TERMS variant if (namespaceURI == dcNS() || namespaceURI == dctermsNS()) { if (localName == QLatin1String("description")) { - result->add(Property::Comment, e.text()); + result->add(Property::Description, e.text()); } else if (localName == QLatin1String("subject")) { result->add(Property::Subject, e.text()); } else if (localName == QLatin1String("title")) { result->add(Property::Title, e.text()); } else if (localName == QLatin1String("creator")) { result->add(Property::Author, e.text()); } else if (localName == QLatin1String("language")) { result->add(Property::Language, e.text()); } } e = e.nextSiblingElement(); } } } // namespace KFileMetaData diff --git a/src/extractors/epubextractor.cpp b/src/extractors/epubextractor.cpp index f427022..395aeb5 100644 --- a/src/extractors/epubextractor.cpp +++ b/src/extractors/epubextractor.cpp @@ -1,192 +1,192 @@ /* Copyright (C) 2013 Vishesh Handa Copyright (C) 2016 Christoph Cullmann This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "epubextractor.h" #include #include #include #include using namespace KFileMetaData; EPubExtractor::EPubExtractor(QObject* parent) : ExtractorPlugin(parent) { } namespace { static const QStringList supportedMimeTypes = { QStringLiteral("application/epub+zip"), }; QString fetchMetadata(struct epub* e, const epub_metadata& type) { int size = 0; unsigned char** data = epub_get_metadata(e, type, &size); if (data) { QStringList strList; for (int i = 0; i < size; i++) { // skip nullptr entries, can happen for broken xml files if (!data[i]) continue; strList << QString::fromUtf8((char*)data[i]); free(data[i]); } free(data); return strList.join(QLatin1String(", ")); } return QString(); } } QStringList EPubExtractor::mimetypes() const { return supportedMimeTypes; } void EPubExtractor::extract(ExtractionResult* result) { // open epub, return on exit, file will be closed again at end of function auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1); if (!ePubDoc) { qWarning() << "Invalid document"; return; } result->addType(Type::Document); QString value = fetchMetadata(ePubDoc, EPUB_TITLE); if (!value.isEmpty()) { result->add(Property::Title, value); } value = fetchMetadata(ePubDoc, EPUB_SUBJECT); if (!value.isEmpty()) { result->add(Property::Subject, value); } value = fetchMetadata(ePubDoc, EPUB_CREATOR); if (!value.isEmpty()) { if (value.startsWith(QLatin1String("aut:"), Qt::CaseInsensitive)) { value = value.mid(4).simplified(); } else if (value.startsWith(QLatin1String("author:"), Qt::CaseInsensitive)) { value = value.mid(7).simplified(); } // A lot of authors have their name written in () again. We discard that part int index = value.indexOf(QLatin1Char('(')); if (index) value = value.mid(0, index); result->add(Property::Author, value); } // The Contributor just seems to be mostly Calibre aka the Generator /* value = fetchMetadata(ePubDoc, EPUB_CONTRIB); if( !value.isEmpty() ) { SimpleResource con; con.addType( NCO::Contact() ); con.addProperty( NCO::fullname(), value ); fileRes.addProperty( NCO::contributor(), con ); graph << con; }*/ value = fetchMetadata(ePubDoc, EPUB_PUBLISHER); if (!value.isEmpty()) { result->add(Property::Publisher, value); } value = fetchMetadata(ePubDoc, EPUB_DESCRIPTION); if (!value.isEmpty()) { - result->add(Property::Comment, value); + result->add(Property::Description, value); } value = fetchMetadata(ePubDoc, EPUB_DATE); if (!value.isEmpty()) { if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) { value = value.mid(QByteArray("Unspecified:").size()).simplified(); } int ind = value.indexOf(QLatin1String("publication:"), Qt::CaseInsensitive); if (ind != -1) { value = value.mid(ind + QByteArray("publication:").size()).simplified(); } QDateTime dt = ExtractorPlugin::dateTimeFromString(value); if (!dt.isNull()) { result->add(Property::CreationDate, dt); result->add(Property::ReleaseYear, dt.date().year()); } } // // Plain Text // if (result->inputFlags() & ExtractionResult::ExtractPlainText) { if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) { do { char* curr = epub_it_get_curr(iter); if (!curr) continue; QString html = QString::fromUtf8(curr); html.remove(QRegularExpression(QStringLiteral("<[^>]*>"))); result->append(html); } while (epub_it_get_next(iter)); epub_free_iterator(iter); } auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0); if (!tit) { tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0); } if (tit) { if (epub_tit_curr_valid(tit)) { do { // get link, iterator handles freeing of it char* clink = epub_tit_get_curr_link(tit); // epub_get_data returns -1 on failure char* data = nullptr; const int size = epub_get_data(ePubDoc, clink, &data); if (size >= 0 && data) { QString html = QString::fromUtf8(data, size); // strip html tags html.remove(QRegularExpression(QStringLiteral("<[^>]*>"))); result->append(html); free(data); } } while (epub_tit_next(tit)); } epub_free_titerator(tit); } } // close epub file again epub_close(ePubDoc); } diff --git a/src/extractors/odfextractor.cpp b/src/extractors/odfextractor.cpp index 3f19ac9..0e054d2 100644 --- a/src/extractors/odfextractor.cpp +++ b/src/extractors/odfextractor.cpp @@ -1,177 +1,177 @@ /* Copyright (C) 2013 Vishesh Handa Copyright (C) 2012 Jörg Ehrichs Copyright (C) 2016 Christoph Cullmann This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "odfextractor.h" #include #include #include #include namespace { inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); } inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); } inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); } QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName) { for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) { if (e.localName() == localName && e.namespaceURI() == nsURI) { return e; } } return QDomElement(); } const QStringList supportedMimeTypes = { QStringLiteral("application/vnd.oasis.opendocument.text"), QStringLiteral("application/vnd.oasis.opendocument.presentation"), QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"), }; } using namespace KFileMetaData; OdfExtractor::OdfExtractor(QObject* parent) : ExtractorPlugin(parent) { } QStringList OdfExtractor::mimetypes() const { return supportedMimeTypes; } void OdfExtractor::extract(ExtractionResult* result) { KZip zip(result->inputUrl()); if (!zip.open(QIODevice::ReadOnly)) { qWarning() << "Document is not a valid ZIP archive"; return; } const KArchiveDirectory* directory = zip.directory(); if (!directory) { qWarning() << "Invalid document structure (main directory is missing)"; return; } // we need a meta xml file in the archive! const auto metaXml = directory->entry(QStringLiteral("meta.xml")); if (!metaXml || !metaXml->isFile()) { qWarning() << "Invalid document structure (meta.xml is missing)"; return; } QDomDocument metaData(QStringLiteral("metaData")); metaData.setContent(static_cast(metaXml)->data(), true); // parse metadata ... QDomElement meta = firstChildElementNS(firstChildElementNS(metaData, officeNS(), QStringLiteral("document-meta")), officeNS(), QStringLiteral("meta")); QDomNode n = meta.firstChild(); while (!n.isNull()) { QDomElement e = n.toElement(); if (!e.isNull()) { const QString namespaceURI = e.namespaceURI(); const QString localName = e.localName(); // Dublin Core if (namespaceURI == dcNS()) { if (localName == QLatin1String("description")) { - result->add(Property::Comment, e.text()); + result->add(Property::Description, e.text()); } else if (localName == QLatin1String("subject")) { result->add(Property::Subject, e.text()); } else if (localName == QLatin1String("title")) { result->add(Property::Title, e.text()); } else if (localName == QLatin1String("creator")) { result->add(Property::Author, e.text()); } else if (localName == QLatin1String("language")) { result->add(Property::Language, e.text()); } } // Meta Properties else if (namespaceURI == metaNS()) { if (localName == QLatin1String("document-statistic")) { bool ok = false; int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok); if (ok) { result->add(Property::PageCount, pageCount); } int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok); if (ok) { result->add(Property::WordCount, wordCount); } } else if (localName == QLatin1String("keyword")) { QString keywords = e.text(); result->add(Property::Keywords, keywords); } else if (localName == QLatin1String("generator")) { result->add(Property::Generator, e.text()); } else if (localName == QLatin1String("creation-date")) { QDateTime dt = ExtractorPlugin::dateTimeFromString(e.text()); if (!dt.isNull()) result->add(Property::CreationDate, dt); } } } n = n.nextSibling(); } result->addType(Type::Document); if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) { result->addType(Type::Presentation); } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) { result->addType(Type::Spreadsheet); } if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) { return; } // for content indexing, we need content xml file const auto contentXml = directory->entry(QStringLiteral("content.xml")); if (!contentXml || !contentXml->isFile()) { qWarning() << "Invalid document structure (content.xml is missing)"; return; } QXmlStreamReader xml(static_cast(contentXml)->createDevice()); while (!xml.atEnd()) { xml.readNext(); if (xml.isCharacters()) { QString str = xml.text().toString(); result->append(str); } if (xml.hasError() || xml.isEndDocument()) break; } } diff --git a/src/extractors/office2007extractor.cpp b/src/extractors/office2007extractor.cpp index 33537b2..10ea544 100644 --- a/src/extractors/office2007extractor.cpp +++ b/src/extractors/office2007extractor.cpp @@ -1,294 +1,294 @@ /* Copyright (C) 2013 Vishesh Handa This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "office2007extractor.h" #include #include #include #include using namespace KFileMetaData; Office2007Extractor::Office2007Extractor(QObject* parent) : ExtractorPlugin(parent) { } const QStringList supportedMimeTypes = { QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"), QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), }; QStringList Office2007Extractor::mimetypes() const { return supportedMimeTypes; } void Office2007Extractor::extract(ExtractionResult* result) { KZip zip(result->inputUrl()); if (!zip.open(QIODevice::ReadOnly)) { qWarning() << "Document is not a valid ZIP archive"; return; } const KArchiveDirectory* rootDir = zip.directory(); if (!rootDir) { qWarning() << "Invalid document structure (main directory is missing)"; return; } const QStringList rootEntries = rootDir->entries(); if (!rootEntries.contains(QStringLiteral("docProps"))) { qWarning() << "Invalid document structure (docProps is missing)"; return; } const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps")); if (!docPropEntry->isDirectory()) { qWarning() << "Invalid document structure (docProps is not a directory)"; return; } const KArchiveDirectory* docPropDirectory = dynamic_cast(docPropEntry); const QStringList docPropsEntries = docPropDirectory->entries(); if (docPropsEntries.contains(QStringLiteral("core.xml"))) { QDomDocument coreDoc(QStringLiteral("core")); const KArchiveFile* file = static_cast(docPropDirectory->entry(QStringLiteral("core.xml"))); coreDoc.setContent(file->data()); QDomElement docElem = coreDoc.documentElement(); QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { - result->add(Property::Comment, str); + result->add(Property::Description, str); } } elem = docElem.firstChildElement(QStringLiteral("dc:subject")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { result->add(Property::Subject, str); } } elem = docElem.firstChildElement(QStringLiteral("dc:title")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { result->add(Property::Title, str); } } elem = docElem.firstChildElement(QStringLiteral("dc:creator")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { result->add(Property::Author, str); } } elem = docElem.firstChildElement(QStringLiteral("dc:language")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { result->add(Property::Language, str); } } elem = docElem.firstChildElement(QStringLiteral("dcterms:created")); if (!elem.isNull()) { QString str = elem.text(); QDateTime dt = dateTimeFromString(str); if (!dt.isNull()) { result->add(Property::CreationDate, dt); } } elem = docElem.firstChildElement(QStringLiteral("cp:keywords")); if (!elem.isNull()) { QString str = elem.text(); if (!str.isEmpty()) { result->add(Property::Keywords, str); } } } if (docPropsEntries.contains(QStringLiteral("app.xml"))) { QDomDocument appDoc(QStringLiteral("app")); const KArchiveFile* file = static_cast(docPropDirectory->entry(QStringLiteral("app.xml"))); appDoc.setContent(file->data()); QDomElement docElem = appDoc.documentElement(); // According to the ontologies only Documents can have a wordCount and pageCount const QString mimeType = result->inputMimetype(); if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages")); if (!elem.isNull()) { bool ok = false; int pageCount = elem.text().toInt(&ok); if (ok) { result->add(Property::PageCount, pageCount); } } elem = docElem.firstChildElement(QStringLiteral("Words")); if (!elem.isNull()) { bool ok = false; int wordCount = elem.text().toInt(&ok); if (ok) { result->add(Property::WordCount, wordCount); } } } QDomElement elem = docElem.firstChildElement(QStringLiteral("Application")); if (!elem.isNull()) { QString app = elem.text(); if (!app.isEmpty()) { result->add(Property::Generator, app); } } } // // Plain Text // bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText); if (rootEntries.contains(QStringLiteral("word"))) { result->addType(Type::Document); if (!extractPlainText) return; const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word")); if (!wordEntry->isDirectory()) { qWarning() << "Invalid document structure (word is not a directory)"; return; } const KArchiveDirectory* wordDirectory = dynamic_cast(wordEntry); const QStringList wordEntries = wordDirectory->entries(); if (wordEntries.contains(QStringLiteral("document.xml"))) { const KArchiveFile* file = static_cast(wordDirectory->entry(QStringLiteral("document.xml"))); extractTextWithTag(file->createDevice(), QStringLiteral("w:t"), result); } } else if (rootEntries.contains(QStringLiteral("xl"))) { result->addType(Type::Document); result->addType(Type::Spreadsheet); if (!extractPlainText) return; const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl")); if (!xlEntry->isDirectory()) { qWarning() << "Invalid document structure (xl is not a directory)"; return; } const KArchiveDirectory* xlDirectory = dynamic_cast(xlEntry); extractTextFromFiles(xlDirectory, result); } else if (rootEntries.contains(QStringLiteral("ppt"))) { result->addType(Type::Document); result->addType(Type::Presentation); if (!extractPlainText) return; const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt")); if (!pptEntry->isDirectory()) { qWarning() << "Invalid document structure (ppt is not a directory)"; return; } const KArchiveDirectory* pptDirectory = dynamic_cast(pptEntry); extractTextFromFiles(pptDirectory, result); } } void Office2007Extractor::extractAllText(QIODevice* device, ExtractionResult* result) { QXmlStreamReader xml(device); while (!xml.atEnd()) { xml.readNext(); if (xml.isCharacters()) { QString str = xml.text().toString(); result->append(str); } if (xml.isEndDocument() || xml.hasError()) break; } } void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, ExtractionResult* result) { const QStringList entries = archiveDir->entries(); foreach(const QString & entryName, entries) { const KArchiveEntry* entry = archiveDir->entry(entryName); if (entry->isDirectory()) { const KArchiveDirectory* subDir = dynamic_cast(entry); extractTextFromFiles(subDir, result); continue; } if (!entryName.endsWith(QLatin1String(".xml"))) continue; const KArchiveFile* file = static_cast(entry); extractAllText(file->createDevice(), result); } } void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result) { QXmlStreamReader xml(device); while (!xml.atEnd()) { xml.readNext(); if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) { QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements); if (!str.isEmpty()) { result->append(str); } } if (xml.isEndDocument() || xml.hasError()) break; } }