diff --git a/autotests/CMakeLists.txt b/autotests/CMakeLists.txt index 728844d..11b9d3a 100644 --- a/autotests/CMakeLists.txt +++ b/autotests/CMakeLists.txt @@ -1,11 +1,12 @@ ecm_add_test(datatypestest.cpp LINK_LIBRARIES Qt5::Test Qt5::Qml KPim::Itinerary) ecm_add_test(jsonlddocumenttest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(mergeutiltest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(airportdbtest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) add_definitions(-DSOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}") ecm_add_test(bcbpparsertest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(structureddataextractortest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) +ecm_add_test(preprocessortest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(unstructureddataextractortest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(pkpassextractortest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary KPim::PkPass) ecm_add_test(postprocessortest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) ecm_add_test(calendarhandlertest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary) diff --git a/autotests/preprocessortest.cpp b/autotests/preprocessortest.cpp new file mode 100644 index 0000000..a1649aa --- /dev/null +++ b/autotests/preprocessortest.cpp @@ -0,0 +1,58 @@ +/* + Copyright (c) 2018 Volker Krause + + This library is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. +*/ + +#include + +#include +#include +#include + +using namespace KItinerary; + +class PreprocessorTest : public QObject +{ + Q_OBJECT +private Q_SLOTS: + void testPreProcHtml_data() + { + QTest::addColumn("in"); + QTest::addColumn("out"); + + QTest::newRow("empty") << QString() << QString(); + QTest::newRow("nbsp removal") << QStringLiteral("abc def") << QStringLiteral("abc def"); + QTest::newRow("unknown entity") << QStringLiteral("abc&kde;def") << QStringLiteral("abc&kde;def"); + QTest::newRow("unquoted amp leading") << QStringLiteral("abc&something def") << QStringLiteral("abc&something def"); + QTest::newRow("unquoted amp mid") << QStringLiteral("123 abc&def ghi") << QStringLiteral("123 abc&def ghi"); + QTest::newRow("unquoted amp trailing") << QStringLiteral("abc def&ghi") << QStringLiteral("abc def&ghi"); + } + + void testPreProcHtml() + { + QFETCH(QString, in); + QFETCH(QString ,out); + + ExtractorPreprocessor preproc; + preproc.preprocessHtml(in); + QCOMPARE(preproc.text(), out); + } +}; + +QTEST_APPLESS_MAIN(PreprocessorTest) + +#include "preprocessortest.moc" diff --git a/src/extractorpreprocessor.cpp b/src/extractorpreprocessor.cpp index e939646..2b4c66b 100644 --- a/src/extractorpreprocessor.cpp +++ b/src/extractorpreprocessor.cpp @@ -1,116 +1,131 @@ /* Copyright (c) 2017 Volker Krause This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config-kitinerary.h" #include "extractorpreprocessor.h" #include "logging.h" #ifdef HAVE_POPPLER #include #endif #include +#include using namespace KItinerary; void ExtractorPreprocessor::preprocessPlainText(const QString &input) { m_buffer = input; } void ExtractorPreprocessor::preprocessHtml(const QString &input) { m_buffer.reserve(input.size()); int begin = 0; int end = input.indexOf(QLatin1Char('<'), begin); while (begin < input.size() && end < input.size() && end >= 0 && begin >= 0) { if (end > begin) { replaceEntityAndAppend(input.midRef(begin, end - begin)); } begin = input.indexOf(QLatin1Char('>'), end); if (begin < 0) { break; } // replace elements with something suitable for field separation const auto elementName = input.mid(end + 1, begin - end - 1); if (elementName.startsWith(QLatin1String("br"), Qt::CaseInsensitive)) { m_buffer.append(QLatin1Char('\n')); } else { m_buffer.append(QLatin1Char(' ')); } ++begin; end = input.indexOf(QLatin1Char('<'), begin); } if (begin >= 0 && end < 0) { replaceEntityAndAppend(input.midRef(begin)); } //qCDebug(Log) << "Preprocessed HTML content: " << m_buffer; } void ExtractorPreprocessor::preprocessPdf(const QByteArray &input) { #ifdef HAVE_POPPLER std::unique_ptr doc(Poppler::Document::loadFromData(input)); if (!doc || doc->isLocked()) { return; } for (int i = 0, total = doc->numPages(); i < total; ++i) { std::unique_ptr page(doc->page(i)); m_buffer += page->text({}, Poppler::Page::PhysicalLayout); } #else Q_UNUSED(input); #endif } QString ExtractorPreprocessor::text() const { return m_buffer; } +static std::tuple findRange(const QStringRef &text, QChar c1, QChar c2, int start = 0) +{ + int begin = text.indexOf(c1, start); + if (begin < 0) { + return std::make_tuple(-1, -1); + } + + for (int end = begin + 1; end < text.size(); ++end) { + if (text.at(end) == c2) { + return std::make_tuple(begin, end); + } + if (text.at(end) == c1) { + begin = end; + } + } + + return std::make_tuple(-1, -1); +} + void ExtractorPreprocessor::replaceEntityAndAppend(const QStringRef &source) { - int begin = 0; - int end = source.indexOf(QLatin1Char('&'), begin); + int pos = 0; + int begin, end; + std::tie(begin, end) = findRange(source, QLatin1Char('&'), QLatin1Char(';')); while (begin < source.size() && end < source.size() && end >= 0 && begin >= 0) { - if (end > begin) { - m_buffer.append(source.mid(begin, end - begin)); - } - begin = source.indexOf(QLatin1Char(';'), end); - if (begin < 0) { - break; + if (begin > pos) { + m_buffer.append(source.mid(pos, begin - pos)); } - const auto entityName = source.mid(end + 1, begin - end - 1); + + const auto entityName = source.mid(begin + 1, end - begin - 1); if (entityName == QLatin1String("nbsp")) { m_buffer.append(QLatin1Char(' ')); } else { - // keep unknown entities - m_buffer.append(source.mid(end, begin - end + 1)); + m_buffer.append(source.mid(begin, end - begin + 1)); } - ++begin; - end = source.indexOf(QLatin1Char('&'), begin); - } - if (begin >= 0 && end < 0) { - m_buffer.append(source.mid(begin)); + pos = end + 1; + std::tie(begin, end) = findRange(source, QLatin1Char('&'), QLatin1Char(';'), pos); } + m_buffer.append(source.mid(pos)); }