diff --git a/CMakeLists.txt b/CMakeLists.txt --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,6 +111,14 @@ PURPOSE "Needed to build the MTP kioslave" ) +find_package(libmagic) +set_package_properties(libmagic PROPERTIES DESCRIPTION "The libmagic library" + URL "https://www.darwinsys.com/file/" + TYPE OPTIONAL + PURPOSE "Optional to improve the text thumbnail creator encoding detection" + ) + + check_include_file(utime.h HAVE_UTIME_H) # ECM's KDECompilerSettings.cmake should take care of enabling supporting on diff --git a/cmake/Findlibmagic.cmake b/cmake/Findlibmagic.cmake new file mode 100644 --- /dev/null +++ b/cmake/Findlibmagic.cmake @@ -0,0 +1,65 @@ +# - Try to find libssh +# Once done this will define +# +# LIBMAGIC_FOUND - system has libmagic +# LIBMAGIC_INCLUDE_DIR - the libmagic include directory +# LIBMAGIC_LIBRARIES - Link these to use libmagic +# LIBMAGIC_DEFINITIONS - Compiler switches required for using libmagic +# +# Copyright (c) 2020 Méven Car +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. + + +if (LIBMAGIC_INCLUDE_DIR AND LIBMAGIC_LIBRARIES) + + # in cache already + SET(LIBMAGIC_FOUND TRUE) + +else (LIBMAGIC_INCLUDE_DIR AND LIBMAGIC_LIBRARIES) + if(NOT WIN32) + # use pkg-config to get the directories and then use these values + # in the FIND_PATH() and FIND_LIBRARY() calls + INCLUDE(FindPkgConfig) + + pkg_check_modules(_LIBMAGIC libmagic) + + set(LIBMAGIC_DEFINITIONS ${_LIBMAGIC_CFLAGS}) + endif(NOT WIN32) + FIND_PATH(LIBMAGIC_INCLUDE_DIR magic.h + ${_LIBMAGIC_INCLUDE_DIRS} + ) + + FIND_LIBRARY(LIBMAGIC_LIBRARIES NAMES libmagic + PATHS + ${_LIBMAGIC_LIBRARY_DIRS} + ) + + if (LIBMAGIC_INCLUDE_DIR AND LIBMAGIC_LIBRARIES) + set(LIBMAGIC_FOUND TRUE) + endif (LIBMAGIC_INCLUDE_DIR AND LIBMAGIC_LIBRARIES) + + +endif (LIBMAGIC_INCLUDE_DIR AND LIBMAGIC_LIBRARIES) diff --git a/thumbnail/CMakeLists.txt b/thumbnail/CMakeLists.txt --- a/thumbnail/CMakeLists.txt +++ b/thumbnail/CMakeLists.txt @@ -103,6 +103,10 @@ KF5::KIOWidgets KF5::SyntaxHighlighting ) +configure_file(config-thumbnail.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config-thumbnail.h ) +if (LIBMAGIC_FOUND) + target_link_libraries(textthumbnail magic) +endif() install(TARGETS textthumbnail DESTINATION ${KDE_INSTALL_PLUGINDIR}) diff --git a/thumbnail/config-thumbnail.h.cmake b/thumbnail/config-thumbnail.h.cmake new file mode 100644 --- /dev/null +++ b/thumbnail/config-thumbnail.h.cmake @@ -0,0 +1 @@ +#cmakedefine01 LIBMAGIC_FOUND diff --git a/thumbnail/textcreator.cpp b/thumbnail/textcreator.cpp --- a/thumbnail/textcreator.cpp +++ b/thumbnail/textcreator.cpp @@ -34,8 +34,10 @@ #include #include -// TODO Fix or remove kencodingprober code -// #include +#include +#if LIBMAGIC_FOUND + #include "magic.h" +#endif extern "C" { @@ -56,17 +58,52 @@ delete [] m_data; } -static QTextCodec *codecFromContent(const char *data, int dataSize) +#if LIBMAGIC_FOUND +static QTextCodec *codecFromFile(const QString &path) +{ + magic_t magicCookie = magic_open(MAGIC_MIME_ENCODING); + magic_load(magicCookie, nullptr); + const char *ret = magic_file(magicCookie, path.toLocal8Bit()); + const auto codecName = QByteArray(ret).toUpper(); + magic_close(magicCookie); + + if (QTextCodec::availableCodecs().contains(codecName)) { + return QTextCodec::codecForName(codecName); + } + + if (strcmp(ret, "unknown-8bit")) { + // use latin for unknown 8bit as it is quite versatile + return QTextCodec::codecForName("latin-1"); + } + + return nullptr; +} +#endif + +static QTextCodec *codecFromContent(const char *data, int dataSize, const QString &path) { -#if 0 // ### Use this when KEncodingProber does not return junk encoding for UTF-8 data) - KEncodingProber prober; - prober.feed(data, dataSize); - return QTextCodec::codecForName(prober.encoding()); -#else QByteArray ba = QByteArray::fromRawData(data, dataSize); // try to detect UTF text, fall back to locale default (which is usually UTF-8) - return QTextCodec::codecForUtfText(ba, QTextCodec::codecForLocale()); + const auto codec = QTextCodec::codecForUtfText(ba, nullptr); + + if (codec == nullptr) { + // UTF-8 BOM detection failed + auto localCodec = QTextCodec::codecForLocale(); +#if LIBMAGIC_FOUND + if (localCodec->name() == "UTF-8") { + // UTF-8 was already tested in QTextCodec::codecForUtfText + // but only using bom presence + // The file could be utf-8 still or something entirely different + // use libmagic heuristics + auto codecMagic = codecFromFile(path); + if (codecMagic != nullptr) { + return codecMagic; + } + } #endif + return localCodec; + } + return codec; } bool TextCreator::create(const QString &path, int width, int height, QImage &img) @@ -126,7 +163,7 @@ { ok = true; m_data[read] = '\0'; - QString text = codecFromContent( m_data, read )->toUnicode( m_data, read ).trimmed(); + QString text = codecFromContent(m_data, read, path)->toUnicode(m_data, read).trimmed(); // FIXME: maybe strip whitespace and read more? // If the text contains tabs or consecutive spaces, it is probably