Changeset View
Standalone View
thumbnail/textcreator.cpp
Show All 28 Lines | |||||
29 | #include <QTextCodec> | 29 | #include <QTextCodec> | ||
30 | #include <QTextDocument> | 30 | #include <QTextDocument> | ||
31 | 31 | | |||
32 | #include <KSyntaxHighlighting/SyntaxHighlighter> | 32 | #include <KSyntaxHighlighting/SyntaxHighlighter> | ||
33 | #include <KSyntaxHighlighting/Theme> | 33 | #include <KSyntaxHighlighting/Theme> | ||
34 | #include <KSyntaxHighlighting/Definition> | 34 | #include <KSyntaxHighlighting/Definition> | ||
35 | #include <KDesktopFile> | 35 | #include <KDesktopFile> | ||
36 | 36 | | |||
37 | // TODO Fix or remove kencodingprober code | 37 | #include <config-thumbnail.h> | ||
38 | // #include <kencodingprober.h> | 38 | #if LIBMAGIC_FOUND | ||
sitter: TBH, I would make libmagic required for building the thumbnail plugin. I can't see much of a… | |||||
Without libmagic, it is current state basically UTF-8 with bom detection otherwise local codec. I did not test exhaustive encodings so I wanted to let the door open for users to not rely on libmagic. And libmagic loads a 5M file storing its heuristics each time it loads ( /usr/share/misc/magic.mgc ). meven: Without libmagic, it is current state basically UTF-8 with bom detection otherwise local codec. | |||||
Perhaps it'd make sense to refactor this a bit and construct some test cases around encoding detection so we get a sense of reliablity? The way I am looking at this: either libmagic always does the best job at detecting encodings, at which point we'll want it as a required dep, or there's something better in which case we don't want libmagic at all and instead use the something better ;) In the end the user isn't necessarily in charge of what a random file will be encoded with, so I don't think there's a point in letting the user (or the distro) build an inferior product by accidentally not including libmagic. The truth is neither we nor the user can with any certainty say what encodings the thumbnailer will encounter. sitter: Perhaps it'd make sense to refactor this a bit and construct some test cases around encoding… | |||||
39 | #include "magic.h" | ||||
40 | #endif | ||||
39 | 41 | | |||
40 | extern "C" | 42 | extern "C" | ||
41 | { | 43 | { | ||
42 | Q_DECL_EXPORT ThumbCreator *new_creator() | 44 | Q_DECL_EXPORT ThumbCreator *new_creator() | ||
43 | { | 45 | { | ||
44 | return new TextCreator; | 46 | return new TextCreator; | ||
45 | } | 47 | } | ||
46 | } | 48 | } | ||
47 | 49 | | |||
48 | TextCreator::TextCreator() | 50 | TextCreator::TextCreator() | ||
49 | : m_data(nullptr), | 51 | : m_data(nullptr), | ||
50 | m_dataSize(0) | 52 | m_dataSize(0) | ||
51 | { | 53 | { | ||
52 | } | 54 | } | ||
53 | 55 | | |||
54 | TextCreator::~TextCreator() | 56 | TextCreator::~TextCreator() | ||
55 | { | 57 | { | ||
56 | delete [] m_data; | 58 | delete [] m_data; | ||
57 | } | 59 | } | ||
58 | 60 | | |||
59 | static QTextCodec *codecFromContent(const char *data, int dataSize) | 61 | #if LIBMAGIC_FOUND | ||
62 | static QTextCodec *codecFromFile(const QString &path) | ||||
sitter: `*` on wrong side of space | |||||
63 | { | ||||
64 | magic_t magicCookie = magic_open(MAGIC_MIME_ENCODING); | ||||
sitter: better name than m? (: | |||||
65 | magic_load(magicCookie, nullptr); | ||||
66 | const char *ret = magic_file(magicCookie, path.toLocal8Bit()); | ||||
excess whitespace towards the end. I also wonder if qfile::encodename would be better sitter: excess whitespace towards the end. I also wonder if qfile::encodename would be better | |||||
67 | const auto codecName = QByteArray(ret).toUpper(); | ||||
I guess you could just toUpper on a QBA instead of going through a temporary QString since ret is an encoding identifier ajnd would be always an ascii string. sitter: I guess you could just toUpper on a QBA instead of going through a temporary QString since ret… | |||||
68 | magic_close(magicCookie); | ||||
69 | | ||||
70 | if (QTextCodec::availableCodecs().contains(codecName)) { | ||||
71 | return QTextCodec::codecForName(codecName); | ||||
72 | } | ||||
73 | | ||||
74 | if (strcmp(ret, "unknown-8bit")) { | ||||
75 | // use latin for unknown 8bit as it is quite versatile | ||||
sitter: unkwnwn typo | |||||
76 | return QTextCodec::codecForName("latin-1"); | ||||
77 | } | ||||
78 | | ||||
79 | return nullptr; | ||||
80 | } | ||||
81 | #endif | ||||
82 | | ||||
83 | static QTextCodec *codecFromContent(const char *data, int dataSize, const QString &path) | ||||
60 | { | 84 | { | ||
61 | #if 0 // ### Use this when KEncodingProber does not return junk encoding for UTF-8 data) | | |||
62 | KEncodingProber prober; | | |||
63 | prober.feed(data, dataSize); | | |||
64 | return QTextCodec::codecForName(prober.encoding()); | | |||
65 | #else | | |||
66 | QByteArray ba = QByteArray::fromRawData(data, dataSize); | 85 | QByteArray ba = QByteArray::fromRawData(data, dataSize); | ||
67 | // try to detect UTF text, fall back to locale default (which is usually UTF-8) | 86 | // try to detect UTF text, fall back to locale default (which is usually UTF-8) | ||
68 | return QTextCodec::codecForUtfText(ba, QTextCodec::codecForLocale()); | 87 | const auto codec = QTextCodec::codecForUtfText(ba, nullptr); | ||
88 | | ||||
89 | if (codec == nullptr) { | ||||
90 | // UTF-8 BOM detection failed | ||||
91 | auto localCodec = QTextCodec::codecForLocale(); | ||||
92 | #if LIBMAGIC_FOUND | ||||
93 | if (localCodec->name() == "UTF-8") { | ||||
94 | // UTF-8 was already tested in QTextCodec::codecForUtfText | ||||
95 | // but only using bom presence | ||||
96 | // The file could be utf-8 still or something entirely different | ||||
97 | // use libmagic heuristics | ||||
98 | auto codecMagic = codecFromFile(path); | ||||
99 | if (codecMagic != nullptr) { | ||||
100 | return codecMagic; | ||||
101 | } | ||||
102 | } | ||||
69 | #endif | 103 | #endif | ||
104 | return localCodec; | ||||
105 | } | ||||
106 | return codec; | ||||
70 | } | 107 | } | ||
71 | 108 | | |||
72 | bool TextCreator::create(const QString &path, int width, int height, QImage &img) | 109 | bool TextCreator::create(const QString &path, int width, int height, QImage &img) | ||
73 | { | 110 | { | ||
74 | // Desktop files, .directory files, and flatpakrefs aren't traditional | 111 | // Desktop files, .directory files, and flatpakrefs aren't traditional | ||
75 | // text files, so their icons should be shown instead | 112 | // text files, so their icons should be shown instead | ||
76 | if (KDesktopFile::isDesktopFile(path) | 113 | if (KDesktopFile::isDesktopFile(path) | ||
77 | || path.endsWith(QStringLiteral(".directory")) | 114 | || path.endsWith(QStringLiteral(".directory")) | ||
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Line(s) | 155 | { | |||
121 | m_dataSize = bytesToRead + 1; | 158 | m_dataSize = bytesToRead + 1; | ||
122 | } | 159 | } | ||
123 | 160 | | |||
124 | int read = file.read( m_data, bytesToRead ); | 161 | int read = file.read( m_data, bytesToRead ); | ||
125 | if ( read > 0 ) | 162 | if ( read > 0 ) | ||
126 | { | 163 | { | ||
127 | ok = true; | 164 | ok = true; | ||
128 | m_data[read] = '\0'; | 165 | m_data[read] = '\0'; | ||
129 | QString text = codecFromContent( m_data, read )->toUnicode( m_data, read ).trimmed(); | 166 | QString text = codecFromContent(m_data, read, path)->toUnicode(m_data, read).trimmed(); | ||
130 | // FIXME: maybe strip whitespace and read more? | 167 | // FIXME: maybe strip whitespace and read more? | ||
131 | 168 | | |||
132 | // If the text contains tabs or consecutive spaces, it is probably | 169 | // If the text contains tabs or consecutive spaces, it is probably | ||
133 | // formatted using white space. Use a fixed pitch font in this case. | 170 | // formatted using white space. Use a fixed pitch font in this case. | ||
134 | const auto textLines = text.splitRef(QLatin1Char('\n')); | 171 | const auto textLines = text.splitRef(QLatin1Char('\n')); | ||
135 | for (const auto& line : textLines) { | 172 | for (const auto& line : textLines) { | ||
136 | const auto trimmedLine = line.trimmed(); | 173 | const auto trimmedLine = line.trimmed(); | ||
137 | if ( trimmedLine.contains( '\t' ) || trimmedLine.contains( " " ) ) { | 174 | if ( trimmedLine.contains( '\t' ) || trimmedLine.contains( " " ) ) { | ||
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines |
TBH, I would make libmagic required for building the thumbnail plugin. I can't see much of a rationale for why we'd want to support "broken"/insufficient encoding detection when there's code that makes it better.