Differential D29381 Diff 83055 thumbnail/textcreator.cpp

Changeset View

Standalone View

thumbnail/textcreator.cpp

Show All 28 Lines
29	#include <QTextCodec>	29		#include <QTextCodec>
30	#include <QTextDocument>	30		#include <QTextDocument>
31		31
32	#include <KSyntaxHighlighting/SyntaxHighlighter>	32		#include <KSyntaxHighlighting/SyntaxHighlighter>
33	#include <KSyntaxHighlighting/Theme>	33		#include <KSyntaxHighlighting/Theme>
34	#include <KSyntaxHighlighting/Definition>	34		#include <KSyntaxHighlighting/Definition>
35	#include <KDesktopFile>	35		#include <KDesktopFile>
36		36
37	// TODO Fix or remove kencodingprober code	37		#include <config-thumbnail.h>
38	// #include <kencodingprober.h>	38		#if LIBMAGIC_FOUND
			sitterUnsubmitted Not Done TBH, I would make libmagic required for building the thumbnail plugin. I can't see much of a rationale for why we'd want to support "broken"/insufficient encoding detection when there's code that makes it better. sitter: TBH, I would make libmagic required for building the thumbnail plugin. I can't see much of a…
			mevenAuthorUnsubmitted Not Done Without libmagic, it is current state basically UTF-8 with bom detection otherwise local codec. I did not test exhaustive encodings so I wanted to let the door open for users to not rely on libmagic. libmagic works well from what I've tested but I could not be absolutely sure for the multiple encodings out there. Hopefully libmagic does a better job detecting UTF-8 (which I saw) but for users not using much UTF-8... And libmagic loads a 5M file storing its heuristics each time it loads ( /usr/share/misc/magic.mgc ). It would be great to keep this in memory somewhere, maybe a static. meven: Without libmagic, it is current state basically UTF-8 with bom detection otherwise local codec.
			sitterUnsubmitted Not Done Perhaps it'd make sense to refactor this a bit and construct some test cases around encoding detection so we get a sense of reliablity? The way I am looking at this: either libmagic always does the best job at detecting encodings, at which point we'll want it as a required dep, or there's something better in which case we don't want libmagic at all and instead use the something better ;) In the end the user isn't necessarily in charge of what a random file will be encoded with, so I don't think there's a point in letting the user (or the distro) build an inferior product by accidentally not including libmagic. The truth is neither we nor the user can with any certainty say what encodings the thumbnailer will encounter. sitter: Perhaps it'd make sense to refactor this a bit and construct some test cases around encoding…
		39		#include "magic.h"
		40		#endif
39		41
40	extern "C"	42		extern "C"
41	{	43		{
42	Q_DECL_EXPORT ThumbCreator *new_creator()	44		Q_DECL_EXPORT ThumbCreator *new_creator()
43	{	45		{
44	return new TextCreator;	46		return new TextCreator;
45	}	47		}
46	}	48		}
47		49
48	TextCreator::TextCreator()	50		TextCreator::TextCreator()
49	: m_data(nullptr),	51		: m_data(nullptr),
50	m_dataSize(0)	52		m_dataSize(0)
51	{	53		{
52	}	54		}
53		55
54	TextCreator::~TextCreator()	56		TextCreator::~TextCreator()
55	{	57		{
56	delete [] m_data;	58		delete [] m_data;
57	}	59		}
58		60
59	static QTextCodec codecFromContent(const char data, int dataSize)	61		#if LIBMAGIC_FOUND
		62		static QTextCodec *codecFromFile(const QString &path)
			sitterUnsubmitted Done `` on wrong side of space sitter:* `*` on wrong side of space
		63		{
		64		magic_t magicCookie = magic_open(MAGIC_MIME_ENCODING);
			sitterUnsubmitted Done better name than m? (: sitter: better name than m? (:
		65		magic_load(magicCookie, nullptr);
		66		const char *ret = magic_file(magicCookie, path.toLocal8Bit());
			sitterUnsubmitted Done excess whitespace towards the end. I also wonder if qfile::encodename would be better sitter: excess whitespace towards the end. I also wonder if qfile::encodename would be better
		67		const auto codecName = QByteArray(ret).toUpper();
			sitterUnsubmitted Done I guess you could just toUpper on a QBA instead of going through a temporary QString since ret is an encoding identifier ajnd would be always an ascii string. Also, can be const it seems. sitter: I guess you could just toUpper on a QBA instead of going through a temporary QString since ret…
		68		magic_close(magicCookie);
		69
		70		if (QTextCodec::availableCodecs().contains(codecName)) {
		71		return QTextCodec::codecForName(codecName);
		72		}
		73
		74		if (strcmp(ret, "unknown-8bit")) {
		75		// use latin for unknown 8bit as it is quite versatile
			sitterUnsubmitted Done unkwnwn typo sitter: unkwnwn typo
		76		return QTextCodec::codecForName("latin-1");
		77		}
		78
		79		return nullptr;
		80		}
		81		#endif
		82
		83		static QTextCodec codecFromContent(const char data, int dataSize, const QString &path)
60	{	84		{
61	#if 0 // ### Use this when KEncodingProber does not return junk encoding for UTF-8 data)
62	KEncodingProber prober;
63	prober.feed(data, dataSize);
64	return QTextCodec::codecForName(prober.encoding());
65	#else
66	QByteArray ba = QByteArray::fromRawData(data, dataSize);	85		QByteArray ba = QByteArray::fromRawData(data, dataSize);
67	// try to detect UTF text, fall back to locale default (which is usually UTF-8)	86		// try to detect UTF text, fall back to locale default (which is usually UTF-8)
68	return QTextCodec::codecForUtfText(ba, QTextCodec::codecForLocale());	87		const auto codec = QTextCodec::codecForUtfText(ba, nullptr);
		88
		89		if (codec == nullptr) {
		90		// UTF-8 BOM detection failed
		91		auto localCodec = QTextCodec::codecForLocale();
		92		#if LIBMAGIC_FOUND
		93		if (localCodec->name() == "UTF-8") {
		94		// UTF-8 was already tested in QTextCodec::codecForUtfText
		95		// but only using bom presence
		96		// The file could be utf-8 still or something entirely different
		97		// use libmagic heuristics
		98		auto codecMagic = codecFromFile(path);
		99		if (codecMagic != nullptr) {
		100		return codecMagic;
		101		}
		102		}
69	#endif	103		#endif
		104		return localCodec;
		105		}
		106		return codec;
70	}	107		}
71		108
72	bool TextCreator::create(const QString &path, int width, int height, QImage &img)	109		bool TextCreator::create(const QString &path, int width, int height, QImage &img)
73	{	110		{
74	// Desktop files, .directory files, and flatpakrefs aren't traditional	111		// Desktop files, .directory files, and flatpakrefs aren't traditional
75	// text files, so their icons should be shown instead	112		// text files, so their icons should be shown instead
76	if (KDesktopFile::isDesktopFile(path)	113		if (KDesktopFile::isDesktopFile(path)
77	\|\| path.endsWith(QStringLiteral(".directory"))	114		\|\| path.endsWith(QStringLiteral(".directory"))
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Line(s)		155	{
121	m_dataSize = bytesToRead + 1;	158		m_dataSize = bytesToRead + 1;
122	}	159		}
123		160
124	int read = file.read( m_data, bytesToRead );	161		int read = file.read( m_data, bytesToRead );
125	if ( read > 0 )	162		if ( read > 0 )
126	{	163		{
127	ok = true;	164		ok = true;
128	m_data[read] = '\0';	165		m_data[read] = '\0';
129	QString text = codecFromContent( m_data, read )->toUnicode( m_data, read ).trimmed();	166		QString text = codecFromContent(m_data, read, path)->toUnicode(m_data, read).trimmed();
130	// FIXME: maybe strip whitespace and read more?	167		// FIXME: maybe strip whitespace and read more?
131		168
132	// If the text contains tabs or consecutive spaces, it is probably	169		// If the text contains tabs or consecutive spaces, it is probably
133	// formatted using white space. Use a fixed pitch font in this case.	170		// formatted using white space. Use a fixed pitch font in this case.
134	const auto textLines = text.splitRef(QLatin1Char('\n'));	171		const auto textLines = text.splitRef(QLatin1Char('\n'));
135	for (const auto& line : textLines) {	172		for (const auto& line : textLines) {
136	const auto trimmedLine = line.trimmed();	173		const auto trimmedLine = line.trimmed();
137	if ( trimmedLine.contains( '\t' ) \|\| trimmedLine.contains( " " ) ) {	174		if ( trimmedLine.contains( '\t' ) \|\| trimmedLine.contains( " " ) ) {
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines