Differential D23787 Diff 65646 src/file/extractor/app.cpp

Changeset View

Standalone View

src/file/extractor/app.cpp

Show First 20 Lines • Show All 158 Lines • ▼ Show 20 Line(s)		147	{
159	// The initial BasicIndexingJob run has been supplied with the file extension	159		// The initial BasicIndexingJob run has been supplied with the file extension
160	// mimetype only, skip based on the "real" mimetype	160		// mimetype only, skip based on the "real" mimetype
161	if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {	161		if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {
162	qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;	162		qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
163	tr->removePhaseOne(id);	163		tr->removePhaseOne(id);
164	return;	164		return;
165	}	165		}
166		166
167	// HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
168	// have trouble processing them
169	//
170	if (mimetype.startsWith(QLatin1String("text/"))) {
171	QFileInfo fileInfo(url);
172	if (fileInfo.size() >= 10 * 1024 * 1024) {
173	tr->removePhaseOne(id);
	davidedmundsonUnsubmitted Not Done This original line seemed very very wrong. Just because we won't want to index phase 2 isn't a reason to remove the filename indexing - it'll just keep running phase 1 on itself again and again. So +1 on that davidedmundson: This original line seemed very very wrong. Just because we won't want to index phase 2 isn't a…
	brunsUnsubmitted Not Done Off by one error - phase one is content indexing already, phase zero is filename/filestat. bruns: Off by one error - phase one is content indexing already, phase zero is filename/filestat.
174	return;
175	}
176	}
177
178	// We always run the basic indexing again. This is mostly so that the proper	167		// We always run the basic indexing again. This is mostly so that the proper
179	// mimetype is set and we get proper type information.	168		// mimetype is set and we get proper type information.
180	// The mimetype fetched in the BasicIndexingJob is fast but not accurate	169		// The mimetype fetched in the BasicIndexingJob is fast but not accurate
181	BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);	170		BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
182	basicIndexer.index();	171		basicIndexer.index();
183		172
184	Baloo::Document doc = basicIndexer.document();	173		Baloo::Document doc = basicIndexer.document();
185		174
186	Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractEverything);	175		Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractEverything);
187	result.setDocument(doc);	176		result.setDocument(doc);
188		177
189	const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);	178		const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
190		179
191	for (KFileMetaData::Extractor* ex : exList) {	180		for (KFileMetaData::Extractor* ex : exList) {
		181		// HACK: We're ignoring text files which are greater tha 10 Mb as we
		182		// have trouble processing them
		183		if ((ex->extractorProperties()["Name"].toString() == QLatin1String("PlaintextExtractor")) && (QFileInfo(url).size() >= 10 * 1024 * 1024)) {
			broulikUnsubmitted Done Store the size in a variable outside the loop, otherwise you end up querying it on each iteration. broulik: Store the size in a variable outside the loop, otherwise you end up querying it on each…
			brunsUnsubmitted Done You should compare for size first, as that's much cheaper than fetching the property and comparing the string. bruns: You should compare for size first, as that's much cheaper than fetching the property and…
		184		qCWarning(BALOO) << "Skipping" << url << ": large plain text file";
			brunsUnsubmitted Not Done Users will love us for spammig the logs ... bruns: Users will love us for spammig the logs ...
			poboikoAuthorUnsubmitted Not Done I though users might actually want to know if file was excluded (and the reasoning behind it). I can make its severity less, i.e. `qCDebug`. Or you think it should be completely removed? poboiko: I though users might actually want to know if file was excluded (and the reasoning behind it).
			ngrahamUnsubmitted Not Done Users don't read log files. :) Then again if the default log level is error rather than warning, then this isn't actually a logspam problem either. ngraham: Users don't read log files. :) Then again if the default log level is error rather than…
		185		continue;
		186		}
192	ex->extract(&result);	187		ex->extract(&result);
193	}	188		}
194		189
195	result.finish();	190		result.finish();
196	if (doc.id() != id) {	191		if (doc.id() != id) {
197	qWarning() << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";	192		qWarning() << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
198	tr->removeDocument(id);	193		tr->removeDocument(id);
199	if (!tr->hasDocument(doc.id())) {	194		if (!tr->hasDocument(doc.id())) {
Show All 9 Lines