Differential D8007 Diff 33921 src/extractors/popplerextractor.cpp

Changeset View

Standalone View

src/extractors/popplerextractor.cpp

Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Line(s)		45	{
49	if (!pdfDoc \|\| pdfDoc->isLocked()) {	49		if (!pdfDoc \|\| pdfDoc->isLocked()) {
50	return;	50		return;
51	}	51		}
52		52
53	result->addType(Type::Document);	53		result->addType(Type::Document);
54		54
55	QString title = pdfDoc->info(QStringLiteral("Title")).trimmed();	55		QString title = pdfDoc->info(QStringLiteral("Title")).trimmed();
56		56
57	// The title extracted from the pdf metadata is in many cases not the real title
58	// of the document. Especially for research papers that are exported to pdf.
59	// As mostly the title of a pdf document is written on the first page in the biggest font
60	// we use this if the pdfDoc title is considered junk
61	if (title.isEmpty() \|\|
62	!title.contains(QLatin1Char(' ')) \|\| // very unlikely the title of a document does only contain one word.
63	title.contains(QStringLiteral("Microsoft"), Qt::CaseInsensitive)) { // most research papers i found written with microsoft word
64	// have a garbage title of the pdf creator rather than the real document title
65	title = parseFirstPage(pdfDoc.data(), fileUrl);
66	}
67
68	if (!title.isEmpty()) {	57		if (!title.isEmpty()) {
69	result->add(Property::Title, title);	58		result->add(Property::Title, title);
70	}	59		}
71		60
72	QString subject = pdfDoc->info(QStringLiteral("Subject"));	61		QString subject = pdfDoc->info(QStringLiteral("Subject"));
73	if (!subject.isEmpty()) {	62		if (!subject.isEmpty()) {
74	result->add(Property::Subject, subject);	63		result->add(Property::Subject, subject);
75	}	64		}
Show All 22 Lines		86	for (int i = 0; i < pdfDoc->numPages(); i++) {
98	QScopedPointer<Poppler::Page> page(pdfDoc->page(i));	87		QScopedPointer<Poppler::Page> page(pdfDoc->page(i));
99	if (!page) { // broken pdf files do not return a valid page	88		if (!page) { // broken pdf files do not return a valid page
100	qWarning() << "Could not read page content from" << fileUrl;	89		qWarning() << "Could not read page content from" << fileUrl;
101	break;	90		break;
102	}	91		}
103	result->append(page->text(QRectF()));	92		result->append(page->text(QRectF()));
104	}	93		}
105	}	94		}
106
107	QString PopplerExtractor::parseFirstPage(Poppler::Document* pdfDoc, const QString& fileUrl)
108	{
109	QScopedPointer<Poppler::Page> p(pdfDoc->page(0));
110
111	if (!p) {
112	qWarning() << "Could not read page content from" << fileUrl;
113	return QString();
114	}
115
116	QList<Poppler::TextBox*> tbList = p->textList();
117	QMap<int, QString> possibleTitleMap;
118
119	int currentLargestChar = 0;
120	int skipTextboxes = 0;
121
122	// Iterate over all textboxes. Each textbox can be a single character/word or textblock
123	// Here we combine the etxtboxes back together based on the textsize
124	// Important are the words with the biggest font size
125	foreach(Poppler::TextBox * tb, tbList) {
126
127	// if we added followup words, skip the textboxes here now
128	if (skipTextboxes > 0) {
129	skipTextboxes--;
130	continue;
131	}
132
133	int height = tb->charBoundingBox(0).height();
134
135	// if the following text is smaller than the biggest we found up to now, ignore it
136	if (height >= currentLargestChar) {
137	QString possibleTitle;
138	possibleTitle.append(tb->text());
139	currentLargestChar = height;
140
141	// if the text has follow up words add them to to create the full title
142	Poppler::TextBox* next = tb->nextWord();
143	while (next) {
144	possibleTitle.append(QLatin1Char(' '));
145	possibleTitle.append(next->text());
146	next = next->nextWord();
147	skipTextboxes++;
148	}
149
150	// now combine text for each font size together, very likeley it must be connected
151	QString existingTitlePart = possibleTitleMap.value(currentLargestChar, QString());
152	existingTitlePart.append(QLatin1Char(' '));
153	existingTitlePart.append(possibleTitle);
154	possibleTitleMap.insert(currentLargestChar, existingTitlePart);
155	}
156	}
157
158	qDeleteAll(tbList);
159
160	QList<int> titleSizes = possibleTitleMap.keys();
161	qSort(titleSizes.begin(), titleSizes.end(), qGreater<int>());
162
163	QString newPossibleTitle;
164
165	// find the text with the largest font that is not just 1 character
166	foreach(int i, titleSizes) {
167	QString title = possibleTitleMap.value(i);
168
169	// sometime the biggest part is a single letter
170	// as a starting paragraph letter
171	if (title.size() < 5) {
172	continue;
173	} else {
174	newPossibleTitle = title.trimmed();
175	break;
176	}
177	}
178
179	// Sometimes the titles that are extracted are too large. This is a way of trimming them.
180	newPossibleTitle.truncate(50);
181	return newPossibleTitle;
182	}