Changeset View
Changeset View
Standalone View
Standalone View
src/extractors/popplerextractor.cpp
Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Line(s) | 45 | { | |||
---|---|---|---|---|---|
49 | if (!pdfDoc || pdfDoc->isLocked()) { | 49 | if (!pdfDoc || pdfDoc->isLocked()) { | ||
50 | return; | 50 | return; | ||
51 | } | 51 | } | ||
52 | 52 | | |||
53 | result->addType(Type::Document); | 53 | result->addType(Type::Document); | ||
54 | 54 | | |||
55 | QString title = pdfDoc->info(QStringLiteral("Title")).trimmed(); | 55 | QString title = pdfDoc->info(QStringLiteral("Title")).trimmed(); | ||
56 | 56 | | |||
57 | // The title extracted from the pdf metadata is in many cases not the real title | | |||
58 | // of the document. Especially for research papers that are exported to pdf. | | |||
59 | // As mostly the title of a pdf document is written on the first page in the biggest font | | |||
60 | // we use this if the pdfDoc title is considered junk | | |||
61 | if (title.isEmpty() || | | |||
62 | !title.contains(QLatin1Char(' ')) || // very unlikely the title of a document does only contain one word. | | |||
63 | title.contains(QStringLiteral("Microsoft"), Qt::CaseInsensitive)) { // most research papers i found written with microsoft word | | |||
64 | // have a garbage title of the pdf creator rather than the real document title | | |||
65 | title = parseFirstPage(pdfDoc.data(), fileUrl); | | |||
66 | } | | |||
67 | | ||||
68 | if (!title.isEmpty()) { | 57 | if (!title.isEmpty()) { | ||
69 | result->add(Property::Title, title); | 58 | result->add(Property::Title, title); | ||
70 | } | 59 | } | ||
71 | 60 | | |||
72 | QString subject = pdfDoc->info(QStringLiteral("Subject")); | 61 | QString subject = pdfDoc->info(QStringLiteral("Subject")); | ||
73 | if (!subject.isEmpty()) { | 62 | if (!subject.isEmpty()) { | ||
74 | result->add(Property::Subject, subject); | 63 | result->add(Property::Subject, subject); | ||
75 | } | 64 | } | ||
Show All 22 Lines | 86 | for (int i = 0; i < pdfDoc->numPages(); i++) { | |||
98 | QScopedPointer<Poppler::Page> page(pdfDoc->page(i)); | 87 | QScopedPointer<Poppler::Page> page(pdfDoc->page(i)); | ||
99 | if (!page) { // broken pdf files do not return a valid page | 88 | if (!page) { // broken pdf files do not return a valid page | ||
100 | qWarning() << "Could not read page content from" << fileUrl; | 89 | qWarning() << "Could not read page content from" << fileUrl; | ||
101 | break; | 90 | break; | ||
102 | } | 91 | } | ||
103 | result->append(page->text(QRectF())); | 92 | result->append(page->text(QRectF())); | ||
104 | } | 93 | } | ||
105 | } | 94 | } | ||
106 | | ||||
107 | QString PopplerExtractor::parseFirstPage(Poppler::Document* pdfDoc, const QString& fileUrl) | | |||
108 | { | | |||
109 | QScopedPointer<Poppler::Page> p(pdfDoc->page(0)); | | |||
110 | | ||||
111 | if (!p) { | | |||
112 | qWarning() << "Could not read page content from" << fileUrl; | | |||
113 | return QString(); | | |||
114 | } | | |||
115 | | ||||
116 | QList<Poppler::TextBox*> tbList = p->textList(); | | |||
117 | QMap<int, QString> possibleTitleMap; | | |||
118 | | ||||
119 | int currentLargestChar = 0; | | |||
120 | int skipTextboxes = 0; | | |||
121 | | ||||
122 | // Iterate over all textboxes. Each textbox can be a single character/word or textblock | | |||
123 | // Here we combine the etxtboxes back together based on the textsize | | |||
124 | // Important are the words with the biggest font size | | |||
125 | foreach(Poppler::TextBox * tb, tbList) { | | |||
126 | | ||||
127 | // if we added followup words, skip the textboxes here now | | |||
128 | if (skipTextboxes > 0) { | | |||
129 | skipTextboxes--; | | |||
130 | continue; | | |||
131 | } | | |||
132 | | ||||
133 | int height = tb->charBoundingBox(0).height(); | | |||
134 | | ||||
135 | // if the following text is smaller than the biggest we found up to now, ignore it | | |||
136 | if (height >= currentLargestChar) { | | |||
137 | QString possibleTitle; | | |||
138 | possibleTitle.append(tb->text()); | | |||
139 | currentLargestChar = height; | | |||
140 | | ||||
141 | // if the text has follow up words add them to to create the full title | | |||
142 | Poppler::TextBox* next = tb->nextWord(); | | |||
143 | while (next) { | | |||
144 | possibleTitle.append(QLatin1Char(' ')); | | |||
145 | possibleTitle.append(next->text()); | | |||
146 | next = next->nextWord(); | | |||
147 | skipTextboxes++; | | |||
148 | } | | |||
149 | | ||||
150 | // now combine text for each font size together, very likeley it must be connected | | |||
151 | QString existingTitlePart = possibleTitleMap.value(currentLargestChar, QString()); | | |||
152 | existingTitlePart.append(QLatin1Char(' ')); | | |||
153 | existingTitlePart.append(possibleTitle); | | |||
154 | possibleTitleMap.insert(currentLargestChar, existingTitlePart); | | |||
155 | } | | |||
156 | } | | |||
157 | | ||||
158 | qDeleteAll(tbList); | | |||
159 | | ||||
160 | QList<int> titleSizes = possibleTitleMap.keys(); | | |||
161 | qSort(titleSizes.begin(), titleSizes.end(), qGreater<int>()); | | |||
162 | | ||||
163 | QString newPossibleTitle; | | |||
164 | | ||||
165 | // find the text with the largest font that is not just 1 character | | |||
166 | foreach(int i, titleSizes) { | | |||
167 | QString title = possibleTitleMap.value(i); | | |||
168 | | ||||
169 | // sometime the biggest part is a single letter | | |||
170 | // as a starting paragraph letter | | |||
171 | if (title.size() < 5) { | | |||
172 | continue; | | |||
173 | } else { | | |||
174 | newPossibleTitle = title.trimmed(); | | |||
175 | break; | | |||
176 | } | | |||
177 | } | | |||
178 | | ||||
179 | // Sometimes the titles that are extracted are too large. This is a way of trimming them. | | |||
180 | newPossibleTitle.truncate(50); | | |||
181 | return newPossibleTitle; | | |||
182 | } | |