diff --git a/generators/chm/lib/helper_search_index.cpp b/generators/chm/lib/helper_search_index.cpp
index 3d6c704f9..aff0008e0 100644
--- a/generators/chm/lib/helper_search_index.cpp
+++ b/generators/chm/lib/helper_search_index.cpp
@@ -1,493 +1,493 @@
/*
* Kchmviewer - a CHM and EPUB file viewer with broad language support
* Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include
#include "ebook.h"
#include "ebook_search.h"
#include "helper_search_index.h"
static const int DICT_VERSION = 4;
namespace QtAs {
// Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
// search for "print" and "->print" will find it.
static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
static const char WORD_CHARACTERS[] = "$_";
struct Term
{
Term() : frequency(-1) {}
Term( const QString &t, int f, QVector l ) : term( t ), frequency( f ), documents( l ) {}
QString term;
int frequency;
QVectordocuments;
bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
};
QDataStream &operator>>( QDataStream &s, Document &l )
{
s >> l.docNumber;
s >> l.frequency;
return s;
}
QDataStream &operator<<( QDataStream &s, const Document &l )
{
s << (short)l.docNumber;
s << (short)l.frequency;
return s;
}
Index::Index()
: QObject( 0 )
{
lastWindowClosed = false;
connect( qApp, SIGNAL( lastWindowClosed() ), this, SLOT( setLastWinClosed() ) );
}
void Index::setLastWinClosed()
{
lastWindowClosed = true;
}
bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile )
{
if ( docs.isEmpty() )
return false;
docList = docs;
if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) )
entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) );
- QList< QUrl >::ConstIterator it = docList.begin();
+ QList< QUrl >::ConstIterator it = docList.constBegin();
int steps = docList.count() / 100;
if ( !steps )
steps++;
int prog = 0;
- for ( int i = 0; it != docList.end(); ++it, ++i )
+ for ( int i = 0; it != docList.constEnd(); ++it, ++i )
{
if ( lastWindowClosed )
return false;
QUrl filename = *it;
QStringList terms;
if ( parseDocumentToStringlist( chmFile, filename, terms ) )
{
- for ( QStringList::ConstIterator tit = terms.begin(); tit != terms.end(); ++tit )
+ for ( QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit )
insertInDict( *tit, i );
}
if ( i%steps == 0 )
{
prog++;
prog = qMin( prog, 99 );
emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) );
}
}
emit indexingProgress( 100, tr("Processing completed") );
return true;
}
void Index::insertInDict( const QString &str, int docNum )
{
Entry *e = 0;
if ( dict.count() )
e = dict[ str ];
if ( e )
{
if ( e->documents.last().docNumber != docNum )
e->documents.append( Document(docNum, 1 ) );
else
e->documents.last().frequency++;
}
else
{
dict.insert( str, new Entry( docNum ) );
}
}
bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl& filename, QStringList& tokenlist )
{
QString parsedbuf, parseentity, text;
if ( !chmFile->getFileContentAsString( text, filename )
|| text.isEmpty() )
{
qWarning( "Search index generator: could not retrieve the document content for %s", qPrintable( filename.toString() ) );
return false;
}
m_charssplit = SPLIT_CHARACTERS;
m_charsword = WORD_CHARACTERS;
tokenlist.clear();
// State machine states
enum state_t
{
STATE_OUTSIDE_TAGS, // outside HTML tags; parse text
STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag
STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity
};
state_t state = STATE_OUTSIDE_TAGS;
QChar QuoteChar; // used in STATE_IN_QUOTES
for ( int j = 0; j < text.length(); j++ )
{
QChar ch = text[j];
if ( (j % 20000) == 0 )
qApp->processEvents( QEventLoop::ExcludeUserInputEvents );
if ( state == STATE_IN_HTML_TAG )
{
// We are inside HTML tag.
// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
if ( ch == '"' || ch == '\'' )
{
state = STATE_IN_QUOTES;
QuoteChar = ch;
}
else if ( ch == '>' )
state = STATE_OUTSIDE_TAGS;
continue;
}
else if ( state == STATE_IN_QUOTES )
{
// We are inside quoted text inside HTML tag.
// Ignore everything until we see the quote character again
if ( ch == QuoteChar )
state = STATE_IN_HTML_TAG;
continue;
}
else if ( state == STATE_IN_HTML_ENTITY )
{
// We are inside encoded HTML entity (like ).
// Collect to parsedbuf everything until we see ;
if ( ch.isLetterOrNumber() )
{
// get next character of this entity
parseentity.append( ch );
continue;
}
// The entity ended
state = STATE_OUTSIDE_TAGS;
// Some shitty HTML does not terminate entities correctly. Screw it.
if ( ch != ';' && ch != '<' )
{
if ( parseentity.isEmpty() )
{
// straight '&' symbol. Add and continue.
parsedbuf += "&";
}
else
qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable( parseentity ), ch.toLatin1() );
j--; // parse this character again, but in different state
continue;
}
// Don't we have a space?
if ( parseentity.toLower() != "nbsp" )
{
QString entity = entityDecoder.decode( parseentity );
if ( entity.isNull() )
{
// decodeEntity() already printed error message
//qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
continue;
}
parsedbuf += entity;
continue;
}
else
ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
}
//
// Now process STATE_OUTSIDE_TAGS
//
// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
if ( ch == '<' )
{
state = STATE_IN_HTML_TAG;
goto tokenize_buf;
}
// Check for start of HTML entity
if ( ch == '&' )
{
state = STATE_IN_HTML_ENTITY;
parseentity = QString::null;
continue;
}
// Replace quote by ' - quotes are used in search window to set the phrase
if ( ch == '"' )
ch = '\'';
// Ok, we have a valid character outside HTML tags, and probably some in buffer already.
// If it is char or letter, add it and continue
if ( ch.isLetterOrNumber() || m_charsword.indexOf( ch ) != -1 )
{
parsedbuf.append( ch );
continue;
}
// If it is a split char, add the word to the dictionary, and then add the char itself.
if ( m_charssplit.indexOf( ch ) != -1 )
{
if ( !parsedbuf.isEmpty() )
tokenlist.push_back( parsedbuf.toLower() );
tokenlist.push_back( ch.toLower() );
parsedbuf = QString::null;
continue;
}
tokenize_buf:
// Just add the word; it is most likely a space or terminated by tokenizer.
if ( !parsedbuf.isEmpty() )
{
tokenlist.push_back( parsedbuf.toLower() );
parsedbuf = QString::null;
}
}
// Add the last word if still here - for broken htmls.
if ( !parsedbuf.isEmpty() )
tokenlist.push_back( parsedbuf.toLower() );
return true;
}
void Index::writeDict( QDataStream& stream )
{
stream << DICT_VERSION;
stream << m_charssplit;
stream << m_charsword;
// Document list
stream << docList;
// Dictionary
- for( QHash::ConstIterator it = dict.begin(); it != dict.end(); ++it )
+ for( QHash::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it )
{
stream << it.key();
stream << (int) it.value()->documents.count();
stream << it.value()->documents;
}
}
bool Index::readDict( QDataStream& stream )
{
dict.clear();
docList.clear();
QString key;
int version, numOfDocs;
stream >> version;
if ( version < 2 )
return false;
stream >> m_charssplit;
stream >> m_charsword;
// Read the document list
stream >> docList;
while ( !stream.atEnd() )
{
stream >> key;
stream >> numOfDocs;
QVector docs( numOfDocs );
stream >> docs;
dict.insert( key, new Entry( docs ) );
}
return dict.size() > 0;
}
QList< QUrl > Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile )
{
QList termList;
QStringList::ConstIterator it = terms.begin();
for ( it = terms.begin(); it != terms.end(); ++it )
{
Entry *e = 0;
if ( dict[ *it ] )
{
e = dict[ *it ];
termList.append( Term( *it, e->documents.count(), e->documents ) );
}
else
{
return QList< QUrl >();
}
}
if ( !termList.count() )
return QList< QUrl >();
qSort( termList );
QVector minDocs = termList.takeFirst().documents;
for(QList::Iterator it = termList.begin(); it != termList.end(); ++it) {
Term *t = &(*it);
QVector docs = t->documents;
for(QVector::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
bool found = false;
for (QVector::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
(*minDoc_it).frequency += (*doc_it).frequency;
found = true;
break;
}
}
if ( !found )
minDoc_it = minDocs.erase( minDoc_it );
else
++minDoc_it;
}
}
QList< QUrl > results;
qSort( minDocs );
if ( termSeq.isEmpty() ) {
for(QVector::Iterator it = minDocs.begin(); it != minDocs.end(); ++it)
results << docList.at((int)(*it).docNumber);
return results;
}
QUrl fileName;
for(QVector::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) {
fileName = docList[ (int)(*it).docNumber ];
if ( searchForPhrases( termSeq, seqWords, fileName, chmFile ) )
results << fileName;
}
return results;
}
bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook * chmFile )
{
QStringList parsed_document;
if ( !parseDocumentToStringlist( chmFile, filename, parsed_document ) )
return false;
miniDict.clear();
// Initialize the dictionary with the words in phrase(s)
for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
miniDict.insert( *cIt, new PosEntry( 0 ) );
// Fill the dictionary with the words from the document
unsigned int word_offset = 3;
- for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
+ for ( QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++ )
{
PosEntry * entry = miniDict[ *it ];
if ( entry )
entry->positions.append( word_offset );
}
// Dump it
/*
QDictIterator it( miniDict );
for( ; it.current(); ++it )
{
QString text( it.currentKey() );
QValueList pos = miniDict[text]->positions;
for ( unsigned int i = 1; i < pos.size(); i++ )
text += " " + QString::number( pos[i] );
qDebug( "%s", text.ascii());
}
*/
QList first_word_positions;
- for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
+ for ( QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++ )
{
QStringList phrasewords = phrase_it->split( ' ' );
first_word_positions = miniDict[ phrasewords[0] ]->positions;
for ( int j = 1; j < phrasewords.count(); ++j )
{
QList next_word_it = miniDict[ phrasewords[j] ]->positions;
QList::iterator dict_it = first_word_positions.begin();
while ( dict_it != first_word_positions.end() )
{
if ( next_word_it.indexOf( *dict_it + 1 ) != -1 )
{
(*dict_it)++;
++dict_it;
}
else
dict_it = first_word_positions.erase( dict_it );
}
}
}
if ( first_word_positions.count() )
return true;
return false;
}
};