Index: trunk/playground/base/kat/src/fulltext_html/fulltext_html.cpp
===================================================================
--- trunk/playground/base/kat/src/fulltext_html/fulltext_html.cpp (revision 472383)
+++ trunk/playground/base/kat/src/fulltext_html/fulltext_html.cpp (revision 472384)
@@ -1,175 +1,178 @@
/***************************************************************************
* Copyright (C) 2005 by Roberto Cappuccio and the Kat team *
* Roberto Cappuccio : roberto.cappuccio@gmail.com *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include
#include
#include
#include
-#include "textcat.h"
-
#include "fulltext_html.h"
#include "kat_export.h"
extern "C"
{
KAT_EXPORT FulltextExtractor* new_extractor( const QString& path )
{
return new HtmlExtractor( path );
}
+
+ void *textcat_Init( const char *conffile );
+ void textcat_Done( void *handle );
+ char *textcat_Classify( void *handle, const char *buffer, size_t size );
+ char *textcat_Version();
}
HtmlExtractor::HtmlExtractor( const QString& path ) :
m_bDone( false ),
m_file( path )
{
m_open = m_file.open( IO_ReadOnly );
}
HtmlExtractor::~HtmlExtractor()
{
m_file.close();
}
void HtmlExtractor::extract( QString& data )
{
kdDebug() << "HTMLExtractor start" << endl;
if ( m_bDone ) {
data = QString::null;
return;
}
m_bDone = true;
if ( m_open ) {
QByteArray buffer( m_file.size() + 1 );
m_file.readBlock( buffer.data(), m_file.size() );
buffer[ m_file.size() ] = '\0';
QString s( buffer.data() );
void* h = textcat_Init( "/usr/share/apps/kat/language/conf.txt" );
if ( h ) {
/*** We only need a little text to determine the language ***/
char* result = textcat_Classify( h, buffer.data(), m_file.size() + 1 );
kdDebug() << "Result == " << result << endl;
textcat_Done( h );
}
else
{
kdDebug() << "Unable to init. Aborting." << endl;
}
//Try to get the encoding from HTML HTTP-EQUIV tag
//The supported encodings are:
// Latin1
// Big5 -- Chinese
// Big5-HKSCS -- Chinese
// eucJP -- Japanese
// eucKR -- Korean
// GB2312 -- Chinese
// GBK -- Chinese
// GB18030 -- Chinese
// JIS7 -- Japanese
// Shift-JIS -- Japanese
// TSCII -- Tamil
// utf8 -- Unicode, 8-bit
// utf16 -- Unicode
// KOI8-R -- Russian
// KOI8-U -- Ukrainian
// ISO8859-1 -- Western
// ISO8859-2 -- Central European
// ISO8859-3 -- Central European
// ISO8859-4 -- Baltic
// ISO8859-5 -- Cyrillic
// ISO8859-6 -- Arabic
// ISO8859-7 -- Greek
// ISO8859-8 -- Hebrew, visually ordered
// ISO8859-8-i -- Hebrew, logically ordered
// ISO8859-9 -- Turkish
// ISO8859-10
// ISO8859-13
// ISO8859-14
// ISO8859-15 -- Western (it is ISO8859-1 plus the Euro sign)
// IBM 850
// IBM 866
// CP874
// CP1250 -- Central European
// CP1251 -- Cyrillic
// CP1252 -- Western
// CP1253 -- Greek
// CP1254 -- Turkish
// CP1255 -- Hebrew
// CP1256 -- Arabic
// CP1257 -- Baltic
// CP1258
// Apple Roman
// TIS-620 -- Thai
QRegExp rxContentType = QRegExp( "name() << endl;
// Transform from whatever encoding to Unicode
QString u = codec->toUnicode( s );
u = KCharsets::resolveEntities( u ); // convert all HTML codes in the form &xxx;
u = u.replace( QRegExp( "", false ), " " ); // strip scripts
u = u.replace( QRegExp( "
" ), "\n" ); // transform all
in \n
u = u.replace( QRegExp( "<[^>]*>" ), " " ); // strip all HTML tags
u = u.simplifyWhiteSpace();
QTextOStream stream( &data );
stream << "";
stream << u;
stream << "";
}
kdDebug() << "HTMLExtractor end" << endl;
}