diff --git a/src/loader.cpp b/src/loader.cpp index c99b1bd..49a79ee 100644 --- a/src/loader.cpp +++ b/src/loader.cpp @@ -1,148 +1,147 @@ /* * loader.cpp * * Copyright (c) 2001, 2002, 2003 Frerich Raabe * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the * accompanying file 'COPYING'. */ #include "loader.h" #include "dataretriever.h" #include "documentsource.h" #include "feed.h" #include "global.h" #include "loaderutil_p.h" #include "parsercollection.h" #include -#include #include #include #include #include namespace Syndication { struct Loader::LoaderPrivate { LoaderPrivate() { } ~LoaderPrivate() { delete retriever; } DataRetriever *retriever = nullptr; Syndication::ErrorCode lastError = Success; int retrieverError = 0; QUrl discoveredFeedURL; QUrl url; }; Loader *Loader::create() { return new Loader; } Loader *Loader::create(QObject *object, const char *slot) { Loader *loader = create(); connect(loader, SIGNAL(loadingComplete(Syndication::Loader*,Syndication::FeedPtr,Syndication::ErrorCode)), object, slot); return loader; } Loader::Loader() : d(new LoaderPrivate) { } Loader::~Loader() { delete d; } void Loader::loadFrom(const QUrl &url, DataRetriever *retriever) { if (d->retriever != nullptr) { return; } d->url = url; d->retriever = retriever; connect(d->retriever, &DataRetriever::dataRetrieved, this, &Loader::slotRetrieverDone); d->retriever->retrieveData(url); } int Loader::retrieverError() const { return d->retrieverError; } Syndication::ErrorCode Loader::errorCode() const { return d->lastError; } void Loader::abort() { if (d && d->retriever) { d->retriever->abort(); delete d->retriever; d->retriever = nullptr; } emit loadingComplete(this, FeedPtr(), Aborted); delete this; } QUrl Loader::discoveredFeedURL() const { return d->discoveredFeedURL; } void Loader::slotRetrieverDone(const QByteArray &data, bool success) { d->retrieverError = d->retriever->errorCode(); ErrorCode status = Success; FeedPtr feed; delete d->retriever; d->retriever = nullptr; if (success) { DocumentSource src(data, d->url.url()); feed = parserCollection()->parse(src); if (parserCollection()->lastError() != Syndication::Success) { status = parserCollection()->lastError(); discoverFeeds(data); } } else { qCDebug(SYNDICATION_LOG) << "Retriever error:" << d->retrieverError; // retriever is a custom impl, so we set OtherRetrieverError status = OtherRetrieverError; } emit loadingComplete(this, feed, status); delete this; } void Loader::discoverFeeds(const QByteArray &data) { const QUrl url = LoaderUtil::parseFeed(data, d->url); if (!url.isEmpty()) { d->discoveredFeedURL = url; } } } // namespace Syndication diff --git a/src/loaderutil.cpp b/src/loaderutil.cpp index e84dccf..68d105f 100644 --- a/src/loaderutil.cpp +++ b/src/loaderutil.cpp @@ -1,111 +1,114 @@ /* * This file is part of the syndication library * * Copyright (C) 2019 Laurent Montel * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "loaderutil_p.h" #include +#include //#define DEBUG_PARSING_FEED #ifdef DEBUG_PARSING_FEED #include #include #endif QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url) { #ifdef DEBUG_PARSING_FEED qDebug() << " QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)"; QFile headerFile(QStringLiteral("/tmp/bb.txt")); headerFile.open(QIODevice::WriteOnly | QIODevice::Text); QTextStream outHeaderStream(&headerFile); outHeaderStream << data; headerFile.close(); #endif QUrl discoveredFeedURL; QString str = QString::fromLatin1(data.constData()).simplified(); QString s2; //QTextStream ts( &str, QIODevice::WriteOnly ); //ts << data.data(); // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>" // "type[\\s]=[\\s]\\\"application/rss+xml\\\"" // "href[\\s]=[\\s]\\\"application/rss+xml\\\"" - QRegExp rx(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*[\\s]*type[^=]*=\"application/rss\\+xml\"[^s][^s](?:[^>]*)[\\s]*[\\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), Qt::CaseInsensitive); - if (rx.indexIn(str) != -1) { - s2 = rx.cap(1); + QRegularExpression rx(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*" + "[\\s]*type[^=]*=\"application/rss\\+xml\"[^s][^s](?:[^>]*)" + "[\\s]*[\\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), + QRegularExpression::CaseInsensitiveOption); + QRegularExpressionMatch match; + if ((match = rx.match(str)).hasMatch()) { + s2 = match.captured(1); } else { - QRegExp rx2(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), Qt::CaseInsensitive); - if (rx2.indexIn(str) != -1) { - s2 = rx2.cap(1); + const QRegularExpression rx2(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)" + "[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), + QRegularExpression::CaseInsensitiveOption); + if ((match = rx2.match(str)).hasMatch()) { + s2 = match.captured(1); } else { - // does not support Atom/RSS autodiscovery.. try finding feeds by brute force.... - int pos = 0; QStringList feeds; QString host = url.host(); - rx.setPattern(QStringLiteral("(?:\\s]*)")); - while (pos >= 0) { - pos = rx.indexIn(str, pos); - s2 = rx.cap(1); - if (s2.endsWith(QLatin1String(".rdf")) || - s2.endsWith(QLatin1String(".rss")) || - s2.endsWith(QLatin1String(".xml"))) { + rx.setPattern(QStringLiteral("(?:\\s]*)")); + QRegularExpressionMatchIterator iter = rx.globalMatch(str); + while (iter.hasNext()) { + match = iter.next(); + s2 = match.captured(1); + if (s2.endsWith(QLatin1String(".rdf")) + || s2.endsWith(QLatin1String(".rss")) + || s2.endsWith(QLatin1String(".xml"))) { feeds.append(s2); } - if (pos >= 0) { - pos += rx.matchedLength(); - } } QUrl testURL; // loop through, prefer feeds on same host QStringList::const_iterator end(feeds.constEnd()); for (QStringList::const_iterator it = feeds.constBegin(); it != end; ++it) { testURL = QUrl(*it); if (testURL.host() == host) { s2 = *it; break; } } } } if (s2.isNull()) { return discoveredFeedURL; } if (QUrl(s2).isRelative()) { if (s2.startsWith(QLatin1String("//"))) { s2.prepend(url.scheme() + QLatin1Char(':')); discoveredFeedURL = QUrl(s2); } else if (s2.startsWith(QLatin1Char('/'))) { discoveredFeedURL = url; discoveredFeedURL.setPath(s2); } else { discoveredFeedURL = url; discoveredFeedURL.setPath(discoveredFeedURL.path() + QLatin1Char('/') + s2); } } else { discoveredFeedURL = QUrl(s2); } return discoveredFeedURL; } diff --git a/src/tools.cpp b/src/tools.cpp index 2c9d508..d4f465b 100644 --- a/src/tools.cpp +++ b/src/tools.cpp @@ -1,291 +1,291 @@ /* * This file is part of the syndication library * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "tools.h" #include "personimpl.h" #include #include #include #include -#include +#include #include #include namespace Syndication { QCryptographicHash md5Machine(QCryptographicHash::Md5); unsigned int calcHash(const QString &str) { return calcHash(str.toUtf8()); } unsigned int calcHash(const QByteArray &array) { if (array.isEmpty()) { return 0; } else { const char *s = array.data(); unsigned int hash = 5381; int c; while ((c = *s++)) { hash = ((hash << 5) + hash) + c; // hash*33 + c } return hash; } } static uint toTimeT(QDateTime &kdt) { if (kdt.isValid()) { //work around unspecified timezones/date-only timestamps by setting the time to 12:00 UTC if (kdt.time().isNull() || (kdt.time() == QTime(0, 0) && kdt.timeSpec() == Qt::LocalTime)) { kdt.setTimeSpec(Qt::UTC); kdt.setTime(QTime(12, 0)); } return kdt.toMSecsSinceEpoch() / 1000; } else { return 0; } } uint parseISODate(const QString &str) { QDateTime kdt = QDateTime::fromString(str, Qt::ISODate); return toTimeT(kdt); } uint parseRFCDate(const QString &str) { QDateTime kdt = QDateTime::fromString(str, Qt::RFC2822Date); return toTimeT(kdt); } uint parseDate(const QString &str, DateFormat hint) { if (str.isEmpty()) { return 0; } if (hint == RFCDate) { time_t t = parseRFCDate(str); return t != 0 ? t : parseISODate(str); } else { time_t t = parseISODate(str); return t != 0 ? t : parseRFCDate(str); } } QString dateTimeToString(uint date) { if (date == 0) { return QString(); } const QString format = QStringLiteral("ddd MMM d HH:mm:ss yyyy"); QDateTime dt; dt.setMSecsSinceEpoch(quint64(date) * 1000); return dt.toUTC().toString(format); } QString calcMD5Sum(const QString &str) { md5Machine.reset(); md5Machine.addData(str.toUtf8()); return QLatin1String(md5Machine.result().toHex().constData()); } QString resolveEntities(const QString &str) { return KCharsets::resolveEntities(str); } QString escapeSpecialCharacters(const QString &strp) { QString str(strp); str.replace(QLatin1Char('&'), QLatin1String("&")); str.replace(QLatin1Char('\"'), QLatin1String(""")); str.replace(QLatin1Char('<'), QLatin1String("<")); str.replace(QLatin1Char('>'), QLatin1String(">")); str.replace(QLatin1Char('\''), QLatin1String("'")); return str.trimmed(); } QString convertNewlines(const QString &strp) { QString str(strp); str.replace(QLatin1Char('\n'), QLatin1String("
")); return str; } QString plainTextToHtml(const QString &plainText) { QString str(plainText); str.replace(QLatin1Char('&'), QLatin1String("&")); str.replace(QLatin1Char('\"'), QLatin1String(""")); str.replace(QLatin1Char('<'), QLatin1String("<")); //str.replace(QLatin1Char('>'), QLatin1String(">")); str.replace(QLatin1Char('\n'), QLatin1String("
")); return str.trimmed(); } QString htmlToPlainText(const QString &html) { QString str(html); //TODO: preserve some formatting, such as line breaks - str.remove(QRegExp(QStringLiteral("<[^>]*>"))); // remove tags + str.remove(QRegularExpression(QStringLiteral("<[^>]*?>"))); // remove tags str = resolveEntities(str); return str.trimmed(); } -static QRegExp tagRegExp() +static QRegularExpression tagRegExp() { - static QRegExp exp(QStringLiteral("<\\w+.*/?>")); + static QRegularExpression exp(QStringLiteral("<\\w+.*/?>")); return exp; } bool stringContainsMarkup(const QString &str) { //check for entities - if (str.contains(QRegExp(QStringLiteral("&[a-zA-Z0-9#]+;")))) { + if (str.contains(QRegularExpression(QStringLiteral("&[a-zA-Z0-9#]+;")))) { return true; } const int ltc = str.count(QLatin1Char('<')); if (ltc == 0) { return false; } return str.contains(tagRegExp()); } bool isHtml(const QString &str) { //check for entities - if (str.contains(QRegExp(QStringLiteral("&[a-zA-Z0-9#]+;")))) { + if (str.contains(QRegularExpression(QStringLiteral("&[a-zA-Z0-9#]+;")))) { return true; } const int ltc = str.count(QLatin1Char('<')); if (ltc == 0) { return false; } return str.contains(tagRegExp()); } QString normalize(const QString &str) { return isHtml(str) ? str.trimmed() : plainTextToHtml(str); } QString normalize(const QString &strp, bool isCDATA, bool containsMarkup) { if (containsMarkup) { return strp.trimmed(); } else { if (isCDATA) { QString str = resolveEntities(strp); str = escapeSpecialCharacters(str); str = convertNewlines(str); str = str.trimmed(); return str; } else { QString str = escapeSpecialCharacters(strp); str = str.trimmed(); return str; } } } PersonPtr personFromString(const QString &strp) { QString str = strp.trimmed(); if (str.isEmpty()) { return PersonPtr(new PersonImpl()); } str = resolveEntities(str); QString name; QString uri; QString email; // look for something looking like a mail address ("foo@bar.com", // "") and extract it - QRegExp remail(QStringLiteral("\\s]+)>?")); // FIXME: user "proper" regexp, + const QRegularExpression remail(QStringLiteral("\\s]+)>?")); // FIXME: user "proper" regexp, // search kmail source for it - int pos = remail.indexIn(str); - if (pos != -1) { - QString all = remail.cap(0); - email = remail.cap(1); + QRegularExpressionMatch match = remail.match(str); + if (match.hasMatch()) { + const QString all = match.captured(0); + email = match.captured(1); str.remove(all); // remove mail address } // replace "mailto", "(", ")" (to be extended) email.remove(QStringLiteral("mailto:")); - email.remove(QRegExp(QStringLiteral("[\\(\\)]"))); + email.remove(QRegularExpression(QStringLiteral("[()]"))); // simplify the rest and use it as name name = str.simplified(); // after removing the email, str might have // the format "(Foo M. Bar)". We cut off // parentheses if there are any. However, if // str is of the format "Foo M. Bar (President)", // we should not cut anything. - QRegExp rename(QStringLiteral("^\\(([^\\)]*)\\)")); - - if (rename.exactMatch(name)) { - name = rename.cap(1); + QRegularExpression rename(QRegularExpression::anchoredPattern(QStringLiteral("^\\(([^)]*)\\)"))); + match = rename.match(name); + if (match.hasMatch()) { + name = match.captured(1); } name = name.isEmpty() ? QString() : name; email = email.isEmpty() ? QString() : email; uri = uri.isEmpty() ? QString() : uri; if (name.isEmpty() && email.isEmpty() && uri.isEmpty()) { return PersonPtr(new PersonImpl()); } return PersonPtr(new PersonImpl(name, uri, email)); } ElementType::ElementType(const QString &localnamep, const QString &nsp) : ns(nsp), localname(localnamep) { } bool ElementType::operator==(const ElementType &other) const { return localname == other.localname && ns == other.ns; } } // namespace Syndication