diff --git a/syndication/src/atom/atomtools.cpp b/syndication/src/atom/atomtools.cpp index 4b3609f72..38d12a37a 100644 --- a/syndication/src/atom/atomtools.cpp +++ b/syndication/src/atom/atomtools.cpp @@ -1,62 +1,62 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "constants.h" #include "tools.h" #include "../elementwrapper.h" #include "../tools.h" #include #include namespace LibSyndication { namespace Atom { QString extractAtomText(const LibSyndication::ElementWrapper& parent, const QString& tagname) { QString str; QDomElement el = parent.firstElementByTagNameNS(atom1Namespace(), tagname); - QString type = el.attribute(QString::fromUtf8("type")); + QString type = el.attribute(QString::fromUtf8("type"), QString::fromUtf8("text")); - if (type.isEmpty() || type == QString::fromUtf8("text")) + if (type == QString::fromUtf8("text")) { str = plainTextToHtml(parent.extractElementTextNS(atom1Namespace(), tagname).simplified()); } else if (type == QString::fromUtf8("html")) { str = parent.extractElementTextNS(atom1Namespace(), tagname).simplified(); } else if (type == QString::fromUtf8("xhtml")) { str = ElementWrapper::childNodesAsXML(el).simplified(); } return str; } } // namespace Atom } // namespace LibSyndication diff --git a/syndication/src/elementwrapper.h b/syndication/src/elementwrapper.h index 026da8329..9700032a5 100644 --- a/syndication/src/elementwrapper.h +++ b/syndication/src/elementwrapper.h @@ -1,301 +1,301 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #ifndef LIBSYNDICATION_ELEMENTWRAPPER_H #define LIBSYNDICATION_ELEMENTWRAPPER_H #include #include "sharedptr.h" #include class QDomElement; template class QList; namespace LibSyndication { /** * A wrapper for XML elements. This is the base class for the (lazy) wrappers * used in the RSS2 and Atom parsers. The wrapped element can be accessed * via element(). It also contains several helper functions for XML processing. * * @author Frank Osterfeld */ class KDE_EXPORT ElementWrapper { public: /** * creates a element wrapper wrapping a null element. * isNull() will return @c true for these instances. */ ElementWrapper(); /** * Copy constructor.The instances share the same element. * @param other the element wrapper to copy */ ElementWrapper(const ElementWrapper& other); /** * Creates an element wrapper wrapping the DOM element @c element * @param element the element to wrap */ ElementWrapper(const QDomElement& element); /** * destructor */ virtual ~ElementWrapper(); /** * Assigns another element wrapper to this one. Both instances * share the same wrapped element instance. * * @param other the element wrapper to assign * @return reference to this instance */ virtual ElementWrapper& operator=(const ElementWrapper& other); /** * compares two wrappers. Two wrappers are equal if and only if * the wrapped elements are equal. * @param other another element wrapper to compare to */ bool operator==(const ElementWrapper& other) const; /** * returns the wrapped resource. */ const QDomElement& element() const; /** * returns whether the wrapped element is a null element * @return @c true if isNull() is true for the wrapped element, * @c false otherwise */ bool isNull() const; /** * returns the xml:base value to be used for the wrapped element. * The xml:base attribute establishes the base URI for resolving any * relative references found in its scope (its own element and all * descendants). (See also completeURI()) * * @return the xml:base value, or a null string if not set */ QString xmlBase() const; /** * returns the xml:lang value to be used for the wrapped element. * The xml:lang attribute indicates the natural language for its element * and all descendants. * * @return the xml:lang value, or a null string if not set */ QString xmlLang() const; - + /** * completes relative URIs with a prefix specified via xml:base. * * Example: * @code * xml:base="http://www.foo.org/", uri="announcements/bar.html" * @endcode * * is completed to @c http://www.foo.org/announcements/bar.html * * See also xmlBase(). * * @param uri a possibly relative URI * @return the resolved, absolute URI (using xml:base), if @c uri is * a relative, valid URI. If @c uri is not valid, absolute, or no * xml:base is set in the scope of this element, @c uri is returned * unmodified. */ QString completeURI(const QString& uri) const; /** * extracts the text from a child element, respecting namespaces. If * there is more than one child with the same tag name, the first one is * processed. * For instance, when the wrapped element is @c <hisElement>: * @code * * Hi there * * @endcode * @code * extractElementText("http://www.w3.org/2005/Atom", "title") * @endcode * will return the text content of @c atom:title, "Hi there". * (Assuming that "atom" is defined as "http://www.w3.org/2005/Atom") * * @param namespaceURI the namespace URI of the element to extract * @param localName the local name (local within its namespace) of the * element to extract * @return the (trimmed) text content of @c localName, or a null string * if there is no such tag */ QString extractElementTextNS(const QString& namespaceURI, const QString& localName) const; /** * extracts the text from a child element, ignoring namespaces. For * instance, when the wrapped element is @c <thisElement>: * @code * * Hi there * * @endcode * @c extractElementText("title") will return the text content * of @c title, "Hi there". * * @param tagName the name of the element to extract * @return the (trimmed) text content of @c tagName, or a null string if * there is no such tag */ QString extractElementText(const QString& tagName) const; /** * returns all child elements with tag name @c tagName * Contrary to QDomElement::elementsByTagName() only direct descendents * are returned. * * @param tagName the tag name of the elements to extract * @return a list of child elements with the given tag name */ QList elementsByTagName(const QString& tagName) const; /** * returns the child nodes of the wrapped element as XML. * * See childNodesAsXML(const QDomElement& parent) for details * @return XML serialization of the wrapped element's children */ QString childNodesAsXML() const; /** * concatenates the XML representations of all children. Example: If * @c parent is an @c xhtml:body element like * @code *

foo

bar
* @endcode * this function returns * @code *

foo

bar
* @endcode * * namespace and xml:base information are preserved. * * @param parent the DOM element whose children should be returned as * XML * @return XML serialization of parent's children */ static QString childNodesAsXML(const QDomElement& parent); /** * returns all child elements with tag name @c tagname * and namespace URI @c nsURI. * Contrary to QDomElement::elementsByTagNameNS() only direct * descendents are returned * * @param nsURI the namespace URI * @param tagName the local name (local within its namespace) of the * element to search for * @return a list of child elements with the given namespace URI * and tag name */ QList elementsByTagNameNS(const QString& nsURI, const QString& tagName) const; /** * searches the direct children of the wrapped element for an element * with a given namespace and tag name. * * @param nsURI the namespace URI * @param tagName the local name (local within its namespace) of the * element to search for * @return the first child element with the given namespace URI and tag * name, or a null element if no such element was found. */ QDomElement firstElementByTagNameNS(const QString& nsURI, const QString& tagName) const; /** * Returns the wrapped element's text or an empty string. * For more information, see QDomElement::text(); */ QString text() const; /** * Returns the attribute called name. If the attribute does not exist * defValue is returned. * (which is a null string by default). * * @param name tag name * @param defValue the default value */ QString attribute(const QString& name, const QString& defValue=QString()) const; /** * Returns the attribute with the local @c name localName and the * namespace URI @c nsURI. * If the attribute does not exist @c defValue is returned (which is a * null string by default). * * @param nsURI namespace URI * @param localName local tag name * @param defValue the default value */ QString attributeNS(const QString& nsURI, const QString& localName, const QString& defValue=QString()) const; /** * Returns true if this element has an attribute called @c name; * otherwise returns @c false. * * @param name the attribute name (without namespace) */ bool hasAttribute(const QString& name) const; /** * Returns true if this element has an attribute with the local name * localName and the namespace URI nsURI; otherwise returns false. * * @param nsURI namespace URI * @param localName local attribute name */ bool hasAttributeNS(const QString& nsURI, const QString& localName) const; private: class ElementWrapperPrivate; SharedPtr d; }; } // namespace LibSyndication #endif // LIBSYNDICATION_ELEMENTWRAPPER_H diff --git a/syndication/src/rdf/document.cpp b/syndication/src/rdf/document.cpp index 2d983d960..4da192f7d 100644 --- a/syndication/src/rdf/document.cpp +++ b/syndication/src/rdf/document.cpp @@ -1,170 +1,170 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include #include #include "document.h" #include "dublincore.h" #include "image.h" #include "item.h" #include "model.h" #include "resource.h" #include "rssvocab.h" #include "sequence.h" #include "statement.h" #include "syndication.h" #include "textinput.h" #include #include namespace LibSyndication { namespace RDF { Document::Document() : LibSyndication::SpecificDocument(), ResourceWrapper() { } Document::Document(ResourcePtr resource) : LibSyndication::SpecificDocument(), ResourceWrapper(resource) { } Document::~Document() { } bool Document::accept(DocumentVisitor* visitor) { return visitor->visitRDFDocument(this); } bool Document::isValid() const { return !isNull(); } QString Document::title() const { QString str = resource()->property(RSSVocab::self()->title())->asString(); - return htmlize(str); + return normalize(str); } QString Document::description() const { QString str = resource()->property(RSSVocab::self()->description())->asString(); - return htmlize(str); + return normalize(str); } QString Document::link() const { return resource()->property(RSSVocab::self()->link())->asString(); } DublinCore Document::dc() const { return DublinCore(resource()); } Syndication Document::syn() const { return Syndication(resource()); } QList Document::items() const { QList list; if (!resource()->hasProperty(RSSVocab::self()->items())) return list; NodePtr n = resource()->property(RSSVocab::self()->items())->object(); if (n->isSequence()) { Sequence* seq = static_cast(n.get()); QList items = seq->items(); QList::Iterator it = items.begin(); QList::Iterator end = items.end(); for ( ; it != end; ++it) { if ((*it)->isResource()) { // well, we need it as ResourcePtr // maybe this should go to the node // interface ResourcePtr asResource()? ResourcePtr ptr = resource()->model().createResource((static_cast((*it).get()))->uri()); Item item(ptr); list.append(item); } } } return list; } Image Document::image() const { ResourcePtr img = resource()->property(RSSVocab::self()->image())->asResource(); return img ? Image(img) : Image(); } TextInput Document::textInput() const { ResourcePtr ti = resource()->property(RSSVocab::self()->textinput())->asResource(); return ti ? TextInput(ti) : TextInput(); } QString Document::debugInfo() const { QString info; info += "### Document: ###################\n"; info += "title: #" + title() + "#\n"; info += "link: #" + link() + "#\n"; info += "description: #" + description() + "#\n"; info += dc().debugInfo(); info += syn().debugInfo(); Image img = image(); if (!img.resource() == 0L) info += img.debugInfo(); TextInput input = textInput(); if (!input.isNull()) info += input.debugInfo(); QList itlist = items(); QList::ConstIterator it = itlist.begin(); QList::ConstIterator end = itlist.end(); for ( ; it != end; ++it) info += (*it).debugInfo(); info += "### Document end ################\n"; return info; } } // namespace RDF } // namespace LibSyndication diff --git a/syndication/src/rdf/item.cpp b/syndication/src/rdf/item.cpp index f81472f7a..ee8f3819f 100644 --- a/syndication/src/rdf/item.cpp +++ b/syndication/src/rdf/item.cpp @@ -1,96 +1,96 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "contentvocab.h" #include "dublincore.h" #include "item.h" #include "model.h" #include "rssvocab.h" #include "statement.h" #include #include #include namespace LibSyndication { namespace RDF { Item::Item() : ResourceWrapper() { } Item::Item(ResourcePtr resource) : ResourceWrapper(resource) { } Item::~Item() { } QString Item::title() const { QString str = resource()->property(RSSVocab::self()->title())->asString(); - return htmlize(str); + return normalize(str); } QString Item::description() const { QString str = resource()->property(RSSVocab::self()->description())->asString(); - return htmlize(str); + return normalize(str); } QString Item::link() const { return resource()->property(RSSVocab::self()->link())->asString(); } DublinCore Item::dc() const { return DublinCore(resource()); } QString Item::encodedContent() const { return resource()->property(ContentVocab::self()->encoded())->asString(); } QString Item::debugInfo() const { QString info; info += "### Item: ###################\n"; info += "title: #" + title() + "#\n"; info += "link: #" + link() + "#\n"; info += "description: #" + description() + "#\n"; info += "content:encoded: #" + encodedContent() + "#\n"; info += dc().debugInfo(); info += "### Item end ################\n"; return info; } bool Item::accept(SpecificItemVisitor* visitor) { return visitor->visitRDFItem(this); } } // namespace RDF } // namespace LibSyndication diff --git a/syndication/src/rss2/document.cpp b/syndication/src/rss2/document.cpp index a9284a092..ea650a204 100644 --- a/syndication/src/rss2/document.cpp +++ b/syndication/src/rss2/document.cpp @@ -1,334 +1,428 @@ /* * This file is part of libsyndication * * Copyright (C) 2005 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace LibSyndication { namespace RSS2 { - -Document::Document(const QDomElement& element) : SpecificDocument(), ElementWrapper(element) +class Document::DocumentPrivate +{ + public: + DocumentPrivate() : itemDescriptionIsCDATA(false), + itemDescriptionContainsMarkup(false), + itemDescGuessed(false), + itemTitleIsCDATA(false), + itemTitleContainsMarkup(false), + itemTitlesGuessed(false) + {} + mutable bool itemDescriptionIsCDATA; + mutable bool itemDescriptionContainsMarkup; + mutable bool itemDescGuessed; + mutable bool itemTitleIsCDATA; + mutable bool itemTitleContainsMarkup; + mutable bool itemTitlesGuessed; +}; + +Document::Document(const QDomElement& element) : SpecificDocument(), + ElementWrapper(element), + d(new DocumentPrivate) { } Document Document::fromXML(const QDomDocument& doc) { QDomNode channelNode = doc.namedItem(QString::fromUtf8("rss")).namedItem(QString::fromUtf8("channel")); return Document(channelNode.toElement()); } -Document::Document() : SpecificDocument(), ElementWrapper() +Document::Document() : SpecificDocument(), ElementWrapper(), d(new DocumentPrivate) +{ +} + +Document::Document(const Document& other) : SpecificDocument(other), ElementWrapper(other) +{ + d = other.d; +} + +Document::~Document() { } +Document& Document::operator=(const Document& other) +{ + ElementWrapper::operator=(other); + d = other.d; + return *this; +} bool Document::isValid() const { return !isNull(); } QString Document::title() const { - QString t = extractElementTextNS(QString(), QString::fromUtf8("title")); - - if (t.isNull()) - { - t = extractElementTextNS(dublinCoreNamespace(), - QString::fromUtf8("title")); - } - - return t; + return extractElementTextNS(QString(), QString::fromUtf8("title")); } QString Document::link() const { return extractElementTextNS(QString(), QString::fromUtf8("link") ); } QString Document::description() const { QString d = extractElementTextNS(QString(), QString::fromUtf8("description")); - - if (d.isNull()) - { - d = extractElementTextNS(dublinCoreNamespace(), - QString::fromUtf8("description")); - } - - return htmlize(d); + return normalize(d); } QString Document::language() const { QString lang = extractElementTextNS(QString(), QString::fromUtf8("language")); if (!lang.isNull()) { return lang; } else { return extractElementTextNS( dublinCoreNamespace(), QString::fromUtf8("language")); } } QString Document::copyright() const { QString rights = extractElementTextNS(QString(), QString::fromUtf8("copyright")); if (!rights.isNull()) { return rights; } else { // if is not provided, use return extractElementTextNS(dublinCoreNamespace(), QString::fromUtf8("rights")); } } QString Document::managingEditor() const { return extractElementTextNS(QString(), QString::fromUtf8("managingEditor")); } QString Document::webMaster() const { return extractElementTextNS(QString(), QString::fromUtf8("webMaster")); } time_t Document::pubDate() const { QString str = extractElementTextNS(QString(), QString::fromUtf8("pubDate")); if (!str.isNull()) { return parseDate(str, RFCDate); } else { // if there is no pubDate, check for dc:date str = extractElementTextNS(dublinCoreNamespace(), QString::fromUtf8("date")); return parseDate(str, ISODate); } } time_t Document::lastBuildDate() const { QString str = extractElementTextNS(QString(), QString::fromUtf8("lastBuildDate")); return parseDate(str, RFCDate); } QList Document::categories() const { QList categories; QList catNodes = elementsByTagNameNS(QString(), QString::fromUtf8("category")); QList::ConstIterator it = catNodes.begin(); for ( ; it != catNodes.end(); ++it) { categories.append(Category(*it)); } return categories; } QString Document::generator() const { return extractElementTextNS(QString(), QString::fromUtf8("generator")); } QString Document::docs() const { return extractElementTextNS(QString(), QString::fromUtf8("docs")); } Cloud Document::cloud() const { return Cloud(firstElementByTagNameNS(QString(), QString::fromUtf8("cloud"))); } int Document::ttl() const { bool ok; int c; QString text = extractElementTextNS(QString(), QString::fromUtf8("ttl")); c = text.toInt(&ok); return ok ? c : 0; } Image Document::image() const { return Image(firstElementByTagNameNS(QString(), QString::fromUtf8("image"))); } TextInput Document::textInput() const { TextInput ti = firstElementByTagNameNS(QString(), QString::fromUtf8("textInput")); if (!ti.isNull()) return ti; // Netscape's version of RSS 0.91 has textinput, not textInput return firstElementByTagNameNS(QString(), QString::fromUtf8("textinput")); } QSet Document::skipHours() const { QSet skipHours; QDomElement skipHoursNode = firstElementByTagNameNS(QString(), QString::fromUtf8("skipHours")); if (!skipHoursNode.isNull()) { ElementWrapper skipHoursWrapper(skipHoursNode); bool ok = false; QList hours = skipHoursWrapper.elementsByTagNameNS(QString(), QString::fromUtf8("hour")); QList::ConstIterator it = hours.begin(); for ( ; it != hours.end(); ++it) { int h = (*it).text().toInt(&ok); if (ok) skipHours.insert(h); } } return skipHours; } QSet Document::skipDays() const { QSet skipDays; QDomElement skipDaysNode = firstElementByTagNameNS(QString(), QString::fromUtf8("skipDays")); if (!skipDaysNode.isNull()) { ElementWrapper skipDaysWrapper(skipDaysNode); QHash weekDays; weekDays[QString::fromUtf8("Monday")] = Monday; weekDays[QString::fromUtf8("Tuesday")] = Tuesday; weekDays[QString::fromUtf8("Wednesday")] = Wednesday; weekDays[QString::fromUtf8("Thursday")] = Thursday; weekDays[QString::fromUtf8("Friday")] = Friday; weekDays[QString::fromUtf8("Saturday")] = Saturday; weekDays[QString::fromUtf8("Sunday")] = Sunday; QList days = skipDaysWrapper.elementsByTagNameNS(QString(), QString::fromUtf8("day")); for (QList::ConstIterator it = days.begin(); it != days.end(); ++it) { if (weekDays.contains((*it).text())) skipDays.insert(weekDays[(*it).text()]); } } return skipDays; } QList Document::items() const { - QList itemNodes = elementsByTagNameNS(QString(), QString::fromUtf8("item")); - QList items; - + + QList itemNodes = elementsByTagNameNS(QString(), QString::fromUtf8("item")); + + DocumentPtr doccpy(new Document(*this)); // pass + for (QList::ConstIterator it = itemNodes.begin(); it != itemNodes.end(); ++it) { - items.append(Item(*it)); + items.append(Item(*it, doccpy)); } - + return items; } QString Document::debugInfo() const { QString info; info += "### Document: ###################\n"; if (!title().isNull()) info += "title: #" + title() + "#\n"; if (!description().isNull()) info += "description: #" + description() + "#\n"; if (!link().isNull()) info += "link: #" + link() + "#\n"; if (!language().isNull()) info += "language: #" + language() + "#\n"; if (!copyright().isNull()) info += "copyright: #" + copyright() + "#\n"; if (!managingEditor().isNull()) info += "managingEditor: #" + managingEditor() + "#\n"; if (!webMaster().isNull()) info += "webMaster: #" + webMaster() + "#\n"; QString dpubdate = dateTimeToString(pubDate()); if (!dpubdate.isNull()) info += "pubDate: #" + dpubdate + "#\n"; QString dlastbuilddate = dateTimeToString(lastBuildDate()); if (!dlastbuilddate.isNull()) info += "lastBuildDate: #" + dlastbuilddate + "#\n"; if (!textInput().isNull()) info += textInput().debugInfo(); if (!cloud().isNull()) info += cloud().debugInfo(); if (!image().isNull()) info += image().debugInfo(); QList cats = categories(); for (QList::ConstIterator it = cats.begin(); it != cats.end(); ++it) info += (*it).debugInfo(); QList litems = items(); for (QList::ConstIterator it = litems.begin(); it != litems.end(); ++it) info += (*it).debugInfo(); info += "### Document end ################\n"; return info; } +void Document::getItemTitleFormatInfo(bool& isCDATA, bool& containsMarkup) const +{ + if (!d->itemTitlesGuessed) + { + QString titles; + QList litems = items(); + + if (litems.isEmpty()) + { + d->itemTitlesGuessed = true; + return; + } + + QDomElement titleEl = (*litems.begin()).firstElementByTagNameNS(QString(), QString::fromUtf8("title")); + d->itemTitleIsCDATA = titleEl.firstChild().isCDATASection(); + + int nmax = litems.size() < 10 ? litems.size() : 10; // we check a maximum of 10 items + int i = 0; + + QList::ConstIterator it = litems.begin(); + + while (i < nmax) + { + titles += (*it).originalTitle(); + ++it; + ++i; + } + + d->itemTitleContainsMarkup = stringContainsMarkup(titles); + d->itemTitlesGuessed = true; + } + + isCDATA = d->itemTitleIsCDATA; + containsMarkup = d->itemTitleContainsMarkup; +} + +void Document::getItemDescriptionFormatInfo(bool& isCDATA, bool& containsMarkup) const +{ + if (!d->itemDescGuessed) + { + QString desc; + QList litems = items(); + + + if (litems.isEmpty()) + { + d->itemDescGuessed = true; + return; + } + + QDomElement descEl = (*litems.begin()).firstElementByTagNameNS(QString(), QString::fromUtf8("description")); + d->itemDescriptionIsCDATA = descEl.firstChild().isCDATASection(); + + int nmax = litems.size() < 10 ? litems.size() : 10; // we check a maximum of 10 items + int i = 0; + + QList::ConstIterator it = litems.begin(); + + while (i < nmax) + { + desc += (*it).originalDescription(); + ++it; + ++i; + } + + d->itemDescriptionContainsMarkup = stringContainsMarkup(desc); + d->itemDescGuessed = true; + } + + isCDATA = d->itemDescriptionIsCDATA; + containsMarkup = d->itemDescriptionContainsMarkup; +} + bool Document::accept(DocumentVisitor* visitor) { return visitor->visitRSS2Document(this); } } // namespace RSS2 } // namespace LibSyndication diff --git a/syndication/src/rss2/document.h b/syndication/src/rss2/document.h index 9dc09f961..d16e2ebe7 100644 --- a/syndication/src/rss2/document.h +++ b/syndication/src/rss2/document.h @@ -1,275 +1,285 @@ /* * This file is part of libsyndication * * Copyright (C) 2005 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #ifndef LIBSYNDICATION_RSS2_DOCUMENT_H #define LIBSYNDICATION_RSS2_DOCUMENT_H #include #include #include class QDomDocument; class QDomElement; class QString; template class QList; template class QSet; namespace LibSyndication { namespace RSS2 { class Category; class Cloud; class Document; class Image; class Item; class TextInput; typedef SharedPtr DocumentPtr; /** * document implementation, representing an RSS feed from the 0.91-0.94/2.0 * family. * * @author Frank Osterfeld */ class KDE_EXPORT Document : public LibSyndication::SpecificDocument, - public ElementWrapper + public LibSyndication::ElementWrapper { public: /** * Parses an RSS2 document from an XML document. * TODO: More on supported formats etc. * * @param document The dom document to parse the document from * @return the document parsed from XML, or an invalid * document if parsing failed. */ static Document fromXML(const QDomDocument& document); /** * Default constructor, creates a null object, for which * isNull() is @c true and isValid() is @c false. */ Document(); - bool accept(DocumentVisitor* visitor); + Document(const Document& other); + + virtual ~Document(); + + Document& operator=(const Document& other); + + virtual bool accept(DocumentVisitor* visitor); /** * returns whether this document is valid or not. * Invalid documents do not contain any useful * information. */ bool isValid() const; /** * The title of the channel. * - * This method returns the content of the @c <title> element. If - * @c <title> is not available, the method returns * @c <dc:title> - * instead, if available. - * - * * @return title TODO: more on escaping/HTML */ QString title() const; /** * The URL to the HTML website corresponding to the channel. * * @return TODO */ QString link() const; /** * Phrase or sentence describing the channel. - * This method returns the content of the @c <description> element. If - * @c <description> is not available, the method returns - * @c <dc:description> instead, if available. - * * * @return TODO */ QString description() const; /** * the items contained in this document */ QList items() const; /** * * @return TODO */ QString language() const; /** * * Copyright notice for content in the channel. * This method returns the content of the @c <copyright> * element. If @c <copyright> is not available, the method returns * @c <dc:rights> instead, if available. * * @return copyright information, or a null string if not set */ QString copyright() const; /** * Email address for person responsible for editorial content. * * @return editor's email address, or a null string if not set */ QString managingEditor() const; /** * Email address for person responsible for technical issues relating * to channel. * * @return web master's email address, or a null string if not */ QString webMaster() const; /** * The publication date for the content in the channel. For example, * the New York Times publishes on a daily basis, the publication date * flips once every 24 hours. That's when the pubDate of the channel * changes. * This method returns the content of the @c <pubDate> element. If * @c <pubDate> is not available, the method returns * @c <dc:date> instead, if available. * * @return the publication date, or 0 if no date was specified or * parsing failed */ time_t pubDate() const; /** * The last time the content of the channel changed. * * @return the last build date, or 0 if no date was specified or parsing * failed */ time_t lastBuildDate() const; /** * Specifies one or more categories that the channel belongs to. * * @return TODO */ QList categories() const; /** * A string indicating the program used to generate the channel. * * @return description of the generator program, or a null string if * not set */ QString generator() const; /** * A URL that points to the documentation for the format used in the * RSS file. It's probably a pointer to the RSS specification. * It's for people who might stumble across an RSS file on a Web server * 25 years from now and wonder what it is. * * @return URL pointing to the format specification, or a null string if * not set */ QString docs() const; /** * Allows processes to register with a cloud to be notified of updates * to the channel, implementing a lightweight publish-subscribe * protocol for RSS feeds. * * @return cloud information, or a null object if not set */ Cloud cloud() const; /** * ttl stands for time to live. It's a number of minutes that indicates * how long a channel can be cached before refreshing from the source. * * @return the "time to live" in minutes, or 0 if not set */ int ttl() const; /** * Specifies a GIF, JPEG or PNG image that can be displayed with the * channel. * * @return the image, or a null object if not set */ Image image() const; /** * Specifies a text input box that can be displayed with the channel. * * @return the text input, or a null object if not set */ TextInput textInput() const; /** * Contains a set of hours (from 0 to 23), time in GMT, when the * channel is not updated. */ QSet skipHours() const; /** days of week, used for skip days */ enum DayOfWeek { Monday = 0, /**< self-explanatory */ Tuesday = 1, /**< self-explanatory */ Wednesday = 2, /**< self-explanatory */ Thursday = 3, /**< self-explanatory */ Friday = 4, /**< self-explanatory */ Saturday = 5, /**< self-explanatory */ Sunday = 6 /**< self-explanatory */ }; /** * A set of week days where aggregators shouldn't read the channel. * */ QSet skipDays() const; - - + /** * Returns a description of the object and its children for * debugging purposes. * * @return debug string */ QString debugInfo() const; + + /** + * @internal + */ + void getItemTitleFormatInfo(bool& isCDATA, bool& containsMarkup) const; + + /** + * @internal + */ + void getItemDescriptionFormatInfo(bool& isCDATA, bool& containsMarkup) const; + private: Document(const QDomElement& element); + + class DocumentPrivate; + SharedPtr d; }; } // namespace RSS2 } // namespace LibSyndication #endif // LIBSYNDICATION_RSS2_DOCUMENT_H diff --git a/syndication/src/rss2/item.cpp b/syndication/src/rss2/item.cpp index 980429789..6a6696d55 100644 --- a/syndication/src/rss2/item.cpp +++ b/syndication/src/rss2/item.cpp @@ -1,232 +1,268 @@ /* * This file is part of libsyndication * * Copyright (C) 2005 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include namespace LibSyndication { namespace RSS2 { -Item::Item() : ElementWrapper() +class Item::ItemPrivate +{ + public: + + SharedPtr doc; +}; + +Item::Item(SharedPtr doc) : ElementWrapper(), d(new ItemPrivate) +{ + d->doc = doc; +} + +Item::Item(const QDomElement& element, SharedPtr doc) : ElementWrapper(element), d(new ItemPrivate) +{ + d->doc = doc; +} + +Item::~Item() { } -Item::Item(const QDomElement& element) : ElementWrapper(element) +Item::Item(const Item& other) : ElementWrapper(other), SpecificItem(other) { + d = other.d; +} + +Item& Item::operator=(const Item& other) +{ + ElementWrapper::operator=(other); + SpecificItem::operator=(other); + d = other.d; + return *this; } QString Item::title() const { - QString t = extractElementTextNS(QString(), QString::fromUtf8("title")); + if (!d->doc) + return originalTitle(); - if (t.isNull()) - { - t = extractElementTextNS(dublinCoreNamespace(), - QString::fromUtf8("title")); - } - return htmlize(t); + bool isCDATA = false; + bool containsMarkup = false; + d->doc->getItemTitleFormatInfo(isCDATA, containsMarkup); + + return normalize(originalTitle(), isCDATA, containsMarkup); +} + + +QString Item::originalDescription() const +{ + return extractElementTextNS(QString(), QString::fromUtf8("description")); +} + +QString Item::originalTitle() const +{ + return extractElementTextNS(QString(), QString::fromUtf8("title")); } QString Item::link() const { return extractElementTextNS(QString(), QString::fromUtf8("link") ); } QString Item::description() const { - QString d = extractElementTextNS(QString(), QString::fromUtf8("description")); - - if (d.isNull()) - { - d = extractElementTextNS(dublinCoreNamespace(), - QString::fromUtf8("description")); - } + if (!d->doc) + return originalDescription(); + + bool isCDATA = false; + bool containsMarkup = false; + d->doc->getItemDescriptionFormatInfo(isCDATA, containsMarkup); - return htmlize(d); + return normalize(originalDescription(), isCDATA, containsMarkup); } QString Item::content() const { // parse encoded stuff from content:encoded, xhtml:body and friends into content return extractContent(*this); } QList Item::categories() const { QList cats = elementsByTagNameNS(QString(), QString::fromUtf8("category")); QList categories; QList::ConstIterator it = cats.begin(); for ( ; it != cats.end(); ++it) { categories.append(Category(*it)); } return categories; } QString Item::comments() const { return extractElementTextNS(QString(), QString::fromUtf8("comments") ); } QString Item::author() const { QString a = extractElementTextNS(QString(), QString::fromUtf8("author") ); if (!a.isNull()) { return a; } else { // if author is not available, fall back to dc:creator return extractElementTextNS(dublinCoreNamespace(), QString::fromUtf8("creator") ); } } QList Item::enclosures() const { QList encs = elementsByTagNameNS(QString(), QString::fromUtf8("enclosure")); QList enclosures; QList::ConstIterator it = encs.begin(); for ( ; it != encs.end(); ++it) { enclosures.append(Enclosure(*it)); } return enclosures; } QString Item::guid() const { return extractElementTextNS(QString(), QString::fromUtf8("guid") ); } bool Item::guidIsPermaLink() const { bool guidIsPermaLink = true; // true is default QDomElement guidNode = firstElementByTagNameNS(QString(), QString::fromUtf8("guid")); if (!guidNode.isNull()) { if (guidNode.attribute(QString::fromUtf8("isPermaLink")) == QString::fromUtf8("false")) { guidIsPermaLink = false; } } return guidIsPermaLink; } time_t Item::pubDate() const { QString str = extractElementTextNS(QString(), QString::fromUtf8("pubDate")); if (!str.isNull()) { return parseDate(str, RFCDate); } // if there is no pubDate, check for dc:date str = extractElementTextNS(dublinCoreNamespace(), QString::fromUtf8("date")); return parseDate(str, ISODate); } - + time_t Item::expirationDate() const { QString str = extractElementTextNS(QString(), QString::fromUtf8("expirationDate")); return parseDate(str, RFCDate); } Source Item::source() const { return Source(firstElementByTagNameNS(QString(), QString::fromUtf8("source"))); } QString Item::rating() const { return extractElementTextNS(QString(), QString::fromUtf8("rating") ); } QString Item::debugInfo() const { QString info; info += "### Item: ###################\n"; if (!title().isNull()) info += "title: #" + title() + "#\n"; if (!link().isNull()) info += "link: #" + link() + "#\n"; if (!description().isNull()) info += "description: #" + description() + "#\n"; if (!content().isNull()) info += "content: #" + content() + "#\n"; if (!author().isNull()) info += "author: #" + author() + "#\n"; if (!comments().isNull()) info += "comments: #" + comments() + "#\n"; QString dpubdate = dateTimeToString(pubDate()); if (!dpubdate.isNull()) info += "pubDate: #" + dpubdate + "#\n"; if (!guid().isNull()) info += "guid: #" + guid() + "#\n"; if (guidIsPermaLink()) info += "guid is PL: #true#\n"; if (!source().isNull()) info += source().debugInfo(); QList cats = categories(); for (QList::ConstIterator it = cats.begin(); it != cats.end(); ++it) info += (*it).debugInfo(); QList encs = enclosures(); for (QList::ConstIterator it = encs.begin(); it != encs.end(); ++it) info += (*it).debugInfo(); info += "### Item end ################\n"; return info; } bool Item::accept(SpecificItemVisitor* visitor) { return visitor->visitRSS2Item(this); } } // namespace RSS2 } // namespace LibSyndication diff --git a/syndication/src/rss2/item.h b/syndication/src/rss2/item.h index 20cfcf4e6..347a5c5b7 100644 --- a/syndication/src/rss2/item.h +++ b/syndication/src/rss2/item.h @@ -1,230 +1,248 @@ /* * This file is part of libsyndication * * Copyright (C) 2005 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #ifndef LIBSYNDICATION_RSS2_ITEM_H #define LIBSYNDICATION_RSS2_ITEM_H +#include "document.h" + #include +#include #include #include class QDomDocument; class QDomElement; class QString; template class QList; namespace LibSyndication { class SpecificItemVisitor; namespace RSS2 { class Category; class Enclosure; class Source; /** * An Item, representing an entry in an RSS feed. * * @author Frank Osterfeld */ class KDE_EXPORT Item : public ElementWrapper, public LibSyndication::SpecificItem { public: /** * Default constructor, creates a null object, for which isNull() is * @c true. */ - Item(); + Item(SharedPtr doc=SharedPtr()); /** * Creates an Item object wrapping an @c <item> XML element. * * @param element The @c <item> element to wrap */ - Item(const QDomElement& element); + Item(const QDomElement& element, SharedPtr doc=SharedPtr()); + + Item(const Item& other); + + ~Item(); + + Item& operator=(const Item& other); bool accept(SpecificItemVisitor* visitor); /** * The title of the item. * * @return The title in plain text. Note that e.g. characters like <, * >, & are not escaped! * (TODO: this might change, check what makes more sense) - * This method returns the content of the @c <title> element. If - * @c <title> is not available, the method returns - * @c <dc:title> instead, if available. */ QString title() const; /** * The URL of the item. This usually links to the web representation * of the item, e.g. the full news article. * * @return an URL, or a null string if not set */ QString link() const; /** * The item synopsis. This might contain a short summary of the * item, but also the full content. If content() is set, that usually * contains the full content instead. - * This method returns the content of the @c <description> element. - * If @c <description> is not available, the method returns * @c - * <dc:description> instead, if available. * * @return a string in HTML format (whitespace is irrelevant, * @c <br/> is used for newlines, "&", "<", ">" are escaped) * summarizing the item. QString::null if no description was specified. */ QString description() const; - + /** * Returns the actual content of the item. In RSS2, this can be stored * in various elements, e.g. in content:encoded, xhtml:body or * xhtml:div. If this is not set, description() might also contain the * content of the item. * * @return the content in HTML format (whitespace is irrelevant, * <br/> is used for newlines, "&", "<", ">" are escaped) * If no content is specified, QString::null is returned. */ QString content() const; /** * Set of categories this item is included in. * * @return a list of categories, possibly empty. */ QList categories() const; /** * URL of a page for comments relating to the item. * * @return an URL to the comments, or a null string if not set */ QString comments() const; /** * The email address of the author of this item. For newspapers and * magazines syndicating via RSS, the author is the person who wrote * the article that this item describes. For collaborative weblogs, the * author of the item might be different from the managing editor or * webmaster. * This method returns the content of the @c <author> element. If * @c <author> is not available, the method returns * @c <dc:creator> instead, if available. * * @return an email address of the author, or a null string if not * specified */ QString author() const; /** * Descriptions of media objects that are attached to the item. * Note that the RSS2 spec is a bit unclear about whether an item can * have multiple enclosures or not. Originally it was not intended, but * in reality, some tools out there specify multiple enclosures. * So most of the time, this list be either empty or contains a * single item, but don't take that for granted */ QList enclosures() const; /** * "guid stands for globally unique identifier. It's a string that * uniquely identifies the item. When present, an aggregator may choose * to use this string to determine if an item is new. * There are no rules for the syntax of a guid. Aggregators must view * them as a string. It's up to the source of the feed to establish the * uniqueness of the string." * * @return a guid string, or a null string if none specified in the * feed */ QString guid() const; /** * If @c true, it can be assumed that the guid is a permalink to the * item, that is, a url that can be opened in a Web browser, that * points to the full item. * * @return @c true if the guid is a permalink and can be interpreted as * URL */ - bool guidIsPermaLink() const; + bool guidIsPermaLink() const; /** * Indicates when the item was published. If it's a date in the future, * you may choose to not display the item until that date. * This returns the content of the @c <pubDate> element. If @c * <pubDate> is not available, the method returns * @c <dc:date> instead, if available. * * @return the publication date, or 0 if no date was specified or * parsing failed */ time_t pubDate() const; /** * expiration date, specifying a date when the item is not longer * available. * Only available in RSS 0.93. * * @return the expiration date, or 0 if no date was specified or * parsing failed */ time_t expirationDate() const; /** * A Platform for Internet Content Selection (PICS) rating tag. * More information on the format of the rating tag can be found here: * http://www.w3.org/PICS/ * * @return PICS rating information, or a null string if not specified */ QString rating() const; /** * The RSS channel that the item came from. See Source class for more * information. * * @return a Source object, or a null object (see Source.isNull()) if * not set. */ Source source() const; /** * Returns a description of the object and its children for debugging * purposes. * * @return debug string */ QString debugInfo() const; + + /** + * @internal + */ + QString originalDescription() const; + + /** + * @internal + */ + QString originalTitle() const; + + private: + + class ItemPrivate; + SharedPtr d; }; } // namespace RSS2 } // namespace LibSyndication #endif // LIBSYNDICATION_RSS2_ITEM_H diff --git a/syndication/src/tools.cpp b/syndication/src/tools.cpp index 784b308b1..fb1198b17 100644 --- a/syndication/src/tools.cpp +++ b/syndication/src/tools.cpp @@ -1,157 +1,216 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "tools.h" #include #include #include #include #include #include #include namespace LibSyndication { KMD5 md5Machine; unsigned int calcHash(const QString& str) { return calcHash(str.toUtf8()); } unsigned int calcHash(const QByteArray& array) { if (array.isEmpty()) { return 0; } else { const char* s = array.data(); unsigned int hash = 5381; int c; while ( ( c = *s++ ) ) hash = ((hash << 5) + hash) + c; // hash*33 + c return hash; } } time_t parseISODate(const QString& str) { time_t res = KDateTime::fromString(str, KDateTime::ISODate).toTime_t(); return res != -1 ? res : 0; } time_t parseRFCDate(const QString& str) { time_t res = KDateTime::fromString(str, KDateTime::RFCDate).toTime_t(); return res != -1 ? res : 0; } time_t parseDate(const QString& str, DateFormat hint) { if (str.isEmpty()) return 0; if (hint == RFCDate) { time_t t = parseRFCDate(str); return t != 0 ? t : parseISODate(str); } else { time_t t = parseISODate(str); return t != 0 ? t : parseRFCDate(str); } } QString dateTimeToString(time_t date) { if (date == 0) return QString::null; QDateTime dt; dt.setTime_t(date); return dt.toString(); } QString calcMD5Sum(const QString& str) { md5Machine.reset(); md5Machine.update(str.toUtf8()); return QString(md5Machine.hexDigest().data()); } +QString resolveEntities(const QString& str) +{ + return KCharsets::resolveEntities(str); +} + +QString escapeSpecialCharacters(const QString& strp) +{ + QString str(strp); + str.replace("&", "&"); + str.replace("\"", """); + str.replace("<", "<"); + str.replace(">", ">"); + str.replace("\'", "'"); + return str; +} + +QString convertNewlines(const QString& strp) +{ + QString str(strp); + str.replace("\n", "
"); + return str; +} + QString plainTextToHtml(const QString& plainText) { QString str(plainText); str.replace("&", "&"); str.replace("\"", """); str.replace("<", "<"); //str.replace(">", ">"); str.replace("\n", "
"); - return str; + return str.simplified(); } QString htmlToPlainText(const QString& html) { QString str(html); //TODO: preserve some formatting, such as line breaks str.replace(QRegExp("<[^>]*>"), ""); // remove tags - str = KCharsets::resolveEntities(str); + str = resolveEntities(str); str = str.simplified(); - return str; } static QRegExp tagRegExp; static bool tagRegExpSet = false; +bool stringContainsMarkup(const QString& str) +{ + int ltc = str.count('<'); + if (ltc == 0 || ltc != str.count('>')) + return false; + + if (!tagRegExpSet) + { + tagRegExp = QRegExp("<\\w+.*/?>"); + tagRegExpSet = true; + } + return str.contains(tagRegExp); +} + bool isHtml(const QString& str) { - if (str != KCharsets::resolveEntities(str)) - return true; +// if (str != KCharsets::resolveEntities(str)) +// return true; int ltc = str.count('<'); if (ltc == 0 || ltc != str.count('>')) return false; if (!tagRegExpSet) { - tagRegExp = QRegExp("<[a-zA-Z]+.*/?>"); + tagRegExp = QRegExp("<\\w+.*/?>"); tagRegExpSet = true; } if (str.contains(tagRegExp)) return true; return false; } -QString htmlize(const QString& str) +QString normalize(const QString& str) +{ + return isHtml(str) ? str.simplified() : plainTextToHtml(str); +} + +QString normalize(const QString& strp, bool isCDATA, bool containsMarkup) { - return isHtml(str) ? str.simplified() : plainTextToHtml(str).simplified(); + if (containsMarkup) + return strp.simplified(); + else + { + if (isCDATA) + { + QString str = resolveEntities(strp); + str = escapeSpecialCharacters(str); + str = convertNewlines(str); + str = str.simplified(); + return str; + } + else + { + QString str = escapeSpecialCharacters(strp); + str = str.simplified(); + return str; + } + } } } // namespace LibSyndication diff --git a/syndication/src/tools.h b/syndication/src/tools.h index 0711cdf9e..aab5cbea0 100644 --- a/syndication/src/tools.h +++ b/syndication/src/tools.h @@ -1,154 +1,210 @@ /* * This file is part of libsyndication * * Copyright (C) 2006 Frank Osterfeld * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #ifndef LIBSYNDICATION_TOOLS_H #define LIBSYNDICATION_TOOLS_H #include #include class QByteArray; class QString; namespace LibSyndication { /** * @internal */ unsigned int calcHash(const QString& str); /** * @internal */ unsigned int calcHash(const QByteArray& array); /** * @internal */ QString calcMD5Sum(const QString& str); /** date formats supported by date parsers */ enum DateFormat { ISODate, /**< ISO 8601 extended format. * (date: "2003-12-13",datetime: "2003-12-13T18:30:02.25", * datetime with timezone: "2003-12-13T18:30:02.25+01:00") */ RFCDate /** RFC 822. (e.g. "Sat, 07 Sep 2002 00:00:01 GMT") */ }; /** * parses a date string in ISO 8601 extended format. * (date: "2003-12-13",datetime: "2003-12-13T18:30:02.25", * datetime with timezone: "2003-12-13T18:30:02.25+01:00") * * @param str a string in ISO 8601 format * @return parsed date in seconds since epoch, 0 if no date could * be parsed from the string. */ KDE_EXPORT time_t parseISODate(const QString& str); /** * parses a date string as defined in RFC 822. * (Sat, 07 Sep 2002 00:00:01 GMT) * * @param str a string in RFC 822 format * @return parsed date in seconds since epoch, 0 if no date could * be parsed from the string. */ KDE_EXPORT time_t parseRFCDate(const QString& str); /** * parses a date string in ISO (see parseISODate()) or RFC 822 (see * parseRFCDate()) format. * It tries both parsers and returns the first valid parsing result found (or 0 * otherwise). * To speed up parsing, you can give a hint which format you expect. * The method will try the corresponding parser first then. * * @param str a date string * @param hint the expected format * @return parsed date in seconds since epoch, 0 if no date could * be parsed from the string. */ KDE_EXPORT time_t parseDate(const QString& str, DateFormat hint=RFCDate); /** * @internal * returns a string representation of a datetime. * this is used internally to create debugging output. * * @param date the date to convert * @return string representation of the date, or a null string if * @c date is 0 */ KDE_EXPORT QString dateTimeToString(time_t date); +/** + * resolves entities to respective unicode chars. + * + * @param str a string + */ +KDE_EXPORT +QString resolveEntities(const QString& str); + +/** + * replaces the characters < >, &, ", ' + * with &lt; &gt; &amp;, &quot; &apos;. + * @param str the string to escape + */ +KDE_EXPORT +QString escapeSpecialCharacters(const QString& str); + +/** + * replaces newlines ("\n") by <br/> + * @param str to convert + */ +KDE_EXPORT +QString convertNewlines(const QString& str); + /** * converts a plain text string to HTML * * @param plainText a string in plain text. */ KDE_EXPORT QString plainTextToHtml(const QString& plainText); /** * converts a HTML string to plain text * * @param html string in HTML format * @return stripped text */ KDE_EXPORT QString htmlToPlainText(const QString& html); /** * guesses whether a string contains plain text or HTML * * @param str the string in unknown format * @return @c true if the heuristic thinks it's HTML, @c false * if thinks it is plain text */ KDE_EXPORT bool isHtml(const QString& str); +/** + * guesses whether a string contains (HTML) markup or not. This + * implements not an exact check for valid HTML markup, but a + * simple (and relatively fast) heuristic. + * + * @param str the string that might or might not contain markup + * @return @c true if the heuristic thinks it contains markup, @c false + * if thinks it is markup-free plain text + */ +KDE_EXPORT +bool stringContainsMarkup(const QString& str); + /** * Ensures HTML formatting for a string. * guesses via isHtml() if @c str contains HTML or plain text, and returns * plainTextToHtml(str) if it thinks it is plain text, or the unmodified @c str * otherwise. * * @param str a string with unknown content * @return string as HTML (as long as the heuristics work) */ KDE_EXPORT -QString htmlize(const QString& str); +QString normalize(const QString& str); + +/** + * normalizes a string based on feed-wide properties of tag content. + * It is based on the assumption that all items in a feed encode their + * title/description content in the same way (CDATA or not, plain text + * vs. HTML). isCDATA and containsMarkup are determined once by the feed, + * and then passed to this method. + * + * The returned string contains HTML, with special characters <, >, + * &, ", and ' escaped, and all other entities resolved. + * Whitespace is collapsed, relevant whitespace is replaced by respective + * HTML tags (<br/>). + * + * @param str a string + * @param isCDATA whether the feed uses CDATA for the tag @c str was read from + * @param containsMarkup whether the feed uses HTML markup in the + * tag @c str was read from. + * @return string as HTML (as long as the heuristics work) + */ +KDE_EXPORT +QString normalize(const QString& str, bool isCDATA, bool containsMarkup); } // namespace LibSyndication #endif // LIBSYNDICATION_TOOLS_H diff --git a/syndication/tests/rss2/desc-a.xml.expected b/syndication/tests/rss2/desc-a.xml.expected index 4ab782be4..491006828 100644 --- a/syndication/tests/rss2/desc-a.xml.expected +++ b/syndication/tests/rss2/desc-a.xml.expected @@ -1,15 +1,15 @@ # Feed begin ###################### title: #RSS 2.0 content test A# link: #http://www.example.com/# description: #Feed Description# # Item begin ###################### id: #http://www.example.com/a/1# title: #CDATA in description - plain-text with HTML entities# link: #http://www.example.com/a/1# -description: #This entry contains a description embedded in CDATA, but the content really is plain-text, apart from HTML entities. Filler content — Scandinavian letters follow: æ ø å# +description: #This entry contains a description embedded in CDATA, but the content really is plain-text, apart from HTML entities.

Filler content — Scandinavian letters follow: æ ø å# datePublished: #Sun Mar 14 19:59:12 2004# dateUpdated: #Sun Mar 14 19:59:12 2004# commentsLink: #http://www.bersvendsen.com/arkiv/207.html#comments# # Item end ######################## # Feed end ######################## diff --git a/syndication/tests/rss2/desc-d.xml.expected b/syndication/tests/rss2/desc-d.xml.expected index 27540eddc..85402ec89 100644 --- a/syndication/tests/rss2/desc-d.xml.expected +++ b/syndication/tests/rss2/desc-d.xml.expected @@ -1,14 +1,14 @@ # Feed begin ###################### title: #RSS 2.0 Content test D# link: #http://www.example.com/# description: #Feed Description# # Item begin ###################### id: #http://www.example.com/a/1# title: #Plain text in CDATA# link: #http://www.example.com/a/1# -description: #This entry contains only plain text inside CDATA. No escaped entities of any sort.

< and > here. And national characters too: �,�# +description: #This entry contains only plain text inside CDATA. No escaped entities of any sort.

< and > here. And national characters too: �,�# datePublished: #Sun Mar 14 19:59:12 2004# dateUpdated: #Sun Mar 14 19:59:12 2004# # Item end ######################## # Feed end ######################## diff --git a/syndication/tests/rss2/inhabitat.xml.expected b/syndication/tests/rss2/inhabitat.xml.expected index 5cdc3654f..1617ed5f3 100644 --- a/syndication/tests/rss2/inhabitat.xml.expected +++ b/syndication/tests/rss2/inhabitat.xml.expected @@ -1,25 +1,25 @@ # Feed begin ###################### title: #Inhabitat# link: #http://www.inhabitat.com# description: #the future of design# copyright: #Copyright 2006# language: #eng# # Image begin ##################### url: #http://www.inhabitat.com/images/smalllogo.jpg# title: #INHABITAT# link: #http://www.inhabitat.com# height: #31# width: #88# # Image end ####################### # Item begin ###################### id: #1030@http://www.inhabitat.com/# title: #ELASTIC CO.# link: #http://www.inhabitat.com/entry_1030.php# -description: #Of all of the materials I wouldn't want to wear, rubber is right up at the top. It doesn't breathe, it catches the little hairs on your arms...but creating dancers' costumes out of rubber bands led designer Elodie Blanchard to great inspirations for home decor, which she produces under the name ElasticCo.

The most commercially appealing among her varied designs is the Mesh Collection, a line of tabletop accessories and lighting made with many small, colorful rubber bands. The containers are great for holding perishables that need a little ventilation in order to stay fresh, and the lamps diffuse light beautifully onto surrounding surfaces.

I wouldn't want to dance in a rubberband get-up, but I'd stash my fruit in an ElasticCo. bowl any day.

+ elasticco.com# +description: #Of all of the materials I wouldn't want to wear, rubber is right up at the top. It doesn't breathe, it catches the little hairs on your arms...but creating dancers' costumes out of rubber bands led designer Elodie Blanchard to great inspirations for home decor, which she produces under the name ElasticCo. The most commercially appealing among her varied designs is the Mesh Collection, a line of tabletop accessories and lighting made with many small, colorful rubber bands. The containers are great for holding perishables that need a little ventilation in order to stay fresh, and the lamps diffuse light beautifully onto surrounding surfaces. I wouldn't want to dance in a rubberband get-up, but I'd stash my fruit in an ElasticCo. bowl any day. + elasticco.com# content: #


Of all of the materials I wouldn't want to wear, rubber is right up at the top. It doesn't breathe, it catches the little hairs on your arms...but creating dancers' costumes out of rubber bands led designer Elodie Blanchard to great inspirations for home decor, which she produces under the name ElasticCo.

The most commercially appealing among her varied designs is the Mesh Collection, a line of tabletop accessories and lighting made with many small, colorful rubber bands. The containers are great for holding perishables that need a little ventilation in order to stay fresh, and the lamps diffuse light beautifully onto surrounding surfaces.

I wouldn't want to dance in a rubberband get-up, but I'd stash my fruit in an ElasticCo. bowl any day.

+ elasticco.com




# datePublished: #Sun Jan 15 21:20:00 2006# dateUpdated: #Sun Jan 15 21:20:00 2006# commentsLink: #http://www.inhabitat.com/entry_1030.php#comm# # Item end ######################## # Feed end ######################## diff --git a/syndication/tests/rss2/spreeblick.xml.expected b/syndication/tests/rss2/spreeblick.xml.expected index 599c93da0..a9eecf0da 100644 --- a/syndication/tests/rss2/spreeblick.xml.expected +++ b/syndication/tests/rss2/spreeblick.xml.expected @@ -1,61 +1,61 @@ # Feed begin ###################### title: #Spreeblick# link: #http://www.spreeblick.com# description: #I live by the river!# language: #de# # Image begin ##################### url: #http://spreeblick.com/wp-content/themes/spreeblick_2006/images/spreeblick_logo_small.png# height: #100# width: #100# # Image end ####################### # Item begin ###################### id: #http://www.spreeblick.com/2006/03/18/blog-block/# title: #Blog-Block# link: #http://www.spreeblick.com/2006/03/18/blog-block/# -description: #Ich sitze Block J, Tribüne 2, ganz links oben. Links neben mir Tanja. Es sind noch Plätze frei im „Größten Online-Stadion“! Update: Olé! Olé, Olé, Olé!# +description: #Ich sitze Block J, Tribüne 2, ganz links oben. Links neben mir Tanja. Es sind noch Plätze frei im „Größten Online-Stadion“!

Update:
Olé! Olé, Olé, Olé!# content: #

Ich sitze Block J, Tribüne 2, ganz links oben. Links neben mir Tanja. Es sind noch Plätze frei im „Größten Online-Stadion“!

online stadion

Update:

Olé! Olé, Olé, Olé!

stadion voll

# datePublished: #Sat Mar 18 00:42:26 2006# dateUpdated: #Sat Mar 18 00:42:26 2006# # Person begin #################### name: #Johnny# # Person end ###################### commentsLink: #http://www.spreeblick.com/2006/03/18/blog-block/#comments# # Item end ######################## # Item begin ###################### id: #http://www.spreeblick.com/2006/03/17/die-lustigste-reaktion-zum-thema/# title: #Die lustigste Reaktion zum “Thema”# link: #http://www.spreeblick.com/2006/03/17/die-lustigste-reaktion-zum-thema/# description: #Sie kam bisher von Charles, ihr wisst schon, der es sich hier besonders schwer macht indem er nicht nur keine Videos zeigt sondern auch noch auf ausländisch bloggt:
SELL! SELL!! SELL!!!# content: #

Sie kam bisher von Charles, ihr wisst schon, der es sich hier besonders schwer macht indem er nicht nur keine Videos zeigt sondern auch noch auf ausländisch bloggt:

SELL! SELL!! SELL!!!

# datePublished: #Fri Mar 17 17:47:59 2006# dateUpdated: #Fri Mar 17 17:47:59 2006# # Person begin #################### name: #Johnny# # Person end ###################### commentsLink: #http://www.spreeblick.com/2006/03/17/die-lustigste-reaktion-zum-thema/#comments# # Item end ######################## # Item begin ###################### id: #http://www.spreeblick.com/2006/03/17/toni-mahoni-14-liebeskummer/# title: #Toni Mahoni - 14: Liebeskummer# link: #http://www.spreeblick.com/2006/03/17/toni-mahoni-14-liebeskummer/# description: #Direkt-Link zu YouTube
iPod-Version (M4V, 3:01, 12,1 MB)
Handy-Version (3GP, 3:01, 1,6 MB)# content: #

toni mahoni


Direkt-Link zu YouTube
iPod-Version (M4V, 3:01, 12,1 MB)
Handy-Version (3GP, 3:01, 1,6 MB)

# datePublished: #Fri Mar 17 13:18:00 2006# dateUpdated: #Fri Mar 17 13:18:00 2006# # Person begin #################### name: #Johnny# # Person end ###################### # Enclosure begin ################# url: #http://spreeblick.com/videos/tonimahoni014.m4v# type: #video/mp4# length: #12684514# # Enclosure end ################### # Enclosure begin ################# url: #http://spreeblick.com/videos/tonimahoni014.3gp# type: #video/3gpp# length: #1703572# # Enclosure end ################### commentsLink: #http://www.spreeblick.com/2006/03/17/toni-mahoni-14-liebeskummer/#comments# # Item end ######################## # Feed end ######################## diff --git a/syndication/tests/rss2/sueddeutsche.rss2.xml.expected b/syndication/tests/rss2/sueddeutsche.rss2.xml.expected index 7c1139fd5..938fcb8a1 100644 --- a/syndication/tests/rss2/sueddeutsche.rss2.xml.expected +++ b/syndication/tests/rss2/sueddeutsche.rss2.xml.expected @@ -1,42 +1,42 @@ # Feed begin ###################### title: #sueddeutsche.de# link: #http://www.sueddeutsche.de/# description: #sueddeutsche.de# copyright: #sueddeutsche.de GmbH/Süddeutsche Zeitung GmbH# language: #de# # Image begin ##################### url: #http://www.sueddeutsche.de/img/g_sz_logo_144.gif# title: #sueddeutsche.de# link: #http://www.sueddeutsche.de/# height: #31# width: #88# # Image end ####################### # Item begin ###################### id: #http://www.sueddeutsche.de/kultur/special/218/65153/# title: #Special: Der 5er.Pack# link: #http://www.sueddeutsche.de/kultur/special/218/65153/# description: ## # Category begin ################## term: #Kultur# # Category end #################### # Item end ######################## # Item begin ###################### id: #http://www.sueddeutsche.de/ausland/artikel/985/67918/# -title: #Iran zu Atomstreit: ''Wir haben keine Angst vor dem Klamauk des Westens''# +title: #Iran zu Atomstreit: ''Wir haben keine Angst vor dem Klamauk des Westens''# link: #http://www.sueddeutsche.de/ausland/artikel/985/67918/# description: #Ebenso wie die US-Regierung will nun auch Großbritannien den Atomstreit mit Iran schnell vor den UN-Sicherheitsrat bringen. Premier Blair schließt keine Maßnahmen mehr aus, um das Nuklearprogramm von Teheran zu stoppen. Doch Irans Präsident Ahmadi-Nedschad zeigt sich unbeeindruckt.# # Category begin ################## term: #Ausland# # Category end #################### # Item end ######################## # Item begin ###################### id: #http://www.sueddeutsche.de/deutschland/artikel/995/67928/# title: #Geheime Kooperation: BND half Amerikanern im Irak-Krieg# link: #http://www.sueddeutsche.de/deutschland/artikel/995/67928/# description: #Trotz offizieller Ablehnung der Militärschläge durch die Bundesregierung soll der deutsche Geheimdienst die USA beim Ausspähen von Bombenzielen unterstützt haben. Das Kanzleramt war informiert.# # Category begin ################## term: #Deutschland# # Category end #################### # Item end ######################## # Feed end ########################