Updated 2007-02-11 08:27:42

I needed a simple, easy to work with, asyncronous safe RSS parser. I tried TclRSS [1], but it had heavy requirements, and I never actually figured out the correct combination of dependencies and their versions needed to make it work. I gave up on that and started writing my own RSS parser, using the subset of the TclRSS API that I had already written a program to. I found TAX: A Tiny API for XML (make sure you use the version at the bottom of the page, I just made that into a package and called it "tax 0.1") a small, good-enough Tcl XML parser and wrote an RSS parser from example RSS 2.0 feeds, and not from the specification (I was in a hurry). It works with my limited testing (digg.com, sourceforge, and cnn), but probably doesn't work for anything that is not exactly RSS 2.0 and pretty close to one of my examples.
 #! /usr/bin/env tclsh

 package require tax

 namespace eval rss {
        namespace eval channels {
        }
        namespace eval items {
        }
 }

 proc rss::__replace_entities {text} {
        return [string map [list "&nbsp;" " " "&gt;" ">" "&lt;" "<" "&amp;" "&"] $text]
 }

 proc rss::__strip_html {text} {
        # We replace entities here (i.e., twice) because HTML-inside-XML will have
        # the HTML entities escaped twice.
        return [__replace_entities [regsub -all -- {<[^>]*>} $text ""]]
 }

 proc rss::__tax_add_to_object {obj tag isClose isSelfClosing properties body} {
        upvar #0 $obj rssobj
        set channelid [namespace tail $obj]

        set tag [string tolower [string trim $tag]]
        if {$tag == "docstart"} {
                set rssobj(parent) [list]

                namespace eval ::rss::items::$channelid {}
        }
        if {$tag == "docstart" || $tag == "rss"} {
                return
        }
        if {[string index $tag 0] == "?"} {
                return
        }

        if {$isClose && !$isSelfClosing} {
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                return
        }

        lappend rssobj(parent) $tag

        set parent [lindex $rssobj(parent) end-1]
        switch -- $tag {
                "item" {
                        if {$parent == "channel"} {
                                set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info procs ::rss::items::${channelid}::*]] 0]]
                                if {$lastusedid == ""} {
                                        set lastusedid 0
                                }
                                set id "::rss::items::${channelid}::[expr $lastusedid + 1]"

                                proc $id [list command [list obj $obj] [list id $id]] {
                                        upvar #0 $obj rssobj
                                        switch -- [string tolower $command] {
                                                "title" {
                                                        set idx [list $id title]
                                                }
                                                "link" {
                                                        set idx [list $id link]
                                                }
                                                "description" {
                                                        set idx [list $id description]
                                                }
                                                "date" {
                                                        set idx [list $id pubdate]
                                                }
                                        }

                                        if {![info exists idx]} {
                                                return ""
                                        }
                                        if {![info exists rssobj($idx)]} {
                                                return ""
                                        }

                                        return $rssobj($idx)
                                }

                                lappend rssobj(items) $id

                                lappend rssobj(parent) $id
                        }
                }
                "title" {
                        set rssobj([list $parent title]) [__strip_html [__replace_entities $body]]
                }
                "link" {
                        set rssobj([list $parent link]) [__replace_entities $body]
                }
                "description" {
                        set rssobj([list $parent description]) [__strip_html [__replace_entities $body]]
                }
                "pubdate" {
                        catch {
                                set body [clock scan $body]
                        }
                        set rssobj([list $parent pubdate]) $body
                }
        }

        if {$isClose} {
                # For self-closing tags
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
        }

        return
 }

 # Return ID
 proc rss::parse {data} {
        set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info vars ::rss::channels::*]] 0]]
        if {$lastusedid == ""} {
                set lastusedid 0
        }
        set id "::rss::channels::[expr $lastusedid + 1]"

        upvar #0 $id rssobj

        tax::parse [list ::rss::__tax_add_to_object $id] $data

        proc $id [list command [list obj $id]] {
                upvar #0 $obj rssobj
                switch -- $command {
                        "items" {
                                set idx items
                        }
                        "description" {
                                set idx [list channel description]
                        }
                        "link" {
                                set idx [list channel link]
                        }
                }

                if {![info exists idx]} {
                        return ""
                }
                if {![info exists rssobj($idx)]} {
                        return ""
                }
                return $rssobj($idx)
        }

        return $id
 }

 proc rss::cleanup {id} {
        if {[string match "::rss::channels::*" $id]} {
                set channelid [namespace tail $id]
                foreach proc [info procs ::rss::items::${channelid}::*] {
                        rename $proc ""
                }
                unset -nocomplain $id
        }

        return 1
 }

 package provide rss 0.1

You'll notice that it is slightly inconsisent towards the middle.. I should have used a namespace under ::rss for both channels and items, but I didn't realize this until I didn't feel like changing it. Feel free to edit the above, or use it in your own code.

LV Do you have any examples to demonstrate?

Roy Keene Sure, a small example:
 #! /usr/bin/env tclsh

 package require rss
 package require http

 set token [http::geturl "http://www.digg.com/rss/index.xml"]
 set rssdata [http::data $token]
 http::cleanup $token

 set id [rss::parse $rssdata]
 foreach item [$id items] {
 	puts "[clock format [$item date]]: [$item title]: [$item description] ([$item link])"
 }

A more complete example:
 #! /usr/bin/env tclsh

 package require rss
 package require http
 package require Tk

 proc gui_bg_update_news {newsobj} {
 	set rssfeeds [list {http://news.google.com/news?ned=us&topic=h&output=rss} {http://rss.cnn.com/rss/cnn_world.rss}]

 	http::config -useragent {Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.7e}

 	if {![info exists ::rss_newsitems]} {
 		set ::rss_newsitems [list]
 	}
 	foreach url $rssfeeds {
 		catch {
 			http::geturl $url -command gui_bg_update_news_data
 		}
 	}

 	after 30000 [list gui_bg_update_news_text $newsobj]
 }

 proc gui_bg_update_news_text {newsobj} {

 	if {[llength $::rss_newsitems] != 0} {
 		$newsobj delete 0 end
 		unset -nocomplain ::rss_newsitems_urls
 		foreach item [lsort -dictionary -index 0 $::rss_newsitems] {
 			set date [lindex $item 0]
 			set title [lindex $item 1]
 			set desc [lindex $item 2]
 			set link [lindex $item 3]

 			$newsobj insert end "$title"
 			set ::rss_newsitems_urls($title) $link
 		}
 		set ::rss_newsitems [list]
 	}

 	after 120000 [list gui_bg_update_news $newsobj]
 }

 proc gui_bg_update_news_data {token} {
 	if {[http::ncode $token] != "200"} {
 		if {$::DEBUG} {
 			puts "Error opening url: [http::ncode $token]"
 		}
 		http::cleanup $token
 		return
 	}
 	set rssdata [http::data $token]

 	if {[catch {
 		set id [rss::parse $rssdata]
 		set newsitems [list]
 		foreach item [$id items] {
 			set newitem [list [$item date] [$item title] [$item description] [$item link]]
 			if {[lsearch $::rss_newsitems $newitem] == -1} {
 				lappend ::rss_newsitems $newitem
 			}
 		}
 	} err]} {
 		if {[info exists $::DEBUG]} {
 			puts "Error in RSS feed update: $err"
 			puts "$::errorInfo"
 		}
 	}

 	http::cleanup $token
 	if {[info exists id]} {
 		rss::cleanup $id
 	}
 }

 proc load_rss_url {newsobj x y} {
 	set idx [$newsobj nearest $y]
 	set idxbbox [$newsobj bbox $idx]

 	set idx_starty [expr [lindex $idxbbox 1]]
 	set idx_endy [expr $idx_starty + [lindex $idxbbox 3]]

 	if {$y < ($idx_starty - 4)} {
 		return
 	}
 	if {$y > ($idx_endy + 4)} {
 		return
 	}

 	set title [$newsobj get $idx]

 	if {![info exists ::rss_newsitems_urls($title)]} {
 		return
 	}

 	set url $::rss_newsitems_urls([$newsobj get $idx])

 	puts "Loading URL: $url"
 	# XXX: TODO, Figure out how to actually load a URL across platforms
 }

 listbox .newsInfo -width 80
 button .exit -text "Exit" -command exit

 pack .newsInfo -expand 1 -fill both
 pack .exit

 bind .newsInfo <Double-1> [list load_rss_url .newsInfo %x %y]
 after 1000 [list gui_bg_update_news .newsInfo]

Category Internet