- C:\_Ricci\sep>tclsh rss2html.tcl > t.html http://news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml

Again: no warranties at all, but I'm basically happy with the output of this script on my few test cases (Spiegel online, Tagesspiegel, BBC...) - feel free to add criticisms or improvements :) A tiny bash script called feedme reaps all feeds that I want, on demand, and ActiveSynch takes care to transfer the HTML pages to the little thing:tclsh x:/tcl/rss2html.tcl > Spiegel.htm www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml tclsh x:/tcl/rss2html.tcl > Tagesspiegel.htm www.tagesspiegel.de/feed/index.xml tclsh x:/tcl/rss2html.tcl > BBC.htm news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml
set usage {
usage: rss2html.tcl rss_url > htmlfile
}
package require http
package require uri
proc main argv {
if {[llength $argv] != 1} {puts stderr $::usage; exit}
set rss [lindex $argv 0] ;# other arguments ignored for now
set content [readRSS $rss]
set n 0
puts "<html><head/><body>From: $rss<br>"
puts "Updated: [clock format [clock sec] -format {%Y-%m-%d, %H:%M:%S}]<hr>"
#-- pass 1: table of contents
foreach {title - descr} $content {
incr n
puts "<br><a href=#$n>$title</a> $descr"
}
#-- pass 2: the pages themselves
set n 0
foreach {title url -} $content {
incr n
puts "<hr><a name=$n><h4>$title</h4></a>"
puts [readHTML $title $url]
}
puts "<hr>Reaped by rss2html :)</body></html>"
}
proc readRSS url {
upvar #0 [geturl_followRedirects $url] arr
if ![info exists arr(body)] {set arr(body) "<html>not found :(</html>"}
set res {}
foreach {tag content} [html2txt $arr(body)] {
switch -- $tag {
<description> {set descr $content}
<title> {set title $content}
<link> {set link $content}
</item> {lappend res $title $link $descr; set descr ""}
}
}
set res
}
proc readHTML {title url} {
regexp {[*](http://.+)} $url -> url
set res {}
upvar #0 [geturl_followRedirects $url] arr
foreach {tag content} [html2txt $arr(body)] {
set content [string trim [despace $content]]
if [string match *$content* $title] continue
if {[string length $content]<20} continue
if [regexp {userAgent|navigator.platform|http|\(\)} $content] continue
switch -glob -- [string tolower $tag] {
<br* - <div* - </div> - <p> - </p> - </script> -
<li> - </li> - </ul> {append res <p>$content\n}
<i> - <b> - </a> - </b> - <!--* - </em> {append res $content\n}
default {#append res "\n<!-- [string trim $tag <>] - $content -->\n"}
}
}
set res
}
proc html2txt {html} {
set res {}
set re {(<[^>]+>) *([^<>]*)}
foreach {all tag content} [regexp -all -inline $re $html] {
if {![regexp src= $content]} {
lappend res $tag $content
}
}
string map {
Ü ½½oe ß ½½Y ä ½½¤ ö ½½¶ ü ½½¼ „ ' “ '
ä ½½¤ ö ½½¶ ü ½½¼ ß ½½Y " "
} $res
}
proc despace string {string trim [regsub -all {\s+} $string " "]}
#-- courtesy KPV's http://wiki.tcl.tk/11831
proc geturl_followRedirects {url args} {
array set URI [::uri::split $url] ;# Need host info from here
while {1} {
set token [eval [list http::geturl $url] $args]
if {![string match {30[1237]} [::http::ncode $token]]} {return $token}
array set meta [set ${token}(meta)]
if {![info exist meta(Location)]} {
return $token
}
array set uri [::uri::split $meta(Location)]
unset meta
if {$uri(host) == ""} { set uri(host) $URI(host) }
# problem w/ relative versus absolute paths
set url [eval ::uri::join [array get uri]]
}
}
main $argv[metoto] - 2009-12-30 11:48:11Hi thats good that you know how to write the bash shell for rss2html can you write a similar script for a webpage? So that the url's are printed live from any Rss Feed?Thanks

