Updated 2011-05-27 12:08:09 by RLE

While DG was making an RSS reader/formatter for tclhttpd for dynamic page generation of news feeds, I bumped into some nasty problems with encodings and thought I'd share.

tDOM's [dom parse] command expects the xml string to have already been properly decoded and be in its "pure" state. The http package does encoding translations but only when the HTTP header for Content-Type contains a charset declaration. To make matters a bit more tricky, if the XML string contains an encoding value in the XML declaration, the XML parser will do a needless second conversion. Here's a working way to grab and parse XML with the http package:
 # Lie like a senator for google to stop giving me a 403..
 http::config -useragent {Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8) Gecko/20051111 Firefox/1.5}
 
 # Should the server uses this info, give it what we want to receive.
 http::config -accept "text/xml,application/xml,application/rss+xml,application/rdf+xml,application/atom+xml"
 
 # Returns the DOM object of the RSS feed.
 proc tmlrss::fetchXML {uri {recurse_limit 4}} {
 
    set token [http::geturl $uri]
    upvar #0 $token state
    if {[http::status $token] != "ok" || [http::ncode $token] != 200} {
        # was the error a redirect?  If so, do it..
        if {[http::ncode $token] == 302 && [incr recurse_limit -1] > 0} {
            array set meta $state(meta)
            set result [fetchXML $meta(Location) $recurse_limit]
            http::cleanup $token
            return $result
        }
        set err [http::code $token]
        http::cleanup $token
        return -code error $err
    }
    set xml [http::data $token]
    array set meta $state(meta)
    http::cleanup $token

    # Do we need to do encoding conversions or was it already done
    # in transit?
 
    if {[info exist meta(Content-Type)] && \
            [regexp -nocase {charset\s*=\s*(\S+)} $meta(Content-Type)]} {
 
        # Socket channel encodings already performed!  No Work to do
        # here.  See section 5.2.2 of the html spec.  Server set
        # encodings win.
 
    } else {
 
        # Read and perform charset mappings of the document rather than
        # hand off the literal string to [dom parse] as Tcl itself is
        # more encoding capable.
 
        set xml [encoding convertfrom [getXmlEncoding $xml] $xml]
    }
 
    # Strip the XML declaration, should it exist.  The encoding
    # conversions have already been performed.  Let's not let Expat
    # try to convert it a second time as the work has already been
    # performed.  As the parser assumes utf-8 by default without a
    # declaration, and the fact that tdom is handing expat the utf-8
    # string rep, this will work even though it doesn't look correct.
 
    return [dom parse -baseurl [uriBase $uri] [stripXmlDecl $xml]]
 }
 
 proc tmlrss::uriBase {uri} {
    array set info [uri::split $uri]
    set info(path) [file dirname $info(path)]
    return [eval uri::join [array get info]]
 }
 
 proc tmlrss::stripXmlDecl {xml} {
    if {![binary scan [string range $xml 0 3] "H8" firstBytes]} {
        # very short (< 4 Bytes) string
        return $xml
    }
 
    # If the entity has an XML Declaration, the first four characters
    # must be "<?xm".
    switch $firstBytes {
        "3c3f786d" {
            # Try to find the end of the XML Declaration
            set closeIndex [string first ">" $xml]
            if {$closeIndex == -1} {
                error "Weird XML data or not XML data at all"
            }
            set xml [string range $xml [expr {$closeIndex+1}] end]
        }
        default {
            # no declaration.
        }
    }
    return $xml
 }
 
 proc tmlrss::getXmlEncoding {xml} {
 
    # The autodetection of the encoding follows
    # XML Recomendation, Appendix F

    if {![binary scan $xml "H8" firstBytes]} {
        # very short (< 4 Bytes) file
         return iso8859-1
    }
    
    # If the entity has a XML Declaration, the first four characters
    # must be "<?xm".
    switch $firstBytes {
        "3c3f786d" {
            # UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS,
            # EUC, or any other 7-bit, 8-bit, or mixed-width encoding which 
            # ensures that the characters of ASCII have their normal positions,
            # width and values; the actual encoding declaration must be read to
            # detect which of these applies, but since all of these encodings
            # use the same bit patterns for the ASCII characters, the encoding
            # declaration itself can be read reliably.

            # Try to find the end of the XML Declaration
            set closeIndex [string first ">" $xml]
            if {$closeIndex == -1} {
                error "Weird XML data or not XML data at all"
            }
 
            set xmlDeclaration [string range $xml 0 [expr {$closeIndex}]]
            # extract the encoding information
            set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]}
            # emacs: "
            if {![regexp $pattern $xml - encStr]} {
                # Probably something like <?xml version="1.0"?>. 
                # Without encoding declaration, pass-thru
                return iso8859-1
            } else {
                set encoding [tDOM::IANAEncoding2TclEncoding $encStr]
            }
        }
        default {
            # pass-thru
            set encoding iso8859-1
        }
    }
    return $encoding
 }

HaO What happens, if the server sends an XML-Document in unicode starting with a BOM (byte order mark) ? Or is this not relevant in this context ?