<?xml version="1.0" encoding="utf-8"?> <element><![CDATA[<not.an.element/>]]></element>
##+##########################################################################
#
# xml.tcl -- Simple XML parser
# by Keith Vetter, March 2004
#
namespace eval ::XML { variable XML "" loc 0}
proc ::XML::Init {xmlData} {
variable XML
variable loc
set XML [string trim $xmlData];
regsub -all {<!--.*?-->} $XML {} XML ;# Remove all comments
set loc 0
}
# Returns {XML|TXT|EOF|PI value attributes START|END|EMPTY}
proc ::XML::NextToken {{peek 0}} {
variable XML
variable loc
set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \
$XML all txt stok tok etok]
if {! $n} {return [list EOF]}
foreach {all0 all1} $all {txt0 txt1} $txt \
{stok0 stok1} $stok {tok0 tok1} $tok {etok0 etok1} $etok break
if {$txt1 >= $txt0} { ;# Got text
set txt [string range $XML $txt0 $txt1]
if {! $peek} {set loc [expr {$txt1 + 1}]}
return [list TXT $txt]
}
set token [string range $XML $tok0 $tok1] ;# Got something in brackets
if {! $peek} {set loc [expr {$all1 + 1}]}
if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { ;# Is it CDATA stuff?
return [list TXT $txt]
}
# Check for Processing Instruction <?...?>
set type XML
if {[regexp {^\?(.*)\?$} $token => token]} {
set type PI
}
set attr ""
regexp {^(.*?)\s+(.*?)$} $token => token attr
set etype START ;# Entity type
if {$etok0 <= $etok1} {
if {$stok0 <= $stok1} { set token "/$token"} ;# Bad XML
set etype EMPTY
} elseif {$stok0 <= $stok1} {
set etype END
}
return [list $type $token $attr $etype]
}
# ::XML::IsWellFormed
# checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210)
#
# Returns "" if well-formed, error message otherwise
# missing:
# characters: doesn't check valid extended characters
# attributes: doesn't check anything: quotes, equals, unique, etc.
# text stuff: references, entities, parameters, etc.
# doctype internal stuff
#
proc ::XML::IsWellFormed {} {
set result [::XML::_IsWellFormed]
set ::XML::loc 0
return $result
}
;proc ::XML::_IsWellFormed {} {
array set emsg {
XMLDECLFIRST "The XML declaration must come first"
MULTIDOCTYPE "Only one DOCTYPE is allowed"
INVALID "Invalid document structure"
MISMATCH "Ending tag '$val' doesn't match starting tag"
BADELEMENT "Bad element name '$val'"
EOD "Only processing instructions allowed at end of document"
BADNAME "Bad name '$val'"
BADPI "No processing instruction starts with 'xml'"
}
# [1] document ::= prolog element Misc*
# [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
# [27] Misc ::= Comment | PI | S
# [28] doctypedecl ::= <!DOCTYPE...>
# [16] PI ::= <? Name ...?>
set seen 0 ;# 1 xml, 2 pi, 4 doctype
while {1} {
foreach {type val attr etype} [::XML::NextToken] break
if {$type eq "PI"} {
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
return [subst $emsg(BADNAME)]
}
if {$val eq "xml"} { ;# XMLDecl
if {$seen != 0} { return $emsg(XMLDECLFIRST) }
# TODO: check version number exist and only encoding and
# standalone attributes are allowed
incr seen ;# Mark as seen XMLDecl
continue
}
if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)}
set seen [expr {$seen | 2}] ;# Mark as seen PI
continue
} elseif {$type eq "XML" && $val eq "!DOCTYPE"} { ;# Doctype
if {$seen & 4} { return $emsg(MULTIDOCTYPE) }
set seen [expr {$seen | 4}]
continue
}
break
}
# [39] element ::= EmptyElemTag | STag content ETag
# [40] STag ::= < Name (S Attribute)* S? >
# [42] ETag ::= </ Name S? >
# [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
# [44] EmptyElemTag ::= < Name (S Attribute)* S? />
#
set stack {}
set first 1
while {1} {
if {! $first} { ;# Skip first time in
foreach {type val attr etype} [::XML::NextToken] break
} else {
if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) }
set first 0
}
if {$type eq "EOF"} break
;# TODO: check attributes: quotes, equals and unique
if {$type eq "TXT"} continue
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
return [subst $emsg(BADNAME)]
}
if {$type eq "PI"} {
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
continue
}
if {$etype eq "START"} { ;# Starting tag
lappend stack $val
} elseif {$etype eq "END"} { ;# </tag>
if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] }
set stack [lrange $stack 0 end-1]
if {[llength $stack] == 0} break ;# Empty stack
} elseif {$etype eq "EMPTY"} { ;# <tag/>
}
}
# End-of-Document can only contain processing instructions
while {1} {
foreach {type val attr etype} [::XML::NextToken] break
if {$type eq "EOF"} break
if {$type eq "PI"} {
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
continue
}
return $emsg(EOD)
}
return ""
}
################################################################
#
# Demo code
#
set xml {<?xml version="1.0" encoding="ISO-8859-1"?>
<loc version="1.0" src="Groundspeak">
<waypoint>
<name id="GCGPXK"><![CDATA[Playing Poker with the Squirrels by Rino 'n Rinette]]></name>
<coord lat="40.1548166" lon="-82.5202833"/>
<type>Geocache</type>
<link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GCGPXK</link>
</waypoint><waypoint>
<name id="GC19DF"><![CDATA[Great Playground Caper by Treasure Hunters Inc.]]></name>
<coord lat="40.0667166666667" lon="-82.5358"/>
<type>Geocache</type>
<link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GC19DF</link>
</waypoint>
</loc>
}
::XML::Init $xml
set wellFormed [::XML::IsWellFormed]
if {$wellFormed ne ""} {
puts "The xml is not well-formed: $wellFormed"
} else {
puts "The xml is well-formed"
while {1} {
foreach {type val attr etype} [::XML::NextToken] break
puts "looking at: $type '$val' '$attr' '$etype'"
if {$type == "EOF"} break
}
}Steve Ball: Comment #1: This looks like an xmlTextReader-style interface (ie. looping and reading one token at a time). That's interesting to me because I'm now using libxml2's xmlTextReader interface in TclXML and am considering introducing it at the scripting level.Comment #2: Your code above will work fine as long as the input XML is well-formed. It has absolutely no error checking at all! Error checking is where all the hard work is...KPV True on both accounts. I was working with validated XML so error checking was not important. But what do you expect from 25 lines of code?JE This is a reasonable approach to take. It's OK for a simple parser to omit error checking as long as you only feed it XML that's known to be good; in the context of a complete system, the well-formedness checks and validation can be Somebody Else's Problem. That said, the above code will fail on valid input too.MSW It's tcl, so I expect a fully featured all-in-one device suitable for every purpose (aka `eierlegende Wollmilchsau'' :)
KPV In response to comment #2 above, I added a routine to check for well-formedness. It checks for most of the major well-formed constraints such as properly nesting tags (that's easy), proper prolog, valid names, etc. It doesn't handle: names with extended unicode characters, anything to do with attributes (quotes, unique, character sets, etc.), text stuff like references, entities, parameters, etc., nor doctype internal stuff.Now I remember how much I hate XML. At first blush, XML seems so simple, but once you dig deep there's so many weird gotchas. Checking for well-formedness quadrupled the size of the code and even so it's not complete. I love XML's design goal #6 XML documents should be human-legible and reasonably clear [1].
AM (8 september 2009) Once you have parsed an XML file successfully, you can regard it as another way to specify source code ... Using XML files for source code

