% Tree mytree ::mytreeInsert some nodes
% mytree insert root 0 domNode4 % mytree insert root 0 domNode5 % mytree insert root 0 domNode6 % mytree children root domNode6 domNode5 domNode4 % mytree insert domNode4 0 domNode7 % mytree domDoc domDoc1 % [mytree domDoc] documentElement domNode2Perform an xpath query for all the leaves in the tree
% [[mytree domDoc] documentElement] selectNodes {//*[count(child::*)=0]}
domNode5 domNode6 domNode7} # Partial wrapping of tDOM functionality with tcllib tree interface
package require tdom
package require XOTcl
namespace import xotcl::*
Class Tree
Tree instproc init {{existingDomDoc ""}} {
[self] instvar domDoc
if {[string length $existingDomDoc] == 0} {
set domDoc [dom createDocument node]
} else {
set domDoc $existingDomDoc
}
[self] parametercmd domDoc
next
}
Tree instproc insertNode {parent idx child} {
# tcllib tree indexing starts at zero, but DOM starts at one
incr idx
set sibling [$parent child $idx]
if {[string length $sibling] == 0} {
return [$parent appendChild $child]
} else {
return [$parent insertBefore $child $sibling]
}
}
# tcllib tree allows an optional third argument to specify the
# name of the node. tDOM doesn't allow node names to be specified
# The easy way out--don't allow the third argument
Tree instproc insert {parent idx} {
[self] instvar domDoc
# Create the node
set newNode [$domDoc createElement node]
# Add the node to the tree
[self] insertNode $parent $idx $newNode
}
Tree instproc move {newParent idx nodeToMove args} {
foreach node [concat $nodeToMove $args] {
set oldParent [$node parentNode]
$oldParent removeChild $node
[self] insertNode $newParent $idx $node
incr idx
}
}
Tree instproc keyexists {node -key key} {
$node hasAttribute $key
}
Tree instproc keys {node} {
$node attributes
}
Tree instproc depth {node} {
return [llength [$node ancestor all]]
}
Tree instproc size {node} {
return [llength [$node descendant all]]
}
Tree instproc isleaf {node} {
return [expr [llength [$node descendant all]] == 0]
}
# This seems correct, but isn't giving me what I expect
Tree instproc index {node} {
return [llength [$node psibling all]]
}
# Try this instead
Tree instproc index {node} {
return [lsearch [[self] children [[self] parent $node]] $node]
}
Tree instproc numchildren {node} {
return [llength [$node child all]]
}
Tree instproc set {node args} {
switch [llength $args] {
0 {
$node getAttribute data
}
1 {
$node setAttribute data [lindex $args 0]
$node getAttribute data
}
2 {
$node getAttribute [lindex $args 1]
}
3 {
set switch [lindex $args 0]
set key [lindex $args 1]
set value [lindex $args 2]
$node setAttribute $key $value
$node getAttribute $key
}
default {error "wrong number of arguments"}
}
}
# Dynamically construct the simple command that have only a node argument
foreach {treeCmd domCmd} {children childNodes parent parentNode previous previousSibling next nextSibling delete delete} {
Tree instproc $treeCmd {node} "\$node $domCmd"
}
Tree instproc destroy {args} {
[self] instvar domDoc
$domDoc delete
next
}
# The tcllib interface always names the root node "root"
# tDOM doesn't have a way to specify node names. Therefore,
# install this filter to automatically convert "root" to the
# actual root element
Tree instproc convertRoot {node args} {
[self] instvar domDoc
# Convert input node from name "root"
if {$node == "root"} {
set node [$domDoc documentElement]
}
# Dispatch the method
set retVal [eval next $node $args]
# Convert output node to name "root"
if {$retVal == [$domDoc documentElement]} {
return root
} else {
return $retVal
}
}
Tree instfilter convertRoot
# Only call the filter for methods that have node as a first argument
Tree instfilterguard convertRoot {
([lsearch {init destroy} [self calledproc]] < 0) &&
([lsearch [Tree info instcommands] [self calledproc]] >= 0)
}LV So, what in version 2.0 or newer of struct::tree prevents the above from working - and is there a way to improve the code so that it would work in either version?
male - 2010-03-23 - I had the problem to parse very malformed html, not parsable by tDOM, so I decided to try tcllib's htmlparse package with the current [struct::tree] package. After parsing the html source text into the struct::tree, I convert the tree into a tDOM DOM to use XPaths to extract the needed information.Here the code usable with tcl 8.6 due to the usage of try/on/finally:
package require tdom;
package require htmlparse;
package require struct::tree;
namespace eval html2dom {
proc Attributes {data} {
set attributes [dict create];
foreach {=> name value} [regexp -inline -all -- {(\w+)(?:=\"?([^\"]*))?\"?} $data] {
dict set attributes $name [expr {$value eq "" ? $name : $value}];
}
return $attributes;
}
proc Walk {tree parentNode node dom parentDomNode} {
set type [$tree get $node type];
if {$parentNode eq $node} {
set domNode $parentDomNode;
} else {
if {$type eq "PCDATA"} {
set domNode [$dom createTextNode $type];
$domNode nodeValue [$tree get $node data];
$parentDomNode appendChild $domNode;
return;
}
set domNode [$dom createElement $type];
$parentDomNode appendChild $domNode;
}
if {[$tree keyexists $node data]} {
set data [$tree get $node data];
if {$data ne ""} {
set attributes [Attributes $data];
if {$attributes ne ""} {
$domNode setAttribute {*}$attributes;
}
}
}
foreach childNode [$tree children $node] {
[lindex [info level 0] 0] $tree $node $childNode $dom $domNode;
}
return;
}
proc convert {htmlFileName} {
try {
set htmlData [tDOM::xmlReadFile $htmlFileName];
set tree [struct::tree];
htmlparse::2tree $htmlData $tree;
htmlparse::removeVisualFluff $tree;
htmlparse::removeFormDefs $tree;
set dom [dom createDocument html];
set domDoc [$dom documentElement];
set treeRoot [$tree rootname];
Walk $tree $treeRoot $treeRoot $dom $domDoc;
} on error {reason options} {
if {[info exists tree]} {
$tree destroy;
}
if {[info exists dom]} {
$dom delete;
}
return -code error -options $options $reason;
} finally {
$tree destroy;
}
return $dom
}
namespace export -clear {[a-z]*};
namespace ensemble create;
}
set dom [html2dom convert $htmlFileName];
set domDoc [$dom documentElement];
set nodes [$domDoc selectNodes $xpath];
