Summary edit
Information about trie structuresReference edit
- trie
- nist.gov
- The trie Data Structure
- by Yehuda Shiran
- Tries and Suffix Trees
- Winter 1997 Class Notes for 308-251, McGill University
See Also edit
Description edit
From Wikipedia, the free encyclopediaIn computer science, a trie
NaviServer uses a trie for url dispatching:
NEM 2008-06-09: Here's a very simplistic trie implementation based on straight-forward use of nested dicts (typically a trie in C or Java would instead using a fixed-size array and indexing directly based on character (e.g. restricting words to contain only characters a-z)):
# trie.tcl -- # # Simple implementation of tries in Tcl. # package require Tcl 8.5 package provide trie 0.3 namespace eval ::trie { namespace export {[a-z]*} namespace ensemble create # create an empty trie proc create {} { dict create } # add a word to a trie contained in trieVar proc add {trieVar word} { upvar 1 $trieVar trie dict set trie {*}[split $word ""] END {} } # check if a given word is contained in a trie proc contains {trie word} { dict exists $trie {*}[split $word ""] END } # get the sub-trie of all words corresponding to a given prefix proc get {trie {prefix ""}} { if {$prefix eq ""} { return $trie } if {![dict exists $trie {*}[split $prefix ""]]} { return {} } dict get $trie {*}[split $prefix ""] } # iterate through all words in a trie calling a callback for each one. The # callback will be called with the string of each word. proc words {trie cmd {prefix ""}} { set tries [list [get $trie $prefix] $prefix] set i 0 while {[llength $tries] > $i} { set trie [lindex $tries $i] set prefix [lindex $tries [incr i]] # set tries [lassign $tries trie prefix] ;# VERY slow! if {[dict exists $trie END]} { uplevel 1 [linsert $cmd end $prefix] } dict for {k v} $trie { lappend tries $v $prefix$k } incr i } } # remove a word from a trie proc remove {trieVar word} { upvar 1 $trieVar trie if {![contains $trie $word]} { return } dict unset trie {*}[split $word ""] END # Could/should compact the trie at this point if no other words with # this word as a prefix. } # count the number of words in the trie proc size {trie {prefix ""}} { set count 0 words $trie count $prefix return $count } # private helpers proc count {args} { upvar 1 count var incr var } }And a quick test/demo:
proc test {} { set t [trie create] foreach word {howdy hello who where what when why how} { trie add t $word } puts "t = $t" puts "words:" trie words $t puts puts "all wh- words:" trie words $t puts "wh" trie remove t how puts "now:" trie words $t {lappend words} puts [join $words ", "] } # A bigger test -- read all words in a text into the trie proc read-trie file { set t [trie create] set in [open $file r] while {[gets $in line] >= 0} { foreach word [regexp -all -inline {[a-zA-Z]+} $line] { trie add t $word } } return $t } set t [read-trie ~/Desktop/ulyss12.txt] ;# James Joyce's Ulysses puts "size = [trie size $t]" dict for {k v} $trie { puts "$k = [trie size $v]" } puts "Words beginning with 'the':" trie words $t puts "the"Interestingly while testing this I noticed that it was taking a huge amount of time to calculate the number of distinct words in the trie (over a minute for just ~37000 words). Profiling revealed that the following idiom was to blame:
set xs [lassign $xs x y]which is used to pop elements off the front of a queue. Lassign seems to be quite pathologically slow in this case... Using just an index offset instead reduced the runtime to around <1 second.DKF: The issue is that the pop currently requires allocating a new array and copying all the elements over. Optimizing that away is really quite tricky indeed since it involves crossing abstraction levels in the compiler, but scripted K-like tricks with lreplace might get you some of the way.CMcC: contributes the following implementation of a Trie object.
# Trie data structure package provide Trie 1.0 if {[catch {package require Debug}]} { proc Debug.trie {args} { #puts stderr [uplevel subst $args] } } oo::class create ::Trie { variable trie id # search for longest prefix, return matching prefix, element and suffix method matches {t what} { set matches {} set wlen [string length $what] foreach k [lsort -decreasing -dictionary [dict keys $t]] { set klen [string length $k] set match "" for {set i 0} {$i < $klen && $i < $wlen && [string index $k $i] eq [string index $what $i] } {incr i} { append match [string index $k $i] } if {$match ne ""} { lappend matches $match $k } } Debug.trie {matches: $what -> $matches} if {[dict size $matches]} { # find the longest matching prefix set match [lindex [lsort -dictionary [dict keys $matches]] end] set mel [dict get $matches $match] set suffix [string range $what [string length $match] end] return [list $match $mel $suffix] } else { return {} ;# no matches } } # return next unique id if there's no proffered value method id {value} { if {$value} { return $value } else { return [incr id] } } # insert an element with a given optional value into trie # along path given by $args (no need to specify) method insert {what {value 0} args} { if {[llength $args]} { set t [dict get $trie {*}$args] } else { set t $trie } if {[dict exists $t $what]} { Debug.trie {$what is an exact match on path ($args $what)} if {[catch {dict size [dict get $trie {*}$args $what]} size]} { # the match is a leaf - we're done } else { # the match is a dict - we have to add a null dict set trie {*}$args $what "" [my id $value] } return ;# exact match - no change } # search for longest prefix set match [my matches $t $what] if {![llength $match]} { ;# no matching prefix - new element Debug.trie {no matching prefix of '$what' in $t - add it on path ($args $what)} dict set trie {*}$args $what [my id $value] return } lassign $match match mel suffix ;# prefix, element of match, suffix if {$match ne $mel} { # the matching element shares a prefix, but has a variant suffix # it must be split Debug.trie {splitting '$mel' along '$match'} set melC [dict get $t $mel] dict unset trie {*}$args $mel dict set trie {*}$args $match [string range $mel [string length $match] end] $melC } if {[catch {dict size [dict get $trie {*}$args $match]} size]} { # the match is a leaf - must be split if {$match eq $mel} { # the matching element shares a prefix, but has a variant suffix # it must be split Debug.trie {splitting '$mel' along '$match'} set melC [dict get $t $mel] dict unset trie {*}$args $mel dict set trie {*}$args $match "" $melC } Debug.trie {'$mel' is the longest prefix '$match' but was a leaf - insert '$suffix'} set melid [dict get $t $mel] dict set trie {*}$args $match $suffix [my id $value] } else { # it's a dict - keep searching Debug.trie {'$mel' is the longest prefix '$match' and is a dict - search for '$suffix' on path ($args $match)} my insert $suffix $value {*}$args $match } } # find a path matching an element $what # if the element's not found, return the nearest path method find_path {what args} { if {[llength $args]} { set t [dict get $trie {*}$args] } else { set t $trie } if {[dict exists $t $what]} { Debug.trie {$what is an exact match on path ($args $what)} return [list {*}$args $what] ;# exact match - no change } # search for longest prefix set match [my matches $t $what] if {![llength $match]} { return $args } lassign $match match mel suffix ;# prefix, element of match, suffix if {$match ne $mel} { # the matching element shares a prefix, but has a variant suffix # no match return $args } if {[catch {dict size [dict get $trie {*}$args $match]} size] || $size == 0} { # got to a non-matching leaf - no match return $args } else { # it's a dict - keep searching Debug.trie {'$mel' is the longest prefix '$match' and is a dict - search for '$suffix' on path ($args $match)} return [my find_path $suffix {*}$args $match] } } # given a trie, which may have been modified by deletion, # optimize it by removing empty nodes and coalescing singleton nodes method optimize {args} { if {[llength $args]} { set t [dict get $trie {*}$args] } else { set t $trie } if {[catch {dict size $t} size]} { Debug.trie {optimize leaf '$t' along '$args'} # leaf - leave it } else { switch -- $size { 0 { Debug.trie {optimize empty dict ($t) along '$args'} if {[llength $args]} { dict unset trie {*}$args } } 1 { Debug.trie {optimize singleton dict ($t) along '$args'} lassign $t k v if {[llength $args]} { dict unset trie {*}$args } append args $k if {[llength $v]} { dict set trie {*}$args $v } my optimize {*}$args } default { Debug.trie {optimize dict ($t) along '$args'} dict for {k v} $t { my optimize {*}$args $k } } } } } # delete element $what from trie method delete {what} { set path [my find_path $what] if {[join $path ""] eq $what} { Debug.trie {del '$what' along ($path) was [dict get $trie {*}$path]} if {[catch {dict size [dict get $trie {*}$path]} size]} { # got to a matching leaf - delete it dict unset trie {*}$path set path [lrange $path 0 end-1] } else { dict unset trie {*}$path "" } my optimize ;# remove empty and singleton elements } else { # nothing to delete, guess we're done } } # find the value of element $what in trie, # error if not found method find {what} { set path [my find_path $what] if {[join $path ""] eq $what} { if {[catch {dict size [dict get $trie {*}$path]} size]} { # got to a matching leaf - done return [dict get $trie {*}$path] } else { return [dict get $trie {*}$path ""] } } else { error "'$what' not found" } } # dump the trie as a string method dump {} { return $trie } # return a string rep of the trie sorted in dict order method order {{t {}}} { if {![llength $t]} { set t $trie } elseif {[llength $t] == 1} { return $t } set acc {} foreach key [lsort -dictionary [dict keys $t]] { lappend acc $key [my order [dict get $t $key]] } return $acc } # return the trie as a dict of names with values method flatten {{t {}} {prefix ""}} { if {![llength $t]} { set t $trie } elseif {[llength $t] == 1} { return [list $prefix $t] } set acc {} foreach key [dict keys $t] { lappend acc {*}[my flatten [dict get $t $key] $prefix$key] } return $acc } # overwrite the trie method set {t} { set trie $t } constructor {args} { set trie {} set id 0 foreach a $args { my insert $a } } } if {[info script] eq $argv0} { set data { rubber romane eunt domus romanus romulus rubens ruber rube rubicon rubicundus roman an antidote anecdotal ant all alloy allotrope allot aloe are ate be cataract catatonic catenary } ::Trie create example {*}$data puts "TRIE: [example dump]" puts "OTRIE: [example order]" example set [example order] puts "FLAT: [example flatten]" foreach d $data { puts "$d -> '[example find_path $d]' -> [example find $d]" } foreach d $data { example delete $d puts "DEL '$d': [example dump]" } puts "TRIE: [example dump]" }