Updated 2011-07-07 08:36:36 by dkf

Adaptive huffman coding has the advantage over static coding that the entire dataset does not need to be known in advance and the dictionary does not need to be transmitted separately from the data. It is also trickier to implement.

It would be wrong to call this implementation adaptive huffman coding because well, it isn't. It is an adaptive encoding, but my tree balancing routines are buggy and I've already spent enough time on this. To make it a true huffman encoding, just fix balance-FGK or balance-V.

The performance is hideously slow — around 200 characters per second on my machine. I think most of this is due to all the tree manipulation.

I intended to mate this code with the code in LZW, but the combination works pretty badly, probably because of my lack of a good idea for encoding the higher-ordered symbols effeciently.

JR
# this uses the tree data structure from tcllib
package require struct

namespace eval ::huffman {
    # reverse a string
    # This is used to store partial words in big-endian order.  'binary
    # format' will only store bits starting at the msb or lsb of a word, so we
    # do the formatting little-endian and reverse it
    proc reverse {str} {
        set rv {}
        foreach c [split $str ""] {
            set rv [linsert $rv 0 $c]
        }
        return [join $rv ""]
    }

    # some tree helper functions

    # create a tree and set up symbol->node and node->symbol arrays
    proc mktree {} {
        variable trees
        variable symbols
        set tree [::struct::tree]
        set trees(e0) root
        set symbols(root) e0
        $tree set root 0
        return $tree
    }

    # delete the tree and arrays
    proc deltree {tree} {
        variable trees
        variable symbols
        $tree destroy
        unset trees
        unset symbols
    }

    # given a node, find the bit path to that node from the root
    proc getpath {tree node} {
        set p {}
        while {$node != "root"} {
            append p [$tree index $node]
            set node [$tree parent $node]
        }
        return [reverse $p]
    }

    # 'tree swap' doesn't move child trees, so we move the two nodes
    # separately using this helper
    proc swap {tree node1 node2} {
        set np [$tree parent $node1]
        set ni [$tree index $node1]
        $tree move [$tree parent $node2] [$tree index $node2] $node1
        $tree move $np $ni $node2
    }

    # dump out the tree
    # for debugging
    proc dump-tree {tree} {
        variable symbols
        $tree walk root -type bfs -order pre -command {
            if {[info exists symbols(%n)]} {set sym $symbols(%n)} else {set sym "-"}
            puts "[string repeat "  " [%t depth %n]]%n($sym [%t get %n]): [%t children %n]"
        }
    }

    # increment weight of node, rebalance tree
    proc update-tree {tree node}  {
        balance-FGK $tree $node
    }

    # update the node weights and rebalance the tree using algorithm FGK.
    # not quite right, doesn't always find the correct sibling.
    proc balance-FGK {tree node}  {
        set cn $node
        while {$cn != "root"} {
            set next [$tree parent $cn]
            $tree set $cn [expr {[$tree get $cn]+1}]
            set sib [$tree next $cn]
            if {$sib == ""} {
                # if we're at the end of a row then the next sibling is the
                # start of the previous row.
                # this works if the tree is only 2 deep, otherwise it's really
                # wrong
                set sib [$tree previous $next]
            }
            if {$sib != "" && [$tree get $cn] > [$tree get $sib]} {
                swap $tree $cn $sib
            }
            set cn $next
        }
        # update root node
        $tree set $cn [expr {[$tree get $cn]+1}]
    }

    # update the node weights and rebalance the tree using algorithm V.
    # pretty much completely wrong
    proc balance-V {tree node} {
        # update the weights on the path to the added node
        set cn $node
        while {$cn != ""} {
            $tree set $cn [expr {[$tree get $cn]+1}]
            set cn [$tree parent $cn]
        }

        set pn {}
        # this is an attempt to walk the tree in 'implicit' order.
        # it doesn't work.
        $tree walk root -order post -type bfs -command {
            # if weight of current node is less than weight of previous node,
            # then swap
            if {$pn != "" && [%t get %n] < [%t get $pn]} {
                swap %t %n $pn
            }
            set pn %n
        }
    }

    # encode symbol (char/integer) as bitstring
    # handle int range 0-2^11
    # the range is beyond 0-255 because this is intended to handle a LZW
    # dictionary rather than just characters.
    # there's got to be a better way.
    # symbol is encoded as a length bit (0 = 8 bits, 1 = 11 bits) followed by
    # the data in big-endian order
    proc encsym {sym} {
        set ebs "" ;# encoded bit-string
        if {$sym < 256} {
            append ebs 0
            set si [binary format c $sym]
            binary scan $si B8 sv
            append ebs $sv
        } else {
            append ebs 1
            set si [binary format s $sym]
            # format in little-endian then reverse
            binary scan $si b11 sv
            append ebs [reverse $sv]
        }
        return $ebs
    }

    # decode the next symbol in the bitstring starting at index 'ind' using
    # the given tree
    # updates the tree
    # returns the symbol and the new index
    proc getsym {tree bits ind} {
        variable symbols
        set n root
        set bl [llength $bits]
        while {![$tree isleaf $n] && $ind < $bl} {
            set n [lindex [$tree children $n] [lindex $bits $ind]]
            incr ind
        }
        if {$symbols($n) == "e0"} {
            # new symbol
            set lb [lindex $bits $ind] ;# length bit
            incr ind
            if {$lb == "0"} {
                set sb [join [lrange $bits $ind [expr {$ind+7}]] ""]
                incr ind 8
                binary scan [binary format B8 $sb] c sym
                # mask to keep result unsigned
                set sym [expr {$sym & 0xff}]
            } else {
                set sb [join [lrange $bits $ind [expr {$ind+10}]] ""]
                incr ind 11
                # reverse the bitstring and decode as little-endian
                binary scan [binary format b11 [reverse $sb]] s sym
                # mask to keep result unsigned
                set sym [expr {$sym & 0x7ff}]
            }
            set symbols($n) $sym
        } else {
            set sym $symbols($n)
        }
        # add the symbol to the tree and/or update the tree
        addsym $tree $sym
        return [list $sym $ind]
    }

    # insert the symbol into the tree if it doesn't exist
    # update the tree and the mapping arrays
    # returns code for symbol
    proc addsym {tree sym} {
        variable trees
        variable symbols
        set rs {}
        if {![info exists trees($sym)]} {
            # if symbol doesn't exist: split e0 into new node and new e0
            append rs [getpath $tree $trees(e0)] [encsym $sym]
            set n $trees(e0)
            set trees($sym) [$tree insert $n 0]
            $tree set $trees($sym) 0
            set trees(e0) [$tree insert $n 1]
            $tree set $trees(e0) 0
            set symbols($trees($sym)) $sym
            set symbols($trees(e0)) e0
        } else {
            # symbol exists, just emit code and update
            append rs [getpath $tree $trees($sym)]
        }
        update-tree $tree $trees($sym)
        return $rs
    }

    # input is a list of integers
    proc encode {rval} {
        set tree [mktree]
        set l [llength $rval]
        set c 0
        foreach rv $rval {
            append bs [addsym $tree $rv]
            incr c
            # progress meter
            puts -nonewline stderr [format "%3.2f%%\r" [expr {double($c)/$l*100}]]
        }
        deltree $tree
        return $bs
    }

    # decode bitstring into list of integers
    proc decode {bs} {
        set tree [mktree]
        set bits [split $bs ""]
        set ind 0
        set ds {}
        set bl [llength $bits]
        while {$ind < $bl} {
            set si [getsym $tree $bits $ind]
            lappend ds [lindex $si 0]
            set ind [lindex $si 1]
            # progress meter
            puts -nonewline stderr [format "%3.2f%%\r" [expr {double($ind)/$bl*100}]]
        }
        return $ds
    }
}
# end namespace eval ::huffman

# convert string to list
proc s2l {str} {
    set l {}
    foreach c [split $str ""] {
        binary scan $c c num
        lappend l $num
    }
    return $l
}

# convert list to string
proc l2s {list} {
    set s ""
    foreach num $list {
        append s [binary format c $num]
    }
    return $s
}

# some testing
set st "Hello, world!"
# set st [read stdin]
puts "input length: [string length $st] bytes"
set cst [huffman::encode [s2l $st]]
puts "compressed length: [expr {[string length $cst]/8}] (bytes) [format "%3.2f%%" [expr {(1-([string length $cst]/8)/double([string length $st]))*100}]]"
set dst [l2s [huffman::decode $cst]]
if {[string compare $dst $st] == 0} {
    puts "input and output match"
} else {
    puts "input and output differ"
}