- they return a list {$length $offset} of the best substring found, indicating failure with a zero length
- they indicate failure if the longest match has ($length < 3), or if it has (length < 5) and ($offset > 255)
- they exploit the property that the last match of length (L+K) with K>0 must occur at a larger (or equal) offset than the last match of length L. This permits to restrict the searches dynamically.
- they are faster than the previous one: 5x for the first one, 2x for the second one.
Timing on real files, in seconds
# lz77.tcl - 5482 bytes (your version)
Times: 1.25 0.33 0.11 (100% 26% 8%)
Sizes: 1991 1990 1981
# lz77.test - 18674 bytes (your version)
Times: 7.72 1.45 0.39 (100% 18% 5%)
Sizes: 3901 3891 3896
# ~/.tkchatrc - 106483 bytes (mine; how come it is that large?)- Times
- 283 76 31 (100% 26% 10%)
Sizes: 30671 30727 30653
# #############################################
# FILE: lz77.tcl - the code for the compression
namespace eval lz77 {
variable lookback 255
variable Escape1 "\x01"
variable Escape2 "\x02"
set K32 [expr {32 * 1024}]
}
proc ::lz77::encode {data} {
variable Escape1
variable Escape2
variable K32
set output ""
set dataLength [string length $data]
append output [string index $data 0]
for {set i 1} {$i < $dataLength} {} {
foreach {length offset} [maxSubstring $i $data] break
if { $length > 0 } {
if { $offset < 255 && $length < 255 } {
set offChar [format %c $offset]
set lenChar [format %c $length]
append output $Escape1$lenChar$offChar
} else {
append output $Escape2
append output [binary format S $length]
append output [binary format S $offset]
}
incr i $length
} else {
set char [string index $data $i]
if { ($char eq $Escape1) || ($char eq $Escape2) } {
append output $Escape1
}
append output $char
incr i
}
}
return $output
}
proc ::lz77::decode {data} {
variable Escape1
variable Escape2
variable EscEsc
set output ""
for {set i 0} {$i < [string length $data]} {incr i} {
set char [string index $data $i]
if { $char eq $Escape1 } {
set char [string index $data [incr i]]
if { ($char eq $Escape1) || ($char eq $Escape2)} {
append output $char
} else {
scan $char %c length
scan [string index $data [incr i]] %c offset
set index [expr {[string length $output] - $offset}]
for {set j 0} {$j < $length} {incr j} {
append output [string index $output $index]
incr index
}
}
} elseif { $char eq $Escape2 } {
binary scan [string range $data [incr i] [incr i]] S length
binary scan [string range $data [incr i] [incr i]] S offset
set index [expr {[string length $output] - $offset}]
for {set j 0} {$j < $length} {incr j} {
append output [string index $output $index]
incr index
}
} else {
append output $char
}
}
return $output
}The best variant for maxSubstring looks first for the best match of length 3. having found a match of length L, it grows it as much as possible to (L+K), K>=0 - and then looks for a match of length (L+K+1). This is guaranteed to make a single pass through the list, and is the fastest variant to date:Note that this could include a binary search for the longest match in the inner while-loop for a possible improvement, replacing the linear char-by-char search implemented here.
proc ::lz77::maxSubstring {index data} {
variable K32
set upperBound [expr {[string length $data] - $index}]
if { $upperBound >= $K32} {
set upperBound [expr {$K32 -1}]
}
set lastPossible [expr {$index + $upperBound -1}]
if { $index > $K32 } {
#
# Note that this will be expensive for long strings: the
# data is copied over at each call. Have yet to find a
# better way.
#
set data [string range $data [expr {$index - $K32}] $lastPossible]
set index $K32
set lastPossible [expr {$index + $upperBound-1}]
}
set offset 0
set lastIdx [expr {$index + 2}]
set toMatch [string range $data $index $lastIdx]
while {$lastIdx <= $lastPossible} {
set matchIdx [string last $toMatch $data [expr {$lastIdx - $offset -1}]]
if {$matchIdx < 0} break
# Found a match; find out exactly how long it is
set offset [expr {$index - $matchIdx}]
set next [incr lastIdx]
while {($lastIdx <= $lastPossible) && \
([string index $data $lastIdx] eq \
[string index $data [expr {$lastIdx - $offset}]])} {
incr lastIdx
}
append toMatch [string range $data $next $lastIdx]
}
set len [expr {$lastIdx - $index}]
if {$len < 5} {
if {($len == 2) || ($offset > 255)} {
return {0 0}
}
}
return [list $len $offset]
}Closely related to the algorithm at LZ77 Compression (using a binary search for the maximal length) we have
proc ::lz77::maxSubstring {index data} {
variable K32
set upperBound [expr {[string length $data] - $index}]
if { [string length $data] >= ($index + $K32)} {
set upperBound [expr {$K32 -1}]
}
set lastPossible [expr {$index + $upperBound -1}]
#
# This never returns a match shorter than 5 at arbitrary
# distance. Reduce the data if possible, as it will be searched
# repeatedly.
#
set start 0
if { $index > $K32 } {
set data [string range $data [expr {$index - $K32}] $lastPossible]
set index $K32
}
set matchLength 0 ;# identical to the lowerBound
set matchOffset 1
if {$upperBound > 4} {
set currString [string range $data $index [expr {$index + 4}]]
set start [string last $currString $data [expr {$index +3}]]
if {$start >= 0} {
set matchLength 5 ;# identical to the lowerBound
set matchOffset [expr {$index - $start}]
if {$start} {
set start [string first $currString $data]
if {$start > 0} {
incr index -$start
set data [string range $data $start end]
}
}
} else {
set upperBound 4
}
}
if {$matchLength == 0} {
#
# No match at 5 or more; as this never returns a match shorter than 5
# if it is farther than 255 from the string, we only have to look for
# matches of length 4 or 3 in the last 255 bytes.
#
if {$index > 255} {
set start [expr {$index -255}]
set index 255
set data [string range $data $start [expr {$start + 2}]]
}
}
#
# Now we really start looking
#
while { ($upperBound > 2) && ($upperBound > $matchLength) } {
set currentLength [expr {($upperBound + $matchLength +1)/2}]
if { $currentLength < 3 } { set currentLength 3 }
set currString [string range $data $index [expr {$index + $currentLength -1}]]
# The last possible match has to begin farther away than the
# longest match already found.
set lastIndex [expr {$index - $matchOffset + $currentLength -1}]
if { ([set matchIndex [string last $currString $data $lastIndex]] >= 0) } {
# Note that (matchLength < currentLength) by construction
set matchOffset [expr {$index - $matchIndex}]
set matchLength $currentLength
} else {
set upperBound [expr {$currentLength - 1}]
}
}
return [list $matchLength $matchOffset]
}The tests are taken from LZ77 Compression, adapted to the new maxSubstring interface:
# #############################################
# FILE: lz77.test - tests for lz77
package require tcltest
namespace import tcltest::*
set fName [file join [file dirname [info script]] lz77.tcl]
source $fName
# ========================================
test encode-1.1 {
encode a simple string in using LZ77
} -body {
set encoded [::lz77::encode {abcdebcdef}]
set expect "abcde\x01\x04\x04"
append expect f
if { [string length $expect] != [string length $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "ENCODED: $exList -> $enList"
return "String lengths were not the same: \
[string length $expect] != [string length $encoded]"
}
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECTED: $exList\nENCODED : $enList"
return "Strings were not equal"
}
} -result {}
test encode-1.2 {
encode a simple string in using LZ77
} -body {
set encoded [::lz77::encode {Blah blah blah blah blah!}]
set expect "Blah b\x01\x12\x05!"
if { [string length $expect] != [string length $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECTED: $exList\nENCODED : $enList"
return "String lengths were not the same: \
[string length $expect] != [string length $encoded]"
}
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECTED: $exList\nENCODED : $enList"
return "Strings were not equal"
}
} -result {}
test encode-1.3 {
encode a string with multiple matches
} -body {
set string {This is a string with multiple strings within it}
set expect "This \x01\x03\x03"
append expect "a string with multiple\x01\x07\x15"
append expect "s\x01\x05\x16"
append expect "in it"
set encoded [::lz77::encode $string]
if { [string length $expect] != [string length $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECT: $exList"
puts "ENCODE: $enList"
return "String lengths were not the same: \
[string length $expect] != [string length $encoded]"
}
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECT: $exList"
puts "ENCODE: $enList"
return "Strings were not equal"
}
} -result {}
test encode-1.4 {
Encode a long (>255 <255*255) string, to use second escape
} -setup {
set original "abcdefghij"
for {set i 0} {$i < 254} {incr i} {
append original [expr {$i % 10}]
}
append original "abcdefg"
set expect "abcdefghij0123456789"
append expect "\x01\xF4\x0a"
append expect "\x02\x00\x07\x01\x08"
} -body {
set encoded [::lz77::encode $original]
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "EXPECT: $exList"
puts "ENCODE: $enList"
return "Strings were not equal"
}
} -result {}
# ========================================
test decode-1.1 {
decode a simple string using LZ77
} -body {
set decoded [::lz77::decode "Blah b\x01\x12\x05!"]
set expect {Blah blah blah blah blah!}
if { [string length $expect] != [string length $decoded] } {
puts "$expect != $decoded"
return "String lengths were not the same: \
[string length $expect] != [string length $decoded]"
}
if { ![string equal $expect $decoded] } {
puts "$expect != $decoded"
return "Strings were not equal"
}
} -result {}
# ========================================
test cycle-1.1 {
cycle a string through encode and decode
} -body {
set original "This is a string with multiple strings within it"
set encoded [::lz77::encode $original]
set changed [::lz77::decode $encoded]
if { ![string equal $original $changed] } {
puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
binary scan $encoded c* enList
puts $enList
return "The strings were not equal"
}
return
} -result {}
test cycle-1.2 {
cycle a string through encode and decode
} -body {
set original "This is a string of text, \
whereherehereherehe parts of the string\
have text that is in other parts of the string"
set encoded [::lz77::encode $original]
set changed [::lz77::decode $encoded]
if { ![string equal $original $changed] } {
puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
binary scan $encoded c* enList
puts $enList
return "The strings were not equal"
}
return
} -result {}
test cycle-1.3 {
Encode a long (>255 <255*255) string, to use second escape\
and decode it
} -setup {
set original "abcdefghij"
for {set i 0} {$i < 254} {incr i} {
append original [expr {$i % 10}]
}
append original "abcdefg"
} -body {
set encoded [::lz77::encode $original]
set decoded [::lz77::decode $encoded]
if { ![string equal $original $decoded] } {
binary scan $original c* orList
binary scan $decoded c* deList
puts "ORIGINAL: $orList"
puts "DECODED : $deList"
return "Strings were not equal"
}
} -result {}
test cycle-1.4 {
cycle a string through encode and decode
} -body {
global fName
set f [open $fName]
set original [read $f]
close $f
set encoded [::lz77::encode $original]
set changed [::lz77::decode $encoded]
if { ![string equal $original $changed] } {
#puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
return "The strings were not equal"
}
return
} -result {}
test cycle-1.5 {
cycle a string through encode and decode
} -body {
set f [open [info script]]
set original [read $f]
close $f
set encoded [::lz77::encode $original]
set changed [::lz77::decode $encoded]
if { ![string equal $original $changed] } {
#puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
return "The strings were not equal"
}
return
} -result {}
# ========================================
# Special cases
# When there's an escape in the input text
# Encode it as \x01\x01, since we can't have a repeat length of 1
test escape-1.1 {
An escape in the input data is coded as the escape, followed\
by \x01
} -setup {
unset -nocomplain original expect encoded exList enList
} -body {
set original "ab\x01" ; append original cd
set expect "ab\x01\x01" ; append expect cd
set encoded [::lz77::encode $original]
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "$exList ->\n$enList"
return "The strings were not equal"
}
} -result {}
test escape-1.2 {
An escape in the input data is coded as the escape, followed\
by \x01
} -setup {
unset -nocomplain original expect encoded exList enList
} -body {
set original "ab\x01" ; append original cd
set encoded [::lz77::encode $original]
set decoded [::lz77::decode $encoded]
if { ![string equal $original $decoded] } {
binary scan $original c* orList
binary scan $decoded c* deList
puts "$orList ->\n$deList"
return "The strings were not equal"
}
} -result {}
test escape-1.2 {
An escaped escape should not interfere with runs surrounding it
} -setup {
unset -nocomplain original expect encoded exList enList
} -body {
set original "abcdebcde\x01" ; append original cd
set expect "abcde\x01\x04\x04\x01\x01" ; append expect cd
set encoded [::lz77::encode $original]
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "$exList ->\n$enList"
return "The strings were not equal"
}
} -result {}
# If there's multiple matches, get the longest one possible
# "These blah is blah blah blah!"
# ^ the match for here
# ^ should start here
# ^ not here
test longest-1.1 {
Get the longest match possible
} -setup {
unset -nocomplain original expect encoded exList enList
set original {These blah is blah blah blah!}
set expect "These blah is\x01\x06\x08"
append expect "\x01\x09\x05!"
} -body {
set encoded [::lz77::encode $original]
if { ![string equal $expect $encoded] } {
binary scan $expect c* exList
binary scan $encoded c* enList
puts "Expect: $exList ->\nEncode: $enList"
puts "DECODE: [::lz77::decode $encoded]"
return "The strings were not equal"
}
} -result {}
# ========================================
test maxSubstring-1.1 {
Find the max substring for a string with only one match
} -setup {
set string {abcdefcdefg}
set index 6
} -body {
::lz77::maxSubstring $index $string
} -result {4 4}
test maxSubstring-1.2 {
Make sure we can get matches of the min length
} -setup {
set string {This is a}
set index 5
} -body {
::lz77::maxSubstring $index $string
} -result {3 3}
test maxSubstring-1.3 {
Return 0 if no match
} -setup {
set string {abcdefghijk}
set index 6
} -body {
lindex [::lz77::maxSubstring $index $string] 0
} -result 0
test maxSubstring-1.4 {
Find the max substring for a string with multiple matches
} -setup {
set string {aaaabbbbaaaaaaaa}
set index 9
} -body {
::lz77::maxSubstring $index $string
} -result {7 1}
test maxSubstring-1.5 {
For a very long string (>32k), remove everything 32k \
past the index
} -setup {
set original "abcdef"
for {set i 0} {$i < 1024} {incr i} {
append original "0123456789012345678901234567890123456789"
}
append original "abcdef"
set index 17
} -body {
::lz77::maxSubstring $index $original
} -result {32767 10}
test maxSubstring-1.6 {
For a very long string (>32k), remove anything more than 32k\
before the index
} -setup {
set original "abcdef"
for {set i 0} {$i < 1024} {incr i} {
append original "0123456789012345678901234567890123456789"
append original "0123456789012345678901234567890123456789"
append original "0123456789012345678901234567890123456789"
}
append original "abcdef"
set index 40000
} -body {
::lz77::maxSubstring $index $original
} -result {32767 10}
test maxSubstring-1.7 {
} -setup {
set data "abcdefghij012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123abcdefg"
set index 274
} -body {
::lz77::maxSubstring $index $data
} -result {7 274}Category Compression - Category File

