Updated 2011-04-29 18:44:41 by hae

if 0 { KBK: The longest common subsequence problem is:

Given two lists L1 and L2, compute the largest set of ordered pairs
`    {x1,y1), (x2,y2), ..., (xn, yn)`

such that
```    x1 < x2 < ... < xn
y1 < y2 < ... < yn```

and
`    L1[x1] = L2[y1] ; L1[x2] = L2[y2] ; ... L1[xn] = L2[yn]`

Solving this problem is critical to the implementation of diff in Tcl.

Here is one Tcl implementation of the "folklore algorithm."

Note that Hunt and McIlroy have published a much better algorithm (used in the Unix 'diff' command) that is implemented at diff in Tcl. This page is here to hold some of the historical discussion.

}
``` namespace eval list {
}

namespace eval list::longestCommonSubsequence {
namespace export compare
}

# Internal procedure that indexes into the 2-dimensional array t,
# which corresponds to the sequence y, looking for the (i,j)th element.

proc list::longestCommonSubsequence::Index { t y i j } {
set indx [expr { ([llength \$y] + 1) * (\$i + 1) + (\$j + 1) }]
return [lindex \$t \$indx]
}

# Internal procedure that implements Levenshtein to derive the longest
# common subsequence of two lists x and y.

proc list::longestCommonSubsequence::ComputeLCS { x y } {
set t [list]
for { set i -1 } { \$i < [llength \$y] } { incr i } {
lappend t 0
}
for { set i 0 } { \$i < [llength \$x] } { incr i } {
lappend t 0
for { set j 0 } { \$j < [llength \$y] } { incr j } {
if { [string equal [lindex \$x \$i] [lindex \$y \$j]] } {
set lastT [Index \$t \$y [expr { \$i - 1 }] [expr {\$j - 1}]]
set nextT [expr {\$lastT + 1}]
} else {
set lastT1 [Index \$t \$y \$i [expr { \$j - 1 }]]
set lastT2 [Index \$t \$y [expr { \$i - 1 }] \$j]
if { \$lastT1 > \$lastT2 } {
set nextT \$lastT1
} else {
set nextT \$lastT2
}
}
lappend t \$nextT
}
}
return \$t
}

# Internal procedure that traces through the array built by ComputeLCS
# and finds a longest common subsequence -- specifically, the one that
# is lexicographically first.

proc list::longestCommonSubsequence::TraceLCS { t x y } {
set trace {}
set i [expr { [llength \$x] - 1 }]
set j [expr { [llength \$y] - 1 }]
set k [expr { [Index \$t \$y \$i \$j] - 1 }]
while { \$i >= 0 && \$j >= 0 } {
set im1 [expr { \$i - 1 }]
set jm1 [expr { \$j - 1 }]
if { [Index \$t \$y \$i \$j] == [Index \$t \$y \$im1 \$jm1] + 1
&& [string equal [lindex \$x \$i] [lindex \$y \$j]] } {
lappend trace xy [list \$i \$j]
set i \$im1
set j \$jm1
} elseif { [Index \$t \$y \$im1 \$j] > [Index \$t \$y \$i \$jm1] } {
lappend trace x \$i
set i \$im1
} else {
lappend trace y \$j
set j \$jm1
}
}
while { \$i >= 0 } {
lappend trace x \$i
incr i -1
}
while { \$j >= 0 } {
lappend trace y \$j
incr j -1
}
return \$trace
}

# list::longestCommonSubsequence::compare --
#
#       Compare two lists for the longest common subsequence
#
# Arguments:
#       x, y - Two lists of strings to compare
#       matched - Callback to execute on matched elements, see below
#       unmatchedX - Callback to execute on unmatched elements from the
#                    first list, see below.
#       unmatchedY - Callback to execute on unmatched elements from the
#                    second list, see below.
#
# Results:
#       None.
#
# Side effects:
#       Whatever the callbacks do.
#
# The 'compare' procedure compares the two lists of strings, x and y.
# It finds a longest common subsequence between the two.  It then walks
# the lists in order and makes the following callbacks:
#
# For an element that is common to both lists, it appends the index in
# the first list, the index in the second list, and the string value of
# the element as three parameters to the 'matched' callback, and executes
# the result.
#
# For an element that is in the first list but not the second, it appends
# the index in the first list and the string value of the element as two
# parameters to the 'unmatchedX' callback and executes the result.
#
# For an element that is in the second list but not the first, it appends
# the index in the second list and the string value of the element as two
# parameters to the 'unmatchedY' callback and executes the result.

proc list::longestCommonSubsequence::compare { x y
matched
unmatchedX unmatchedY } {
set t [ComputeLCS \$x \$y]
set trace [TraceLCS \$t \$x \$y]
set i [llength \$trace]
while { \$i > 0 } {
set indices [lindex \$trace [incr i -1]]
set type [lindex \$trace [incr i -1]]
switch -exact -- \$type {
xy {
set c \$matched
eval lappend c \$indices
lappend c [lindex \$x [lindex \$indices 0]]
uplevel 1 \$c
}
x {
set c \$unmatchedX
lappend c \$indices
lappend c [lindex \$x \$indices]
uplevel 1 \$c
}
y {
set c \$unmatchedY
lappend c \$indices
lappend c [lindex \$y \$indices]
uplevel 1 \$c
}
}
}
return
}```

# With this code in hand, we can now write the external parts of a diff command. The various options of diff alter how it displays the comparison, but not its fundamental operation. Here's an external wrapper that gives very simple-minded output.
``` namespace import list::longestCommonSubsequence::compare

proc umx { index value } {
variable lastx
variable xlines
append xlines "< " \$value \n
set lastx \$index
}

proc umy { index value } {
variable lasty
variable ylines
append ylines "> " \$value \n
set lasty \$index
}

proc matched { index1 index2 value } {
variable lastx
variable lasty
variable xlines
variable ylines
if { [info exists lastx] && [info exists lasty] } {
puts "[expr { \$lastx + 1 }],\${index1}c[expr {\$lasty + 1 }],\${index2}"
puts -nonewline \$xlines
puts "----"
puts -nonewline \$ylines
} elseif { [info exists lastx] } {
puts "[expr { \$lastx + 1 }],\${index1}d\${index2}"
puts -nonewline \$xlines
} elseif { [info exists lasty] } {
puts  "\${index1}a[expr {\$lasty + 1 }],\${index2}"
puts -nonewline \$ylines
}
catch { unset lastx }
catch { unset xlines }
catch { unset lasty }
catch { unset ylines }
}

# Really, we should read the first file in like this:
#    set f0 [open [lindex \$argv 0] r]
#    set x [split [read \$f0] \n]
#    close \$f0
# But I'll just provide some sample lines:

set x {}
for { set i 0 } { \$i < 10 } { incr i } {
lappend x a r a d e d a b r a x
}

# The second file, too, should be read in like this:
#    set f1 [open [lindex \$argv 1] r]
#    set y [split [read \$f1] \n]
#    close \$f1
# Once again, I'll just do some sample lines.

set y {}
for { set i 0 } { \$i < 10 } { incr i } {
lappend y a b r a c a d a b r a
}

compare \$x \$y matched umx umy
matched [llength \$x] [llength \$y] {}```

if 0 {

}