magiccompile.tcl:
# file to compile the magic file from magic(5) into a tcl program
package require fileutil
source magiclib.tcl
source magictree.tcl
namespace eval magic {}
set magic::debug 0
# parse an individual line
proc magic::parseline {line {maxlevel 10000}} {
# calculate the line's level
set unlevel [string trimleft $line >]
set level [expr [string length $line] - [string length $unlevel]]
if {$level > $maxlevel} {
return -code continue "Skip - too high a level"
}
# regexp parse line into (offset, type, value, command)
set parse [regexp -expanded -inline {^(\S+)\s+(\S+)\s*((\S|(\B\s))*)\s*(.*)$} $unlevel]
if {$parse == {}} {
error "Can't parse: '$unlevel'"
}
# unpack parsed line
set value ""
set command ""
foreach {junk offset type value junk1 junk2 command} $parse break
# handle trailing spaces
if {[string index $value end] eq "\\"} {
append value " "
}
if {[string index $command end] eq "\\"} {
append command " "
}
if {$value == ""} {
error "no value" ;# badly formatted line
}
variable debug
if {$debug > 1} {
puts "level:$level offset:$offset type:$type value:'$value' command:'$command'"
}
# return the line's fields
return [list $level $offset $type $value $command]
}
# process a magic file
proc magic::process {file {maxlevel 10000}} {
variable level ;# level of line
variable linenum ;# line number
set level 0
set script {}
set linenum 0
::fileutil::foreachLine line $file {
incr linenum
set line [string trim $line " "]
if {[string index $line 0] eq "#"} {
continue ;# skip comments
} elseif {$line == ""} {
continue ;# skip blank lines
} else {
# parse line
if {[catch {parseline $line $maxlevel} parsed]} {
continue ;# skip erroring lines
}
# got a valid line
foreach {level offset type value message} $parsed break
# strip comparator out of value field,
# (they are combined)
set compare [string index $value 0]
switch -glob -- $value {
[<>]=* {
set compare [string range $value 0 1]
set value [string range $value 2 end]
}
<* -
>* -
&* -
^* {
set value [string range $value 1 end]
}
=* {
set compare "=="
set value [string range $value 1 end]
}
!* {
set compare "!="
set value [string range $value 1 end]
}
x {
# this is the 'don't care' match
# used for collecting values
set value ""
}
default {
# the default comparator is equals
set compare "=="
if {[string match {\\[<!>=]*} $value]} {
set value [string range $value 1 end]
}
}
}
# process type field
set qual ""
switch -glob -- $type {
pstring* -
string* {
# String or Pascal string type
# extract string match qualifiers
set type [split $type /]
set qual [lindex $type 1]
set type [lindex $type 0]
# convert pstring to string + qualifier
if {$type eq "pstring"} {
append qual "p"
set type "string"
}
# protect hashes in output script value
set value [string map [list "\#" "\\#" \" \\\" \{ \\\{ \} \\\}] $value]
if {($value eq "\\0") && ($compare eq ">")} {
# record 'any string' match
set value ""
set compare x
} elseif {$compare eq "!="} {
# string doesn't allow !match
set value !$value
set compare "=="
}
if {$type ne "string"} {
# don't let any odd string types sneak in
puts stderr "Reject String: ${file}:$linenum $type - $line"
continue
}
}
regex {
# I am *not* going to handle regex
puts stderr "Reject Regex: ${file}:$linenum $type - $line"
continue
}
*byte* -
*short* -
*long* -
*date* {
# Numeric types
# extract numeric match &qualifiers
set type [split $type &]
set qual [lindex $type 1]
if {$qual ne ""} {
# this is an &-qualifier
set qual &$qual
} else {
# extract -qualifier from type
set type [split $type -]
set qual [lindex $type 1]
if {$qual ne ""} {
set qual -$qual
}
}
set type [lindex $type 0]
# perform value adjustments
if {$compare ne "x"} {
# trim redundant Long value qualifier
set value [string trimright $value L]
if {[catch {set value [expr $value]} x eo]} {
# check that value is representable in tcl
puts stderr "Reject Value Error: ${file}:$linenum '$value' '$line' - $eo"
continue;
}
# coerce numeric value into hex
set value [format "0x%x" $value]
}
}
default {
# this is not a type we can handle
puts stderr "Reject Unknown Type: ${file}:$linenum $type - $line"
continue
}
}
}
# collect some summaries
variable debug
if {$debug} {
variable types
set types($type) $type
variable quals
set quals($qual) $qual
}
#puts "$linenum level:$level offset:$offset type:$type qual:$qual compare:$compare value:'$value' message:'$message'"
# protect hashes in output script message
set message [string map [list "\#" "\\\#" \" \\\" \} \\\} ( \\( ) \\)] $message]
if {![string match "(*)" $offset]} {
catch {set offset [expr $offset]}
}
# record is the complete match command,
# encoded for tcl code generation
set record [list $linenum $type $qual $compare $offset $value $message]
if {$script == {}} {
# the original script has level 0,
# regardless of what the script says
set level 0
}
if {$level == 0} {
# add a new 0-level record
lappend script $record
} else {
# find the growing edge of the script
set depth [lrepeat [expr $level] end]
while {[catch {
# get the insertion point
set insertion [lindex $script {*}$depth]
}]} {
# handle scripts which jump levels,
# reduce depth to current-depth+1
set depth [lreplace $depth end end]
}
# add the record at the insertion point
lappend insertion $record
# re-insert the record into its correct position
lset script {*}$depth $insertion
}
}
#puts "Script: $script"
return $script
}
proc magic::install {args} {
foreach arg $args {
proc magic::/[file tail $arg] {} [magic::compile $arg]
}
}
# compile up magic files or directories of magic files
proc magic::compile {args} {
set tcl ""
set script {}
foreach arg $args {
if {[file type $arg] == "directory"} {
foreach file [glob [file join $arg *]] {
set script1 [process $file]
lappend script [list file $file] {*}$script1
#append tcl "magic::file_start $file" \n
#append tcl [run $script1] \n
}
} else {
set file $arg
set script1 [process $file]
lappend script [list file $file] {*}$script1
#append tcl "magic::file_start $file" \n
#append tcl [run $script1] \n
}
}
#puts stderr $script
puts "\# $args"
set t [2tree $script]
set tcl [treegen $t root]
puts [treedump $t]
#set tcl [run $script]
return $tcl
}
proc magic::generate {args} {
foreach arg $args {
set out [::open [file tail $arg].tcl w]
puts $out "proc magic::/[file tail $arg] {} \{"
puts $out [magic::compile $arg]
puts $out "return {}"
puts $out \}
::close $out
}
}
#puts [magic::compile {*}$argv]
#puts stderr "typemap: [array get magic::typemap]"
magic::generate {*}$argv
#set script [magic::compile {} /usr/share/misc/file/magic]
#puts "\# types:[array names magic::types]"
#puts "\# quals:[array names magic::quals]"
#puts "Script: $script"
return
-----
magiclib.tcl:
# TODO:
# Required Functionality:
# implement full offset language
# implement pstring (pascal string, blerk)
# implement regex form (blerk!)
# implement string qualifiers
# Optimisations:
# reorder tests according to expected or observed frequency
# this conflicts with reduction in strength optimisations.
# Rewriting within a level will require pulling apart the
# list of tests at that level and reordering them.
# There is an inconsistency between handling at 0-level and
# deeper level - this has to be removed or justified.
# Hypothetically, every test at the same level should be
# mutually exclusive, but this is not given, and should be
# detected. If true, this allows reduction in strength to switch
# on Numeric tests
# reduce Numeric tests at the same level to switches
#
# - first pass through clauses at same level to categorise as
# variant values over same test (type and offset).
# work out some way to cache String comparisons
# Reduce seek/reads for String comparisons at same level:
#
# - first pass through clauses at same level to determine string ranges.
#
# - String tests at same level over overlapping ranges can be
# written as sub-string comparisons over the maximum range
# this saves re-reading the same string from file.
#
# - common prefix strings will have to be guarded against, by
# sorting string values, then sorting the tests in reverse length order.
namespace eval magic {}
set magic::debug 0
set magic::optimise 1
# open the file to be scanned
proc magic::open {file} {
variable fd
set fd [::open $file]
fconfigure $fd -translation binary
# fill the string cache
# the vast majority of magic strings are in the first 4k of the file.
variable strbuf
set strbuf [read $fd 4096]
# clear the fetch cache
variable cache
catch {unset cache}
variable result
set result ""
variable string
set string ""
variable numeric
set numeric -9999
return $fd
}
proc magic::close {file} {
variable fd
::close $fd
}
# mark the start of a magic file in debugging
proc magic::file_start {name} {
variable debug
if {$debug} {
puts stderr "File: $name"
}
}
# return the emitted result
proc magic::result {{msg ""}} {
variable result
if {$msg != ""} {
emit $msg
}
return -code return $result
}
# emit a message
proc magic::emit {msg} {
variable string
variable numeric
set msg [::string map [list \\b "" %s $string %ld $numeric %d $numeric] $msg]
variable result
append result " " $msg
set result [string trim $result " "]
}
# handle complex offsets - TODO
proc magic::offset {where} {
variable debug
#if {$debug} {
puts stderr "OFFSET: $where"
#}
return 0
}
# fetch and cache a value from the file
proc magic::fetch {where what scan} {
variable cache
variable numeric
if {![info exists cache($where,$what,$scan)]} {
variable fd
seek $fd $where
binary scan [read $fd $what] $scan numeric
set cache($where,$what,$scan) $numeric
} else {
set numeric $cache($where,$what,$scan)
}
return $numeric
}
# maps magic typenames to field characteristics: size, binary scan format
array set magic::typemap {
byte {1 c}
ubyte {1 c}
short {2 S}
ushort {2 S}
beshort {2 S}
leshort {2 s}
ubeshort {2 S}
uleshort {2 s}
long {4 I}
belong {4 I}
lelong {4 i}
ubelong {4 I}
ulelong {4 i}
date {2 S}
bedate {2 S}
ledate {2 s}
ldate {4 I}
beldate {4 I}
leldate {4 i}
}
# generate short form names
foreach {n v} [array get magic::typemap] {
foreach {len scan} $v {
#puts stderr "Adding $scan - [list $len $scan]"
set magic::typemap($scan) [list $len $scan]
break
}
}
proc magic::Nv {type offset {qual ""}} {
variable typemap
variable numeric
# unpack the type characteristics
foreach {size scan} $typemap($type) break
# fetch the numeric field
set numeric [fetch $offset $size $scan]
if {$qual != ""} {
# there's a mask to be applied
set numeric [expr $numeric $qual]
}
variable debug
if {$debug} {
puts stderr "NV $type $offset $qual: $numeric"
}
return $numeric
}
# Numeric - get bytes of $type at $offset and $compare to $val
# qual might be a mask
proc magic::N {type offset comp val {qual ""}} {
variable typemap
variable numeric
# unpack the type characteristics
foreach {size scan} $typemap($type) break
# fetch the numeric field
set numeric [fetch $offset $size $scan]
if {$comp == "x"} {
# anything matches - don't care
return 1
}
# get value in binary form, then back to numeric
# this avoids problems with sign, as both values are
# [binary scan]-converted identically
binary scan [binary format $scan $val] $scan val
if {$qual != ""} {
# there's a mask to be applied
set numeric [expr $numeric $qual]
}
set c [expr $val $comp $numeric] ;# perform comparison
variable debug
if {$debug} {
puts stderr "numeric $type: $val $comp $numeric / $qual - $c"
}
return $c
}
proc magic::getString {offset len} {
# cache the first 1k of the file
variable string
set end [expr {$offset + $len - 1}]
if {$end < 4096} {
# in the string cache
variable strbuf
set string [string range $strbuf $offset $end]
} else {
# an unusual one
variable fd
seek $fd $offset ;# move to the offset
set string [read $fd $len]
}
return $string
}
proc magic::S {offset comp val {qual ""}} {
variable fd
variable string
# convert any backslashes
set val [subst -nocommands -novariables $val]
if {$comp eq "x"} {
# match anything - don't care, just get the value
set string ""
seek $fd $offset ;# move to the offset
while {([::string length $string] < 100)
&& [::string is print [set c [read $fd 1]]]} {
if {[string is space $c]} {
break
}
append string $c
}
return 1
}
# get the string and compare it
set string [getString $offset [::string length $val]]
set cmp [::string compare $val $string]
set c [expr $cmp $comp 0]
variable debug
if {$debug} {
puts "String '$val' $comp '$string' - $c"
if {$c} {
puts "offset $offset - $string"
}
}
return $c
}
----
magictree.tcl
package require struct::list
package require struct::tree
namespace eval magic {}
proc magic::path {tree} {
$tree set root path {}
foreach child [$tree children root] {
$tree walk $child -type dfs node {
set path [$tree get [$tree parent $node] path]
lappend path [$tree index $node]
$tree set $node path $path
}
}
}
proc magic::tree_el {tree parent file line type qual comp offset val message args} {
set node [$tree insert $parent end]
set path [$tree get $parent path]
lappend path [$tree index $node]
$tree set $node path $path
# generate a proc call type for the type, Numeric or String
variable typemap
switch -glob -- $type {
*byte* -
*short* -
*long* -
*date* {
set otype N
set type [lindex $typemap($type) 1]
}
*string {
set otype S
}
default {
puts stderr "Unknown type: '$type'"
}
}
foreach key {line type qual comp offset val message file otype} {
if {[catch {
$tree set $node $key [set $key]
} result eo]} {
puts "Tree: $eo - $file $line $type"
}
}
# now add children
foreach el $args {
tree_el $tree $node $file {*}$el
}
return $node
}
proc magic::2tree {script} {
variable tree;
set tree [::struct::tree]
$tree set root path ""
$tree set root otype Root
$tree set root type root
$tree set root message "unknown"
# generate a test for each match
set file "unknown"
foreach el $script {
#puts "EL: $el"
if {[lindex $el 0] eq "file"} {
set file [lindex $el 1]
} else {
append result [tree_el $tree root $file {*}$el]
}
}
optNum $tree root
#optStr $tree root
puts stderr "Script contains [llength [$tree children root]] discriminators"
path $tree
return $tree
}
proc magic::isStr {tree node} {
return [expr {"S" eq [$tree get $node otype]}]
}
proc magic::sortRegion {r1 r2} {
set cmp 0
if {[catch {
if {[string match (*) $r1] || [string match (*) $r2]} {
set cmp [string compare $r1 $r2]
} else {
set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}]
if {!$cmp} {
set cmp 0
set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}]
}
}
} result eo]} {
set cmp [string compare $r1 $r2]
}
return $cmp
}
proc magic::optStr {tree node} {
variable regions
catch {unset regions}
array set regions {}
optStr1 $tree $node
puts stderr "Regions [array statistics regions]"
foreach region [lsort -index 0 -command magic::sortRegion [array name regions]] {
puts "$region - $regions($region)"
}
}
proc magic::optStr1 {tree node} {
# traverse each numeric element of this node's children,
# categorising them
set kids [$tree children $node]
foreach child $kids {
optStr1 $tree $child
}
set strings [$tree children $node filter magic::isStr]
#puts stderr "optstr: $node: $strings"
variable regions
foreach el $strings {
#if {[$tree get $el otype] eq "String"} {
# puts "[$tree getall $el] - [string length [$tree get $el val]]"
#}
if {[$tree get $el comp] eq "x"} {
continue
}
set offset [$tree get $el offset]
set len [string length [$tree get $el val]]
lappend regions([list $offset $len]) $el
}
}
proc magic::isNum {tree node} {
return [expr {"N" eq [$tree get $node otype]}]
}
proc magic::switchNSort {tree n1 n2} {
return [expr {[$tree get $n1 val] - [$tree get $n1 val]}]
}
proc magic::optNum {tree node} {
# traverse each numeric element of this node's children,
# categorising them
set kids [$tree children $node]
foreach child $kids {
optNum $tree $child
}
set numerics [$tree children $node filter magic::isNum]
#puts stderr "optNum: $node: $numerics"
if {[llength $numerics] < 2} {
return
}
array set offsets {}
foreach el $numerics {
if {[$tree get $el comp] ne "=="} {
continue
}
lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el
}
#puts "Offset: stderr [array get offsets]"
foreach {match nodes} [array get offsets] {
if {[llength $nodes] < 2} {
continue
}
catch {unset matcher}
foreach n $nodes {
set nv [expr [$tree get $n val]]
if {[info exists matcher($nv)]} {
puts stderr "Node <[$tree getall $n]> clashes with <[$tree getall $matcher($nv)]>"
} else {
set matcher($nv) $n
}
}
foreach {type offset qual} [split $match ,] break
set switch [$tree insert $node [$tree index [lindex $nodes 0]]]
$tree set $switch otype Switch
$tree set $switch message $match
$tree set $switch offset $offset
$tree set $switch type $type
$tree set $switch qual $qual
set nodes [lsort -command [list magic::switchNSort $tree] $nodes]
$tree move $switch end {*}$nodes
set path [$tree get [$tree parent $switch] path]
lappend path [$tree index $switch]
$tree set $switch path $path
}
}
proc magic::treedump {tree} {
set result ""
$tree walk root -type dfs node {
set path [$tree get $node path]
set depth [llength $path]
append result [string repeat " " $depth] [list $path] ": " [$tree get $node type]
if {[$tree keyexists $node offset]} {
append result ,[$tree get $node offset]
}
if {[$tree keyexists $node qual]} {
set q [$tree get $node qual]
if {$q ne ""} {
append result ,$q
}
}
if {[$tree keyexists $node comp]} {
append result [$tree get $node comp]
}
if {[$tree keyexists $node val]} {
append result [$tree get $node val]
}
if {$depth == 1} {
set msg [$tree get $node message]
set n $node
while {($n != {}) && ($msg == "")} {
set n [lindex [$tree children $n] 0]
if {$n != {}} {
set msg [$tree get $n message]
}
}
append result " " ( $msg )
if {[$tree keyexists $node file]} {
append result " - " [$tree get $node file]
}
}
#append result " <" [$tree getall $node] >
append result \n
}
return $result
}
proc magic::treegen {tree node} {
return "[treegen1 $tree $node]\nresult\n"
}
proc magic::treegen1 {tree node} {
set result ""
foreach k {otype type offset comp val qual message} {
if {[$tree keyexists $node $k]} {
set $k [$tree get $node $k]
}
}
if {$otype eq "N"} {
set type [list N $type]
} elseif {$otype eq "S"} {
set type S
}
switch $otype {
N -
S {
# this is a complex offset - call the offset interpreter
if {[string match "(*)" $offset]} {
set offset "\[offset $offset\]"
}
append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{"
if {[$tree isleaf $node]} {
if {$message != ""} {
append result "emit [list $message]"
} else {
append result "emit [$tree get $node path]"
}
} else {
if {$message != ""} {
append result "emit [list $message]\n"
}
foreach child [$tree children $node] {
append result [treegen1 $tree $child]
}
#append result "\nreturn \$result"
}
append result "\}\n"
}
Root {
foreach child [$tree children $node] {
append result [treegen1 $tree $child]
}
}
Switch {
# this is a complex offset - call the offset interpreter
if {[string match "(*)" $offset]} {
set offset "\[offset $offset\]"
}
append result "switch -- \[Nv $type $offset $qual\] "
variable typemap
set scan [lindex $typemap($type) 1]
foreach child [$tree children $node] {
binary scan [binary format $scan [$tree get $child val]] $scan val
append result "$val \{"
if {[$tree isleaf $child]} {
append result "emit [list [$tree get $child message]]"
} else {
append result "emit [list [$tree get $child message]]\n"
foreach grandchild [$tree children $child] {
append result [treegen1 $tree $grandchild]
}
}
append result "\} "
}
append result "\n"
}
}
return $result
}
returnWhew ... let us never speak of file discrimination programs again :)

