Keith Vetter 2007-06-06 : I came across an
html utility written by Eric Raymond called
imgsizer. It edits web pages inserting missing
WIDTH and
HEIGHT attributes to image tags [
1]. By having these tags, browsers can load pages faster exploiting multithreading image loading; without them, browsers must do sequential image loading.
I wanted to use Eric's tool but it's written in Python. So I just decided to rewrite it in tcl.
I first tried using
TDOM to parse web pages, but it was too brittle and couldn't handle malformed
html. This version uses regular expressions to find the image tags to update.
If you have
TclMagick, then this utility will run in pure tcl. Otherwise, it uses
Img for figuring out image size which means that Tk also gets loaded. (Using
tcllib's jpeg, gif and png library is left as an exercise for the reader.)
##+##########################################################################
#
# imgsizer.tsh -- splices in WIDTH and HEIGHT parameters for HTML IMG tags
# This allows browsers to load pages faster by multi-threading the image
# loading rather than strict sequential loading.
#
# by Keith Vetter, May 2007
# inspired by Eric Raymond's version: http://www.catb.org/~esr/software.html
package require http
package require cmdline
set OPT(haveMagick) [expr {! [catch {package require TclMagick}]}]
if {! $OPT(haveMagick)} {package require Img}
catch {wm withdraw .}
array set OPT {version 0.1 root . keepOld 0 noOverwrite 0 verbose 1}
##+##########################################################################
#
# ImageSizer -- fixes up WIDTH and HEIGHT parameters for IMG tag in one file
# uses regexp to find img tags (TDOM was too fragile)
#
proc ImageSizer {fname} {
INFO 1 "$fname\n"
set fin [open $fname r]
set data [read $fin]; list
close $fin
set result {}
set last 0
set ::CHANGED 0
foreach idx [regexp -inline -all -indices {<img.*?>} $data] {
foreach {start end} $idx break
append result [string range $data $last [expr {$start-1}]]
set imgTag [string range $data $start $end]
append result [HandleImgTag $imgTag]
set last [expr {$end+1}]
}
append result [string range $data $last end]
SaveResult $result $fname
}
##+##########################################################################
#
# HandleImgTag -- fixes up one image tag
#
proc HandleImgTag {imgTag} {
GetAllAttributes $imgTag
set src [GetAttribute src "<none>"]
if {$src eq "<none>"} {
WARN "Image tag without src"
return $imgTag
}
INFO 2 " $src"
if {[file pathtype $src] ne "relative"} {
set src [file join $::OPT(root) ".$src"]
}
set w [GetAttribute width "?"]
set h [GetAttribute height "?"]
if {[string first "%" $w] != -1 ||
[string first "%" $h] != -1} {
INFO 2 " -- skipping: %\n"
return $imgTag
}
if {$::OPT(noOverwrite) && [string is integer -strict $w] &&
[string is integer -strict $h]} {
INFO 2 " -- skipping: noOverwrite\n"
return $imgTag
}
if {[catch {foreach {w2 h2} [GetImageSize $src] break}]} {
INFO 2 "\n"
WARN "ERROR: cannot read image dimensions for '$src'"
return $imgTag
}
if {$w ne $w2 || $h ne $h2} {incr ::CHANGED}
set ::ATTR(width) $w2
set ::ATTR(height) $h2
INFO 2 " ($w,$h) => ($w2,$h2)\n"
return [RebuildImgTag]
}
##+##########################################################################
#
# RebuildImgTag -- returns new image tag with all its attributes
#
proc RebuildImgTag {} {
array set attr [array get ::ATTR]
set all [concat src width height [array names attr]]
set html "<img"
foreach arr $all {
if {! [info exists attr($arr)]} continue
set value $attr($arr)
set delim "\""
if {[string is integer -strict $value]} {
set delim ""
} elseif {[string first "\"" $value] != -1} {
set delim "'"
}
append html " $arr=$delim$value$delim"
unset attr($arr)
}
append html "/>"
return $html
}
##+##########################################################################
#
# GetAttribute -- returns attribute value, using default if not found
#
proc GetAttribute {which default} {
set which [string tolower $which]
if {[info exists ::ATTR($which)]} {
return $::ATTR($which)
}
return $default
}
##+##########################################################################
#
# GetAllAttributes -- extracts all attributes for a given tag into global ATTR
#
proc GetAllAttributes {thisTag} {
set last 2
unset -nocomplain ::ATTR
while {1} {
set next [regexp -inline -nocase -indices -start $last {\s.*?=.} $thisTag]
if {$next eq {}} break
foreach {start end} [lindex $next 0] break
set name [string range $thisTag [expr {$start+1}] [expr {$end-2}]]
set name [string tolower $name]
set delim [string index $thisTag $end]
if {$delim eq "'" || $delim eq "\""} {
set start2 [expr {$end+1}]
set end2 [string first $delim $thisTag $start2]
} else {
set start2 $end
set idx [regexp -indices -inline -start $start2 {\s|>} $thisTag]
set end2 [lindex $idx 0 1]
}
set value [string range $thisTag $start2 [expr {$end2 - 1}]]
set last $end2
set ::ATTR($name) $value
}
}
##+##########################################################################
#
# GetImageSize -- returns the size of an image
#
proc GetImageSize {iname} {
if {[info exists ::CACHE($iname)]} { ;# Cache to avoid downloads
return $::CACHE($iname)
}
if {[regexp {(?i)^http:} $iname]} { ;# Is iname really a URL???
set ::CACHE($iname) [GetWebImageSize $iname]
return $::CACHE($iname)
}
if {$::OPT(haveMagick)} {
set wand [magick create wand]
$wand ReadImage $iname
set w [$wand width]
set h [$wand height]
} else {
set img [image create photo -file $iname]
set w [image width $img]
set h [image height $img]
image delete $img
}
set ::CACHE($iname) [list $w $h]
return [list $w $h]
}
##+##########################################################################
#
# GetWebImageSize -- returns the size of an image after first downloading
# from the web
#
proc GetWebImageSize {url} {
set token [::http::geturl $url]
::http::wait $token
set idata [::http::data $token] ; list
::http::cleanup $token
if {$::OPT(haveMagick)} {
set wand [magick create wand]
$wand ReadImageBlob $idata
set w [$wand width]
set h [$wand height]
} else {
set img [image create photo -data $idata]
set w [image width $img]
set h [image height $img]
image delete $img
}
return [list $w $h]
}
##+##########################################################################
#
# SaveResult -- safely saves our result while safely moving files around.
#
proc SaveResult {html fname} {
if {$::CHANGED == 0} {
INFO 2 " no change\n"
return
}
set tempname [GetTempName $fname ".tmp"]
set backname [GetTempName $fname ".bak"]
set fout [open $tempname w]
puts -nonewline $fout $html
close $fout
file rename $fname $backname
file rename $tempname $fname
if {! $::OPT(keepOld)} {
file delete $backname
}
}
##+##########################################################################
#
# GetTempName -- returns an unused filename based on a given basename and
# extension. Not bullet-proof, race condition exists but good enough for now.
#
proc GetTempName {base extension} {
set fname "$base$extension"
if {! [file exists $fname]} { return $fname }
for {set i 1} {$i < 1000} {incr i} {
set fname [format "%s%s%03d" $base $extension $i]
if {! [file exists $fname]} { return $fname }
}
for {set i 1} {$i < 1000} {incr i} {
set rand [expr {int(rand()*0x7FFFffff)}]
set fname "$base$extension$rand"
if {! [file exists $fname]} { return $fname }
}
error "Could not create tempfile '$fname' '$extension'"
}
proc ParseArgs {} {
global argc argv OPT
for {set i 0} {$i < $argc} {incr i} {
set arg [lindex $argv $i]
switch -regexp -- $arg {
^--document-root$ -
^-d$ { set OPT(root) [lindex $argv [incr i]]}
^-d { set OPT(root) [string range $arg 2 end]}
^--no-overwrite$ -
^-n$ { set OPT(noOverwrite) 1 }
^--keep-original$ -
^-k$ { set OPT(keepOld) 1 }
^-q$ { set OPT(verbose) 0 }
^-v$ { incr OPT(verbose) }
^-V$ { DoHelp version }
^-h$ - ^-?$ - ^--help DoHelp
^--$ { incr i; break }
^- { WARN "unknown option: \"$arg\""; DoHelp usage }
default { break }
}
}
set argc [expr {$argc - $i}]
if {$argc <= 0 && ! $::tcl_interactive} { DoHelp usage }
set argv [lrange $argv $i end]
}
proc WARN {msg} { puts stderr $msg}
proc INFO {lvl msg} {if {$lvl <= $::OPT(verbose)} {puts -nonewline $msg }}
proc DoHelp {{what help}} {
if {$what eq "version"} {
puts "imgsizer version $::OPT(version)"
exit
}
set txt "imgsizer ?-d documentRoot? ?-n? ?-k? html-files"
if {$what eq "usage"} {
puts "usage: $txt"
exit
}
append txt "\n\n"
append txt "The imgsizer script automatically inserts WIDTH and HEIGHT parameters\n"
append txt "for IMG tags for HTML files. These parameters enable browsers to show\n"
append txt "pages faster by multi-threading the image loading rather than strict\n"
append txt "sequential loading.\n\n"
append txt "This script will insert missing WIDTH and HEIGHT parameters and\n"
append txt "correct existing parameters unless they contain a percent sign (%),\n"
append txt "or if you gave the '-n' or '--no-overwrite' switch.\n\n"
append txt "Options:\n"
append txt " -d, --document-root Directory for absolute image filenames\n"
append txt " (i.e, ones which contain a leading '/')\n"
append txt " -n, --no-overwrite Do not overwrite existing image size info\n"
append txt " -k, --keep-original Keep original source files with .bak extension\n"
append txt " -V Show version information\n"
append txt " -v Verbose output\n"
append txt " -q Quiet output\n"
puts $txt
exit
}
################################################################
ParseArgs
if {$tcl_interactive} return ;# For debugging
foreach arg $argv {
if {$tcl_platform(platform) ne "windows"} {
ImageSizer $arg
continue
}
foreach fname [glob -nocomplain -- $arg] { ;# Manual glob'ing
ImageSizer $fname
}
}
exit ;# Img package loads Tk
return