package require http
package require tdom
package require tk
bind all <Key-Return> {imdb $::query}
proc imdb {input} {
set ua "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.7e"
http::config -useragent $ua
# don't manually build query strings http can do it better
set query [http::formatQuery btnI "I'm Feeling Lucky" q "site:imdb.com $input"]
set http [http::geturl http://www.google.com/search?$query]
# get HTTP header info
upvar 0 $http arr
array set meta $arr(meta)
# redirection url
set url $meta(Location)
set ::url $url
.f1.bi configure -state normal
# cleanup
http::cleanup $http
set http [http::geturl $url]
set html [http::data $http]
http::cleanup $http
set doc [dom parse -html $html]
set ::title [[$doc selectNodes {/html/head/title[1]}] asText]
set outline [$doc selectNodes {string(//div[h5="Plot Summary:"]/text())}]
if {$outline eq {}} {
set outline [$doc selectNodes {string(//div[h5="Plot Outline:"]/text())}]
}
set outline [string trim $outline]
.s delete 1.0 end
.s insert 1.0 $outline
set ::rating [[$doc selectNodes {//div[@class="general rating"]/b[2]} ] asText]
set votes [[$doc selectNodes {//div[@class="general rating"]/small/a[1]} ] asText]
$doc delete
set ::votes [string map {, {}} [lindex [split $votes] 0]]
}
proc go {url} {
exec cmd /c start {} $url &
}
label .lq -text Query:
entry .q -textvariable query
label .lt -text Title:
entry .t -textvariable title
label .lr -text Rating:
entry .r -textvariable rating
label .lv -text Votes
entry .v -textvariable votes
label .ls -text Outline
text .s -height 3 -wrap word
label .lu -text URL
entry .u -textvariable url
grid .lq .q -sticky ew
grid .lt .t -sticky ew
grid .lr .r -sticky ew
grid .lv .v -sticky ew
grid .ls .s -sticky ew
grid .lu .u -sticky ew
frame .f1
grid .f1 -column 1 -sticky ew
button .f1.bs -text Search -command {imdb $::query}
button .f1.bi -text "Goto IMDB" -state disable -command {go $::url}
grid .f1.bs .f1.bi
grid columnconfigure . 1 -weight 1
grid columnconfigure .f1 {0 1} -weight 1LES on 20080821: This is a nice idea, but this application doesn't quite work for me. First, Google doesn't like Lynx so we have to tell it a different lie about the user agent. Second, the code above tries to obtain from the headers information that my tests never retrieved: $meta(Location). So I skipped that altogether. Third, I added a switch that will let you open the IMDB page in your default browser on Linux too if you have KDE (not necessarily using it as window manager). Fourth, dom chokes on the HTML, so I did what countless developers never hesitate to frown upon: replace dom parsing with Regular Expressions. Why? Because it works, that's why.LV I don't know about this specific application - but I use lynx with google and imdb on a regular basis and have not encountered cases where "google doesn't like lynx" myself.LES Using Lynx as the user agent in this application, the HTML is not the expected Google page. It is another page that says that my browser is not authorized, I should read the usage terms, etc. or something to that effect.MJ - Very strange, both versions work correctly for me. The meta(Location) part is because 'I'm feeling lucky' use a http 302 redirect, which the http package doesn't automatically follow. I have no clue why this doesn't work for you though, very strange indeed. Regarding the use of regexp instead of tdom, 'parsing' with regular expression will break as well, only differently. I guess the main conclusion that can be drawn is that webscraping like this is a hit and miss game. The only way to use a service like this robustly is by a webservice, which has other issues (SOAP does not make me feel clean).
LES Here is my code:
package require http
package require Tk
bind all <Key-Return> {imdb $::query}
proc imdb {input} {
http::config -useragent "Firefox 2.0.2"
set query [http::formatQuery btnG Search q "site:imdb.com $input"]
set http [http::geturl http://www.google.com/search?$query]
set html [http::data $http]
http::cleanup $http
regexp {<div class=g><a href=['"]?([^<>'" ]+imdb.com/title[^<>'" ]+)} $html => ::url
.f1.bi configure -state normal
set http [http::geturl $::url]
set html [http::data $http]
http::cleanup $http
regexp {<b>User Rating:</b>(.*?)</div>} $html => _rating
regsub -all {(<[^<>]+>|more|\n)} $_rating {} _rating
set ::rating [string trim $_rating]
regexp {<h5>Genre:</h5>(.*?)</div>} $html => _genre
regsub -all {(<[^<>]+>|more)} $_genre {} _genre
set ::genre [string trim $_genre]
regexp {<h5>Plot Outline:</h5>(.*?)<} $html => _plot
set ::plot [string trim $_plot]
.s delete 1.0 end
.s insert 1.0 "$::genre\n$::plot"
}
proc go {url} {
if { $::tcl_platform(platform) == "windows" } {
exec cmd /c start {} $url &
} else { exec kfmclient exec $url }
}
label .lq -text Query:
entry .q -textvariable query
label .lt -text Title:
entry .t -textvariable title
label .lr -text Rating:
entry .r -textvariable rating
label .ls -text Outline
text .s -height 5 -wrap word
label .lu -text URL
entry .u -textvariable ::url
grid .lq .q -sticky ew
grid .lt .t -sticky ew
grid .lr .r -sticky ew
grid .ls .s -sticky ew
grid .lu .u -sticky ew
frame .f1
grid .f1 -column 1 -sticky ew
button .f1.bs -text Search -command {imdb $::query}
button .f1.bi -text "Goto IMDB" -state disable -command {go $::url}
grid .f1.bs .f1.bi
grid columnconfigure . 1 -weight 1
grid columnconfigure .f1 {0 1} -weight 1See also: Web scraping

