Michael Schlenker:
A very simple package i wrote to parse SOIF Objects (
RFC 2655, [
1]) as used by the harvest search engine [
2].
Drop me a note, if you find it useful:
# Parse SOIF Objects as defined by RFC 2655
#
# (c) 2002 Michael Schlenker <[email protected]>
#
#
#
# License: Use under the same license as the tcl core.
#
#
# uses uri package from tcllib
package require Tcl 8.2
package require uri
package provide SOIF 0.1
namespace eval SOIF {
set version 0.1
set notalnumregexp {[^a-zA-Z0-9\-_]}
set identifier {([a-zA-Z0-9\-_]+)\{([0-9]+)\}(:\t)}
}
# -- SOIF::parse
#
# Description: Parse a SOIF object into a
# list of values.
#
# Input: single SOIF object
#
# Output: list of the form
# "TEMPLATE-TYPE URL ATTRIBUTE-VALUELIST"
#
#
proc SOIF::parse {obj} {
variable notalnumregexp
variable identifier
# check for @ symbol
set k [string first @ $obj]
if {$k == -1} {
error "No SOIF Object"
}
# check for template type
set l [string first \{ $obj ]
if {$l == -1} {
error "No SOIF Template Type"
}
set template_type ""
set template_type [string trim \
[string range $obj [expr {$k+1}] [expr {$l-1}]]]
# validate, that it is alphanumeric
if {[regexp $notalnumregexp $template_type]} {
error "Template Type \"$template_type\" not valid \
alphanumeric template-type."
}
# check for URL
set m $l
set url_candidate ""
while {[string length $url_candidate]==0} {
# the rfc is unclear how to identify
# the url, trying this
set n [string first "\n" $obj $m]
set url_candidate [string trim \
[string range $obj [expr {$m+1}] $n]]
set m [expr {$n+1}]
}
# handle the special case that no url is given
if {![string equal $url_candidate "-"]} {
# check if this is a URL here,
# this should throw an error
# if no valid url is found
if {[catch {uri::split $url_candidate} clist] == 1} {
error "URL \"$url_candidate\" not of \
known type."
}
}
set url $url_candidate
set attvalue ""
# header is done, now parse attribute value pairs
set start $n
while {[regexp -indices -start $start --\
$identifier $obj -> id length delimeter]} {
set id_text [string range $obj \
[lindex $id 0] [lindex $id 1]]
set length [string range $obj \
[lindex $length 0] [lindex $length 1]]
set offset [expr [lindex $delimeter 1] +1]
set value [string range \
$obj $offset [expr {$offset+$length-1}]]
lappend attvalue $id_text $value
set start [expr {$offset+$length}]
}
# all identifiers and values have been parsed
# check for closing \}
if {![regexp -indices -start $start -- {\}} $obj -> dummy]} {
error "Missing close brace on obj"
}
set result [list $template_type $url $attvalue]
return $result
}
# -- SOIF::readObjectFromFile
#
# Description: Reads a SOIF Object from File
# (only one object per file should be used)
#
# Input: Filename
#
# Output: SOIF Object
#
proc SOIF::readObjectFromFile { filename } {
if {![file exists $filename]} {
error "No file \"${filename}\" exists."
}
if {[catch {open $filename r} fid]} {
error "Opening \"$filename\" failed."
}
# set the translation to binary,
# as SOIF can contain arbitrary data
fconfigure $fid -translation binary
set obj [read $fid]
if {[catch {close $fid}]} {
error "Closing \"$filename\" with \
channel ID \"$fid\" failed."
}
return $obj
}
# -- SOIF::writeObjectToFile
#
# Description: Writes the string rep of a SOIF Object to Disk
# The string rep should be built
# with SOIF::create.
#
# Input: Filename
# SOIF-Object
#
# Output: --
#
proc SOIF::writeObjectToFile { filename obj } {
if {![file exists $filename]} {
error "File \"$filename\" exists, cannot write"
}
if {[catch {open $filename w+} fid]} {
error "File \"$filename\" could not be \
opened for writing."
}
# set the translation to binary,
# as SOIF can contain arbitrary data
fconfigure $fid -translation binary
puts -nonewline $fid $obj
if {[catch {close $fid}]} {
error "Closing \"$filename\" with channel \
ID \"$fid\" failed."
}
}
# -- SOIF::create
#
# Description: Creates a string rep from the parts
# of a SOIF object.
#
# Bugs: Does not check, if the data it gets
# is well formed.
#
# Input: Template-Type
# URL
# Identifier-Value List
#
# Output: SOIF String rep
#
proc SOIF::create {template-type url attvaluelist} {
set obj ""
append obj "@${template-type} \{ "
append obj $url
append obj "\r\n"
foreach {attribute value} $attlist {
set length [string length $value]
set identifier "$attribute\{$length\}:\t"
append obj $identifier $value
# prettyprinting with extra newlines
append obj "\r\n"
}
append obj "\}\n"
return $obj
}