Updated 2007-12-06 13:38:26 by dkf

Michael Schlenker:

A very simple package i wrote to parse SOIF Objects (RFC 2655, [1]) as used by the harvest search engine [2].

Drop me a note, if you find it useful:
  # Parse SOIF Objects as defined by RFC 2655
  #
  # (c) 2002 Michael Schlenker <[email protected]>
  #
  #
  #
  # License: Use under the same license as the tcl core.
  #
  #

  # uses uri package from tcllib
  package require Tcl 8.2
  package require uri
  package provide SOIF 0.1

  namespace eval SOIF {
	set version 0.1
	set notalnumregexp {[^a-zA-Z0-9\-_]}
	set identifier     {([a-zA-Z0-9\-_]+)\{([0-9]+)\}(:\t)}
  }

  # -- SOIF::parse
  #
  # Description:		Parse a SOIF object into a
  #                             list of values.
  #
  # Input:		        single SOIF object
  #
  # Output:		        list of the form
  #                             "TEMPLATE-TYPE URL ATTRIBUTE-VALUELIST"
  #
  #

  proc SOIF::parse {obj} {
	variable notalnumregexp
	variable identifier

	# check for @ symbol
	set k [string first @ $obj]
	if {$k == -1} {
		error "No SOIF Object"
	}

	# check for template type
	set l [string first \{ $obj ]
	if {$l == -1} {
		error "No SOIF Template Type"
	}
	set template_type ""
	set template_type [string trim \
            [string range $obj [expr {$k+1}] [expr {$l-1}]]]

        # validate, that it is alphanumeric
	if {[regexp $notalnumregexp $template_type]} {
		error "Template Type \"$template_type\" not valid \
                       alphanumeric template-type."
	}

	# check for URL
	set m $l
	set url_candidate ""
	while {[string length $url_candidate]==0} {
		# the rfc is unclear how to identify
                # the url, trying this
		set n [string first "\n" $obj $m]

		set url_candidate [string trim \
                       [string range $obj [expr {$m+1}] $n]]
		set m [expr {$n+1}]
	}
	# handle the special case that no url is given
	if {![string equal $url_candidate "-"]} {

		# check if this is a URL here,
                # this should throw an error
		# if no valid url is found
		if {[catch {uri::split $url_candidate} clist] == 1} {
			error "URL \"$url_candidate\" not of \
                               known type."
		}

	}
	set url $url_candidate
	set attvalue ""
	# header is done, now parse attribute value pairs
	set start $n
	while {[regexp -indices -start $start --\
                 $identifier $obj -> id length delimeter]} {
		set id_text [string range $obj \
                              [lindex $id 0] [lindex $id 1]]
		set length [string range $obj \
                              [lindex $length 0] [lindex $length 1]]
		set offset [expr [lindex $delimeter 1] +1]
		set value [string range \
                             $obj $offset [expr {$offset+$length-1}]]
		lappend attvalue $id_text $value
		set start [expr {$offset+$length}]
	}

	# all identifiers and values have been parsed
	# check for closing \}

	if {![regexp -indices -start $start -- {\}} $obj -> dummy]} {
		error "Missing close brace on obj"
	}

	set result [list $template_type $url $attvalue]
	return $result
  }

  # -- SOIF::readObjectFromFile
  #
  # Description:		Reads a SOIF Object from File
  #                       (only one object per file should be used)
  #
  # Input:		Filename
  #
  # Output:		SOIF Object
  #

  proc SOIF::readObjectFromFile { filename } {

	if {![file exists $filename]} {
		error "No file \"${filename}\" exists."
	}

	if {[catch {open $filename r} fid]} {
		error "Opening \"$filename\" failed."
	}
	# set the translation to binary,
        # as SOIF can contain arbitrary data
	fconfigure $fid -translation binary

	set obj [read $fid]
	if {[catch {close $fid}]} {
		error "Closing \"$filename\" with \
                       channel ID \"$fid\" failed."
	}
	return $obj
  }

  # -- SOIF::writeObjectToFile
  #
  # Description:	Writes the string rep of a SOIF Object to Disk
  #			The string rep should be built
  #                     with SOIF::create.
  #
  # Input:		Filename
  #			SOIF-Object
  #
  # Output:		--
  #

  proc SOIF::writeObjectToFile { filename obj } {

	if {![file exists $filename]} {
		error "File \"$filename\" exists, cannot write"
	}

	if {[catch {open $filename w+} fid]} {
		error "File \"$filename\" could not be \
                       opened for writing."
	}

	# set the translation to binary,
        # as SOIF can contain arbitrary data

        fconfigure $fid -translation binary

	puts -nonewline $fid $obj

	if {[catch {close $fid}]} {
		error "Closing \"$filename\" with channel \
                       ID \"$fid\" failed."
	}
  }

  # -- SOIF::create
  #
  # Description:         Creates a string rep from the parts
  #                      of a SOIF object.
  #
  # Bugs:		Does not check, if the data it gets
  #                     is well formed.
  #
  # Input:		Template-Type
  #			URL
  #			Identifier-Value List
  #
  # Output:		SOIF String rep
  #

  proc SOIF::create {template-type url attvaluelist} {
	set obj ""
	append obj "@${template-type} \{ "
	append obj $url
	append obj "\r\n"
	foreach {attribute value} $attlist {
		set length [string length $value]
		set identifier "$attribute\{$length\}:\t"
		append obj $identifier $value
		# prettyprinting with extra newlines
		append obj "\r\n"
	}
	append obj "\}\n"

	return $obj
  }