#!/bin/sh
# the next line restarts using tclsh \
exec tclsh "$0" "$@"
#---------------
# Perform multiple search on text block.
#---------------
# Args:
# needle list of search items, wildcards permitted
# haystack block of text upon to search
# Options:
# -exact (default: 1) match all items in the needle list
# -nocase (default: 0) ignore case
# -token (default: "") split needle string by token
# -verbose (default: 0) return full details of the search matches
# Returns:
# * Various patterns based upon options settings
# * Default settings would result in returning 1 (true) or 0 (false)
# whilst matching an exact needle pattern within the haystack string.
# * Setting -verbose to 1, will result in a complete listing of the
# occurrences of the needle in the haystack.
# * Each matched item will be reported in the form of a duple, where the
# first element contains the character position and word number of the
# matched item in the haystack, followed by the matched item itself.
# e.g. {idx pos} match
#
proc pattern_search {needles haystack args} {
# set defaults and assign options values from args
array set opts [list -nocase 0 -token "" -verbose 0 -exact 1]
array set opts $args
# tokenize needle string
if { $opts(-token) != "" } { set needle [split $needles $opts(-token)] }
set i 0 ;# counter for successful matches per line
set word_number 0 ;# word number
set needle_tally "" ;# tally of which needle patterns have been found
set found ""
foreach wrd $haystack {
foreach sub $needles {
if { $opts(-nocase) } {
set id [string match -nocase $sub $wrd ]
} else {
set id [string match $sub $wrd ]
}
if { $id == 1 } {
lappend needle_tally $sub
lappend found $word_number $wrd
incr i
}
}
incr word_number
}
set needle_tally [lsort -unique $needle_tally]
set needle [lsort -unique $needles]
# exact match for occurrences of needles in haystack
if { $opts(-exact) } {
if { $needle_tally == $needle } {
if { $opts(-verbose) } {
return [pattern_search_verbose $found $haystack]
}
} else { set i 0 }
}
if { $opts(-verbose) } {
return [pattern_search_verbose $found $haystack] }
# not an exact match
if {$i >= 1} { return 1 }
# no matches whatsoever
return 0
}
#---------------
# get a detailed list of the needles found in the haystack
#---------------
# Args:
# found list of matched needles
# haystack text examined
# Returns:
# formatted list of found needles in haystack
# {character-position word-number} needle
#
proc pattern_search_verbose { found haystack } {
set res ""
set idx -1
foreach {pos match} $found {
set idx [string first $match $haystack $idx+1]
lappend res [list $idx $pos] $match
}
return $res
}
# example uses
puts "1) [pattern_search "B F" "A B C D E F G" ]" ;# -nocase 0 -token "" -verbose 0 -exact 1
puts "2) [pattern_search "B H" "A B C D E F G" -nocase 1]" ;# -token "" -verbose 0 -exact 1
puts "3) [pattern_search "b" "A B C D E F G" -nocase 1 -exact 0]" ;# -token "" -verbose 0
puts "4) [pattern_search "d*" "A B C DOG E F G" -exact 0 -nocase 1]" ;# -token "" -verbose 0
puts "5) [pattern_search "A+D+G+Z" "A B C D E F G" -token +]" ;# -nocase 0 -verbose 0 -exact 1
puts "6) [pattern_search "A* *b* *G" "APPLE EBB C D E F EGG bottle Grape Apple APPLE Grape Garden" -exact 1 -verbose 1]" ;# -token ""
puts "7) [pattern_search "A* *b* *G" "APPLE EBB C D E F EGG bottle Grape Apple APPLE Grape Garden" -nocase 1 -exact 0 -verbose 1]" ;# -token ""
puts "8) >[pattern_search eggs basket -verbose 1]<"ak - 2017-06-20 19:14:47Second idea I had upon reading and skimming was this
- Convert the needles from glob syntax to regex (* -> .*, ? -> .)
- Then put all the needles together as a big alternation (i.e. foo|bar|...). This might need parens around each needle to separate them properly.
- Run regexp -indices -all to find the matches.
. While that is limited to fixed strings it could be used to find candidates based on the fixed prefix (suffix?) of each pattern and then check the small set of candidates for full match.
