An important part of any Scripted Compiler is the ability actually process the system language underlying the scripting language. In the case of Tcl this is the C Language.The first step is always to separate the input stream into tokens, each representing one semantic atom. In compiler speak, lexing.The following script lexes a string containing C source into a list of tokens. It assumes that the sources are free of preprocessor statements like "#include", "#define", etc.Also note that the script is built upon the base package provided in Scripted Lexing. While this means the code shown here is quite tailored to parsing for a compiler the general principle used is broad enough to allow for many variations. Examples:
- Keep the whitespace as tokens. Might be required for a pretty-printer.
- Treat comments as whitespace and remove them. True compiler. Keeping the comments, but not other whitespace as in the script below is more something for a code analyzer looking for additional data (meta-data) in comments. See Source Navigator for a tool in this area.
- Modify the definitions, convert the keywords and punctuation into single byte codes, and refrain from splitting/listifying the result. Sort of a special method for compressing C sources.
[andreask@pliers trans]$ ./driver -noraw -notoken tclIO.c
__________________________________________________
tclIO.c:
242918 characters
Lexing in 13446065 microseconds
= 13.446065 seconds
= 55.35227937 usec/char
__________________________________________________Not bad for a lexer written in a scripting language IMHO.TODO- Read up on C syntax. I believe that I currently do not recognize all possible types of numbers.
clex.tcl (The code, finally :)
# -*- tcl -*-
# Lexing C
package require lexbase
package provide clex 2.0
namespace eval clex {
# Define the lexer symbols for the language 'C', as an example.
namespace import ::lexbase::*
DefStart
DefP ( LPAREN ; DefP ) RPAREN ; DefP -> DEREF
DefP < LT ; DefP <= LE ; DefP == EQ
DefP > GT ; DefP >= GE ; DefP != NE
DefP \[ LBRACKET ; DefP \] RBRACKET ; DefP = ASSIGN
DefP \{ LBRACE ; DefP \} RBRACE ; DefP *= MUL_ASSIGN
DefP . DOT ; DefP , COMMA ; DefP /= DIV_ASSIGN
DefP ++ INCR_OP ; DefP -- DECR_OP ; DefP %= REM_ASSIGN
DefP & ADDR_BITAND ; DefP * MULT_STAR ; DefP += PLUS_ASSIGN
DefP + PLUS ; DefP - MINUS ; DefP -= MINUS_ASSIGN
DefP ~ BITNOT ; DefP ! LOGNOT ; DefP <<= LSHIFT_ASSIGN
DefP / DIV ; DefP % REM ; DefP >>= RSHIFT_ASSIGN
DefP << LSHIFT ; DefP >> RSHIFT ; DefP &= BITAND_ASSIGN
DefP ^ BITEOR ; DefP && LOGAND ; DefP ^= BITEOR_ASSIGN
DefP | BITOR ; DefP || LOGOR ; DefP |= BITOR_ASSIGN
DefP ? QUERY ; DefP : COLON ; DefP \; SEMICOLON
DefP ... ELLIPSIS ; DefP ~= BITNOT_ASSIGN
DefK typedef ; DefK extern ; DefK static ; DefK auto ; DefK register
DefK void ; DefK char ; DefK short ; DefK int ; DefK long
DefK float ; DefK double ; DefK signed ; DefK unsigned
DefK goto ; DefK continue ; DefK break ; DefK return
DefK case ; DefK default ; DefK switch
DefK struct ; DefK union ; DefK enum
DefK while ; DefK do ; DefK for
DefK const ; DefK volatile
DefK if ; DefK else
DefK sizeof
DefM COMMENT ::clex::C_comment_begin ::clex::C_comment_end
DefM COMMENT ::clex::C99_comment_begin ::clex::C99_comment_end
DefM STRING_LITERAL ::clex::C_string_begin ::clex::C_string_end
DefM STRING_LITERAL ::clex::C_char_begin ::clex::C_char_end
# Floats containing '.'s have to be matched early because the '.'
# is later seen as punctuation.
DefM CONSTANT ::clex::C_floatA_begin ::clex::C_floatA_end
DefM CONSTANT ::clex::C_floatB_begin ::clex::C_floatB_end
DefI IDENT
DefWS {[ \t\v\f\r\n]+}
DefRxM {^0x[[:xdigit:]]+} CONSTANT
DefRxM {^\d+} CONSTANT
DefEnd
}
proc ::clex::C_comment_begin {string start} {
return [string first "/*" $string $start]
}
proc ::clex::C_comment_end {string start} {
incr start 2 ; # Skip behind /*
set stop [string first "*/" $string $start]
incr stop 1 ; # Skip to /
return $stop
}
proc ::clex::C99_comment_begin {string start} {
string first // $string $start
}
proc ::clex::C99_comment_end {string start} {
regexp -indices -start $start {//(?:\\.|[^\n\\])*(?:\n|$)} $string range
lindex $range 1
}
proc ::clex::C_string_begin {string start} {
return [string first "\"" $string $start]
}
proc ::clex::C_string_end {string start} {
# The next vari-sized thing is a "-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \" as one character inside of the string. "
set from $start
while 1 {
incr from
set stop [string first "\"" $string $from]
# Note that we do not use [string first] to look for a \",
# but simply check the preceding character. That is less
# expensive than possibly running through the whole string.
incr stop -1
if {[string equal [string index $string $stop] "\\"]} {
incr stop 2
set from $stop
continue
}
incr stop
break
}
return $stop
}
proc ::clex::C_char_begin {string start} {
return [string first "'" $string $start]
}
proc ::clex::C_char_end {string start} {
# The next vari-sized thing is a '-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \' as one character inside of the string. "
set from $start
while 1 {
incr from
set stop [string first "'" $string $from]
# Note that we do not use [string first] to look for a \",
# but simply check the preceding character. That is less
# expensive than possibly running through the whole string.
incr stop -1
if {[string equal [string index $string $stop] "\\"]} {
incr stop 2
set from $stop
continue
}
incr stop
break
}
return $stop
}
proc ::clex::C_floatA_begin {string start} {
upvar stash stash
if {[regexp -indices -start $start {\W([0-9]*\.[0-9]+([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
#puts a==[string range $string [lindex $match 0] [lindex $match 1]]
set stash(float-a) [lindex $match 1]
return [lindex $match 0]
}
return -1
}
proc ::clex::C_floatA_end {string start} {
upvar stash stash
return $stash(float-a)
}
proc ::clex::C_floatB_begin {string start} {
upvar stash stash
if {[regexp -indices -start $start {\W([0-9]+\.[0-9]*([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
#puts b==[string range $string [lindex $match 0] [lindex $match 1]]
set stash(float-b) [lindex $match 1]
return [lindex $match 0]
}
return -1
}
proc ::clex::C_floatB_end {string start} {
upvar stash stash
return $stash(float-b)
return -1
}driver
#!/usr/bin/env tclsh
# -*- tcl -*-
set time 1
set token 1
set raw 1
while {1} {
switch -exact -- [lindex $argv 0] {
-notime {set time 0}
-notoken {set token 0}
-noraw {set raw 0}
default {break}
}
set argv [lrange $argv 1 end]
}
source lexbase.tcl
source clex.tcl
# Read file, lex it, time the execution to measure performance
set data [read [set fh [open [set fname [lindex $argv 0]]]]][close $fh]
set len [string length $data]
set usec [lindex [time {set res [lexbase::lex $data]}] 0]
foreach {sym attr} $res break
foreach {aidx aval} $attr break
if {$time} {
# Write performance statistics.
puts __________________________________________________
puts "$fname:"
puts "\t$len characters"
puts "\tLexing in $usec microseconds"
puts "\t = [expr {double($usec)/1000000}] seconds"
puts "\t = [expr {double($usec)/$len}] usec/char"
}
if {$token} {
# Generate tokenized listing of the input, using the lexing results as input.
puts __________________________________________________
set av 0
foreach s $sym {
switch -glob -- $s {
*- {puts "$s <<[lindex $aval [lindex $aidx $av]]>>" ; incr av}
* {puts "$s"}
}
}
}
if {$raw} {
# Dump the raw lexer result.
puts __________________________________________________
puts Symbols___________________________________________
puts $sym
puts ""
puts Attribute-Indices_________________________________
puts $aidx
puts ""
puts Attribute-Data____________________________________
puts \{[join $aval "\} \{"]\}
puts ""
puts __________________________________________________
}
puts __________________________________________________AMG: Here's another lexer (I say "scanner") for C that uses ylex:
# cscanner.tcl
package require ylex
# Create the object used to assemble the scanner.
yeti::ylex CScannerFactory -name CScanner
# On error, print the filename, line number, and column number.
CScannerFactory code error {
if {$file ne {}} {
puts -nonewline $verbout $file:
}
puts $verbout "$line:$column: $yyerrmsg"
}
# Define public variables and methods.
CScannerFactory code public {
variable file {} ;# Current file name, or empty string if none.
variable line 1 ;# Current line number.
variable column 1 ;# Current column number.
variable typeNames {} ;# List of TYPE_NAME tokens.
# addTypeName --
# Adds a typedef name to the list of names treated as TYPE_NAME.
method addTypeName {name} {
lappend typeNames $name
}
}
# Define internal methods.
CScannerFactory code private {
# result --
# Common result handler for matches. Updates the line and column counts,
# and returns the arguments if provided.
method result {args} {
set text [string map {\r ""} $yytext]
set start 0
while {$start < [string length $text]} {
regexp -start $start {([^\n\t]*)([\n\t]?)} $text chunk body space
incr column [string length $body]
if {$space eq "\n"} {
set column 1
incr line
} elseif {$space eq "\t"} {
set column [expr {(($column + 7) & ~3) + 1}]
}
incr start [string length $chunk]
}
if {[llength $args]} {
return -level 2 $args
}
}
# lineDirective --
# Processes #line directives.
method lineDirective {} {
if {[regexp {^\s*#line (\d+)(?: "(.+)")?\n$} $yytext _ line newFile]
&& $newFile ne ""} {
set file [subst -nocommands -novariables $newFile]
}
}
# tokenType --
# Decides if a token is TYPE_NAME or IDENTIFIER according to $typeNames.
method tokenType {} {
if {$yytext in $typeNames} {
return TYPE_NAME
} else {
return IDENTIFIER
}
}
# scanChar --
# Converts character literals to integers.
method scanChar {char} {
set char [subst -nocommands -novariables $char]
if {[string length $char] != 1} {
error "multi-character constants not supported"
}
scan $char %c
}
# scanStr --
# Converts string literals to Tcl strings.
method scanStr {string} {
subst -nocommands -novariables $string
}
}
# Define useful abbreviations for upcoming regular expressions.
CScannerFactory macro {
C {(?://(?:\\.|[^\n\\])*(?:\n|$))}
E {(?:[eE][+-]?\d+)}
FS {[fFlL]}
IS {(?:[uU]?[lL]{0,2}|[lL]{0,2}[uU]?)}
}
# Generate a regular expression matching any simple token. The value of such
# tokens is the uppercase version of the token string itself.
foreach token {
auto bool break case char const continue default do double else enum extern
float for goto if int long register return short signed sizeof static
struct switch typedef union unsigned void volatile while ... >>= <<= +=
-= *= /= %= &= ^= |= >> << ++ -- -> && || <= >= == != ; \{ \} , : = ( ) [
] . & ! ~ - + * / % < > ^ | ?
} {
lappend pattern [regsub -all {[][*+?{}()|.^$]} $token {\\&}]
}
set pattern (?:[join $pattern |])
# Match simple tokens.
CScannerFactory add $pattern {result [string toupper $yytext]}
# Match and decode more complex tokens.
CScannerFactory add {
{[ \t\v\n\f]} {result}
{/\*.*?\*/} {result}
{<C>} {result}
{(?n)^\s*#line[^\n]*\n} {lineDirective}
{[a-zA-Z_]\w*\M} {result [tokenType] $yytext}
{0[xX]([[:xdigit:]]+)<IS>\M} {result CONSTANT [scan $1 %x]}
{0([0-7]+)<IS>\M} {result CONSTANT [scan $1 %o]}
{(\d+)<IS>\M} {result CONSTANT [scan $1 %d]}
{L?'((?:[^\\']|\\.)+)'} {result CONSTANT [scanChar $1]}
{(\d+<E>)<FS>?\M} {result CONSTANT [scan $1 %f]}
{(\d*\.\d+<E>?)<FS>?\M} {result CONSTANT [scan $1 %f]}
{(\d+\.\d*<E>?)<FS>?\M} {result CONSTANT [scan $1 %f]}
{L?"((?:[^\\"]|\\.)+)"} {result STRING_LITERAL [scanStr $1]}
{.} {error "invalid character \"$yytext\""}
}
# Create the CScanner class. You might want to cache the generated script to
# avoid dependency on ylex and to improve startup time.
eval [CScannerFactory dump]
itcl::delete object CScannerFactoryIt's quite different than the code given at the top of this page. The primary difference is that it directly uses the various symbols like "+" as the terminal names. Since we're using Tcl, I don't see a problem with this. I find that it makes the grammar much more readable.
