Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

Finite State Machine based VBScript Parser

15 views
Skip to first unread message

Alex K. Angelopoulos (MVP)

unread,
Dec 22, 2002, 2:39:36 PM12/22/02
to
Below is a new variation on the parser, this one using the finite state machine
approach.

The use is more in the sense of a general thinking approach to it than a precise
implementation. Most VB family parsers have to be substantial hand-tuned; this
one is no exception, and is actually even more so handicapped than necessary
since I am still getting the idea of how a finite state machine can be
optimized.

A well-done FSM description of a VBScript parser would actually start from a
chart which shows along the left vertical axis a set of state names for states
which the script would be in; and along the top horzontal axis would show
classes of input - continuation character (_), quote mark, etc. In each
intersecting box would be some action taken by the parser based on current state
and the input character (one of something like the following: ignore the
character, add it to a token we are building, emit the current characters we
have added to a new token, or something more complicated based on looking ahead
at characters). One of the actions in this simplified form would be a possible
transition to a new state.

The nice thing about such a chart is that it could theoretically be "read" by an
application or script, and the data therein used to produce the actual code used
for parsing. In fact, that is essentially how most lexing tools work.
Unfortunately, they tend to be written in a language such as C and the output
they produce is also C code, which steps around one point of this exercise -
doing it all within script.

In any case, here is a hand-tuned pseudo-FSM implementation of a VBScript
parser; it consists of a small script to split up a script (itself in this case)
and then show some basic stats and finally echo out a stripped version of the
script itself. The output is not all that interesting at this point, but hey -
I'm rather proud of my masochism - er, dedication - in getting this far with
it...


'newparser.vbs
' implements a finite state machine based VBScript parsing method
Set parser = new fsmParser

retVal = parser.loadScriptFromFile(WScript.ScriptFullName)
WScript.Echo "Loaded script successfully?", CStr(retVal)
WScript.Echo "total atoms (characters):", parser.atomCount
wscript.echo ""
parser.TokenizeScript

parser.rebuild
WScript.Echo parser.script


Class fsmParser


dim Debugging
dim atom_Alpha, atom_Comment, atom_Continuation, atom_Date, atom_Dot
dim atom_EndScript, atom_Join, atom_linearWhiteSpace, atom_Numeric
dim atom_Operator, atom_Other, atom_Quote, atom_Terminator
dim state_Ground, state_inComment, state_inContinuation, state_inDate
dim state_inIdentifier, state_inJoin, state_inOperator, state_inQuote
dim state_inTerminator
dim atoms, atomic_states(), statements(), tokens(), atomtypes()
dim tokentypes(), tokentype, structures()
dim state, current_position, current_atom, current_atomtype, token
dim m_indentOutput, m_script


property let indentOutput(bIndent)
m_indentOutput = bIndent
end property


Property get atomCount
atomCount = UBound(atoms) + 1
End Property


Property Get TokenCount
dim i
for i = 0 to UBound(statements)
TokenCount = TokenCount + UBound(statements(i)) + 1
next
end Property


Property get Instructions
instructions = statements
end property


Property get script
script = m_script
end property


Sub act_Ground
Select Case current_atom
Case atom_Alpha
state = state_inIdentifier
Case atom_Comment
state = state_inComment
Case atom_Continuation
Case atom_Date
state = state_inDate
Case atom_Dot
state = state_inIdentifier
Case atom_EndScript
EmitScript
Case atom_Join
EmitStatement
Case atom_linearWhiteSpace
' Do Nothing
Case atom_Numeric
state = state_inIdentifier
Case atom_Operator
Case atom_Other
Case atom_Quote
state = state_inQuote
Case atom_Terminator
EmitStatement
End Select
end sub


sub rebuild
dim tmpStatements(), i, j, thistoken, thistokentype, lasttokentype
redim tmpStatements(UBound(statements))
redim structures(0) 'first one will always be global code
for i = 0 to UBound(statements)
sTmp = ""
for j = 0 to UBound(statements(i)(0))
thistoken = statements(i)(0)(j)
thistokentype = statements(i)(1)(j)
select case True
case thistoken = "="
sTmp = sTmp & " " & thistoken
lasttokentype = thistokentype
case thistoken = ","
sTmp = sTmp & thistoken
lasttokentype = thistokentype
case thistokentype = "stringliteral"
sTmp = sTmp & " " & quote(thistoken)
lasttokentype = thistokentype
case IsBracket(thistoken)
sTmp = sTmp & thistoken
lasttokentype = "bracket"
case else
If lasttokentype = "bracket" then
sTmp = sTmp & thistoken
else
sTmp = sTmp & " " & thistoken
end if
lasttokentype = thistokentype
end select
next
tmpStatements(i) = Trim(sTmp)
next
m_script = Join(tmpStatements, vbLf)
end sub


Sub Class_Initialize
Enum_Atoms
Enum_States
state = state_Ground
current_position = 0
redim tokens(-1)
redim tokentypes(-1)
redim statements(-1)
Debugging = False
End Sub


Sub TokenizeScript
for current_position = 0 to Ubound(atoms)
current_atom = atoms(current_position)
current_atomtype = atomic_states(current_position)
'wscript.echo Current_atomtype
Select case state
case state_Ground
DebugOut "state is ground."
doGround
case state_inComment
DebugOut "state is comment."
doComment
case state_inContinuation
doContinuation
case state_inIdentifier
doIdentifier
case state_inQuote
DebugOut "state is quote."
doQuote
End select
'wscript.echo current_position, current_atom, state
'wscript.quit
next
end sub


Function loadScriptFromFile(filename)
atoms = StringToNumericArray(ReadFile(filename))
GetAtomTypes
If Len(atoms(0)) > 0 Then loadScriptFromFile = True
End Function


Function loadScriptFromString(sData)
atoms = StringToNumericArray(sData)
GetAtomTypes
If Len(atoms(0)) > 0 Then loadScriptFromString = True
End Function


Sub GetAtomTypes
Redim atomic_states(UBound(atoms))
for i = 0 to UBound(atoms)
atomic_states(i) = CharType(atoms(i))
next
end sub


Function CharType(iChr)
dim i
Select Case True
Case (iChr>=65 and iChr<=90) _
or (iChr>=97 and iChr<=122)
CharType = atom_Alpha
Case iChr = 39
CharType = atom_Comment
Case iChr = 95
CharType = atom_Continuation
Case iChr = 35
CharType = atom_Date
Case iChr = 46
CharType = atom_Dot
Case iChr = 0
CharType = atom_EndScript
Case iChr = 58
CharType = atom_Join
Case (iChr>=48 and iChr<=57)
CharType = atom_Numeric
Case (iChr = 9) or (iChr = 11) or (iChr = 12) or (iChr = 32)
CharType = atom_linearWhiteSpace
Case (iChr = 33) or (iChr = 38) or (iChr = 47) or (iChr = 92) _
or (iChr = 94) or (iChr>=40 and iChr<=45) _
or (iChr>=60 and iChr<=62)
CharType = atom_Operator
Case iChr = 34
CharType = atom_Quote
Case (iChr = 10) or (iChr = 13)
CharType = atom_Terminator
Case Else
' 1,8,14-31,36,37,59,63-64,91,93,96,123+
CharType = atom_Other
End Select
End Function


Sub Enum_Atoms
atom_linearWhiteSpace = 0
atom_Alpha = 1
atom_Comment = 2
atom_Continuation = 3
atom_Date = 4
atom_Dot = 5
atom_EndScript = 6
atom_Join = 7
atom_Numeric = 8
atom_Operator = 9
atom_Other = 10
atom_Quote = 11
atom_Terminator = 12
End Sub


Sub Enum_States
state_Ground = 0
state_inQuote = 1
state_inComment = 2
state_inContinuation = 3
state_inIdentifier = 4
state_inDate = 5
state_inJoin = 6
End Sub


Function ReadFile(FilePath)
'Given the path to a file, will return entire contents
' works with either ANSI or Unicode
Dim FSO, CurrentFile
Const ForReading = 1, TristateUseDefault = -2, _
DoNotCreateFile = False
Set FSO = createobject("Scripting.FileSystemObject")
If FSO.FileExists(FilePath) Then
If FSO.GetFile(FilePath).Size>0 Then
Set CurrentFile = FSO.OpenTextFile(FilePath, ForReading, _
False, TristateUseDefault)
If CurrentFile.AtEndOfStream <> True Then
ReadFile = CurrentFile.ReadAll: CurrentFile.Close
End If
End If
End If
End Function


Function StringToArray(sData)
' takes a single string of arbitrary length
' returns a 0-based array, 1 character per element
Dim aTmp()
Redim aTmp(Len(sData) - 1)
for i = 0 to UBound(aTmp)
aTmp(i) = Mid(sData, 1 + i, 1)
Next
stringToArray = aTmp
End Function


Function StringToNumericArray(sData)
' takes a single string of arbitrary length
' returns a 0-based array, 1 character per element
Dim aTmp()
Redim aTmp(Len(sData) - 1)
for i = 0 to UBound(aTmp)
aTmp(i) = Asc(Mid(sData, 1 + i, 1))
Next
StringToNumericArray = aTmp
End Function


Sub doComment
Select Case current_atomtype
Case atom_Terminator
EmitStatement
state = state_Ground
End Select
end sub


Sub doGround
Select Case current_atomtype
Case atom_Terminator
EmitStatement
Case atom_Join
EmitStatement
' because it's a termination here...
Case atom_Alpha
state = state_inIdentifier
token = Chr(current_atom)
Case atom_Numeric
state = state_inIdentifier
token = Chr(current_atom)
Case atom_Dot
' Technically an operator if we see it here
doOperator
Case atom_Continuation
' Technically an operator if we see it here
state = state_inContinuation
Case atom_Comment state = state_inComment
Case atom_Quote state = state_inQuote
Case atom_Operator
doOperator
End Select
end sub


Sub doContinuation
Select Case current_atomtype
Case atom_linearWhiteSpace
' do nothing - white space is OK
Case atom_Terminator
DebugOut atomic_states(current_position) _
& atomic_states(current_position + 1)
if atomic_states(current_position + 1) = atom_Terminator Then
' do nothing
else
state = state_Ground
end if
Case Else
End Select
end sub


Sub doDate
Select Case current_atomtype
Case atom_Terminator
Case Else
End Select
end sub


Sub doIdentifier
tokentype = "identifier"
Select Case current_atomtype
Case atom_Terminator
doTerminator
Case atom_linearWhiteSpace
emitToken
state = state_Ground
Case atom_Operator
emitToken
doOperator
Case atom_Join
EmitStatement
Case Else
token = token & Chr(current_atom)
End Select
end sub


Sub doQuote
tokentype = "stringliteral"
Select Case current_atomtype
Case atom_Terminator
emitStatement
Case atom_Quote
if atomic_states(current_position + 1) = atom_Quote Then
token = token & Chr(current_atom)
else
emitToken
end if
Case Else
token = token & Chr(current_atom)
End Select
end sub


Sub doOperator
tokentype = "operator"
token = Chr(current_atom)
EmitToken
state = state_Ground
end sub


Sub doTerminator
DebugOut "terminating a statement."
EmitToken
EmitStatement
state = state_Ground
end sub


Sub EmitToken
If Len(token)>0 Then
DebugOut "emitting token."
redim preserve tokens(UBound(tokens)+1)
redim preserve tokentypes(UBound(tokens))
tokens(UBound(tokens)) = token
tokentypes(UBound(tokens)) = tokentype
token = ""
End If
state = state_Ground
end sub


sub EmitStatement
emitToken
If UBound(tokens) > -1 Then
DebugOut "emitting statement."
redim preserve statements(UBound(statements)+1)
statements(UBound(statements)) = Array(tokens, tokentypes)
redim tokens(-1)
redim tokentypes(-1)
end if
end sub


sub DebugOut(sData)
If Debugging Then WScript.Echo sData
end sub


function VbsReserved
VbsReserved = Array("and", "as", "boolean", "byref", "byte", "byval", _
"call", "case", "class", "const", "currency", "debug", "dim", "do", _
"double", "each", "else", "elseif", "empty", "end", "endif", "enum", _
"eqv", "event", "exit", "false", "for", "function", "get", "goto", _
"if", "imp", "implements", "in", "integer", "is", "let", "like", _
"long", "loop", "lset", "me", "mod", "new", "next", "not", "nothing", _
"null", "on", "option", "optional", "or", "paramarray", "preserve", _
"private", "public", "raiseevent", "redim", "rem", "resume", "rset", _
"select", "set", "shared", "single", "static", "stop", "sub", "then", _
"to", "true", "type", "typeof", "until", "variant", "wend", "while", _
"with", "xor")
end function


function IsBracket(token)
If (token = "(") or (token = ")") Then
IsBracket = True
Else
IsBracket = false
end if
end function

function quote(token)
quote = Chr(34) & token & Chr(34)
end function

End Class


0 new messages