I have a few issues that I don't see a solution for, was hoping to get some help/ideas
I am writing an ES2019 parser i.e. latest version of Javascript, And after getting a pretty functional parser (it passes 1800+/2000 tests from the ES test suite) I realized the S? (S being my rule for Whitepace / LineTerminator / Comment) I had sprinkled throughout the grammar were matching no space when they should of been requiring a space.
All of my tokens have the S rule associated on the left, because automatic semicolon insertion gives meaning to certain space, e.g. LineTerminators on the right. I decided that all Keywords / Identifiers should have a mandatory space before them, and that punctuation should have an optional space before them, in most cases. This makes the parser behave the same as other Javascript parsers e.g. esprima. Now here is the problem with the space associated to the left the input must now start with a S production if the first token is a keyword or it will not match at all.
The only workaround I can think of is to call the parse method of the generated parser with a space prepended to it, and modify the location function to subtract 1 from the column if the line === 1.
Variable Declaration and AssignmentExpression use automatic semicolon insertion so semicolons can be omitted if desired
Though not present in this small modified grammar the space between function and the identifier is also significant consider these with the Esprima parser:
{
const includeLoc = true;
const includeSource = false;
function Node(base, openingToken) {
if (includeLoc) {
const loc = location();
delete loc.start.offset;
delete loc.end.offset;
if (includeSource) {
loc.source = source || text().trim();
}
if (openingToken) {
loc.start = openingToken.loc.start;
loc.start.column -= 1;
}
loc.end.column -= 1;
base.loc = loc;
}
return base;
}
}
/*a = a:$.+ {
return Node({
text: a,
textLength: a.length,
});
}*/
start = Program
Program =
exps:(func:function a:Identifier left_parentheses right_parentheses b:BlockStatement {
return Node({
type: 'FunctionExpression',
id: a,
params: [],
body: b,
generator: false,
async: false
}, func);
}
/ v:var a:VariableDeclarators EOS {
return Node({
"type": "VariableDeclaration",
"declarations": a,
"kind": "var"
});
}
/ a:Identifier e:equals b:number EOS {
return Node({
"type": "ExpressionStatement",
"expression": {
"type": "AssignmentExpression",
"operator": e.text,
"left": a,
"right": b
}
});
})+ {
return Node(
{
"type": "Program",
"body": exps,
"sourceType": "script"
});
}
VariableDeclarator =
a:Identifier e:equals b:number {
return Node({
"type": "VariableDeclarator",
"id": a,
"init": b
}, a);
}
VariableDeclarators =
first:VariableDeclarator rest:(comma a:VariableDeclarator { return a; })* { return [first].concat(rest); }
BlockStatement =
a:left_brace right_brace {
return Node({
"type": "BlockStatement",
"body": []
}, a);
}
function =
S+ a:function_token { return a; }
Identifier =
S+ a:IdentifierToken {
return Node({
type: 'Identifier',
name: a.text
}, a)
}
comma =
S? a:comma_token { return a; }
left_brace =
S? a:left_brace_token { return a; }
right_brace =
S? a:right_brace_token { return a; }
left_parentheses =
S? a:left_parentheses_token { return a; }
right_parentheses =
S? a:right_parentheses_token { return a; }
var =
S+ a:var_token &S { return a; }
semicolon =
S? a:semicolon_token { return a; }
equals =
S? a:equals_token { return a; }
number =
S? a:number_token { return Node({
"type": "Literal",
"value": 1,
"raw": "1"
}); }
S =
(WhiteSpace / LineTerminatorSequence / Comment)+
WhiteSpace =
TAB
/ VT
/ FF
/ SP
/ NBSP
/ ZWNBSP
/ USP
LineTerminator =
LF
/ CR
/ LS
/ PS
LineTerminatorSequence =
LF
/ CR !(LF)
/ LS
/ PS
/ CR LF
Comment =
MultiLineComment
/ SingleLineComment
MultiLineComment =
solidus_token asterisk_token a:$MultiLineCommentChars* asterisk_token solidus_token { return a; }
MultiLineCommentNoLB =
solidus_token asterisk_token a:$(!(LineTerminator) MultiLineCommentChars)* asterisk_token solidus_token { return a; }
SnoLB =
(WhiteSpace / SingleLineComment / MultiLineCommentNoLB)+
EOS =
SnoLB? LineTerminatorSequence
/ SnoLB? &right_brace_token
/ S? EOF
/ S? semicolon_token
EOSnoLB =
SnoLB? semicolon_token
/ SnoLB? LineTerminatorSequence
/ SnoLB? &right_brace_token
/ SnoLB? EOF
MultiLineCommentChars =
MultiLineNotAsteriskChar MultiLineCommentChars?
/ asterisk_token !solidus_token PostAsteriskCommentChars?
PostAsteriskCommentChars =
MultiLineNotForwardSlashOrAsteriskChar MultiLineCommentChars?
/ asterisk_token !solidus_token PostAsteriskCommentChars?
MultiLineNotAsteriskChar =
!asterisk_token SourceCharacter
MultiLineNotForwardSlashOrAsteriskChar =
!(solidus_token / asterisk_token) SourceCharacter
SingleLineComment =
solidus_token solidus_token a:$SingleLineCommentChars? { return a; }
SingleLineCommentChars =
SingleLineCommentChar SingleLineCommentChars?
SingleLineCommentChar =
!(LineTerminator) SourceCharacter
TAB =
"\u0009"
VT =
"\u000B"
FF =
"\u000C"
SP =
"\u0020"
NBSP =
"\u00A0"
ZWNBSP =
"\uFEFF"
USP =
[\u1680\u2000-\u200A\u202F\u205F\u3000]
ZWNJ =
"\u200C"
ZWJ =
"\u200D"
LF =
"\u000A"
CR =
"\u000D"
LS =
"\u2028"
PS =
"\u2029"
EOF = !.
asterisk_token =
'*' { return Node({text: text()}); }
comma_token =
',' { return Node({text: text()}); }
equals_token =
'=' { return Node({text: text()}); }
function_token =
'function' { return Node({text: text()}); }
left_brace_token =
'{' { return Node({text: text()}); }
left_parentheses_token =
'(' { return Node({text: text()}); }
number_token =
[1-9][0-9]* / [0-9] { return Node({text: text()}); }
right_brace_token =
'}' { return Node({text: text()}); }
right_parentheses_token =
')' { return Node({text: text()}); }
semicolon_token =
';' { return Node({text: text()}); }
solidus_token =
'/' { return Node({text: text()}); }
var_token =
'var' { return Node({text: text()}); }
IdentifierToken=
a:$[A-Za-z0-9]+ { return Node({text: text()}); }
SourceCharacter = .