The WhiteSpace / Comment conundrums

43 views
Skip to first unread message

jas...@snmpstack.com

unread,
Sep 28, 2018, 1:55:29 PM9/28/18
to PEG.js: Parser Generator for JavaScript
I have a few issues that I don't see a solution for, was hoping to get some help/ideas

I am writing an ES2019 parser i.e. latest version of Javascript, And after getting a pretty functional parser (it passes 1800+/2000 tests from the ES test suite) I realized the S? (S being my rule for Whitepace / LineTerminator / Comment)  I had sprinkled throughout the grammar were matching no space when they should of been requiring a space.

All of my tokens have the S rule associated on the left, because automatic semicolon insertion gives meaning to certain space, e.g. LineTerminators on the right. I decided that all Keywords / Identifiers should have a mandatory space before them, and that punctuation should have an optional space before them, in most cases. This makes the parser behave the same as other Javascript parsers e.g. esprima. Now here is the problem with the space associated to the left the input must now start with a S production if the first token is a keyword or it will not match at all. 

The only workaround I can think of is to call the parse method of the generated parser with a space prepended to it, and modify the location function to subtract 1 from the column if the line === 1. 

I am including a simple grammar that accepts three kinds of input:

  • a simple empty function e.g. function test () {}
  • a variable declaration for numbers only e.g. var a = 1; or var a =1, x=2, g=3;
  • an assignment expression for numbers only e.g. test = 1; or test = 3;
All prefaced with a space of course

Variable Declaration and AssignmentExpression use automatic semicolon insertion so semicolons can be omitted if desired

The AST output should be the same that an ESTree compliant parser would. e.g. Esprima http://esprima.org/demo/parse.html Though not 100% checked. I have modified the loc function to match what esprima shows for location e.g. subtracting 1 from the end column, I have also added functionality to reflect the location of the tokens, i.e. not include Whitespace/Comments in location. 

Why this is important:

  • compare the output of 
    • var test = 1;
    • vartest = 1;
    • var vartest = 1;
The space between var and test is significant as it changes which rule will match, so just replacing every S+ with S? won't work correctly
Though not present in this small modified grammar the space between function and the identifier is also significant consider these with the Esprima parser:

  • function test() {}
  • functiontest() [line break here] {}
They lead to wildly different AST. 

So the questions I would like answered / advice is:

  • How do you account for WhiteSpace / Comments at the beginning of an input when S is mandatory and the Space is associated to the left of the token?  
  • Is the location function giving wrong information ? Try un-commenting the `a` rule at the beginning of the sample grammar, and inputing a single line, the end location is always shown 1 plus the length, shouldn't it be the same as the length?
SAMPLE Grammar illustrating issues

{
const includeLoc = true;
const includeSource = false;
  function Node(base, openingToken) {
      if (includeLoc) {
          const loc = location();
          delete loc.start.offset;
          delete loc.end.offset;
          if (includeSource) {
              loc.source = source || text().trim();
          }
          if (openingToken) {
          loc.start = openingToken.loc.start;
            loc.start.column -= 1;
          }
          loc.end.column -= 1;
          base.loc = loc;
      }      
      return base;
  }
}
/*a = a:$.+ {
return Node({
    text: a,
        textLength: a.length,
    });
}*/
start = Program
Program =
exps:(func:function a:Identifier left_parentheses right_parentheses b:BlockStatement {
return Node({
type: 'FunctionExpression',
id: a,
params: [],
body: b,
generator: false,
            async: false
}, func);
}
  / v:var a:VariableDeclarators EOS {
 
      return Node({
            "type": "VariableDeclaration",
            "declarations": a,
            "kind": "var"
        });
    
  }
  / a:Identifier e:equals b:number EOS {
  return Node({
            "type": "ExpressionStatement",
            "expression": {
                "type": "AssignmentExpression",
                "operator": e.text,
                "left": a,
                "right": b
            }
        });
  })+ {
return Node(
{
            "type": "Program",
            "body": exps,
            "sourceType": "script"
        });
  }

VariableDeclarator =
a:Identifier e:equals b:number {
    return Node({
                    "type": "VariableDeclarator",
                    "id": a,
                    "init": b
                }, a);
    }
    
VariableDeclarators = 
  first:VariableDeclarator rest:(comma a:VariableDeclarator { return a; })* { return [first].concat(rest); }

BlockStatement =
a:left_brace right_brace {
    return Node({
        "type": "BlockStatement",
            "body": []
        }, a);
    }

function =
S+ a:function_token { return a; }
Identifier = 
S+ a:IdentifierToken {
return Node({
type: 'Identifier',
name: a.text
}, a)
}

comma =
S? a:comma_token { return a; }

left_brace =
S? a:left_brace_token { return a; }
right_brace = 
S? a:right_brace_token { return a; }
left_parentheses = 
S? a:left_parentheses_token { return a; }

right_parentheses =
S? a:right_parentheses_token { return a; }
    
var =
S+ a:var_token &S  { return a; }
    
semicolon = 
S? a:semicolon_token { return a; }
    
equals =
S? a:equals_token { return a; } 

number =
S? a:number_token { return Node({
      "type": "Literal",
      "value": 1,
      "raw": "1"
    }); }
    
S =
    (WhiteSpace / LineTerminatorSequence / Comment)+ 

WhiteSpace =
    TAB 
  / VT 
  / FF 
  / SP 
  / NBSP 
  / ZWNBSP 
  / USP 

LineTerminator =
    LF 
  / CR 
  / LS 
  / PS 

LineTerminatorSequence =
    LF 
  / CR !(LF) 
  / LS 
  / PS 
  / CR LF 

Comment =
    MultiLineComment 
  / SingleLineComment 

MultiLineComment =
    solidus_token asterisk_token a:$MultiLineCommentChars* asterisk_token solidus_token { return a; } 

MultiLineCommentNoLB =
    solidus_token asterisk_token a:$(!(LineTerminator) MultiLineCommentChars)* asterisk_token solidus_token { return a; } 

SnoLB =
    (WhiteSpace / SingleLineComment / MultiLineCommentNoLB)+ 

EOS =
    SnoLB? LineTerminatorSequence 
  / SnoLB? &right_brace_token 
  / S? EOF 
  / S? semicolon_token 

EOSnoLB =
    SnoLB? semicolon_token
  / SnoLB? LineTerminatorSequence 
  / SnoLB? &right_brace_token 
  / SnoLB? EOF 

MultiLineCommentChars =
    MultiLineNotAsteriskChar MultiLineCommentChars? 
  / asterisk_token !solidus_token PostAsteriskCommentChars? 

PostAsteriskCommentChars =
    MultiLineNotForwardSlashOrAsteriskChar MultiLineCommentChars? 
  / asterisk_token !solidus_token PostAsteriskCommentChars? 

MultiLineNotAsteriskChar =
    !asterisk_token SourceCharacter 

MultiLineNotForwardSlashOrAsteriskChar =
    !(solidus_token / asterisk_token) SourceCharacter 

SingleLineComment =
    solidus_token solidus_token a:$SingleLineCommentChars? { return a; } 

SingleLineCommentChars =
    SingleLineCommentChar SingleLineCommentChars? 

SingleLineCommentChar =
    !(LineTerminator) SourceCharacter 
    
TAB = 
"\u0009"
VT = 
"\u000B"
FF = 
"\u000C"
SP = 
"\u0020"
NBSP = 
"\u00A0"
ZWNBSP = 
"\uFEFF"
USP =
[\u1680\u2000-\u200A\u202F\u205F\u3000]
ZWNJ =
"\u200C"
ZWJ = 
"\u200D"

LF =
"\u000A"
CR =
"\u000D"
LS =
"\u2028"
PS =
"\u2029"

EOF = !.

asterisk_token =
'*' { return Node({text: text()}); }
 
comma_token =
',' { return Node({text: text()}); }

equals_token = 
'=' { return Node({text: text()}); }

function_token = 
'function' { return Node({text: text()}); }

left_brace_token = 
'{' { return Node({text: text()}); }
left_parentheses_token = 
'(' { return Node({text: text()}); }

number_token = 
[1-9][0-9]* / [0-9] { return Node({text: text()}); }
right_brace_token = 
'}' { return Node({text: text()}); }

right_parentheses_token = 
')' { return Node({text: text()}); }

semicolon_token =
';'  { return Node({text: text()}); }

solidus_token = 
'/' { return Node({text: text()}); }
var_token = 
'var' { return Node({text: text()}); }

IdentifierToken=
a:$[A-Za-z0-9]+ { return Node({text: text()}); }

SourceCharacter = .




jas...@snmpstack.com

unread,
Sep 28, 2018, 2:43:56 PM9/28/18
to PEG.js: Parser Generator for JavaScript
I think I found the solution to first problem, though I don't like it....

instead of S+ and S? I created the rules 

WS = 
   (WhiteSpace / LineTerminatorSequence / Comment)+ 
   
S =
WS
    / &{ return beginning === true} !(WhiteSpace / LineTerminatorSequence / Comment) 

s = 
(WhiteSpace / LineTerminatorSequence / Comment)*

and added a let beginning = true; in the initializer, and then the first time Node() is called it sets beginning to false 

this works but it gets pretty tricky not sure if it will scale to the full grammar without a lot of heartache, any better ideas?


{
let beginning = true;
const includeLoc = true;
const includeSource = false;
  function Node(base, openingToken) {
      if (includeLoc) {
          const loc = location();
          delete loc.start.offset;
          delete loc.end.offset;
          if (includeSource) {
              loc.source = source || text().trim();
          }
          if (openingToken) {
          loc.start = openingToken.loc.start;
            loc.start.column -= 1;
          }
          loc.end.column -= 1;
          base.loc = loc;
      }      
      if (beginning === true) {
          beginning = false;
      }
      return base;
  }
}
/*a = a:$.+ {
return Node({
    text: a,
        textLength: a.length,
    });
}*/
start = Program
Program =
exps:(func:function a:Identifier left_parentheses right_parentheses b:BlockStatement {
        return Node({
type: 'FunctionExpression',
id: a,
params: [],
body: b,
generator: false,
            async: false
}, func);
}
S a:function_token &WS { return a; }
Identifier = 
S a:IdentifierToken {
return Node({
type: 'Identifier',
name: a.text
}, a)
}

comma =
s a:comma_token { return a; }

left_brace =
s a:left_brace_token { return a; }
right_brace = 
s a:right_brace_token { return a; }
left_parentheses = 
s a:left_parentheses_token { return a; }

right_parentheses =
s a:right_parentheses_token { return a; }
    
var =
S a:var_token &WS  { return a; }
    
semicolon = 
s a:semicolon_token { return a; }
    
equals =
s a:equals_token { return a; } 

number =
s a:number_token { return Node({
      "type": "Literal",
      "value": 1,
      "raw": "1"
    }); }
    

    
WS = 
   (WhiteSpace / LineTerminatorSequence / Comment)+ 
   
S =
WS
    / &{ return beginning === true} !(WhiteSpace / LineTerminatorSequence / Comment) 

s = 
(WhiteSpace / LineTerminatorSequence / Comment)*
SnoLB =
    (WhiteSpace / SingleLineComment / MultiLineCommentNoLB)* 

EOS =
    SnoLB? LineTerminatorSequence 
  / SnoLB? &right_brace_token 
  / s EOF 
  / s semicolon_token 
!(Keyword WS)  a:$[A-Za-z0-9]+ { return Node({text: text()}); }
    
Keyword =
'var'
    / 'function'

SourceCharacter = .
Reply all
Reply to author
Forward
0 new messages