Lexing C++11 Raw_String

53 views
Skip to first unread message

Dennis Ashley

unread,
Mar 12, 2016, 11:36:43 PM3/12/16
to antlr-discussion

I managed to hack this out myself, any comments or possible improvements would be greatly appreciated.  If this can be done without ACTIONs?  That would be great?

FYI, I was able to test this using ANTLRWorks2.1.  The test cases from the spec and the dozen or so I came up with all passed.

The one draw back is that \" and D_CHAR_SEQ are part of the text of Raw_String passed to the parser.  The parser can strip them out, but it would nice if the lexer separated the token.

    grammar SampleRaw;

    @lexer::members {
        String d_char_start = "";
        String d_char_end   = "";
    }

    DQuote          : '\"' ;
    RAW             : 'R' ;
    ENCODING_PREFIX : 'u8' | [uUL] ;
    S_CHAR_SEQ      : S_CHAR+ ;
    fragment S_CHAR
            /* any member of the source character set except the
               double_quote ", backslash \, or NEW_LINE character
             */
        : ~[\"\\\n\r]
        | ESCAPE_SEQUENCE
        | UNIV_CHAR_NAME
        ;
    fragment ESCAPE_SEQUENCE
        : SIMPLE_ESCAPE_SEQ
        | OCT_ESCAPE_SEQ
        | HEX_ESCAPE_SEQ
        ;
    fragment SIMPLE_ESCAPE_SEQ  // one of
        : '\\' '\''
        | '\\' '\"'
        | '\\' '?'
        | '\\' '\\'
        | '\\' 'a'
        | '\\' 'b'
        | '\\' 'f'
        | '\\' 'n'
        | '\\' 'r'
        | '\\' 't'
        | '\\' 'v'
        ;
    fragment OCT_ESCAPE_SEQ
        : [0-3] ( OCT_DIGIT OCT_DIGIT? )?
        | [4-7] ( OCT_DIGIT )?
        ;
    fragment OCT_DIGIT
        : [0-7]
        ;
    fragment HEX_ESCAPE_SEQ
        : '\\' 'x' HEX_DIGIT+
        ;
    fragment HEX_DIGIT
        : [a-zA-Z0-9]
        ;
    fragment UNIV_CHAR_NAME
        : '\\' 'u' HEX_QUAD
        | '\\' 'U' HEX_QUAD HEX_QUAD
        ;
    fragment HEX_QUAD
        : HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
        ;

    Raw_String
        : DQuote
          ( /* CASE when D_CHAR is empty
               ACTION in D_CHAR_SEQ attempts to reset variable d_char_start
               if it is empty, so handle this case statically
             */
            '('
                ( ~')'         // Anything but )
                | ')' ~('\"')  // ) Actually OK, can't be followed by "
                               //  )" these are the terminating chars
                )*
            ')'
          | D_CHAR_SEQ  // Action in D_CHAR_SEQ stores its value (START)
            '('
                ( ~')'  // Anything but )
                | ')'
                    ( (' '|'('|')'|'\\'|'\t'|'\u000b'|'\f'|'\n'|'\r') // ~D_CHAR
                    | D_CHAR_SEQ  // value stored (END)
                        ( { ( d_char_start != d_char_end ) }?
                            /* D_CHAR_SEQs Don't match, continue
                               consuming characters
                             */
                        | { ( d_char_start == d_char_end ) }? ~'\"'
                            /* D_CHAR_SEQs Do match, consume
                               D_CHAR_SEQ and check next character
                             */
                       )
                    )
                )*
            ')'
            D_CHAR_SEQ  /* No need to check here,
                           Matching Terminating CHARS is only way to get out
                           of loop above
                         */
          )
          DQuote { d_char_start = ""; d_char_end = ""; } // Reset Variables
        ;
    D_CHAR_SEQ      // Should be limited to 16 characters
        : D_CHAR+  {
                    if ( d_char_start == "" ) {
                        d_char_start = getText() ;
                    }
                    else {
                        d_char_end   = getText() ;
                    } }
        ;
    fragment D_CHAR
            /* any member of the basic source character set except
               space, the left parenthesis (, the right parenthesis ),
               the backslash \, and the control characters representing
               horizontal tab, vertical tab, form feed, and newline.
             */
        : ~[ \")(\\\t\v\f\n\r]
        ;
    WS
        : [\u0000-\u0020\u007f]+
        ;
    start
        :  WS? ( string_literal WS? )* EOF
        ;
    string_literal
        : ENCODING_PREFIX? '\"' S_CHAR_SEQ '\"'
        | ENCODING_PREFIX? 'R' Raw_String
        ;
SampleRaw.g4

Dennis Ashley

unread,
Mar 18, 2016, 4:09:08 PM3/18/16
to antlr-discussion


On Saturday, March 12, 2016 at 10:36:43 PM UTC-6, Dennis Ashley wrote:

I managed to hack this out myself, any comments or possible improvements would be greatly appreciated.  If this can be done without ACTIONs?  That would be great?

FYI, I was able to test this using ANTLRWorks2.1.  The test cases from the spec and the dozen or so I came up with all passed.

The one draw back is that \" and D_CHAR_SEQ are part of the text of Raw_String passed to the parser.  The parser can strip them out, but it would nice if the lexer separated the token.

    grammar SampleRaw;

    Reg_String
        : '\"' S_CHAR* '\"'
        : 'R'
          '\"'
          D_CHAR_SEQ?
          '('
          /* The following loop consumes characters until it matches the
             terminating sequence of characters for the RAW STRING
           - The predicates are mutually exclusive, so Only one will
             ever execute in each loop pass
           */
          (  /* If Current Text Does Not End with Initial D_CHAR Delimiter
                take any TOKEN ( Character )
              */
            {
             !getText().endsWith( ")"
                                + getText().substring( getText().indexOf("\"") + 1
                                                     , getText().indexOf("(")
                                                     )
                                )
            }?
            .
          |  /* If Current Text Does End with Initial D_CHAR Delimiter
                Check following TOKEN ( Character ) does not complete the Raw String
              - If the following Characters is \", we fall out of the loop
              */
            {
              getText().endsWith( ")"
                                + getText().substring( getText().indexOf("\"") + 1
                                                     , getText().indexOf("(")
                                                     )
                                )
            }?
            ~'\"'
          )*
          '\"'              // Match Closing Double Quote
        ;
    fragment D_CHAR_SEQ     // Should be limited to 16 characters
        : D_CHAR+
        ;
    fragment D_CHAR
            /* any member of the basic source character set except
               space, the left parenthesis (, the right parenthesis ),
               the backslash \, and the control characters representing
               horizontal tab, vertical tab, form feed, and newline.
             */
        :  ~[\u0009-\u000d ()\\]
        ;
    ENCODING_PREFIX         //  one of
        : 'u8'
        | [uUL]
        ;
    WhiteSpace
        : [ \u0000-\u0020\u007f]+ -> skip
        ;
    start
        : string_literal* EOF
        ;
    string_literal
        : ENCODING_PREFIX? Reg_String
        | ENCODING_PREFIX? Raw_String
        ;

Dennis Ashley

unread,
Mar 18, 2016, 4:18:44 PM3/18/16
to antlr-discussion
This is the best solution I could up with!  It works with on the test cases I could come up with and it its simple. 
Reply all
Reply to author
Forward
0 new messages