I managed to hack this out myself, any comments or possible improvements would be greatly appreciated. If this can be done without ACTIONs? That would be great?
FYI, I was able to test this using ANTLRWorks2.1. The test cases from the spec and the dozen or so I came up with all passed.
The one draw back is that \" and D_CHAR_SEQ are part of the text of Raw_String passed to the parser. The parser can strip them out, but it would nice if the lexer separated the token.
grammar SampleRaw;
@lexer::members {
String d_char_start = "";
String d_char_end = "";
}
DQuote : '\"' ;
RAW : 'R' ;
ENCODING_PREFIX : 'u8' | [uUL] ;
S_CHAR_SEQ : S_CHAR+ ;
fragment S_CHAR
/* any member of the source character set except the
double_quote ", backslash \, or NEW_LINE character
*/
: ~[\"\\\n\r]
| ESCAPE_SEQUENCE
| UNIV_CHAR_NAME
;
fragment ESCAPE_SEQUENCE
: SIMPLE_ESCAPE_SEQ
| OCT_ESCAPE_SEQ
| HEX_ESCAPE_SEQ
;
fragment SIMPLE_ESCAPE_SEQ // one of
: '\\' '\''
| '\\' '\"'
| '\\' '?'
| '\\' '\\'
| '\\' 'a'
| '\\' 'b'
| '\\' 'f'
| '\\' 'n'
| '\\' 'r'
| '\\' 't'
| '\\' 'v'
;
fragment OCT_ESCAPE_SEQ
: [0-3] ( OCT_DIGIT OCT_DIGIT? )?
| [4-7] ( OCT_DIGIT )?
;
fragment OCT_DIGIT
: [0-7]
;
fragment HEX_ESCAPE_SEQ
: '\\' 'x' HEX_DIGIT+
;
fragment HEX_DIGIT
: [a-zA-Z0-9]
;
fragment UNIV_CHAR_NAME
: '\\' 'u' HEX_QUAD
| '\\' 'U' HEX_QUAD HEX_QUAD
;
fragment HEX_QUAD
: HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
;
Raw_String
: DQuote
( /* CASE when D_CHAR is empty
ACTION in D_CHAR_SEQ attempts to reset variable d_char_start
if it is empty, so handle this case statically
*/
'('
( ~')' // Anything but )
| ')' ~('\"') // ) Actually OK, can't be followed by "
// )" these are the terminating chars
)*
')'
| D_CHAR_SEQ // Action in D_CHAR_SEQ stores its value (START)
'('
( ~')' // Anything but )
| ')'
( (' '|'('|')'|'\\'|'\t'|'\u000b'|'\f'|'\n'|'\r') // ~D_CHAR
| D_CHAR_SEQ // value stored (END)
( { ( d_char_start != d_char_end ) }?
/* D_CHAR_SEQs Don't match, continue
consuming characters
*/
| { ( d_char_start == d_char_end ) }? ~'\"'
/* D_CHAR_SEQs Do match, consume
D_CHAR_SEQ and check next character
*/
)
)
)*
')'
D_CHAR_SEQ /* No need to check here,
Matching Terminating CHARS is only way to get out
of loop above
*/
)
DQuote { d_char_start = ""; d_char_end = ""; } // Reset Variables
;
D_CHAR_SEQ // Should be limited to 16 characters
: D_CHAR+ {
if ( d_char_start == "" ) {
d_char_start = getText() ;
}
else {
d_char_end = getText() ;
} }
;
fragment D_CHAR
/* any member of the basic source character set except
space, the left parenthesis (, the right parenthesis ),
the backslash \, and the control characters representing
horizontal tab, vertical tab, form feed, and newline.
*/
: ~[ \")(\\\t\v\f\n\r]
;
WS
: [\u0000-\u0020\u007f]+
;
start
: WS? ( string_literal WS? )* EOF
;
string_literal
: ENCODING_PREFIX? '\"' S_CHAR_SEQ '\"'
| ENCODING_PREFIX? 'R' Raw_String
;