@@ -31,6 +31,7 @@ use std::char;
3131use std:: fmt;
3232use std:: slice;
3333use std:: str;
34+ use std:: str:: Utf8Error ;
3435
3536/// A structure used to lex the s-expression syntax of WAT files.
3637///
@@ -99,6 +100,12 @@ pub enum TokenKind {
99100 /// The payload here is the original source text.
100101 Keyword ,
101102
103+ /// An annotation (like `@foo`).
104+ ///
105+ /// All annotations start with `@` and the payload will be the name of the
106+ /// annotation.
107+ Annotation ,
108+
102109 /// A reserved series of `idchar` symbols. Unknown what this is meant to be
103110 /// used for, you'll probably generate an error about an unexpected token.
104111 Reserved ,
@@ -136,8 +143,15 @@ pub enum FloatKind {
136143}
137144
138145enum ReservedKind {
146+ /// "..."
139147 String ,
148+ /// anything that's just a sequence of `idchars!()`
140149 Idchars ,
150+ /// $"..."
151+ IdString ,
152+ /// @"..."
153+ AnnotationString ,
154+ /// everything else (a conglomeration of strings, idchars, etc)
141155 Reserved ,
142156}
143157
@@ -199,6 +213,16 @@ pub enum LexError {
199213 /// version to behave differently than the compiler-visible version, so
200214 /// these are simply rejected for now.
201215 ConfusingUnicode ( char ) ,
216+
217+ /// An invalid utf-8 sequence was found in a quoted identifier, such as
218+ /// `$"\ff"`.
219+ InvalidUtf8Id ( Utf8Error ) ,
220+
221+ /// An empty identifier was found, or a lone `$`.
222+ EmptyId ,
223+
224+ /// An empty identifier was found, or a lone `@`.
225+ EmptyAnnotation ,
202226}
203227
204228/// A sign token for an integer.
@@ -420,14 +444,21 @@ impl<'a> Lexer<'a> {
420444 if let Some ( ret) = self . classify_number ( src) {
421445 return Ok ( Some ( ret) ) ;
422446 // https://webassembly.github.io/spec/core/text/values.html#text-id
423- } else if * c == b'$' && src . len ( ) > 1 {
447+ } else if * c == b'$' {
424448 return Ok ( Some ( TokenKind :: Id ) ) ;
449+ // part of the WebAssembly/annotations proposal
450+ // (no online url yet)
451+ } else if * c == b'@' {
452+ return Ok ( Some ( TokenKind :: Annotation ) ) ;
425453 // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
426454 } else if b'a' <= * c && * c <= b'z' {
427455 return Ok ( Some ( TokenKind :: Keyword ) ) ;
428456 }
429457 }
430458
459+ ReservedKind :: IdString => return Ok ( Some ( TokenKind :: Id ) ) ,
460+ ReservedKind :: AnnotationString => return Ok ( Some ( TokenKind :: Annotation ) ) ,
461+
431462 // ... otherwise this was a conglomeration of idchars,
432463 // strings, or just idchars that don't match a prior rule,
433464 // meaning this falls through to the fallback `Reserved`
@@ -538,15 +569,15 @@ impl<'a> Lexer<'a> {
538569 /// eaten. The classification assists in determining what the actual token
539570 /// here eaten looks like.
540571 fn parse_reserved ( & self , pos : & mut usize ) -> Result < ( ReservedKind , & ' a str ) , Error > {
541- let mut idchars = false ;
572+ let mut idchars = 0u32 ;
542573 let mut strings = 0u32 ;
543574 let start = * pos;
544575 while let Some ( byte) = self . input . as_bytes ( ) . get ( * pos) {
545576 match byte {
546577 // Normal `idchars` production which appends to the reserved
547578 // token that's being produced.
548579 idchars ! ( ) => {
549- idchars = true ;
580+ idchars += 1 ;
550581 * pos += 1 ;
551582 }
552583
@@ -575,9 +606,13 @@ impl<'a> Lexer<'a> {
575606 }
576607 let ret = & self . input [ start..* pos] ;
577608 Ok ( match ( idchars, strings) {
578- ( false , 0 ) => unreachable ! ( ) ,
579- ( false , 1 ) => ( ReservedKind :: String , ret) ,
580- ( true , 0 ) => ( ReservedKind :: Idchars , ret) ,
609+ ( 0 , 0 ) => unreachable ! ( ) ,
610+ ( 0 , 1 ) => ( ReservedKind :: String , ret) ,
611+ ( _, 0 ) => ( ReservedKind :: Idchars , ret) ,
612+ // Pattern match `@"..."` and `$"..."` for string-based
613+ // identifiers and annotations.
614+ ( 1 , 1 ) if ret. starts_with ( "$" ) => ( ReservedKind :: IdString , ret) ,
615+ ( 1 , 1 ) if ret. starts_with ( "@" ) => ( ReservedKind :: AnnotationString , ret) ,
581616 _ => ( ReservedKind :: Reserved , ret) ,
582617 } )
583618 }
@@ -813,6 +848,37 @@ impl<'a> Lexer<'a> {
813848 }
814849 }
815850
851+ /// Parses an id-or-string-based name from `it`.
852+ ///
853+ /// Note that `it` should already have been lexed and this is just
854+ /// extracting the value. If the token lexed was `@a` then this should point
855+ /// to `a`.
856+ ///
857+ /// This will automatically detect quoted syntax such as `@"..."` and the
858+ /// byte string will be parsed and validated as utf-8.
859+ ///
860+ /// # Errors
861+ ///
862+ /// Returns an error if a quoted byte string is found and contains invalid
863+ /// utf-8.
864+ fn parse_name ( it : & mut str:: Chars < ' a > ) -> Result < Cow < ' a , str > , LexError > {
865+ if it. clone ( ) . next ( ) == Some ( '"' ) {
866+ it. next ( ) ;
867+ match Lexer :: parse_str ( it, true ) ? {
868+ Cow :: Borrowed ( bytes) => match std:: str:: from_utf8 ( bytes) {
869+ Ok ( s) => Ok ( Cow :: Borrowed ( s) ) ,
870+ Err ( e) => Err ( LexError :: InvalidUtf8Id ( e) ) ,
871+ } ,
872+ Cow :: Owned ( bytes) => match String :: from_utf8 ( bytes) {
873+ Ok ( s) => Ok ( Cow :: Owned ( s) ) ,
874+ Err ( e) => Err ( LexError :: InvalidUtf8Id ( e. utf8_error ( ) ) ) ,
875+ } ,
876+ }
877+ } else {
878+ Ok ( Cow :: Borrowed ( it. as_str ( ) ) )
879+ }
880+ }
881+
816882 fn hexnum ( it : & mut str:: Chars < ' _ > ) -> Result < u32 , LexError > {
817883 let n = Lexer :: hexdigit ( it) ?;
818884 let mut last_underscore = false ;
@@ -878,28 +944,23 @@ impl<'a> Lexer<'a> {
878944 std:: iter:: from_fn ( move || self . parse ( & mut pos) . transpose ( ) )
879945 }
880946
881- /// Returns whether an annotation is present at `pos` and the name of the
882- /// annotation.
883- pub fn annotation ( & self , mut pos : usize ) -> Option < & ' a str > {
947+ /// Returns whether an annotation is present at `pos`. If it is present then
948+ /// `Ok(Some(token))` is returned corresponding to the token, otherwise
949+ /// `Ok(None)` is returned. If the next token cannot be parsed then an error
950+ /// is returned.
951+ pub fn annotation ( & self , mut pos : usize ) -> Result < Option < Token > , Error > {
884952 let bytes = self . input . as_bytes ( ) ;
885953 // Quickly reject anything that for sure isn't an annotation since this
886954 // method is used every time an lparen is parsed.
887955 if bytes. get ( pos) != Some ( & b'@' ) {
888- return None ;
956+ return Ok ( None ) ;
889957 }
890- match self . parse ( & mut pos) {
891- Ok ( Some ( token) ) => {
892- match token. kind {
893- TokenKind :: Reserved => { }
894- _ => return None ,
895- }
896- if token. len == 1 {
897- None // just the `@` character isn't a valid annotation
898- } else {
899- Some ( & token. src ( self . input ) [ 1 ..] )
900- }
901- }
902- Ok ( None ) | Err ( _) => None ,
958+ match self . parse ( & mut pos) ? {
959+ Some ( token) => match token. kind {
960+ TokenKind :: Annotation => Ok ( Some ( token) ) ,
961+ _ => Ok ( None ) ,
962+ } ,
963+ None => Ok ( None ) ,
903964 }
904965 }
905966}
@@ -913,9 +974,49 @@ impl Token {
913974 /// Returns the identifier, without the leading `$` symbol, that this token
914975 /// represents.
915976 ///
977+ /// Note that this method returns the contents of the identifier. With a
978+ /// string-based identifier this means that escapes have been resolved to
979+ /// their string-based equivalent.
980+ ///
916981 /// Should only be used with `TokenKind::Id`.
917- pub fn id < ' a > ( & self , s : & ' a str ) -> & ' a str {
918- & self . src ( s) [ 1 ..]
982+ ///
983+ /// # Errors
984+ ///
985+ /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
986+ /// which is invalid utf-8.
987+ pub fn id < ' a > ( & self , s : & ' a str ) -> Result < Cow < ' a , str > , Error > {
988+ let mut ch = self . src ( s) . chars ( ) ;
989+ let dollar = ch. next ( ) ;
990+ debug_assert_eq ! ( dollar, Some ( '$' ) ) ;
991+ let id = Lexer :: parse_name ( & mut ch) . map_err ( |e| self . error ( s, e) ) ?;
992+ if id. is_empty ( ) {
993+ return Err ( self . error ( s, LexError :: EmptyId ) ) ;
994+ }
995+ Ok ( id)
996+ }
997+
998+ /// Returns the annotation, without the leading `@` symbol, that this token
999+ /// represents.
1000+ ///
1001+ /// Note that this method returns the contents of the identifier. With a
1002+ /// string-based identifier this means that escapes have been resolved to
1003+ /// their string-based equivalent.
1004+ ///
1005+ /// Should only be used with `TokenKind::Annotation`.
1006+ ///
1007+ /// # Errors
1008+ ///
1009+ /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1010+ /// which is invalid utf-8.
1011+ pub fn annotation < ' a > ( & self , s : & ' a str ) -> Result < Cow < ' a , str > , Error > {
1012+ let mut ch = self . src ( s) . chars ( ) ;
1013+ let at = ch. next ( ) ;
1014+ debug_assert_eq ! ( at, Some ( '@' ) ) ;
1015+ let id = Lexer :: parse_name ( & mut ch) . map_err ( |e| self . error ( s, e) ) ?;
1016+ if id. is_empty ( ) {
1017+ return Err ( self . error ( s, LexError :: EmptyAnnotation ) ) ;
1018+ }
1019+ Ok ( id)
9191020 }
9201021
9211022 /// Returns the keyword this token represents.
@@ -1061,6 +1162,16 @@ impl Token {
10611162 val,
10621163 }
10631164 }
1165+
1166+ fn error ( & self , src : & str , err : LexError ) -> Error {
1167+ Error :: lex (
1168+ Span {
1169+ offset : self . offset ,
1170+ } ,
1171+ src,
1172+ err,
1173+ )
1174+ }
10641175}
10651176
10661177impl < ' a > Integer < ' a > {
@@ -1107,6 +1218,9 @@ impl fmt::Display for LexError {
11071218 InvalidUnicodeValue ( c) => write ! ( f, "invalid unicode scalar value 0x{:x}" , c) ?,
11081219 LoneUnderscore => write ! ( f, "bare underscore in numeric literal" ) ?,
11091220 ConfusingUnicode ( c) => write ! ( f, "likely-confusing unicode character found {:?}" , c) ?,
1221+ InvalidUtf8Id ( _) => write ! ( f, "malformed UTF-8 encoding of string-based id" ) ?,
1222+ EmptyId => write ! ( f, "empty identifier" ) ?,
1223+ EmptyAnnotation => write ! ( f, "empty annotation id" ) ?,
11101224 }
11111225 Ok ( ( ) )
11121226 }
@@ -1254,10 +1368,10 @@ mod tests {
12541368
12551369 #[ test]
12561370 fn id ( ) {
1257- fn get_id ( input : & str ) -> & str {
1371+ fn get_id ( input : & str ) -> String {
12581372 let token = get_token ( input) ;
12591373 match token. kind {
1260- TokenKind :: Id => token. id ( input) ,
1374+ TokenKind :: Id => token. id ( input) . unwrap ( ) . to_string ( ) ,
12611375 other => panic ! ( "not id {:?}" , other) ,
12621376 }
12631377 }
@@ -1267,6 +1381,23 @@ mod tests {
12671381 assert_eq ! ( get_id( "$0^" ) , "0^" ) ;
12681382 assert_eq ! ( get_id( "$0^;;" ) , "0^" ) ;
12691383 assert_eq ! ( get_id( "$0^ ;;" ) , "0^" ) ;
1384+ assert_eq ! ( get_id( "$\" x\" ;;" ) , "x" ) ;
1385+ }
1386+
1387+ #[ test]
1388+ fn annotation ( ) {
1389+ fn get_annotation ( input : & str ) -> String {
1390+ let token = get_token ( input) ;
1391+ match token. kind {
1392+ TokenKind :: Annotation => token. annotation ( input) . unwrap ( ) . to_string ( ) ,
1393+ other => panic ! ( "not annotation {:?}" , other) ,
1394+ }
1395+ }
1396+ assert_eq ! ( get_annotation( "@foo" ) , "foo" ) ;
1397+ assert_eq ! ( get_annotation( "@foo " ) , "foo" ) ;
1398+ assert_eq ! ( get_annotation( "@f " ) , "f" ) ;
1399+ assert_eq ! ( get_annotation( "@\" x\" " ) , "x" ) ;
1400+ assert_eq ! ( get_annotation( "@0 " ) , "0" ) ;
12701401 }
12711402
12721403 #[ test]
@@ -1294,7 +1425,6 @@ mod tests {
12941425 other => panic ! ( "not reserved {:?}" , other) ,
12951426 }
12961427 }
1297- assert_eq ! ( get_reserved( "$ " ) , "$" ) ;
12981428 assert_eq ! ( get_reserved( "^_x " ) , "^_x" ) ;
12991429 }
13001430
0 commit comments