@@ -3,7 +3,6 @@ use std::char;
33use std:: convert:: TryFrom ;
44use std:: fmt;
55use std:: str;
6- use unicode_normalization:: char:: canonical_combining_class;
76use unicode_xid:: UnicodeXID ;
87
98use self :: Token :: * ;
@@ -90,7 +89,6 @@ pub enum Token {
9089pub enum Error {
9190 InvalidCharInString ( usize , char ) ,
9291 InvalidCharInId ( usize , char ) ,
93- IdNotSSNFC ( usize ) ,
9492 IdPartEmpty ( usize ) ,
9593 InvalidEscape ( usize , char ) ,
9694 // InvalidHexEscape(usize, char),
@@ -472,50 +470,39 @@ fn is_keylike_continue(ch: char) -> bool {
472470}
473471
474472pub fn validate_id ( start : usize , id : & str ) -> Result < ( ) , Error > {
475- // Ids must be in stream-safe NFC.
476- if !unicode_normalization:: is_nfc_stream_safe ( & id) {
477- return Err ( Error :: IdNotSSNFC ( start) ) ;
478- }
479-
480473 // IDs must have at least one part.
481474 if id. is_empty ( ) {
482475 return Err ( Error :: IdPartEmpty ( start) ) ;
483476 }
484477
485478 // Ids consist of parts separated by '-'s.
486479 for part in id. split ( "-" ) {
487- // Parts must be non-empty and start with a non-combining XID start.
488- match part. chars ( ) . next ( ) {
480+ // Parts must be non-empty and contain either all ASCII lowercase or
481+ // all ASCII uppercase.
482+ let upper = match part. chars ( ) . next ( ) {
489483 None => return Err ( Error :: IdPartEmpty ( start) ) ,
490484 Some ( first) => {
491- // Require the first character of each part to be non-combining,
492- // so that if a source langauge uses `CamelCase`, they won't
493- // combine with the last character of the previous part.
494- if canonical_combining_class ( first) != 0 {
495- return Err ( Error :: InvalidCharInId ( start, first) ) ;
496- }
497-
498- // Require the first character to be a XID start.
499- if !UnicodeXID :: is_xid_start ( first) {
485+ if first. is_ascii_lowercase ( ) {
486+ false
487+ } else if first. is_ascii_uppercase ( ) {
488+ true
489+ } else {
500490 return Err ( Error :: InvalidCharInId ( start, first) ) ;
501491 }
502-
503- // TODO: Disallow values with 'Grapheme_Extend = Yes', to
504- // prevent them from combining with previous parts?
505-
506- // TODO: Disallow values with 'Grapheme_Cluster_Break = SpacingMark'?
507492 }
508493 } ;
509494
510- // Some XID values are not valid ID part values.
511495 for ch in part. chars ( ) {
512- // Disallow uppercase and underscore, so that identifiers
513- // consistently use `kebab-case`, and source languages can map
514- // identifiers according to their own conventions (which might use
515- // `CamelCase` or `snake_case` or something else) without worrying
516- // about collisions.
517- if ch. is_uppercase ( ) || ch == '_' || !UnicodeXID :: is_xid_continue ( ch) {
518- return Err ( Error :: InvalidCharInId ( start, ch) ) ;
496+ if ch. is_ascii_digit ( ) {
497+ // Digits are accepted in both uppercase and lowercase segments.
498+ } else if upper {
499+ if !ch. is_ascii_uppercase ( ) {
500+ return Err ( Error :: InvalidCharInId ( start, ch) ) ;
501+ }
502+ } else {
503+ if !ch. is_ascii_lowercase ( ) {
504+ return Err ( Error :: InvalidCharInId ( start, ch) ) ;
505+ }
519506 }
520507 }
521508 }
@@ -595,7 +582,6 @@ impl fmt::Display for Error {
595582 Error :: InvalidCharInString ( _, ch) => write ! ( f, "invalid character in string {:?}" , ch) ,
596583 Error :: InvalidCharInId ( _, ch) => write ! ( f, "invalid character in identifier {:?}" , ch) ,
597584 Error :: IdPartEmpty ( _) => write ! ( f, "identifiers must have characters between '-'s" ) ,
598- Error :: IdNotSSNFC ( _) => write ! ( f, "identifiers must be in stream-safe NFC" ) ,
599585 Error :: InvalidEscape ( _, ch) => write ! ( f, "invalid escape in string {:?}" , ch) ,
600586 }
601587 }
@@ -614,7 +600,6 @@ pub fn rewrite_error(err: &mut anyhow::Error, file: &str, contents: &str) {
614600 | Error :: NewlineInString ( at)
615601 | Error :: InvalidCharInString ( at, _)
616602 | Error :: InvalidCharInId ( at, _)
617- | Error :: IdNotSSNFC ( at)
618603 | Error :: IdPartEmpty ( at)
619604 | Error :: InvalidEscape ( at, _) => * at,
620605 } ;
@@ -627,17 +612,17 @@ fn test_validate_id() {
627612 validate_id ( 0 , "apple" ) . unwrap ( ) ;
628613 validate_id ( 0 , "apple-pear" ) . unwrap ( ) ;
629614 validate_id ( 0 , "apple-pear-grape" ) . unwrap ( ) ;
630- validate_id ( 0 , "garçon" ) . unwrap ( ) ;
631- validate_id ( 0 , "hühnervögel" ) . unwrap ( ) ;
632- validate_id ( 0 , "москва" ) . unwrap ( ) ;
633- validate_id ( 0 , "東京" ) . unwrap ( ) ;
634- validate_id ( 0 , "東-京" ) . unwrap ( ) ;
635- validate_id ( 0 , "garçon-hühnervögel-москва-東京" ) . unwrap ( ) ;
636- validate_id ( 0 , "garçon-hühnervögel-москва-東-京" ) . unwrap ( ) ;
637615 validate_id ( 0 , "a0" ) . unwrap ( ) ;
638616 validate_id ( 0 , "a" ) . unwrap ( ) ;
639617 validate_id ( 0 , "a-a" ) . unwrap ( ) ;
640618 validate_id ( 0 , "bool" ) . unwrap ( ) ;
619+ validate_id ( 0 , "APPLE" ) . unwrap ( ) ;
620+ validate_id ( 0 , "APPLE-PEAR" ) . unwrap ( ) ;
621+ validate_id ( 0 , "APPLE-PEAR-GRAPE" ) . unwrap ( ) ;
622+ validate_id ( 0 , "apple-PEAR-grape" ) . unwrap ( ) ;
623+ validate_id ( 0 , "APPLE-pear-GRAPE" ) . unwrap ( ) ;
624+ validate_id ( 0 , "ENOENT" ) . unwrap ( ) ;
625+ validate_id ( 0 , "is-XML" ) . unwrap ( ) ;
641626
642627 assert ! ( validate_id( 0 , "" ) . is_err( ) ) ;
643628 assert ! ( validate_id( 0 , "0" ) . is_err( ) ) ;
@@ -652,7 +637,6 @@ fn test_validate_id() {
652637 assert ! ( validate_id( 0 , "a-" ) . is_err( ) ) ;
653638 assert ! ( validate_id( 0 , "-a" ) . is_err( ) ) ;
654639 assert ! ( validate_id( 0 , "Apple" ) . is_err( ) ) ;
655- assert ! ( validate_id( 0 , "APPLE" ) . is_err( ) ) ;
656640 assert ! ( validate_id( 0 , "applE" ) . is_err( ) ) ;
657641 assert ! ( validate_id( 0 , "-apple-pear" ) . is_err( ) ) ;
658642 assert ! ( validate_id( 0 , "apple-pear-" ) . is_err( ) ) ;
@@ -675,11 +659,10 @@ fn test_validate_id() {
675659 assert ! ( validate_id( 0 , "_Znwj" ) . is_err( ) ) ;
676660 assert ! ( validate_id( 0 , "__i386" ) . is_err( ) ) ;
677661 assert ! ( validate_id( 0 , "__i386__" ) . is_err( ) ) ;
678- assert ! ( validate_id( 0 , "ENOENT" ) . is_err( ) ) ;
679662 assert ! ( validate_id( 0 , "Москва" ) . is_err( ) ) ;
680663 assert ! ( validate_id( 0 , "garçon-hühnervögel-Москва-東京" ) . is_err( ) ) ;
681664 assert ! ( validate_id( 0 , "😼" ) . is_err( ) , "non-identifier" ) ;
682- assert ! ( validate_id( 0 , "\u{212b} " ) . is_err( ) , "not NFC " ) ;
665+ assert ! ( validate_id( 0 , "\u{212b} " ) . is_err( ) , "non-ascii " ) ;
683666}
684667
685668#[ test]
@@ -716,6 +699,13 @@ fn test_tokenizer() {
716699 assert_eq ! ( collect( "%a-a" ) . unwrap( ) , vec![ Token :: ExplicitId ] ) ;
717700 assert_eq ! ( collect( "%bool" ) . unwrap( ) , vec![ Token :: ExplicitId ] ) ;
718701 assert_eq ! ( collect( "%" ) . unwrap( ) , vec![ Token :: ExplicitId ] ) ;
702+ assert_eq ! ( collect( "APPLE" ) . unwrap( ) , vec![ Token :: Id ] ) ;
703+ assert_eq ! ( collect( "APPLE-PEAR" ) . unwrap( ) , vec![ Token :: Id ] ) ;
704+ assert_eq ! ( collect( "APPLE-PEAR-GRAPE" ) . unwrap( ) , vec![ Token :: Id ] ) ;
705+ assert_eq ! ( collect( "apple-PEAR-grape" ) . unwrap( ) , vec![ Token :: Id ] ) ;
706+ assert_eq ! ( collect( "APPLE-pear-GRAPE" ) . unwrap( ) , vec![ Token :: Id ] ) ;
707+ assert_eq ! ( collect( "ENOENT" ) . unwrap( ) , vec![ Token :: Id ] ) ;
708+ assert_eq ! ( collect( "is-XML" ) . unwrap( ) , vec![ Token :: Id ] ) ;
719709
720710 assert_eq ! ( collect( "func" ) . unwrap( ) , vec![ Token :: Func ] ) ;
721711 assert_eq ! (
0 commit comments