Skip to content

Commit 00a82e6

Browse files
authored
Implement the component-model lexing rules for identifiers. (#382)
* Implement the component-model lexing rules for identifiers. The [component-model grammer] for kebab-case identifiers now looks like this: ``` name ::= <word> | <name>-<word> word ::= [a-z][0-9a-z]* | [A-Z][0-9A-Z]* ``` Implement the rules. This continues to use XID rules for the initial lexing, as that corresponds to what users might accidentally use, so that we can issue appropriate errors in those cases. The precise grammer is validated in a separate step. [component-model grammer]: https://github.com/WebAssembly/component-model/blob/main/design/mvp/Explainer.md#instance-definitions * Add more lexing tests. * Update the tests in tests/codegen/conventions.wit. * Comment out identifiers that collide when mapped to snake_case, for now. See WebAssembly/component-model#118.
1 parent 9370a15 commit 00a82e6

4 files changed

Lines changed: 41 additions & 74 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 25 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/wit-parser/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ id-arena = "2"
99
anyhow = { workspace = true }
1010
pulldown-cmark = { workspace = true }
1111
unicode-xid = "0.2.2"
12-
unicode-normalization = "0.1.19"
1312

1413
[dev-dependencies]
1514
rayon = "1"

crates/wit-parser/src/ast/lex.rs

Lines changed: 33 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ use std::char;
33
use std::convert::TryFrom;
44
use std::fmt;
55
use std::str;
6-
use unicode_normalization::char::canonical_combining_class;
76
use unicode_xid::UnicodeXID;
87

98
use self::Token::*;
@@ -90,7 +89,6 @@ pub enum Token {
9089
pub enum Error {
9190
InvalidCharInString(usize, char),
9291
InvalidCharInId(usize, char),
93-
IdNotSSNFC(usize),
9492
IdPartEmpty(usize),
9593
InvalidEscape(usize, char),
9694
// InvalidHexEscape(usize, char),
@@ -472,50 +470,39 @@ fn is_keylike_continue(ch: char) -> bool {
472470
}
473471

474472
pub fn validate_id(start: usize, id: &str) -> Result<(), Error> {
475-
// Ids must be in stream-safe NFC.
476-
if !unicode_normalization::is_nfc_stream_safe(&id) {
477-
return Err(Error::IdNotSSNFC(start));
478-
}
479-
480473
// IDs must have at least one part.
481474
if id.is_empty() {
482475
return Err(Error::IdPartEmpty(start));
483476
}
484477

485478
// Ids consist of parts separated by '-'s.
486479
for part in id.split("-") {
487-
// Parts must be non-empty and start with a non-combining XID start.
488-
match part.chars().next() {
480+
// Parts must be non-empty and contain either all ASCII lowercase or
481+
// all ASCII uppercase.
482+
let upper = match part.chars().next() {
489483
None => return Err(Error::IdPartEmpty(start)),
490484
Some(first) => {
491-
// Require the first character of each part to be non-combining,
492-
// so that if a source langauge uses `CamelCase`, they won't
493-
// combine with the last character of the previous part.
494-
if canonical_combining_class(first) != 0 {
495-
return Err(Error::InvalidCharInId(start, first));
496-
}
497-
498-
// Require the first character to be a XID start.
499-
if !UnicodeXID::is_xid_start(first) {
485+
if first.is_ascii_lowercase() {
486+
false
487+
} else if first.is_ascii_uppercase() {
488+
true
489+
} else {
500490
return Err(Error::InvalidCharInId(start, first));
501491
}
502-
503-
// TODO: Disallow values with 'Grapheme_Extend = Yes', to
504-
// prevent them from combining with previous parts?
505-
506-
// TODO: Disallow values with 'Grapheme_Cluster_Break = SpacingMark'?
507492
}
508493
};
509494

510-
// Some XID values are not valid ID part values.
511495
for ch in part.chars() {
512-
// Disallow uppercase and underscore, so that identifiers
513-
// consistently use `kebab-case`, and source languages can map
514-
// identifiers according to their own conventions (which might use
515-
// `CamelCase` or `snake_case` or something else) without worrying
516-
// about collisions.
517-
if ch.is_uppercase() || ch == '_' || !UnicodeXID::is_xid_continue(ch) {
518-
return Err(Error::InvalidCharInId(start, ch));
496+
if ch.is_ascii_digit() {
497+
// Digits are accepted in both uppercase and lowercase segments.
498+
} else if upper {
499+
if !ch.is_ascii_uppercase() {
500+
return Err(Error::InvalidCharInId(start, ch));
501+
}
502+
} else {
503+
if !ch.is_ascii_lowercase() {
504+
return Err(Error::InvalidCharInId(start, ch));
505+
}
519506
}
520507
}
521508
}
@@ -595,7 +582,6 @@ impl fmt::Display for Error {
595582
Error::InvalidCharInString(_, ch) => write!(f, "invalid character in string {:?}", ch),
596583
Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {:?}", ch),
597584
Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
598-
Error::IdNotSSNFC(_) => write!(f, "identifiers must be in stream-safe NFC"),
599585
Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {:?}", ch),
600586
}
601587
}
@@ -614,7 +600,6 @@ pub fn rewrite_error(err: &mut anyhow::Error, file: &str, contents: &str) {
614600
| Error::NewlineInString(at)
615601
| Error::InvalidCharInString(at, _)
616602
| Error::InvalidCharInId(at, _)
617-
| Error::IdNotSSNFC(at)
618603
| Error::IdPartEmpty(at)
619604
| Error::InvalidEscape(at, _) => *at,
620605
};
@@ -627,17 +612,17 @@ fn test_validate_id() {
627612
validate_id(0, "apple").unwrap();
628613
validate_id(0, "apple-pear").unwrap();
629614
validate_id(0, "apple-pear-grape").unwrap();
630-
validate_id(0, "garçon").unwrap();
631-
validate_id(0, "hühnervögel").unwrap();
632-
validate_id(0, "москва").unwrap();
633-
validate_id(0, "東京").unwrap();
634-
validate_id(0, "東-京").unwrap();
635-
validate_id(0, "garçon-hühnervögel-москва-東京").unwrap();
636-
validate_id(0, "garçon-hühnervögel-москва-東-京").unwrap();
637615
validate_id(0, "a0").unwrap();
638616
validate_id(0, "a").unwrap();
639617
validate_id(0, "a-a").unwrap();
640618
validate_id(0, "bool").unwrap();
619+
validate_id(0, "APPLE").unwrap();
620+
validate_id(0, "APPLE-PEAR").unwrap();
621+
validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
622+
validate_id(0, "apple-PEAR-grape").unwrap();
623+
validate_id(0, "APPLE-pear-GRAPE").unwrap();
624+
validate_id(0, "ENOENT").unwrap();
625+
validate_id(0, "is-XML").unwrap();
641626

642627
assert!(validate_id(0, "").is_err());
643628
assert!(validate_id(0, "0").is_err());
@@ -652,7 +637,6 @@ fn test_validate_id() {
652637
assert!(validate_id(0, "a-").is_err());
653638
assert!(validate_id(0, "-a").is_err());
654639
assert!(validate_id(0, "Apple").is_err());
655-
assert!(validate_id(0, "APPLE").is_err());
656640
assert!(validate_id(0, "applE").is_err());
657641
assert!(validate_id(0, "-apple-pear").is_err());
658642
assert!(validate_id(0, "apple-pear-").is_err());
@@ -675,11 +659,10 @@ fn test_validate_id() {
675659
assert!(validate_id(0, "_Znwj").is_err());
676660
assert!(validate_id(0, "__i386").is_err());
677661
assert!(validate_id(0, "__i386__").is_err());
678-
assert!(validate_id(0, "ENOENT").is_err());
679662
assert!(validate_id(0, "Москва").is_err());
680663
assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
681664
assert!(validate_id(0, "😼").is_err(), "non-identifier");
682-
assert!(validate_id(0, "\u{212b}").is_err(), "not NFC");
665+
assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
683666
}
684667

685668
#[test]
@@ -716,6 +699,13 @@ fn test_tokenizer() {
716699
assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
717700
assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
718701
assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
702+
assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
703+
assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
704+
assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
705+
assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
706+
assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
707+
assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
708+
assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
719709

720710
assert_eq!(collect("func").unwrap(), vec![Token::Func]);
721711
assert_eq!(

tests/codegen/conventions.wit

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ foo: func(x: ludicrous-speed)
1414
apple: func()
1515
apple-pear: func()
1616
apple-pear-grape: func()
17-
garçon: func()
18-
hühnervögel: func()
19-
москва: func()
20-
東-京: func()
21-
garçon-hühnervögel-москва-東-京: func()
2217
a0: func()
2318

19+
// Comment out identifiers that collide when mapped to snake_case, for now; see
20+
// https://github.com/WebAssembly/component-model/issues/118
21+
//APPLE: func()
22+
//APPLE-pear-GRAPE: func()
23+
//apple-PEAR-grape: func()
24+
25+
is-XML: func()
26+
2427
%explicit: func()
2528
%explicit-kebab: func()
2629

0 commit comments

Comments
 (0)