Skip to content

Commit 8dc6ddf

Browse files
authored
Simplify and clarify some value type parsing (#1601)
Reading over #1600 I remembered that I have a difficult time understanding how value types are encoded in wasm. There's a number of overlapping concerns and a bit of duplication within `wasmparser` itself. I've tried to leave an explanatory comment for myself in the future which can also hopefully help serve others as well. Along the way I've also managed to remove `ValType::is_valtype_byte` with some simpler logic to avoid duplication of all the value type bytes that are supported.
1 parent 10d2e21 commit 8dc6ddf

2 files changed

Lines changed: 92 additions & 30 deletions

File tree

crates/wasmparser/src/binary_reader.rs

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -732,14 +732,27 @@ impl<'a> BinaryReader<'a> {
732732
pub(crate) fn read_block_type(&mut self) -> Result<BlockType> {
733733
let b = self.peek()?;
734734

735-
// Check for empty block
736-
if b == 0x40 {
737-
self.position += 1;
738-
return Ok(BlockType::Empty);
739-
}
740-
741-
// Check for a block type of form [] -> [t].
742-
if ValType::is_valtype_byte(b) {
735+
// Block types are encoded as either 0x40, a `valtype`, or `s33`. All
736+
// current `valtype` encodings are negative numbers when encoded with
737+
// sleb128, but it's also required that valtype encodings are in their
738+
// canonical form. For example an overlong encoding of -1 as `0xff 0x7f`
739+
// is not valid and it is required to be `0x7f`. This means that we
740+
// can't simply match on the `s33` that pops out below since reading the
741+
// whole `s33` might read an overlong encoding.
742+
//
743+
// To test for this the first byte `b` is inspected. The highest bit,
744+
// the continuation bit in LEB128 encoding, must be clear. The next bit,
745+
// the sign bit, must be set to indicate that the number is negative. If
746+
// these two conditions hold then we're guaranteed that this is a
747+
// negative number.
748+
//
749+
// After this a value type is read directly instead of looking for an
750+
// indexed value type.
751+
if b & 0x80 == 0 && b & 0x40 != 0 {
752+
if b == 0x40 {
753+
self.position += 1;
754+
return Ok(BlockType::Empty);
755+
}
743756
return Ok(BlockType::Type(self.read()?));
744757
}
745758

crates/wasmparser/src/readers/core/types.rs

Lines changed: 71 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,18 +1369,10 @@ pub enum HeapType {
13691369
NoExn,
13701370
}
13711371

1372-
impl ValType {
1373-
pub(crate) fn is_valtype_byte(byte: u8) -> bool {
1374-
match byte {
1375-
0x7F | 0x7E | 0x7D | 0x7C | 0x7B | 0x70 | 0x6F | 0x64 | 0x63 | 0x6E | 0x71 | 0x72
1376-
| 0x74 | 0x73 | 0x6D | 0x6B | 0x6A | 0x6C | 0x69 => true,
1377-
_ => false,
1378-
}
1379-
}
1380-
}
1381-
13821372
impl<'a> FromReader<'a> for StorageType {
13831373
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
1374+
// NB: See `FromReader<'a> for ValType` for a table of how this
1375+
// interacts with other value encodings.
13841376
match reader.peek()? {
13851377
0x78 => {
13861378
reader.read_u8()?;
@@ -1397,6 +1389,53 @@ impl<'a> FromReader<'a> for StorageType {
13971389

13981390
impl<'a> FromReader<'a> for ValType {
13991391
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
1392+
// Decoding value types is sort of subtle because the space of what's
1393+
// being decoded here is actually spread out across an number of
1394+
// locations. This comment here is intended to serve as a bit of a
1395+
// reference to what's being decoded here and how it interacts with
1396+
// other locations.
1397+
//
1398+
// Note that all value types are encoded as canonical-form negative
1399+
// numbers in the sleb128 encoding scheme. Currently in the wasm spec
1400+
// sleb128 isn't actually used but it looks to be modelled to allow it
1401+
// one day. In the meantime the current values used are:
1402+
//
1403+
// | sleb128 | decimal | type | notes |
1404+
// |---------|---------|--------------|------------------------------|
1405+
// | 0x7F | -1 | i32 | |
1406+
// | 0x7E | -2 | i64 | |
1407+
// | 0x7D | -3 | f32 | |
1408+
// | 0x7C | -4 | f64 | |
1409+
// | 0x7B | -5 | v128 | simd proposal |
1410+
// | 0x78 | -8 | i8 | gc proposal, in `FieldType` |
1411+
// | 0x77 | -9 | i16 | gc proposal, in `FieldType` |
1412+
// | 0x74 | -12 | noexn | gc + exceptions proposal |
1413+
// | 0x73 | -13 | nofunc | gc proposal |
1414+
// | 0x72 | -14 | noextern | gc proposal |
1415+
// | 0x71 | -15 | nullref | gc proposal |
1416+
// | 0x70 | -16 | func | reference types proposal |
1417+
// | 0x6F | -17 | extern | reference types proposal |
1418+
// | 0x6E | -18 | any | gc proposal |
1419+
// | 0x6D | -19 | eq | gc proposal |
1420+
// | 0x6C | -20 | i31 | gc proposal |
1421+
// | 0x6B | -21 | struct | gc proposal |
1422+
// | 0x6A | -22 | array | gc proposal |
1423+
// | 0x69 | -23 | exnref | gc + exceptions proposal |
1424+
// | 0x64 | -28 | ref $t | gc proposal, prefix byte |
1425+
// | 0x63 | -29 | ref null $t | gc proposal, prefix byte |
1426+
// | 0x60 | -32 | func $t | prefix byte |
1427+
// | 0x5f | -33 | struct $t | gc proposal, prefix byte |
1428+
// | 0x5e | -34 | array $t | gc proposal, prefix byte |
1429+
// | 0x50 | -48 | sub $t | gc proposal, prefix byte |
1430+
// | 0x4F | -49 | sub final $t | gc proposal, prefix byte |
1431+
// | 0x4E | -50 | rec $t | gc proposal, prefix byte |
1432+
// | 0x40 | -64 | ε | empty block type |
1433+
//
1434+
// Note that not all of these encodings are parsed here, for example
1435+
// 0x78 as the encoding for `i8` is parsed only in `FieldType`. The
1436+
// parsing of `FieldType` will delegate here without actually consuming
1437+
// anything though so the encoding 0x78 still must be disjoint and not
1438+
// read here otherwise.
14001439
match reader.peek()? {
14011440
0x7F => {
14021441
reader.read_u8()?;
@@ -1427,32 +1466,36 @@ impl<'a> FromReader<'a> for ValType {
14271466

14281467
impl<'a> FromReader<'a> for RefType {
14291468
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
1469+
// NB: See `FromReader<'a> for ValType` for a table of how this
1470+
// interacts with other value encodings.
14301471
match reader.read()? {
1431-
0x70 => Ok(RefType::FUNC.nullable()),
1432-
0x6F => Ok(RefType::EXTERN.nullable()),
1433-
0x6E => Ok(RefType::ANY.nullable()),
1434-
0x71 => Ok(RefType::NONE.nullable()),
1435-
0x72 => Ok(RefType::NOEXTERN.nullable()),
1436-
0x73 => Ok(RefType::NOFUNC.nullable()),
1437-
0x6D => Ok(RefType::EQ.nullable()),
1438-
0x6B => Ok(RefType::STRUCT.nullable()),
1439-
0x6A => Ok(RefType::ARRAY.nullable()),
1440-
0x6C => Ok(RefType::I31.nullable()),
1441-
0x69 => Ok(RefType::EXN.nullable()),
1442-
0x74 => Ok(RefType::NOEXN.nullable()),
14431472
byte @ (0x63 | 0x64) => {
14441473
let nullable = byte == 0x63;
14451474
let pos = reader.original_position();
14461475
RefType::new(nullable, reader.read()?)
14471476
.ok_or_else(|| crate::BinaryReaderError::new("type index too large", pos))
14481477
}
1478+
0x69 => Ok(RefType::EXN.nullable()),
1479+
0x6A => Ok(RefType::ARRAY.nullable()),
1480+
0x6B => Ok(RefType::STRUCT.nullable()),
1481+
0x6C => Ok(RefType::I31.nullable()),
1482+
0x6D => Ok(RefType::EQ.nullable()),
1483+
0x6E => Ok(RefType::ANY.nullable()),
1484+
0x6F => Ok(RefType::EXTERN.nullable()),
1485+
0x70 => Ok(RefType::FUNC.nullable()),
1486+
0x71 => Ok(RefType::NONE.nullable()),
1487+
0x72 => Ok(RefType::NOEXTERN.nullable()),
1488+
0x73 => Ok(RefType::NOFUNC.nullable()),
1489+
0x74 => Ok(RefType::NOEXN.nullable()),
14491490
_ => bail!(reader.original_position(), "malformed reference type"),
14501491
}
14511492
}
14521493
}
14531494

14541495
impl<'a> FromReader<'a> for HeapType {
14551496
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
1497+
// NB: See `FromReader<'a> for ValType` for a table of how this
1498+
// interacts with other value encodings.
14561499
match reader.peek()? {
14571500
0x70 => {
14581501
reader.read_u8()?;
@@ -1669,6 +1712,8 @@ fn read_composite_type(
16691712
opcode: u8,
16701713
reader: &mut BinaryReader,
16711714
) -> Result<CompositeType, BinaryReaderError> {
1715+
// NB: See `FromReader<'a> for ValType` for a table of how this
1716+
// interacts with other value encodings.
16721717
Ok(match opcode {
16731718
0x60 => CompositeType::Func(reader.read()?),
16741719
0x5e => CompositeType::Array(reader.read()?),
@@ -1679,6 +1724,8 @@ fn read_composite_type(
16791724

16801725
impl<'a> FromReader<'a> for RecGroup {
16811726
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
1727+
// NB: See `FromReader<'a> for ValType` for a table of how this
1728+
// interacts with other value encodings.
16821729
match reader.peek()? {
16831730
0x4e => {
16841731
reader.read_u8()?;
@@ -1702,6 +1749,8 @@ impl<'a> FromReader<'a> for RecGroup {
17021749
impl<'a> FromReader<'a> for SubType {
17031750
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
17041751
let pos = reader.original_position();
1752+
// NB: See `FromReader<'a> for ValType` for a table of how this
1753+
// interacts with other value encodings.
17051754
Ok(match reader.read_u8()? {
17061755
opcode @ (0x4f | 0x50) => {
17071756
let idx_iter = reader.read_iter(MAX_WASM_SUPERTYPES, "supertype idxs")?;

0 commit comments

Comments
 (0)