|
| 1 | +From a3ac7f97a0dacdbc4df51322541cb4b1d063fe12 Mon Sep 17 00:00:00 2001 |
| 2 | +From: AllSpark <allspark@microsoft.com> |
| 3 | +Date: Thu, 9 Apr 2026 09:13:59 +0000 |
| 4 | +Subject: [PATCH] Security hardening for external_data: whitelist keys, |
| 5 | + parse-time bounds, file-size validation; warnings; export constants |
| 6 | + |
| 7 | +Upstream-reference: https://github.com/onnx/onnx/commit/e30c6935d67cc3eca2fa284e37248e7c0036c46b.patch |
| 8 | +Note: The original patch authored by AllSpark was backported by Aninda <v-anipradhan@microsoft.com> to apply to version 2.0.0 of PyTorch on Azure Linux. |
| 9 | + |
| 10 | +--- |
| 11 | + third_party/onnx/onnx/external_data_helper.py | 102 ++++++++++++++++-- |
| 12 | + 1 file changed, 91 insertions(+), 11 deletions(-) |
| 13 | + |
| 14 | +diff --git a/third_party/onnx/onnx/external_data_helper.py b/third_party/onnx/onnx/external_data_helper.py |
| 15 | +index 6763b11d..27a0a407 100644 |
| 16 | +--- a/third_party/onnx/onnx/external_data_helper.py |
| 17 | ++++ b/third_party/onnx/onnx/external_data_helper.py |
| 18 | +@@ -3,12 +3,65 @@ import os |
| 19 | + import re |
| 20 | + import sys |
| 21 | + import uuid |
| 22 | ++import warnings |
| 23 | + from itertools import chain |
| 24 | +-from typing import Callable, Iterable, Optional |
| 25 | ++from typing import Callable, Iterable, Optional, IO |
| 26 | + |
| 27 | + import onnx.onnx_cpp2py_export.checker as c_checker |
| 28 | + from .onnx_pb import AttributeProto, GraphProto, ModelProto, TensorProto |
| 29 | + |
| 30 | ++# Security: 3-layer defense against malicious external_data entries (GHSA-538c-55jv-c5g9) |
| 31 | ++# |
| 32 | ++# Layer 1 (here) — Attribute whitelist: Only spec-defined keys are accepted. |
| 33 | ++# Unknown keys are warned and ignored, preventing arbitrary attribute injection (CWE-915). |
| 34 | ++# |
| 35 | ++# Layer 2 (ExternalDataInfo.__init__) — Bounds validation: offset and length must be |
| 36 | ++# non-negative integers. Catches invalid values at parse time (CWE-400). |
| 37 | ++# |
| 38 | ++# Layer 3 (load_external_data_for_tensor) — File-size validation: offset and length are |
| 39 | ++# checked against actual file size before reading. This is the critical safety net that |
| 40 | ++# prevents memory exhaustion regardless of how the model was constructed (CWE-400). |
| 41 | ++# |
| 42 | ++# 'basepath' is included because set_external_data() writes it to protobuf entries; |
| 43 | ++# it must survive save/load round-trips. |
| 44 | ++_ALLOWED_EXTERNAL_DATA_KEYS = frozenset({"location", "offset", "length", "checksum", "basepath"}) |
| 45 | ++_SORTED_ALLOWED_KEYS = sorted(_ALLOWED_EXTERNAL_DATA_KEYS) |
| 46 | ++_MAX_UNKNOWN_KEYS_IN_WARNING = 10 |
| 47 | ++ |
| 48 | ++ |
| 49 | ++def _validate_external_data_file_bounds( |
| 50 | ++ data_file: IO[bytes], |
| 51 | ++ info: ExternalDataInfo, |
| 52 | ++ tensor_name: str, |
| 53 | ++) -> bytes: |
| 54 | ++ """Validate offset/length against actual file size and read data. |
| 55 | ++ |
| 56 | ++ Layer 3 defense-in-depth (CWE-400): prevents memory exhaustion even if the |
| 57 | ++ model was crafted via direct protobuf APIs that bypass Python parsing. |
| 58 | ++ |
| 59 | ++ Returns the raw bytes read from the file. |
| 60 | ++ """ |
| 61 | ++ file_size = os.fstat(data_file.fileno()).st_size |
| 62 | ++ |
| 63 | ++ if info.offset is not None: |
| 64 | ++ if info.offset > file_size: |
| 65 | ++ raise ValueError( |
| 66 | ++ f"External data offset ({info.offset}) exceeds file size ({file_size}) for tensor {tensor_name!r}" |
| 67 | ++ ) |
| 68 | ++ data_file.seek(info.offset) |
| 69 | ++ |
| 70 | ++ if info.length is not None: |
| 71 | ++ read_start = info.offset if info.offset is not None else 0 |
| 72 | ++ available = file_size - read_start |
| 73 | ++ if info.length > available: |
| 74 | ++ raise ValueError( |
| 75 | ++ f"External data length ({info.length}) exceeds available data ({available} bytes from offset {read_start}) for tensor {tensor_name!r}" |
| 76 | ++ ) |
| 77 | ++ return data_file.read(info.length) |
| 78 | ++ return data_file.read() |
| 79 | ++ |
| 80 | ++_MAX_KEY_DISPLAY_LENGTH = 100 |
| 81 | ++ |
| 82 | + |
| 83 | + class ExternalDataInfo: |
| 84 | + def __init__(self, tensor: TensorProto) -> None: |
| 85 | +@@ -18,14 +71,45 @@ class ExternalDataInfo: |
| 86 | + self.checksum = None |
| 87 | + self.basepath = "" |
| 88 | + |
| 89 | ++ unknown_keys: set[str] = set() |
| 90 | ++ unknown_key_count = 0 |
| 91 | + for entry in tensor.external_data: |
| 92 | +- setattr(self, entry.key, entry.value) |
| 93 | ++ # Layer 1: reject unknown keys (CWE-915 defense-in-depth) |
| 94 | ++ if entry.key in _ALLOWED_EXTERNAL_DATA_KEYS: |
| 95 | ++ setattr(self, entry.key, entry.value) |
| 96 | ++ else: |
| 97 | ++ unknown_key_count += 1 |
| 98 | ++ if len(unknown_keys) < _MAX_UNKNOWN_KEYS_IN_WARNING: |
| 99 | ++ truncated = entry.key[:_MAX_KEY_DISPLAY_LENGTH] |
| 100 | ++ if len(entry.key) > _MAX_KEY_DISPLAY_LENGTH: |
| 101 | ++ truncated += "..." |
| 102 | ++ unknown_keys.add(truncated) |
| 103 | ++ |
| 104 | ++ if unknown_keys: |
| 105 | ++ shown = sorted(unknown_keys) |
| 106 | ++ extra = unknown_key_count - len(shown) |
| 107 | ++ key_list = repr(shown) |
| 108 | ++ if extra > 0: |
| 109 | ++ key_list += f" and {extra} more" |
| 110 | ++ warnings.warn( |
| 111 | ++ f"Ignoring unknown external data key(s) {key_list} for tensor {tensor.name!r}. " |
| 112 | ++ f"Allowed keys: {_SORTED_ALLOWED_KEYS}", |
| 113 | ++ stacklevel=2, |
| 114 | ++ ) |
| 115 | + |
| 116 | +- if self.offset: |
| 117 | ++ if self.offset is not None: |
| 118 | + self.offset = int(self.offset) |
| 119 | ++ if self.offset < 0: |
| 120 | ++ raise ValueError( |
| 121 | ++ f"External data offset must be non-negative, got {self.offset} for tensor {tensor.name!r}" |
| 122 | ++ ) |
| 123 | + |
| 124 | +- if self.length: |
| 125 | ++ if self.length is not None: |
| 126 | + self.length = int(self.length) |
| 127 | ++ if self.length < 0: |
| 128 | ++ raise ValueError( |
| 129 | ++ f"External data length must be non-negative, got {self.length} for tensor {tensor.name!r}" |
| 130 | ++ ) |
| 131 | + |
| 132 | + |
| 133 | + def load_external_data_for_tensor(tensor: TensorProto, base_dir: str) -> None: |
| 134 | +@@ -42,13 +126,9 @@ def load_external_data_for_tensor(tensor: TensorProto, base_dir: str) -> None: |
| 135 | + base_dir, info.location, tensor.name |
| 136 | + ) |
| 137 | + with open(external_data_file_path, "rb") as data_file: |
| 138 | +- if info.offset: |
| 139 | +- data_file.seek(info.offset) |
| 140 | +- |
| 141 | +- if info.length: |
| 142 | +- tensor.raw_data = data_file.read(info.length) |
| 143 | +- else: |
| 144 | +- tensor.raw_data = data_file.read() |
| 145 | ++ tensor.raw_data = _validate_external_data_file_bounds( |
| 146 | ++ data_file, info, tensor.name |
| 147 | ++ ) |
| 148 | + |
| 149 | + |
| 150 | + def load_external_data_for_model(model: ModelProto, base_dir: str) -> None: |
| 151 | +-- |
| 152 | +2.34.1 |
| 153 | + |
0 commit comments