From 34b25973510df3b4f54023f3421c0da027f2660c Mon Sep 17 00:00:00 2001 From: Kirit93 Date: Tue, 19 May 2026 09:07:19 -0700 Subject: [PATCH 1/5] feat(telemetry): add anonymous opt-out usage telemetry Signed-off-by: Kirit93 --- .cargo/config.toml | 8 - .github/workflows/branch-checks.yml | 12 +- .github/workflows/branch-docs.yml | 4 +- .github/workflows/branch-e2e.yml | 2 +- .github/workflows/ci-image.yml | 6 +- .github/workflows/deb-package.yml | 10 +- .github/workflows/docker-build.yml | 4 +- .github/workflows/driver-vm-linux.yml | 10 +- .github/workflows/driver-vm-macos.yml | 16 +- .github/workflows/e2e-gpu-test.yaml | 2 +- .github/workflows/e2e-test.yml | 2 +- .github/workflows/helm-lint.yml | 7 - .github/workflows/issue-triage.yml | 4 +- .github/workflows/release-auto-tag.yml | 2 +- .github/workflows/release-dev.yml | 54 +- .github/workflows/release-tag.yml | 65 +- .github/workflows/release-vm-kernel.yml | 26 +- .github/workflows/rpm-package.yml | 4 +- .github/workflows/rust-native-build.yml | 4 +- .github/workflows/test-gpu.yml | 2 +- .github/workflows/vouch-check.yml | 4 +- .github/workflows/vouch-command.yml | 2 +- .gitignore | 3 - .packit.yaml | 10 +- CONTRIBUTING.md | 19 +- Cargo.lock | 7 +- README.md | 12 +- architecture/build.md | 4 +- architecture/gateway.md | 87 - crates/openshell-cli/Cargo.toml | 1 - crates/openshell-cli/src/main.rs | 296 -- crates/openshell-cli/src/run.rs | 597 +--- .../tests/ensure_providers_integration.rs | 63 +- .../openshell-cli/tests/mtls_integration.rs | 27 - .../tests/provider_commands_integration.rs | 314 +- .../sandbox_create_lifecycle_integration.rs | 30 - .../sandbox_name_fallback_integration.rs | 28 - crates/openshell-core/Cargo.toml | 2 + crates/openshell-core/src/config.rs | 8 +- crates/openshell-core/src/lib.rs | 3 +- crates/openshell-core/src/metadata.rs | 142 +- crates/openshell-core/src/paths.rs | 27 - crates/openshell-core/src/telemetry.rs | 266 ++ crates/openshell-driver-docker/src/lib.rs | 26 +- crates/openshell-driver-docker/src/tests.rs | 10 +- crates/openshell-driver-podman/NETWORKING.md | 21 +- crates/openshell-driver-podman/README.md | 16 +- .../openshell-driver-podman/src/container.rs | 2 +- crates/openshell-driver-vm/README.md | 19 - .../scripts/openshell-vm-sandbox-init.sh | 7 - crates/openshell-driver-vm/src/lib.rs | 1 - crates/openshell-driver-vm/src/nft_ruleset.rs | 92 - crates/openshell-driver-vm/src/runtime.rs | 108 +- .../src/events/network_activity.rs | 2 +- crates/openshell-ocsf/src/format/shorthand.rs | 4 +- .../src/objects/firewall_rule.rs | 2 +- crates/openshell-providers/src/lib.rs | 14 +- crates/openshell-providers/src/profiles.rs | 246 +- .../src/providers/claude.rs | 2 +- crates/openshell-sandbox/Cargo.toml | 1 - .../src/activity_aggregator.rs | 153 + .../openshell-sandbox/src/bypass_monitor.rs | 24 +- crates/openshell-sandbox/src/child_env.rs | 6 +- crates/openshell-sandbox/src/grpc_client.rs | 13 +- crates/openshell-sandbox/src/l7/graphql.rs | 1 + crates/openshell-sandbox/src/l7/relay.rs | 18 + crates/openshell-sandbox/src/l7/websocket.rs | 1 + crates/openshell-sandbox/src/lib.rs | 214 +- crates/openshell-sandbox/src/policy_local.rs | 2 +- .../src/provider_credentials.rs | 62 +- crates/openshell-sandbox/src/proxy.rs | 246 +- .../src/sandbox/linux/mod.rs | 1 - .../src/sandbox/linux/netns.rs | 540 +++- .../src/sandbox/linux/nft_ruleset.rs | 148 - crates/openshell-sandbox/src/secrets.rs | 61 +- .../postgres/005_add_resource_version.sql | 5 - .../sqlite/005_add_resource_version.sql | 5 - crates/openshell-server/src/auth/authz.rs | 62 - crates/openshell-server/src/certgen.rs | 6 +- crates/openshell-server/src/cli.rs | 218 +- crates/openshell-server/src/compute/mod.rs | 455 +-- crates/openshell-server/src/compute/vm.rs | 21 +- crates/openshell-server/src/config_file.rs | 43 - crates/openshell-server/src/defaults.rs | 155 - crates/openshell-server/src/grpc/mod.rs | 85 +- crates/openshell-server/src/grpc/policy.rs | 595 +--- crates/openshell-server/src/grpc/provider.rs | 2623 ++--------------- crates/openshell-server/src/grpc/sandbox.rs | 969 +----- crates/openshell-server/src/grpc/service.rs | 223 +- .../openshell-server/src/grpc/validation.rs | 21 - crates/openshell-server/src/inference.rs | 218 +- crates/openshell-server/src/lib.rs | 32 +- .../openshell-server/src/persistence/mod.rs | 336 +-- .../src/persistence/postgres.rs | 239 +- .../src/persistence/sqlite.rs | 222 +- .../openshell-server/src/persistence/tests.rs | 459 +-- .../openshell-server/src/provider_refresh.rs | 1221 -------- .../openshell-server/src/service_routing.rs | 1 - crates/openshell-server/src/ssh_sessions.rs | 1 - .../src/supervisor_session.rs | 6 +- crates/openshell-server/src/telemetry.rs | 246 ++ crates/openshell-server/tests/common/mod.rs | 28 - .../tests/supervisor_relay_integration.rs | 27 - crates/openshell-tui/src/lib.rs | 9 - deploy/deb/init-gateway-config.sh | 56 + deploy/deb/openshell-gateway.service | 11 +- deploy/docker/Dockerfile.ci | 8 +- deploy/docker/Dockerfile.gateway | 11 +- deploy/helm/openshell/.helmignore | 1 - deploy/helm/openshell/README.md | 28 +- deploy/helm/openshell/README.md.gotmpl | 79 - deploy/helm/openshell/values.yaml | 123 +- deploy/man/openshell-gateway.8.md | 79 +- deploy/man/openshell-gateway.env.5.md | 127 + deploy/man/openshell.1.md | 4 +- deploy/rpm/CONFIGURATION.md | 160 +- deploy/rpm/QUICKSTART.md | 15 +- deploy/rpm/TROUBLESHOOTING.md | 29 +- deploy/rpm/gateway.toml.default | 30 - deploy/rpm/init-gateway-env.sh | 140 + deploy/snap/README.md | 26 +- deploy/snap/bin/openshell-gateway-wrapper | 22 +- deploy/snap/meta/snap.yaml.in | 6 + docs/about/installation.mdx | 10 +- docs/reference/gateway-auth.mdx | 4 +- docs/reference/gateway-config.mdx | 25 +- docs/reference/sandbox-compute-drivers.mdx | 40 +- docs/sandboxes/manage-providers.mdx | 56 +- docs/security/best-practices.mdx | 4 +- e2e/rust/Cargo.toml | 7 +- e2e/rust/e2e-podman.sh | 4 +- e2e/rust/src/harness/cli.rs | 107 - e2e/rust/src/harness/mod.rs | 1 - e2e/rust/tests/gateway_resume.rs | 112 +- e2e/rust/tests/podman_gateway_resume.rs | 81 - e2e/rust/tests/provider_auto_create.rs | 18 +- e2e/rust/tests/vm_gateway_resume.rs | 110 +- e2e/rust/tests/websocket_conformance.rs | 6 +- e2e/support/gateway-common.sh | 22 +- e2e/with-docker-gateway.sh | 48 +- e2e/with-kube-gateway.sh | 9 +- e2e/with-podman-gateway.sh | 47 +- examples/bring-your-own-container/Dockerfile | 4 +- mise.lock | 156 +- mise.toml | 16 +- openshell.spec | 92 +- openshell_telemetry_schema.json | 287 ++ proto/datamodel.proto | 9 +- proto/openshell.proto | 155 +- providers/anthropic.yaml | 22 + providers/{claude-code.yaml => claude.yaml} | 2 +- providers/codex.yaml | 22 + providers/copilot.yaml | 26 + providers/gitlab.yaml | 26 + providers/openai.yaml | 22 + providers/opencode.yaml | 22 + providers/outlook.yaml | 7 + pyproject.toml | 1 - python/openshell/release_formula_test.py | 91 +- rfc/0003-gateway-configuration/README.md | 2 +- snapcraft.yaml | 6 + tasks/ci.toml | 2 +- tasks/helm.toml | 24 +- tasks/scripts/helm-k3s-local.sh | 13 +- tasks/scripts/package-deb.sh | 2 + tasks/scripts/release.py | 84 +- tasks/test.toml | 2 +- 167 files changed, 4016 insertions(+), 11535 deletions(-) delete mode 100644 .cargo/config.toml create mode 100644 crates/openshell-core/src/telemetry.rs delete mode 100644 crates/openshell-driver-vm/src/nft_ruleset.rs create mode 100644 crates/openshell-sandbox/src/activity_aggregator.rs delete mode 100644 crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs delete mode 100644 crates/openshell-server/migrations/postgres/005_add_resource_version.sql delete mode 100644 crates/openshell-server/migrations/sqlite/005_add_resource_version.sql delete mode 100644 crates/openshell-server/src/defaults.rs delete mode 100644 crates/openshell-server/src/provider_refresh.rs create mode 100644 crates/openshell-server/src/telemetry.rs create mode 100755 deploy/deb/init-gateway-config.sh delete mode 100644 deploy/helm/openshell/README.md.gotmpl create mode 100644 deploy/man/openshell-gateway.env.5.md delete mode 100644 deploy/rpm/gateway.toml.default create mode 100644 deploy/rpm/init-gateway-env.sh delete mode 100644 e2e/rust/src/harness/cli.rs delete mode 100644 e2e/rust/tests/podman_gateway_resume.rs create mode 100644 openshell_telemetry_schema.json create mode 100644 providers/anthropic.yaml rename providers/{claude-code.yaml => claude.yaml} (98%) create mode 100644 providers/codex.yaml create mode 100644 providers/copilot.yaml create mode 100644 providers/gitlab.yaml create mode 100644 providers/openai.yaml create mode 100644 providers/opencode.yaml create mode 100644 providers/outlook.yaml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index 0005fc2bd..000000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -[env] -# z3-sys bindgen needs the z3 include path. On some distros (e.g. RHEL/Fedora) -# the header lives in /usr/include/z3/ rather than /usr/include/. The extra -I -# is harmless on systems where the path doesn't exist. -BINDGEN_EXTRA_CLANG_ARGS = "-I/usr/include/z3" diff --git a/.github/workflows/branch-checks.yml b/.github/workflows/branch-checks.yml index 54084fddd..f7bc6ad1f 100644 --- a/.github/workflows/branch-checks.yml +++ b/.github/workflows/branch-checks.yml @@ -30,7 +30,7 @@ jobs: outputs: should_run: ${{ steps.gate.outputs.should_run }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - id: gate uses: ./.github/actions/pr-gate @@ -46,7 +46,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Mark workspace as safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" @@ -70,7 +70,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Install tools run: mise install --locked @@ -95,7 +95,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Install tools run: mise install --locked @@ -148,7 +148,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Install tools run: mise install --locked @@ -173,7 +173,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Install tools run: mise install --locked diff --git a/.github/workflows/branch-docs.yml b/.github/workflows/branch-docs.yml index 3b2a4099e..1368bc775 100644 --- a/.github/workflows/branch-docs.yml +++ b/.github/workflows/branch-docs.yml @@ -19,7 +19,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + uses: actions/checkout@v6 - name: Check Fern preview availability id: fern-preview @@ -34,7 +34,7 @@ jobs: fi - name: Setup Node.js - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6 + uses: actions/setup-node@v6 with: node-version: "24" diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index 49f9ddb03..3d8dd5928 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -18,7 +18,7 @@ jobs: outputs: should_run: ${{ steps.gate.outputs.should_run }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - id: gate uses: ./.github/actions/pr-gate with: diff --git a/.github/workflows/ci-image.yml b/.github/workflows/ci-image.yml index f0e7caef3..327ce0733 100644 --- a/.github/workflows/ci-image.yml +++ b/.github/workflows/ci-image.yml @@ -35,10 +35,10 @@ jobs: runs-on: ${{ matrix.runner }} timeout-minutes: 60 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Log in to GitHub Container Registry - uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -91,7 +91,7 @@ jobs: timeout-minutes: 10 steps: - name: Log in to GitHub Container Registry - uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} diff --git a/.github/workflows/deb-package.yml b/.github/workflows/deb-package.yml index 6ae6127a4..72628a23a 100644 --- a/.github/workflows/deb-package.yml +++ b/.github/workflows/deb-package.yml @@ -42,24 +42,24 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} - name: Download CLI artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: cli-linux-${{ matrix.arch }} path: package-input/ - name: Download gateway artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: gateway-binary-linux-${{ matrix.arch }} path: package-input/ - name: Download VM driver artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: driver-vm-linux-${{ matrix.arch }} path: package-input/ @@ -85,7 +85,7 @@ jobs: tasks/scripts/package-deb.sh - name: Upload Debian package artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: deb-linux-${{ matrix.arch }} path: artifacts/*.deb diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index e997c6fb9..450d6b5c5 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -186,7 +186,7 @@ jobs: DOCKER_PUSH: ${{ inputs.push && '1' || '0' }} DOCKER_PLATFORM: ${{ matrix.platform }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] || github.sha }} fetch-depth: 0 @@ -207,7 +207,7 @@ jobs: buildkitd-config: /etc/buildkit/buildkitd.toml - name: Download Rust binary artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: ${{ needs.resolve.outputs.artifact_prefix }}-linux-${{ matrix.arch }} path: prebuilt-rust-binary diff --git a/.github/workflows/driver-vm-linux.yml b/.github/workflows/driver-vm-linux.yml index 9e63b3aa2..8ad4073ca 100644 --- a/.github/workflows/driver-vm-linux.yml +++ b/.github/workflows/driver-vm-linux.yml @@ -32,7 +32,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} @@ -66,7 +66,7 @@ jobs: done - name: Upload runtime artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: vm-driver-kernel-runtime-tarballs path: runtime-artifacts/vm-runtime-*.tar.zst @@ -100,7 +100,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: ${{ inputs['image-tag'] }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} fetch-depth: 0 @@ -125,7 +125,7 @@ jobs: run: apt-get update && apt-get install -y --no-install-recommends zstd && rm -rf /var/lib/apt/lists/* - name: Download kernel runtime tarball - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: vm-driver-kernel-runtime-tarballs path: runtime-download/ @@ -188,7 +188,7 @@ jobs: -C target/release openshell-driver-vm - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: driver-vm-linux-${{ matrix.arch }} path: artifacts/*.tar.gz diff --git a/.github/workflows/driver-vm-macos.yml b/.github/workflows/driver-vm-macos.yml index 5618d3731..915e007c9 100644 --- a/.github/workflows/driver-vm-macos.yml +++ b/.github/workflows/driver-vm-macos.yml @@ -32,7 +32,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} @@ -60,7 +60,7 @@ jobs: run: test -f runtime-artifacts/vm-runtime-darwin-aarch64.tar.zst - name: Upload runtime artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: vm-driver-macos-kernel-runtime-tarball path: runtime-artifacts/vm-runtime-darwin-aarch64.tar.zst @@ -79,7 +79,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: ${{ inputs['image-tag'] }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} fetch-depth: 0 @@ -113,7 +113,7 @@ jobs: run: mise x -- sccache --show-stats - name: Upload supervisor bundle - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: driver-vm-supervisor-arm64 path: target/vm-runtime-compressed/openshell-sandbox.zst @@ -135,7 +135,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] }} fetch-depth: 0 @@ -156,7 +156,7 @@ jobs: run: apt-get update && apt-get install -y --no-install-recommends zstd && rm -rf /var/lib/apt/lists/* - name: Download kernel runtime tarball - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: vm-driver-macos-kernel-runtime-tarball path: runtime-download/ @@ -171,7 +171,7 @@ jobs: tasks/scripts/vm/compress-vm-runtime.sh - name: Download bundled supervisor - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: driver-vm-supervisor-arm64 path: target/vm-runtime-compressed-macos/ @@ -214,7 +214,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: driver-vm-macos path: artifacts/*.tar.gz diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index 78cd7e4d1..0004bcbe2 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -55,7 +55,7 @@ jobs: # probe below and by the e2e tests in e2e/rust/tests/gpu_device_selection.rs. OPENSHELL_E2E_GPU_PROBE_IMAGE: "nvcr.io/nvidia/base/ubuntu:noble-20251013" steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index aabddee96..db8010d0f 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -58,7 +58,7 @@ jobs: OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] || github.sha }} diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml index 9f3e2fbcb..78739793c 100644 --- a/.github/workflows/helm-lint.yml +++ b/.github/workflows/helm-lint.yml @@ -9,10 +9,6 @@ on: - "pull-request/[0-9]+" paths: - "deploy/helm/**" - - "mise.toml" - - "mise.lock" - - "tasks/helm.toml" - - ".github/workflows/helm-lint.yml" workflow_dispatch: env: @@ -60,8 +56,5 @@ jobs: - name: Lint Helm chart run: mise run helm:lint - - name: Check Helm chart README - run: mise run helm:docs:check - - name: Run Helm chart unit tests run: mise run helm:test diff --git a/.github/workflows/issue-triage.yml b/.github/workflows/issue-triage.yml index 4aa3d6697..b59d8ba34 100644 --- a/.github/workflows/issue-triage.yml +++ b/.github/workflows/issue-triage.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Check contributor permissions id: contributor - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: result-encoding: string script: | @@ -46,7 +46,7 @@ jobs: - name: Add triage label if: steps.contributor.outputs.result == 'true' - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | await github.rest.issues.addLabels({ diff --git a/.github/workflows/release-auto-tag.yml b/.github/workflows/release-auto-tag.yml index 2b10a5b6e..f89c506d7 100644 --- a/.github/workflows/release-auto-tag.yml +++ b/.github/workflows/release-auto-tag.yml @@ -20,7 +20,7 @@ jobs: create-tag: runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml index 94c950772..f1df71b3f 100644 --- a/.github/workflows/release-dev.yml +++ b/.github/workflows/release-dev.yml @@ -33,7 +33,7 @@ jobs: rpm_version: ${{ steps.v.outputs.rpm_version }} rpm_release: ${{ steps.v.outputs.rpm_release }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -123,7 +123,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: dev steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -147,7 +147,7 @@ jobs: ls -la ${{ matrix.output_path }} - name: Upload wheel artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: python-wheels-${{ matrix.artifact }} path: ${{ matrix.output_path }} @@ -170,7 +170,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: dev steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -193,7 +193,7 @@ jobs: ls -la target/wheels/*.whl - name: Upload wheel artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: python-wheels-macos path: target/wheels/*.whl @@ -231,7 +231,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: dev steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -307,7 +307,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: cli-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -332,7 +332,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -369,7 +369,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: cli-macos path: artifacts/*.tar.gz @@ -401,7 +401,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -454,7 +454,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: gateway-binary-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -479,7 +479,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -521,7 +521,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: gateway-binary-macos path: artifacts/*.tar.gz @@ -553,7 +553,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: fetch-depth: 0 @@ -604,7 +604,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: supervisor-binary-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -666,52 +666,52 @@ jobs: outputs: wheel_filenames: ${{ steps.wheel_filenames.outputs.wheel_filenames }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Download all CLI artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: cli-* path: release/ merge-multiple: true - name: Download gateway binary artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: gateway-binary-* path: release/ merge-multiple: true - name: Download supervisor binary artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: supervisor-binary-* path: release/ merge-multiple: true - name: Download VM driver artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: driver-vm-* path: release/ merge-multiple: true - name: Download wheel artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: python-wheels-* path: release/ merge-multiple: true - name: Download Debian package artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: deb-linux-* path: release/ merge-multiple: true - name: Download RPM package artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: rpm-linux-* path: release/ @@ -785,7 +785,7 @@ jobs: cat release/openshell.rb - name: Attest VM driver artifacts - uses: actions/attest@59d89421af93a897026c735860bf21b6eb4f7b26 # v4 + uses: actions/attest@v4 with: subject-path: | release/openshell-driver-vm-x86_64-unknown-linux-gnu.tar.gz @@ -793,7 +793,7 @@ jobs: release/openshell-driver-vm-aarch64-apple-darwin.tar.gz - name: Prune managed assets from dev release - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); @@ -848,7 +848,7 @@ jobs: git push --force origin dev - name: Create / update GitHub Release - uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2 + uses: softprops/action-gh-release@v2 with: name: OpenShell Development Build prerelease: true @@ -894,7 +894,7 @@ jobs: permissions: packages: write steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - uses: ./.github/actions/release-helm-oci with: diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml index 4552551ff..18bf74db5 100644 --- a/.github/workflows/release-tag.yml +++ b/.github/workflows/release-tag.yml @@ -48,7 +48,7 @@ jobs: # Commit resolved from RELEASE_TAG, used for image tags and downstream metadata source_sha: ${{ steps.v.outputs.source_sha }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -152,7 +152,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.semver }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -177,7 +177,7 @@ jobs: ls -la ${{ matrix.output_path }} - name: Upload wheel artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: python-wheels-${{ matrix.artifact }} path: ${{ matrix.output_path }} @@ -200,7 +200,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.semver }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -224,7 +224,7 @@ jobs: ls -la target/wheels/*.whl - name: Upload wheel artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: python-wheels-macos path: target/wheels/*.whl @@ -262,7 +262,7 @@ jobs: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.semver }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -339,7 +339,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: cli-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -364,7 +364,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -402,7 +402,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: cli-macos path: artifacts/*.tar.gz @@ -434,7 +434,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -488,7 +488,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: gateway-binary-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -520,7 +520,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -572,7 +572,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: supervisor-binary-linux-${{ matrix.arch }} path: artifacts/*.tar.gz @@ -597,7 +597,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 @@ -640,7 +640,7 @@ jobs: ls -lh artifacts/ - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: gateway-binary-macos path: artifacts/*.tar.gz @@ -702,54 +702,54 @@ jobs: outputs: wheel_filenames: ${{ steps.wheel_filenames.outputs.wheel_filenames }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} - name: Download all CLI artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: cli-* path: release/ merge-multiple: true - name: Download gateway binary artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: gateway-binary-* path: release/ merge-multiple: true - name: Download supervisor binary artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: supervisor-binary-* path: release/ merge-multiple: true - name: Download VM driver artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: driver-vm-* path: release/ merge-multiple: true - name: Download wheel artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: python-wheels-* path: release/ merge-multiple: true - name: Download Debian package artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: deb-linux-* path: release/ merge-multiple: true - name: Download RPM package artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: rpm-linux-* path: release/ @@ -798,16 +798,15 @@ jobs: cat release/openshell.rb - name: Attest VM driver artifacts - uses: actions/attest@59d89421af93a897026c735860bf21b6eb4f7b26 # v4 + uses: actions/attest@v4 with: subject-path: | - release/*.tar.gz - release/*.deb - release/*.rpm - release/*.whl + release/openshell-driver-vm-x86_64-unknown-linux-gnu.tar.gz + release/openshell-driver-vm-aarch64-unknown-linux-gnu.tar.gz + release/openshell-driver-vm-aarch64-apple-darwin.tar.gz - name: Prune removed VM checksum asset - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); @@ -829,7 +828,7 @@ jobs: } - name: Create GitHub Release - uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2 + uses: softprops/action-gh-release@v2 with: name: OpenShell ${{ env.RELEASE_TAG }} prerelease: false @@ -870,12 +869,12 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 15 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} - name: Setup Node.js - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6 + uses: actions/setup-node@v6 with: node-version: "24" @@ -898,7 +897,7 @@ jobs: permissions: packages: write steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.tag || github.ref }} diff --git a/.github/workflows/release-vm-kernel.yml b/.github/workflows/release-vm-kernel.yml index d94bfe399..abd7a633c 100644 --- a/.github/workflows/release-vm-kernel.yml +++ b/.github/workflows/release-vm-kernel.yml @@ -47,7 +47,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Mark workspace safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" @@ -63,7 +63,7 @@ jobs: --output artifacts/vm-runtime-linux-aarch64.tar.zst - name: Upload runtime artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: vm-runtime-linux-arm64 path: artifacts/vm-runtime-linux-aarch64.tar.zst @@ -73,7 +73,7 @@ jobs: # the aarch64 Linux kernel as a byte array — it is OS-agnostic and can # be compiled into a .dylib by Apple's cc without rebuilding the kernel. - name: Upload kernel.c for macOS build - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: kernel-c-arm64 path: | @@ -97,7 +97,7 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Mark workspace safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" @@ -113,7 +113,7 @@ jobs: --output artifacts/vm-runtime-linux-x86_64.tar.zst - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: vm-runtime-linux-amd64 path: artifacts/vm-runtime-linux-x86_64.tar.zst @@ -130,7 +130,7 @@ jobs: env: RUSTC_WRAPPER: "" steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Install dependencies run: | @@ -140,7 +140,7 @@ jobs: brew install lld dtc xz - name: Download pre-built kernel.c - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: name: kernel-c-arm64 path: target/kernel-artifact @@ -156,7 +156,7 @@ jobs: --output artifacts/vm-runtime-darwin-aarch64.tar.zst - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: vm-runtime-macos-arm64 path: artifacts/vm-runtime-darwin-aarch64.tar.zst @@ -176,17 +176,17 @@ jobs: attestations: write artifact-metadata: write steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - name: Download all runtime artifacts - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + uses: actions/download-artifact@v4 with: pattern: vm-runtime-* path: release/ merge-multiple: true - name: Attest VM runtime artifacts - uses: actions/attest@59d89421af93a897026c735860bf21b6eb4f7b26 # v4 + uses: actions/attest@v4 with: subject-path: | release/vm-runtime-linux-aarch64.tar.zst @@ -201,7 +201,7 @@ jobs: git push --force origin vm-runtime - name: Prune stale runtime assets from vm-runtime release - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); @@ -224,7 +224,7 @@ jobs: } - name: Create / update vm-runtime GitHub Release - uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2 + uses: softprops/action-gh-release@v2 with: name: OpenShell VM Runtime prerelease: true diff --git a/.github/workflows/rpm-package.yml b/.github/workflows/rpm-package.yml index e96b19958..e0607c3ff 100644 --- a/.github/workflows/rpm-package.yml +++ b/.github/workflows/rpm-package.yml @@ -54,7 +54,7 @@ jobs: pandoc python3-devel git-core \ cargo-rpm-macros - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs.checkout-ref }} fetch-depth: 0 @@ -87,7 +87,7 @@ jobs: ls -lah artifacts/ - name: Upload RPM artifacts - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: rpm-linux-${{ matrix.arch }} path: artifacts/*.rpm diff --git a/.github/workflows/rust-native-build.yml b/.github/workflows/rust-native-build.yml index 1086ee5e8..edb1bfb7a 100644 --- a/.github/workflows/rust-native-build.yml +++ b/.github/workflows/rust-native-build.yml @@ -84,7 +84,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 with: ref: ${{ inputs['checkout-ref'] || github.sha }} fetch-depth: 0 @@ -247,7 +247,7 @@ jobs: ls -lh "$STAGE/" - name: Upload artifact - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + uses: actions/upload-artifact@v7 with: name: ${{ inputs['artifact-name'] != '' && inputs['artifact-name'] || format('rust-binary-{0}-linux-{1}', inputs.component, inputs.arch) }} path: prebuilt-binaries/${{ inputs.arch }}/${{ steps.target.outputs.binary }} diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index 4d62ccefd..37fdcbb94 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -19,7 +19,7 @@ jobs: outputs: should_run: ${{ steps.gate.outputs.should_run }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@v6 - id: gate uses: ./.github/actions/pr-gate with: diff --git a/.github/workflows/vouch-check.yml b/.github/workflows/vouch-check.yml index 2eeeb949f..db7a540eb 100644 --- a/.github/workflows/vouch-check.yml +++ b/.github/workflows/vouch-check.yml @@ -18,7 +18,7 @@ jobs: - name: Check org membership id: org-check if: env.ORG_READ_TOKEN != '' - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: github-token: ${{ secrets.ORG_READ_TOKEN }} result-encoding: string @@ -42,7 +42,7 @@ jobs: - name: Check if contributor is vouched if: steps.org-check.outputs.result != 'skip' - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | const author = context.payload.pull_request.user.login; diff --git a/.github/workflows/vouch-command.yml b/.github/workflows/vouch-command.yml index 366dd6a0e..309a4ae36 100644 --- a/.github/workflows/vouch-command.yml +++ b/.github/workflows/vouch-command.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Process /vouch command - uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + uses: actions/github-script@v9 with: script: | const commenter = context.payload.comment.user.login; diff --git a/.gitignore b/.gitignore index 24a77fce2..15cc9658d 100644 --- a/.gitignore +++ b/.gitignore @@ -196,9 +196,6 @@ artifacts/ # Local mise settings mise.local.toml -# Local Codex app state -.codex/ - # Ignore plans for now architecture/plans diff --git a/.packit.yaml b/.packit.yaml index 5d1f65063..3a608111b 100644 --- a/.packit.yaml +++ b/.packit.yaml @@ -35,10 +35,12 @@ actions: - 'bash -c "echo openshell-${PACKIT_PROJECT_VERSION}.tar.gz"' fix-spec-file: - # Update the canonical version macro. Version:, Source0:, Source1:, and all - # other version references expand from %{openshell_version} so only this - # one line needs updating. - - 'bash -c "sed -i -r \"s/^%global openshell_version .*/%global openshell_version ${PACKIT_RPMSPEC_VERSION}/\" openshell.spec"' + # Update Source0 to the generated tarball name + - 'bash -c "sed -i \"s|^Source0:.*|Source0: openshell-${PACKIT_PROJECT_VERSION}.tar.gz|\" openshell.spec"' + # Update Source1 to the generated vendor tarball name + - 'bash -c "sed -i \"s|^Source1:.*|Source1: openshell-${PACKIT_PROJECT_VERSION}-vendor.tar.xz|\" openshell.spec"' + # Update Version + - 'bash -c "sed -i -r \"s/^Version:(\\s*)\\S+/Version:\\1${PACKIT_RPMSPEC_VERSION}/\" openshell.spec"' # Update Release - 'bash -c "RELEASE=${OPENSHELL_RPM_RELEASE:-${PACKIT_RPMSPEC_RELEASE}} && sed -i -r \"s/^Release:(\\s*)\\S+/Release:\\1${RELEASE}%{?dist}/\" openshell.spec"' # Keep embedded binary metadata aligned with the release workflow. Python diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9dddac69a..5c091c6c4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -174,16 +174,15 @@ openshell sandbox create -- codex These are the primary `mise` tasks for day-to-day development: -| Task | Purpose | -| -------------------- | ------------------------------------------------------- | -| `mise run gateway` | Run a standalone gateway for local development | -| `mise run sandbox` | Create or reconnect to the dev sandbox | -| `mise run test` | Default test suite | -| `mise run e2e` | Default end-to-end test lane | -| `mise run ci` | Full local CI checks (lint, compile/type checks, tests) | -| `mise run docs` | Validate Fern docs locally | -| `mise run helm:docs` | Regenerate the Helm chart README | -| `mise run clean` | Clean build artifacts | +| Task | Purpose | +| ------------------ | ------------------------------------------------------- | +| `mise run gateway` | Run a standalone gateway for local development | +| `mise run sandbox` | Create or reconnect to the dev sandbox | +| `mise run test` | Default test suite | +| `mise run e2e` | Default end-to-end test lane | +| `mise run ci` | Full local CI checks (lint, compile/type checks, tests) | +| `mise run docs` | Validate Fern docs locally | +| `mise run clean` | Clean build artifacts | ## Project Structure diff --git a/Cargo.lock b/Cargo.lock index 9e73bce83..450c8dba8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3408,7 +3408,6 @@ dependencies = [ "anyhow", "base64 0.22.1", "bytes", - "chrono", "clap", "clap_complete", "crossterm 0.28.1", @@ -3456,11 +3455,13 @@ dependencies = [ name = "openshell-core" version = "0.0.0" dependencies = [ + "chrono", "ipnet", "miette", "prost", "prost-types", "protobuf-src", + "reqwest 0.12.28", "serde", "serde_json", "tempfile", @@ -4969,9 +4970,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" [[package]] name = "rustls-webpki" -version = "0.103.13" +version = "0.103.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" dependencies = [ "aws-lc-rs", "ring", diff --git a/README.md b/README.md index 574347642..f2111e7bc 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The sandbox container includes the following tools by default: | Category | Tools | | ---------- | -------------------------------------------------------- | | Agent | `claude`, `opencode`, `codex`, `copilot` | -| Language | `python` (3.14), `node` (22) | +| Language | `python` (3.13), `node` (22) | | Developer | `gh`, `git`, `vim`, `nano` | | Networking | `ping`, `dig`, `nslookup`, `nc`, `traceroute`, `netstat` | @@ -243,6 +243,16 @@ All implementation work is human-gated — agents propose plans, humans approve, OpenShell is built agent-first — your agent is your first collaborator. Before opening issues or submitting code, point your agent at the repo and let it use the skills in `.agents/skills/` to investigate, diagnose, and prototype. See [CONTRIBUTING.md](CONTRIBUTING.md) for the full agent skills table, contribution workflow, and development setup. +## Telemetry + +OpenShell collects anonymous telemetry to help improve the project for developers. This data is not used to track individual user behavior. It helps us understand aggregate usage of sandbox, provider, and policy workflows so we can prioritize product improvements and share usage trends with the community. + +Disable telemetry with `OPENSHELL_TELEMETRY_ENABLED=false`. See the [telemetry schema](openshell_telemetry_schema.json) for details. + +Telemetry events are limited to anonymous operational categories and counts, such as sandbox lifecycle outcomes, provider profile buckets, policy decision counts, and aggregate network activity denial categories. OpenShell telemetry does not collect sandbox names or IDs, hostnames, file paths, binary paths, prompts, credentials, provider names, model names, or user content. + +Opting out applies only to telemetry emitted by OpenShell. Third-party services, model providers, inference endpoints, agents, or tools that you configure and use with OpenShell may have their own terms and privacy practices. + ## Notice and Disclaimer This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk. diff --git a/architecture/build.md b/architecture/build.md index 8a4212aa7..79ce59be9 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -40,8 +40,8 @@ the staging directory before running Buildx. Runtime layout: -- **Gateway**: `gcr.io/distroless/cc-debian13:nonroot` base, GNU-linked binary at - `/usr/local/bin/openshell-gateway`, runs as UID/GID `1000:1000`. +- **Gateway**: `nvcr.io/nvidia/distroless/cc` base, GNU-linked binary at + `/usr/local/bin/openshell-gateway`, runs as UID/GID `65532:65532`. - **Supervisor**: `scratch` base, static musl binary at `/openshell-sandbox`. Static linkage is required because the image is mounted/extracted into sandbox environments (Docker extraction, Podman image volumes, Kubernetes diff --git a/architecture/gateway.md b/architecture/gateway.md index 04e64a73f..e9cbe187d 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -83,7 +83,6 @@ The storage schema is intentionally narrow: | `version` | Optional monotonically increasing version for scoped records. | | `status` | Optional workflow state for records such as policy revisions or draft policy chunks. | | `dedup_key` and `hit_count` | Optional policy-advisor fields for coalescing repeated observations. | -| `resource_version` | Monotonically increasing counter for optimistic concurrency control. Incremented atomically on each update. | | `payload` | Prost-encoded protobuf payload for the full domain object. | | `created_at_ms` and `updated_at_ms` | Gateway timestamps used for ordering and list output. | | `labels` | JSON object carrying Kubernetes-style object labels for filtering and organization. | @@ -114,92 +113,6 @@ default WAL journal mode), which mirror the same sensitive contents. Persisted state includes sandboxes, providers, SSH sessions, policy revisions, settings, inference configuration, and deployment records. -### Optimistic Concurrency (CAS) - -Every object row carries a `resource_version` that the database increments -atomically on each write. Concurrent mutations use compare-and-swap (CAS): the -writer reads the current version, applies changes, and writes back with a -`WHERE resource_version = ` guard. If another writer updated the row -in between, the guard fails and the caller receives a `Conflict` error. - -This matters for HA deployments where multiple gateway replicas share the same -Postgres database, and for single-node deployments where concurrent gRPC -handlers or the reconciler mutate the same sandbox. - -**Compile-time enforcement.** The unconditional write methods `put` and -`put_message` are gated behind `#[cfg(test)]`. Production code must use -`put_if` with an explicit `WriteCondition` or `update_message_cas`. The -compiler rejects any other write path, making non-CAS writes structurally -impossible outside of tests. - -Every write goes through one of three conditions: - -- `MustCreate` -- insert-only. The database rejects the write with a - `UniqueViolation` error if a row with that ID already exists. Handlers match - on the structured `PersistenceError::UniqueViolation { .. }` variant to - distinguish creation conflicts from other failures. -- `MatchResourceVersion(v)` -- update-only. The database rejects the write - with a `Conflict` error if the current version differs from `v`. -- `Unconditional` -- test-only; not reachable in production builds. - -**Creates.** All create paths use `MustCreate` and hydrate the response -directly from the `WriteResult` returned by `put_if`, which carries the -assigned `resource_version`, `created_at_ms`, and `updated_at_ms`. This -eliminates a read-after-write round trip and the race window that would come -with it. - -**Updates.** The `update_message_cas` helper makes a single CAS attempt: it -fetches the current object, applies a mutation closure, and writes with a -`MatchResourceVersion` condition. On conflict the persistence layer returns a -`Conflict` error, which gRPC handlers map to `ABORTED` status so the client -(or the next watch/reconcile event) can retry with fresh state. There is no -automatic retry loop. - -The helper accepts an `expected_version` parameter that selects between two -modes: - -- **Server-driven** (`expected_version = 0`): the helper uses the version it - just read from the database. Internal operations (reconciler, policy status - reports, compute phase transitions) use this mode because the caller does - not track versions. -- **Client-driven** (`expected_version != 0`): the helper validates that the - caller's version matches the current database version before applying the - mutation. If they diverge it returns `Conflict` without attempting the - write. Client-facing operations that carry an `expected_resource_version` - field use this mode: `AttachSandboxProvider`, `DetachSandboxProvider`, - `UpdateProvider`, and `UpdateConfig` (policy backfill path). - -**Lists.** The `list_messages` and `list_messages_with_selector` helpers decode -protobuf payloads from list results and hydrate `resource_version` from the -authoritative database column into each decoded message, mirroring the -`get_message` pattern. This ensures list responses carry correct versions -without requiring callers to manually hydrate each record. - -**Deletes.** Delete operations are not yet CAS-protected -- the delete request -protos do not carry `expected_resource_version`. A `delete_if` primitive exists -in the persistence layer but is not wired into gRPC handlers. - -**Coverage.** All `ObjectMeta`-bearing message types have write-condition -coverage: - -| Type | Create | Update | List | -|---|---|---|---| -| Sandbox | `MustCreate` | `update_message_cas` | `list_messages` | -| Provider | `MustCreate` | `update_message_cas` | `list_messages` | -| ProviderProfile | `MustCreate` | (immutable) | `list_messages` | -| InferenceRoute | `MustCreate` | `update_message_cas` | `list_messages` | -| SandboxPolicy | scoped versioning | scoped versioning | scoped query | -| Settings | `Mutex`-guarded | `Mutex`-guarded | single-row | - -Global settings updates use a Tokio `Mutex` to serialize multi-step -validation within a single gateway process, with CAS on the underlying -persistence write as defense in depth. In an HA deployment with multiple -gateways, the Mutex alone would be insufficient. Sandbox-scoped settings -rely entirely on CAS without a Mutex. - -The `resource_version` is surfaced to clients through `ObjectMeta` in proto -responses. Database migrations backfill existing rows with version 1. - Policy and runtime settings are delivered together through the effective sandbox config path. A gateway-global policy can override sandbox-scoped policy. The sandbox supervisor polls for config revisions and hot-reloads dynamic policy diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index b69a9629b..543fed6c3 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -33,7 +33,6 @@ tokio = { workspace = true } tonic = { workspace = true, features = ["tls", "tls-native-roots"] } # CLI -chrono = "0.4" clap = { workspace = true } clap_complete = { workspace = true } crossterm = { workspace = true } diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 3a8c344d3..62a7b0ec1 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -653,23 +653,6 @@ enum OutputFormat { Json, } -#[derive(Clone, Debug, ValueEnum)] -enum CliProviderRefreshStrategy { - Oauth2RefreshToken, - Oauth2ClientCredentials, - GoogleServiceAccountJwt, -} - -impl CliProviderRefreshStrategy { - fn as_str(&self) -> &'static str { - match self { - Self::Oauth2RefreshToken => "oauth2_refresh_token", - Self::Oauth2ClientCredentials => "oauth2_client_credentials", - Self::GoogleServiceAccountJwt => "google_service_account_jwt", - } - } -} - impl OutputFormat { fn as_str(&self) -> &'static str { match self { @@ -725,10 +708,6 @@ enum ProviderCommands { config: Vec, }, - /// Manage provider credential refresh. - #[command(subcommand, help_template = SUBCOMMAND_HELP_TEMPLATE)] - Refresh(ProviderRefreshCommands), - /// Fetch a provider by name. #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] Get { @@ -787,10 +766,6 @@ enum ProviderCommands { /// Provider config key/value pair. #[arg(long = "config", value_name = "KEY=VALUE")] config: Vec, - - /// Credential expiry (`KEY=TIMESTAMP`). Accepts epoch milliseconds or RFC3339. A zero timestamp clears expiry. - #[arg(long = "credential-expires-at", value_name = "KEY=TIMESTAMP")] - credential_expires_at: Vec, }, /// Delete providers by name. @@ -802,77 +777,6 @@ enum ProviderCommands { }, } -#[derive(Subcommand, Debug)] -enum ProviderRefreshCommands { - /// Show provider credential refresh status. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Status { - /// Provider name. - #[arg(add = ArgValueCompleter::new(completers::complete_provider_names))] - name: String, - - /// Optional credential key to filter by. - #[arg(long = "credential-key")] - credential_key: Option, - }, - - /// Configure refresh metadata for a provider credential. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Configure { - /// Provider name. - #[arg(add = ArgValueCompleter::new(completers::complete_provider_names))] - name: String, - - /// Injectable credential key, for example `MS_GRAPH_ACCESS_TOKEN`. - #[arg(long = "credential-key")] - credential_key: String, - - /// Refresh strategy. - #[arg(long, value_enum)] - strategy: CliProviderRefreshStrategy, - - /// Non-injectable refresh material (`KEY=VALUE`). - #[arg(long = "material", value_name = "KEY=VALUE")] - material: Vec, - - /// Material keys that are secret and must not be exposed. - #[arg(long = "secret-material-key", value_name = "KEY")] - secret_material_keys: Vec, - - /// Expiry for the current credential. Accepts epoch milliseconds or RFC3339. - #[arg( - long = "credential-expires-at", - value_name = "TIMESTAMP", - value_parser = run::parse_credential_expiry_cli_value - )] - credential_expires_at: Option, - }, - - /// Record a gateway-owned credential rotation request. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Rotate { - /// Provider name. - #[arg(add = ArgValueCompleter::new(completers::complete_provider_names))] - name: String, - - /// Injectable credential key, for example `MS_GRAPH_ACCESS_TOKEN`. - #[arg(long = "credential-key")] - credential_key: String, - }, - - /// Delete refresh metadata for a provider credential. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Delete { - /// Provider name. - #[arg(add = ArgValueCompleter::new(completers::complete_provider_names))] - name: String, - - /// Injectable credential key, for example `MS_GRAPH_ACCESS_TOKEN`. - #[arg(long = "credential-key")] - credential_key: String, - }, -} - #[derive(Subcommand, Debug)] enum ProviderProfileCommands { /// Export a provider profile. @@ -2737,55 +2641,6 @@ async fn main() -> Result<()> { ) .await?; } - ProviderCommands::Refresh(command) => match command { - ProviderRefreshCommands::Status { - name, - credential_key, - } => { - run::provider_refresh_status( - endpoint, - &name, - credential_key.as_deref(), - &tls, - ) - .await?; - } - ProviderRefreshCommands::Configure { - name, - credential_key, - strategy, - material, - secret_material_keys, - credential_expires_at, - } => { - run::provider_refresh_config( - endpoint, - run::ProviderRefreshConfigInput { - name: &name, - credential_key: &credential_key, - strategy: strategy.as_str(), - material: &material, - secret_material_keys: &secret_material_keys, - credential_expires_at_ms: credential_expires_at, - }, - &tls, - ) - .await?; - } - ProviderRefreshCommands::Rotate { - name, - credential_key, - } => { - run::provider_rotate(endpoint, &name, &credential_key, &tls).await?; - } - ProviderRefreshCommands::Delete { - name, - credential_key, - } => { - run::provider_refresh_delete(endpoint, &name, &credential_key, &tls) - .await?; - } - }, ProviderCommands::Get { name } => { run::provider_get(endpoint, &name, &tls).await?; } @@ -2830,7 +2685,6 @@ async fn main() -> Result<()> { from_existing, credentials, config, - credential_expires_at, } => { run::provider_update( endpoint, @@ -2838,7 +2692,6 @@ async fn main() -> Result<()> { from_existing, &credentials, &config, - &credential_expires_at, &tls, ) .await?; @@ -3719,155 +3572,6 @@ mod tests { } } - #[test] - fn provider_refresh_commands_parse() { - let status = Cli::try_parse_from([ - "openshell", - "provider", - "refresh", - "status", - "my-graph", - "--credential-key", - "MS_GRAPH_ACCESS_TOKEN", - ]) - .expect("provider refresh status should parse"); - assert!(matches!( - status.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Refresh(ProviderRefreshCommands::Status { - name, - credential_key: Some(key) - })) - }) if name == "my-graph" && key == "MS_GRAPH_ACCESS_TOKEN" - )); - - let config = Cli::try_parse_from([ - "openshell", - "provider", - "refresh", - "configure", - "my-graph", - "--credential-key", - "MS_GRAPH_ACCESS_TOKEN", - "--strategy", - "oauth2-client-credentials", - "--material", - "tenant_id=abc", - "--secret-material-key", - "client_secret", - "--credential-expires-at", - "1767225600000", - ]) - .expect("provider refresh configure should parse"); - assert!(matches!( - config.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Refresh( - ProviderRefreshCommands::Configure { - strategy: CliProviderRefreshStrategy::Oauth2ClientCredentials, - credential_expires_at: Some(1_767_225_600_000), - .. - } - )) - }) - )); - - let rotate = Cli::try_parse_from([ - "openshell", - "provider", - "refresh", - "rotate", - "my-graph", - "--credential-key", - "MS_GRAPH_ACCESS_TOKEN", - ]) - .expect("provider refresh rotate should parse"); - assert!(matches!( - rotate.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Refresh(ProviderRefreshCommands::Rotate { - name, - credential_key - })) - }) if name == "my-graph" && credential_key == "MS_GRAPH_ACCESS_TOKEN" - )); - - let delete = Cli::try_parse_from([ - "openshell", - "provider", - "refresh", - "delete", - "my-graph", - "--credential-key", - "MS_GRAPH_ACCESS_TOKEN", - ]) - .expect("provider refresh delete should parse"); - assert!(matches!( - delete.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Refresh(ProviderRefreshCommands::Delete { - name, - credential_key - })) - }) if name == "my-graph" && credential_key == "MS_GRAPH_ACCESS_TOKEN" - )); - } - - #[test] - fn provider_update_accepts_credential_expiry() { - let cli = Cli::try_parse_from([ - "openshell", - "provider", - "update", - "my-graph", - "--credential", - "MS_GRAPH_ACCESS_TOKEN=abc", - "--credential-expires-at", - "MS_GRAPH_ACCESS_TOKEN=1767225600000", - ]) - .expect("provider update should parse credential expiry"); - - assert!(matches!( - cli.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Update { - credential_expires_at, - .. - }) - }) if credential_expires_at == vec!["MS_GRAPH_ACCESS_TOKEN=1767225600000"] - )); - } - - #[test] - fn provider_refresh_config_accepts_rfc3339_credential_expiry() { - let cli = Cli::try_parse_from([ - "openshell", - "provider", - "refresh", - "configure", - "my-graph", - "--credential-key", - "MS_GRAPH_ACCESS_TOKEN", - "--strategy", - "oauth2-client-credentials", - "--credential-expires-at", - "2026-01-01T00:00:00Z", - ]) - .expect("provider refresh configure should parse RFC3339 credential expiry"); - - assert!(matches!( - cli.command, - Some(Commands::Provider { - command: Some(ProviderCommands::Refresh( - ProviderRefreshCommands::Configure { - credential_expires_at: Some(1_767_225_600_000), - .. - } - )) - }) - )); - } - #[test] fn settings_set_global_parses_yes_flag() { let cli = Cli::try_parse_from([ diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2e3cb0531..30025a14a 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -9,7 +9,6 @@ use crate::tls::{ grpc_inference_client, require_tls_materials, }; use bytes::Bytes; -use chrono::DateTime; use dialoguer::{Confirm, Select, theme::ColorfulTheme}; use futures::StreamExt; use http_body_util::Full; @@ -32,24 +31,21 @@ use openshell_core::progress::{ use openshell_core::proto::ProviderProfileCategory; use openshell_core::proto::{ ApproveAllDraftChunksRequest, ApproveDraftChunkRequest, AttachSandboxProviderRequest, - ClearDraftChunksRequest, ConfigureProviderRefreshRequest, CreateProviderRequest, - CreateSandboxRequest, CreateSshSessionRequest, DeleteProviderProfileRequest, - DeleteProviderRefreshRequest, DeleteProviderRequest, DeleteSandboxRequest, + ClearDraftChunksRequest, CreateProviderRequest, CreateSandboxRequest, CreateSshSessionRequest, + DeleteProviderProfileRequest, DeleteProviderRequest, DeleteSandboxRequest, DeleteServiceRequest, DetachSandboxProviderRequest, ExecSandboxRequest, ExposeServiceRequest, GetClusterInferenceRequest, GetDraftHistoryRequest, GetDraftPolicyRequest, - GetGatewayConfigRequest, GetProviderProfileRequest, GetProviderRefreshStatusRequest, - GetProviderRequest, GetSandboxConfigRequest, GetSandboxLogsRequest, - GetSandboxPolicyStatusRequest, GetSandboxRequest, GetServiceRequest, HealthRequest, - ImportProviderProfilesRequest, LintProviderProfilesRequest, ListProviderProfilesRequest, - ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxProvidersRequest, - ListSandboxesRequest, ListServicesRequest, PlatformEvent, PolicySource, PolicyStatus, Provider, - ProviderCredentialRefreshStatus, ProviderCredentialRefreshStrategy, ProviderProfile, + GetGatewayConfigRequest, GetProviderProfileRequest, GetProviderRequest, + GetSandboxConfigRequest, GetSandboxLogsRequest, GetSandboxPolicyStatusRequest, + GetSandboxRequest, GetServiceRequest, HealthRequest, ImportProviderProfilesRequest, + LintProviderProfilesRequest, ListProviderProfilesRequest, ListProvidersRequest, + ListSandboxPoliciesRequest, ListSandboxProvidersRequest, ListSandboxesRequest, + ListServicesRequest, PlatformEvent, PolicySource, PolicyStatus, Provider, ProviderProfile, ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, - RevokeSshSessionRequest, RotateProviderCredentialRequest, Sandbox, SandboxPhase, SandboxPolicy, - SandboxSpec, SandboxTemplate, ServiceEndpointResponse, SetClusterInferenceRequest, - SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit, TcpRelayTarget, - UpdateConfigRequest, UpdateProviderRequest, WatchSandboxRequest, exec_sandbox_event, - setting_value, tcp_forward_init, + RevokeSshSessionRequest, Sandbox, SandboxPhase, SandboxPolicy, SandboxSpec, SandboxTemplate, + ServiceEndpointResponse, SetClusterInferenceRequest, SettingScope, SettingValue, + TcpForwardFrame, TcpForwardInit, TcpRelayTarget, UpdateConfigRequest, UpdateProviderRequest, + WatchSandboxRequest, exec_sandbox_event, setting_value, tcp_forward_init, }; use openshell_core::settings::{self, SettingValueKind}; use openshell_core::{ObjectId, ObjectName}; @@ -679,8 +675,8 @@ fn is_loopback_gateway_endpoint(endpoint: &str) -> bool { /// would serve this endpoint. /// /// Loopback endpoints (`localhost`, `127.0.0.1`, `::1`) resolve to the -/// `"openshell"` gateway name, matching the convention used by local -/// `openshell-gateway generate-certs` and the TLS cert resolver in `tls.rs`. +/// `"openshell"` gateway name, matching the convention used by +/// `init-pki.sh` and the TLS cert resolver in `tls.rs`. fn mtls_certs_exist_for_endpoint(name: &str, endpoint: &str) -> bool { let cert_name = if is_loopback_gateway_endpoint(endpoint) { "openshell" @@ -905,7 +901,7 @@ pub async fn gateway_add( // Derive a gateway name from the hostname when none is provided. // Loopback endpoints use the canonical "openshell" name, matching the - // convention in local cert generation and default_tls_dir. + // convention in init-pki.sh and default_tls_dir. let derived_name; let name = if let Some(n) = name { n @@ -2393,11 +2389,6 @@ pub async fn sandbox_get( println!(" {} {}", "Id:".dimmed(), id); println!(" {} {}", "Name:".dimmed(), name); println!(" {} {}", "Phase:".dimmed(), phase_name(sandbox.phase)); - println!( - " {} {}", - "Resource version:".dimmed(), - sandbox.metadata.as_ref().map_or(0, |m| m.resource_version) - ); // Display labels if present if let Some(metadata) = &sandbox.metadata @@ -3163,7 +3154,6 @@ fn sandbox_to_json(sandbox: &Sandbox) -> serde_json::Value { "id": sandbox.object_id(), "name": sandbox.object_name(), "labels": labels, - "resource_version": meta.map_or(0, |m| m.resource_version), "created_at": format_epoch_ms(meta.map_or(0, |m| m.created_at_ms)), "phase": phase_name(sandbox.phase), "current_policy_version": sandbox.current_policy_version, @@ -3196,38 +3186,14 @@ pub async fn sandbox_provider_attach( tls: &TlsOptions, ) -> Result<()> { let mut client = grpc_client(server, tls).await?; - - // Fetch current sandbox to get resource_version for CAS - let sandbox = client - .get_sandbox(GetSandboxRequest { - name: name.to_string(), - }) - .await - .into_diagnostic()? - .into_inner() - .sandbox - .ok_or_else(|| miette::miette!("sandbox not found"))?; - - let resource_version = sandbox.metadata.as_ref().map_or(0, |m| m.resource_version); - - let response = match client + let response = client .attach_sandbox_provider(AttachSandboxProviderRequest { sandbox_name: name.to_string(), provider_name: provider.to_string(), - expected_resource_version: resource_version, }) .await - { - Ok(response) => response.into_inner(), - Err(status) if status.code() == Code::Aborted => { - return Err(miette::miette!( - "Failed to attach provider: sandbox was modified by another operation.\n\ - Please retry the command." - ) - .with_source_code(status.message().to_string())); - } - Err(e) => return Err(e).into_diagnostic(), - }; + .into_diagnostic()? + .into_inner(); if response.attached { println!( @@ -3249,38 +3215,14 @@ pub async fn sandbox_provider_detach( tls: &TlsOptions, ) -> Result<()> { let mut client = grpc_client(server, tls).await?; - - // Fetch current sandbox to get resource_version for CAS - let sandbox = client - .get_sandbox(GetSandboxRequest { - name: name.to_string(), - }) - .await - .into_diagnostic()? - .into_inner() - .sandbox - .ok_or_else(|| miette::miette!("sandbox not found"))?; - - let resource_version = sandbox.metadata.as_ref().map_or(0, |m| m.resource_version); - - let response = match client + let response = client .detach_sandbox_provider(DetachSandboxProviderRequest { sandbox_name: name.to_string(), provider_name: provider.to_string(), - expected_resource_version: resource_version, }) .await - { - Ok(response) => response.into_inner(), - Err(status) if status.code() == Code::Aborted => { - return Err(miette::miette!( - "Failed to detach provider: sandbox was modified by another operation.\n\ - Please retry the command." - ) - .with_source_code(status.message().to_string())); - } - Err(e) => return Err(e).into_diagnostic(), - }; + .into_diagnostic()? + .into_inner(); if response.detached { println!( @@ -3427,7 +3369,7 @@ fn inferred_provider_type(command: &[String]) -> Option { /// passed through directly; the server validates they exist at sandbox creation. /// /// `inferred_types` are provider **types** inferred from the trailing command -/// (e.g. `claude` -> type `"claude-code"`). These are resolved to provider names via +/// (e.g. `claude` → type `"claude"`). These are resolved to provider names via /// a type→name lookup, and missing types may be auto-created interactively. /// /// Returns a deduplicated list of provider **names** suitable for @@ -3615,12 +3557,10 @@ async fn auto_create_provider( name: exact_name.to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: discovered.credentials.clone(), config: discovered.config.clone(), - credential_expires_at_ms: HashMap::new(), }), }; @@ -3657,12 +3597,10 @@ async fn auto_create_provider( name: name.clone(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: discovered.credentials.clone(), config: discovered.config.clone(), - credential_expires_at_ms: HashMap::new(), }), }; @@ -3759,72 +3697,6 @@ fn parse_credential_pairs(items: &[String]) -> Result> { Ok(map) } -pub fn parse_credential_expiry_cli_value(value: &str) -> std::result::Result { - parse_credential_expiry_value(value, None).map_err(|err| err.to_string()) -} - -fn credential_expiry_value_error(key: Option<&str>, detail: &str) -> miette::Report { - key.map_or_else( - || miette::miette!("--credential-expires-at value {detail}"), - |key| miette::miette!("--credential-expires-at value for '{key}' {detail}"), - ) -} - -fn parse_credential_expiry_value(value: &str, key: Option<&str>) -> Result { - let value = value.trim(); - if value.is_empty() { - return Err(credential_expiry_value_error(key, "cannot be empty")); - } - - if let Ok(value_ms) = value.parse::() { - if value_ms < 0 { - return Err(credential_expiry_value_error( - key, - "must be greater than or equal to 0", - )); - } - return Ok(value_ms); - } - - let parsed = DateTime::parse_from_rfc3339(value).map_err(|_| { - credential_expiry_value_error( - key, - "must be a Unix epoch millisecond timestamp or RFC3339 timestamp", - ) - })?; - let value_ms = parsed.timestamp_millis(); - if value_ms < 0 { - return Err(credential_expiry_value_error( - key, - "must be greater than or equal to 0", - )); - } - - Ok(value_ms) -} - -fn parse_credential_expiry_pairs(items: &[String]) -> Result> { - let mut map = HashMap::new(); - - for item in items { - let Some((key, value)) = item.split_once('=') else { - return Err(miette::miette!( - "--credential-expires-at expects KEY=TIMESTAMP, got '{item}'" - )); - }; - let key = key.trim(); - if key.is_empty() { - return Err(miette::miette!( - "--credential-expires-at key cannot be empty" - )); - } - let value = parse_credential_expiry_value(value, Some(key))?; - map.insert(key.to_string(), value); - } - - Ok(map) -} - pub async fn service_expose( server: &str, sandbox: &str, @@ -4123,20 +3995,10 @@ pub async fn provider_create( } if credential_map.is_empty() { - let allows_refresh_bootstrap = client - .get_provider_profile(GetProviderProfileRequest { - id: provider_type.clone(), - }) - .await - .ok() - .and_then(|response| response.into_inner().profile) - .is_some_and(|profile| provider_profile_allows_refresh_bootstrap(&profile)); - if !allows_refresh_bootstrap { - return Err(miette::miette!( - "no credentials resolved for provider type '{provider_type}'. \ - Use --credential KEY[=VALUE] or --from-existing with the appropriate env vars set." - )); - } + return Err(miette::miette!( + "no credentials resolved for provider type '{provider_type}'. \ + Use --credential KEY[=VALUE] or --from-existing with the appropriate env vars set." + )); } let response = client @@ -4147,12 +4009,10 @@ pub async fn provider_create( name: name.to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.clone(), credentials: credential_map, config: config_map, - credential_expires_at_ms: HashMap::new(), }), }) .await @@ -4171,30 +4031,6 @@ pub async fn provider_create( Ok(()) } -fn provider_profile_allows_refresh_bootstrap(profile: &ProviderProfile) -> bool { - let required_credentials = profile - .credentials - .iter() - .filter(|credential| credential.required) - .collect::>(); - !required_credentials.is_empty() - && required_credentials.iter().all(|credential| { - credential - .refresh - .as_ref() - .is_some_and(|refresh| is_gateway_mintable_refresh_strategy(refresh.strategy)) - }) -} - -fn is_gateway_mintable_refresh_strategy(strategy: i32) -> bool { - matches!( - ProviderCredentialRefreshStrategy::try_from(strategy), - Ok(ProviderCredentialRefreshStrategy::Oauth2RefreshToken - | ProviderCredentialRefreshStrategy::Oauth2ClientCredentials - | ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt) - ) -} - pub async fn provider_get(server: &str, name: &str, tls: &TlsOptions) -> Result<()> { let mut client = grpc_client(server, tls).await?; let response = client @@ -4217,11 +4053,6 @@ pub async fn provider_get(server: &str, name: &str, tls: &TlsOptions) -> Result< println!(" {} {}", "Id:".dimmed(), provider.object_id()); println!(" {} {}", "Name:".dimmed(), provider.object_name()); println!(" {} {}", "Type:".dimmed(), provider.r#type); - println!( - " {} {}", - "Resource version:".dimmed(), - provider.metadata.as_ref().map_or(0, |m| m.resource_version) - ); println!( " {} {}", "Credential keys:".dimmed(), @@ -4474,224 +4305,6 @@ pub async fn provider_profile_delete(server: &str, id: &str, tls: &TlsOptions) - Ok(()) } -pub async fn provider_refresh_status( - server: &str, - name: &str, - credential_key: Option<&str>, - tls: &TlsOptions, -) -> Result<()> { - let mut client = grpc_client(server, tls).await?; - let response = client - .get_provider_refresh_status(GetProviderRefreshStatusRequest { - provider: name.to_string(), - credential_key: credential_key.unwrap_or_default().to_string(), - }) - .await - .into_diagnostic()? - .into_inner(); - - if response.credentials.is_empty() { - if let Some(credential_key) = credential_key { - println!( - "No refresh configuration found for provider '{name}' credential '{credential_key}'." - ); - } else { - println!("No refresh configurations found for provider '{name}'."); - } - return Ok(()); - } - - println!("{}", refresh_status_header()); - for status in response.credentials { - print_refresh_status_row(&status); - } - Ok(()) -} - -fn refresh_status_header() -> String { - format!( - "{:<24} {:<28} {:<28} {:<18} {:<20} {:<20} {:<20} {}", - "PROVIDER".bold(), - "CREDENTIAL_KEY".bold(), - "STRATEGY".bold(), - "STATUS".bold(), - "EXPIRES_AT".bold(), - "NEXT_REFRESH".bold(), - "LAST_REFRESH".bold(), - "LAST_ERROR".bold(), - ) -} - -pub struct ProviderRefreshConfigInput<'a> { - pub name: &'a str, - pub credential_key: &'a str, - pub strategy: &'a str, - pub material: &'a [String], - pub secret_material_keys: &'a [String], - pub credential_expires_at_ms: Option, -} - -pub async fn provider_refresh_config( - server: &str, - input: ProviderRefreshConfigInput<'_>, - tls: &TlsOptions, -) -> Result<()> { - let strategy = provider_refresh_strategy(input.strategy)?; - let material = parse_key_value_pairs(input.material, "--material")?; - let mut client = grpc_client(server, tls).await?; - let status = client - .configure_provider_refresh(ConfigureProviderRefreshRequest { - provider: input.name.to_string(), - credential_key: input.credential_key.to_string(), - strategy: strategy as i32, - material, - secret_material_keys: input.secret_material_keys.to_vec(), - expires_at_ms: input.credential_expires_at_ms, - }) - .await - .into_diagnostic()? - .into_inner() - .status - .ok_or_else(|| miette!("provider refresh status missing from response"))?; - - println!( - "{} Configured refresh for {} {}", - "✓".green().bold(), - status.provider_name, - status.credential_key - ); - Ok(()) -} - -pub async fn provider_rotate( - server: &str, - name: &str, - credential_key: &str, - tls: &TlsOptions, -) -> Result<()> { - let mut client = grpc_client(server, tls).await?; - let status = client - .rotate_provider_credential(RotateProviderCredentialRequest { - provider: name.to_string(), - credential_key: credential_key.to_string(), - }) - .await - .into_diagnostic()? - .into_inner() - .status - .ok_or_else(|| miette!("provider refresh status missing from response"))?; - - if status.last_error.is_empty() { - println!( - "{} Rotation requested for {} {} ({})", - "✓".green().bold(), - status.provider_name, - status.credential_key, - status.status - ); - } else { - println!( - "Rotation request recorded for {} {} ({}): {}", - status.provider_name, status.credential_key, status.status, status.last_error - ); - } - Ok(()) -} - -pub async fn provider_refresh_delete( - server: &str, - name: &str, - credential_key: &str, - tls: &TlsOptions, -) -> Result<()> { - let mut client = grpc_client(server, tls).await?; - let response = client - .delete_provider_refresh(DeleteProviderRefreshRequest { - provider: name.to_string(), - credential_key: credential_key.to_string(), - }) - .await - .into_diagnostic()? - .into_inner(); - - if response.deleted { - println!( - "{} Deleted refresh config for {} {}", - "✓".green().bold(), - name, - credential_key - ); - } else { - println!("No refresh config found for provider '{name}' credential '{credential_key}'."); - } - Ok(()) -} - -fn provider_refresh_strategy(strategy: &str) -> Result { - match strategy { - "oauth2_refresh_token" => Ok(ProviderCredentialRefreshStrategy::Oauth2RefreshToken), - "oauth2_client_credentials" => { - Ok(ProviderCredentialRefreshStrategy::Oauth2ClientCredentials) - } - "google_service_account_jwt" => { - Ok(ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt) - } - _ => Err(miette!("unsupported provider refresh strategy: {strategy}")), - } -} - -fn print_refresh_status_row(status: &ProviderCredentialRefreshStatus) { - println!("{}", refresh_status_row(status)); -} - -fn refresh_status_row(status: &ProviderCredentialRefreshStatus) -> String { - let strategy = ProviderCredentialRefreshStrategy::try_from(status.strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified); - format!( - "{:<24} {:<28} {:<28} {:<18} {:<20} {:<20} {:<20} {}", - status.provider_name, - status.credential_key, - provider_refresh_strategy_name(strategy), - status.status, - format_optional_epoch_ms(status.expires_at_ms), - format_optional_epoch_ms(status.next_refresh_at_ms), - format_optional_epoch_ms(status.last_refresh_at_ms), - truncate_status_field(&status.last_error, 72), - ) -} - -fn format_optional_epoch_ms(ms: i64) -> String { - if ms > 0 { - format_epoch_ms(ms) - } else { - "-".to_string() - } -} - -fn truncate_status_field(value: &str, max_chars: usize) -> String { - if value.is_empty() { - return "-".to_string(); - } - let mut chars = value.chars(); - let truncated = chars.by_ref().take(max_chars).collect::(); - if chars.next().is_some() { - format!("{truncated}...") - } else { - truncated - } -} - -fn provider_refresh_strategy_name(strategy: ProviderCredentialRefreshStrategy) -> &'static str { - match strategy { - ProviderCredentialRefreshStrategy::Static => "static", - ProviderCredentialRefreshStrategy::External => "external", - ProviderCredentialRefreshStrategy::Oauth2RefreshToken => "oauth2_refresh_token", - ProviderCredentialRefreshStrategy::Oauth2ClientCredentials => "oauth2_client_credentials", - ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt => "google_service_account_jwt", - ProviderCredentialRefreshStrategy::Unspecified => "unspecified", - } -} - fn load_profile_import_items( file: Option<&Path>, from: Option<&Path>, @@ -4844,7 +4457,6 @@ pub async fn provider_update( from_existing: bool, credentials: &[String], config: &[String], - credential_expires_at: &[String], tls: &TlsOptions, ) -> Result<()> { if from_existing && !credentials.is_empty() { @@ -4857,7 +4469,6 @@ pub async fn provider_update( let mut credential_map = parse_credential_pairs(credentials)?; let mut config_map = parse_key_value_pairs(config, "--config")?; - let credential_expires_at_ms = parse_credential_expiry_pairs(credential_expires_at)?; if from_existing { // Fetch the existing provider to discover its type for credential lookup. @@ -4898,14 +4509,11 @@ pub async fn provider_update( name: name.to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: credential_map, config: config_map, - credential_expires_at_ms: HashMap::new(), }), - credential_expires_at_ms, }) .await .into_diagnostic()?; @@ -5455,7 +5063,6 @@ pub async fn sandbox_policy_set_global( delete_setting: false, global: true, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -5654,7 +5261,6 @@ pub async fn gateway_setting_set( delete_setting: false, global: true, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -5689,7 +5295,6 @@ pub async fn sandbox_setting_set( delete_setting: false, global: false, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -5724,7 +5329,6 @@ pub async fn gateway_setting_delete( delete_setting: true, global: true, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -5759,7 +5363,6 @@ pub async fn sandbox_setting_delete( delete_setting: true, global: false, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -5818,7 +5421,6 @@ pub async fn sandbox_policy_set( delete_setting: false, global: false, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic()?; @@ -5993,7 +5595,6 @@ pub async fn sandbox_policy_update( delete_setting: false, global: false, merge_operations: plan.merge_operations, - expected_resource_version: 0, }) .await .into_diagnostic()? @@ -6710,10 +6311,8 @@ mod tests { gateway_auth_label, gateway_env_override_warning, gateway_select_with, gateway_type_label, git_sync_files, http_health_check, image_requests_gpu, import_local_package_mtls_bundle, inferred_provider_type, package_managed_tls_dirs, parse_cli_setting_value, - parse_credential_expiry_cli_value, parse_credential_expiry_pairs, parse_credential_pairs, - plaintext_gateway_is_remote, progress_step_from_metadata, - provider_profile_allows_refresh_bootstrap, provisioning_timeout_message, - ready_false_condition_message, refresh_status_header, refresh_status_row, resolve_from, + parse_credential_pairs, plaintext_gateway_is_remote, progress_step_from_metadata, + provisioning_timeout_message, ready_false_condition_message, resolve_from, sandbox_should_persist, service_expose_status_error, service_url_for_gateway, }; use crate::TEST_ENV_LOCK; @@ -6733,9 +6332,7 @@ mod tests { PROGRESS_STEP_STARTING_SANDBOX, }; use openshell_core::proto::{ - Provider, ProviderCredentialRefresh, ProviderCredentialRefreshStatus, - ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileCredential, - SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, + Provider, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, }; struct EnvVarGuard { @@ -6839,48 +6436,6 @@ mod tests { )); } - #[test] - fn parse_credential_expiry_pairs_accepts_epoch_millis_and_rfc3339() { - let parsed = parse_credential_expiry_pairs(&[ - "API_TOKEN=1767225600000".to_string(), - "MS_GRAPH_ACCESS_TOKEN=2026-01-01T00:00:00Z".to_string(), - ]) - .expect("parse"); - - assert_eq!(parsed.get("API_TOKEN"), Some(&1_767_225_600_000)); - assert_eq!( - parsed.get("MS_GRAPH_ACCESS_TOKEN"), - Some(&1_767_225_600_000) - ); - } - - #[test] - fn parse_credential_expiry_pairs_accepts_zero_to_clear_expiry() { - let parsed = - parse_credential_expiry_pairs(&["API_TOKEN=0".to_string()]).expect("parse zero"); - - assert_eq!(parsed.get("API_TOKEN"), Some(&0)); - } - - #[test] - fn parse_credential_expiry_rejects_invalid_timestamp() { - let err = parse_credential_expiry_pairs(&["API_TOKEN=next-week".to_string()]) - .expect_err("invalid timestamp should error"); - - assert!( - err.to_string() - .contains("must be a Unix epoch millisecond timestamp or RFC3339 timestamp") - ); - } - - #[test] - fn parse_credential_expiry_cli_value_accepts_rfc3339_offsets() { - let parsed = parse_credential_expiry_cli_value("2026-01-01T01:00:00+01:00") - .expect("parse RFC3339 with offset"); - - assert_eq!(parsed, 1_767_225_600_000); - } - #[test] fn provider_attachment_table_formats_provider_counts() { let output = format_provider_attachment_table( @@ -6901,7 +6456,6 @@ mod tests { "https://api.custom.example".to_string(), )) .collect(), - credential_expires_at_ms: std::collections::HashMap::new(), }], false, ); @@ -6933,93 +6487,6 @@ mod tests { assert_eq!(progress_step_from_metadata("driver-private-step"), None); } - #[test] - fn refresh_status_table_includes_operational_fields() { - let header = refresh_status_header(); - assert!(header.contains("NEXT_REFRESH")); - assert!(header.contains("LAST_REFRESH")); - assert!(header.contains("LAST_ERROR")); - - let row = refresh_status_row(&ProviderCredentialRefreshStatus { - provider_name: "my-graph".to_string(), - provider_id: "provider-id".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - status: "error".to_string(), - expires_at_ms: 1_767_225_600_000, - next_refresh_at_ms: 1_767_225_660_000, - last_refresh_at_ms: 1_767_225_000_000, - last_error: "token endpoint returned a very long error message that should be truncated for table readability" - .to_string(), - }); - - assert!(row.contains("my-graph")); - assert!(row.contains("MS_GRAPH_ACCESS_TOKEN")); - assert!(row.contains("oauth2_client_credentials")); - assert!(row.contains("error")); - assert!(row.contains("2026-01-01 00:00:00")); - assert!(row.contains("...")); - } - - #[test] - fn refresh_bootstrap_requires_all_required_credentials_to_be_gateway_mintable() { - let refresh_token_profile = ProviderProfile { - credentials: vec![ProviderProfileCredential { - name: "MS_GRAPH_ACCESS_TOKEN".to_string(), - required: true, - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::Oauth2RefreshToken as i32, - ..Default::default() - }), - ..Default::default() - }], - ..Default::default() - }; - assert!(provider_profile_allows_refresh_bootstrap( - &refresh_token_profile - )); - - let mixed_static_profile = ProviderProfile { - credentials: vec![ - ProviderProfileCredential { - name: "ACCESS_TOKEN".to_string(), - required: true, - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - ..Default::default() - }), - ..Default::default() - }, - ProviderProfileCredential { - name: "STATIC_API_KEY".to_string(), - required: true, - refresh: None, - ..Default::default() - }, - ], - ..Default::default() - }; - assert!(!provider_profile_allows_refresh_bootstrap( - &mixed_static_profile - )); - - let optional_refresh_profile = ProviderProfile { - credentials: vec![ProviderProfileCredential { - name: "OPTIONAL_TOKEN".to_string(), - required: false, - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt as i32, - ..Default::default() - }), - ..Default::default() - }], - ..Default::default() - }; - assert!(!provider_profile_allows_refresh_bootstrap( - &optional_refresh_profile - )); - } - #[cfg(feature = "dev-settings")] #[test] fn parse_cli_setting_value_parses_bool_aliases() { @@ -7115,7 +6582,7 @@ mod tests { #[test] fn inferred_provider_type_returns_type_for_known_command() { let result = inferred_provider_type(&["claude".to_string(), "--help".to_string()]); - assert_eq!(result, Some("claude-code".to_string())); + assert_eq!(result, Some("claude".to_string())); } #[test] @@ -7144,7 +6611,7 @@ mod tests { #[test] fn inferred_provider_type_handles_full_path() { let result = inferred_provider_type(&["/usr/local/bin/claude".to_string()]); - assert_eq!(result, Some("claude-code".to_string())); + assert_eq!(result, Some("claude".to_string())); } #[test] @@ -7703,7 +7170,7 @@ mod tests { }); // Loopback endpoints derive the canonical "openshell" gateway - // name, matching local cert generation and default_tls_dir conventions. + // name, matching init-pki.sh and default_tls_dir conventions. let metadata = load_gateway_metadata("openshell").expect("load stored gateway"); assert_eq!(metadata.auth_mode.as_deref(), Some("plaintext")); assert!(!metadata.is_remote); diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index fa2605ac2..bd4262b31 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -62,12 +62,10 @@ impl TestOpenShell { name: name.to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ); } @@ -320,19 +318,6 @@ impl OpenShell for TestOpenShell { } base }; - let merge_expiry = |mut base: HashMap, incoming: HashMap| { - if incoming.is_empty() { - return base; - } - for (k, v) in incoming { - if v <= 0 { - base.remove(&k); - } else { - base.insert(k, v); - } - } - base - }; let existing_metadata = existing.metadata.clone().unwrap_or_default(); let provider_metadata = provider.metadata.clone().unwrap_or_default(); let updated = Provider { @@ -341,15 +326,10 @@ impl OpenShell for TestOpenShell { name: provider_metadata.name, created_at_ms: existing_metadata.created_at_ms, labels: existing_metadata.labels, - resource_version: 0, }), r#type: existing.r#type, credentials: merge(existing.credentials, provider.credentials), config: merge(existing.config, provider.config), - credential_expires_at_ms: merge_expiry( - existing.credential_expires_at_ms, - provider.credential_expires_at_ms, - ), }; let updated_name = updated.object_name().to_string(); providers.insert(updated_name, updated.clone()); @@ -357,33 +337,6 @@ impl OpenShell for TestOpenShell { provider: Some(updated), })) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } async fn delete_provider( &self, @@ -732,19 +685,19 @@ async fn inferred_type_auto_creates_provider() { let result = run::ensure_required_providers( &mut client, &[], - &["claude-code".to_string()], + &["claude".to_string()], Some(true), // --auto-providers ) .await .expect("should auto-create the inferred provider"); - assert_eq!(result, vec!["claude-code".to_string()]); + assert_eq!(result, vec!["claude".to_string()]); let providers = ts.openshell.state.providers.lock().await; let provider = providers - .get("claude-code") - .expect("claude-code provider should exist"); - assert_eq!(provider.r#type, "claude-code"); + .get("claude") + .expect("claude provider should exist"); + assert_eq!(provider.r#type, "claude"); } /// When `--no-auto-providers` is set, missing explicit providers that would @@ -796,7 +749,7 @@ async fn explicit_and_inferred_providers_combined() { let result = run::ensure_required_providers( &mut client, &["nvidia".to_string()], - &["claude-code".to_string()], + &["claude".to_string()], Some(true), ) .await @@ -804,12 +757,12 @@ async fn explicit_and_inferred_providers_combined() { assert_eq!(result.len(), 2); assert!(result.contains(&"nvidia".to_string())); - assert!(result.contains(&"claude-code".to_string())); + assert!(result.contains(&"claude".to_string())); let providers = ts.openshell.state.providers.lock().await; assert_eq!(providers.len(), 2); assert!(providers.contains_key("nvidia")); - assert!(providers.contains_key("claude-code")); + assert!(providers.contains_key("claude")); } /// When an explicit provider name matches an inferred type, the provider diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index fd7a18b28..7102ed9b6 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -244,33 +244,6 @@ impl OpenShell for TestOpenShell { "update_provider not implemented in test", )) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } async fn delete_provider( &self, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index cb2b3cb18..49b933e67 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -11,21 +11,17 @@ use openshell_cli::tls::TlsOptions; use openshell_core::proto::open_shell_server::{OpenShell, OpenShellServer}; use openshell_core::proto::{ AttachSandboxProviderRequest, AttachSandboxProviderResponse, CreateProviderRequest, - CreateSandboxRequest, CreateSshSessionRequest, CreateSshSessionResponse, - DeleteProviderRefreshRequest, DeleteProviderRefreshResponse, DeleteProviderRequest, + CreateSandboxRequest, CreateSshSessionRequest, CreateSshSessionResponse, DeleteProviderRequest, DeleteProviderResponse, DeleteSandboxRequest, DeleteSandboxResponse, DetachSandboxProviderRequest, DetachSandboxProviderResponse, ExecSandboxEvent, ExecSandboxInput, ExecSandboxRequest, GatewayMessage, GetGatewayConfigRequest, - GetGatewayConfigResponse, GetProviderRefreshStatusRequest, GetProviderRefreshStatusResponse, - GetProviderRequest, GetSandboxConfigRequest, GetSandboxConfigResponse, - GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, - HealthRequest, HealthResponse, ListProvidersRequest, ListProvidersResponse, - ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, - ListSandboxesResponse, Provider, ProviderCredentialRefresh, ProviderCredentialRefreshStatus, - ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileCredential, - ProviderResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, - RotateProviderCredentialRequest, RotateProviderCredentialResponse, Sandbox, SandboxResponse, - SandboxStreamEvent, ServiceStatus, SupervisorMessage, UpdateProviderRequest, + GetGatewayConfigResponse, GetProviderRequest, GetSandboxConfigRequest, + GetSandboxConfigResponse, GetSandboxProviderEnvironmentRequest, + GetSandboxProviderEnvironmentResponse, GetSandboxRequest, HealthRequest, HealthResponse, + ListProvidersRequest, ListProvidersResponse, ListSandboxProvidersRequest, + ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, Provider, + ProviderProfile, ProviderResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, + SandboxResponse, SandboxStreamEvent, ServiceStatus, SupervisorMessage, UpdateProviderRequest, WatchSandboxRequest, }; use openshell_core::{ObjectId, ObjectName}; @@ -42,33 +38,10 @@ use tonic::{Response, Status}; struct ProviderState { providers: Arc>>, profiles: Arc>>, - refresh_statuses: Arc>>, - refresh_requests: Arc>>, sandbox_providers: Arc>>>, sandbox_provider_requests: Arc>>, } -#[derive(Clone, Debug, PartialEq, Eq)] -enum ProviderRefreshRequestLog { - Status { - provider_name: String, - credential_key: String, - }, - Configure { - provider_name: String, - credential_key: String, - expires_at_ms: Option, - }, - Rotate { - provider_name: String, - credential_key: String, - }, - Delete { - provider_name: String, - credential_key: String, - }, -} - #[derive(Clone, Debug, PartialEq, Eq)] enum SandboxProviderRequestLog { List { @@ -110,25 +83,9 @@ impl OpenShell for TestOpenShell { async fn get_sandbox( &self, - request: tonic::Request, + _request: tonic::Request, ) -> Result, Status> { - let name = request.into_inner().name; - // Return a minimal sandbox with metadata for CAS operations - Ok(Response::new(SandboxResponse { - sandbox: Some(Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: format!("sb-{name}"), - name, - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 1, - }), - spec: None, - status: None, - phase: 0, - current_policy_version: 0, - }), - })) + Ok(Response::new(SandboxResponse::default())) } async fn list_sandboxes( @@ -198,7 +155,7 @@ impl OpenShell for TestOpenShell { providers.push(request.provider_name.clone()); true }; - let sandbox = Sandbox { + let sandbox = openshell_core::proto::Sandbox { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { name: request.sandbox_name, ..Default::default() @@ -235,7 +192,7 @@ impl OpenShell for TestOpenShell { let before_len = providers.len(); providers.retain(|name| name != &request.provider_name); let detached = providers.len() != before_len; - let sandbox = Sandbox { + let sandbox = openshell_core::proto::Sandbox { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { name: request.sandbox_name, ..Default::default() @@ -482,19 +439,6 @@ impl OpenShell for TestOpenShell { } base }; - let merge_expiry = |mut base: HashMap, incoming: HashMap| { - if incoming.is_empty() { - return base; - } - for (k, v) in incoming { - if v <= 0 { - base.remove(&k); - } else { - base.insert(k, v); - } - } - base - }; let existing_metadata = existing.metadata.clone().unwrap_or_default(); let provider_metadata = provider.metadata.clone().unwrap_or_default(); let updated = Provider { @@ -503,15 +447,10 @@ impl OpenShell for TestOpenShell { name: provider_metadata.name, created_at_ms: existing_metadata.created_at_ms, labels: existing_metadata.labels, - resource_version: 0, }), r#type: existing.r#type, credentials: merge(existing.credentials, provider.credentials), config: merge(existing.config, provider.config), - credential_expires_at_ms: merge_expiry( - existing.credential_expires_at_ms, - provider.credential_expires_at_ms, - ), }; let updated_name = updated.object_name().to_string(); providers.insert(updated_name, updated.clone()); @@ -519,125 +458,6 @@ impl OpenShell for TestOpenShell { provider: Some(updated), })) } - async fn get_provider_refresh_status( - &self, - request: tonic::Request, - ) -> Result, Status> { - let request = request.into_inner(); - self.state - .refresh_requests - .lock() - .await - .push(ProviderRefreshRequestLog::Status { - provider_name: request.provider.clone(), - credential_key: request.credential_key.clone(), - }); - let refresh_statuses = self.state.refresh_statuses.lock().await; - let credentials = if request.credential_key.is_empty() { - refresh_statuses - .values() - .filter(|status| status.provider_name == request.provider) - .cloned() - .collect() - } else { - refresh_statuses - .get(&(request.provider, request.credential_key)) - .cloned() - .into_iter() - .collect() - }; - Ok(Response::new(GetProviderRefreshStatusResponse { - credentials, - })) - } - - async fn configure_provider_refresh( - &self, - request: tonic::Request, - ) -> Result, Status> { - let request = request.into_inner(); - self.state - .refresh_requests - .lock() - .await - .push(ProviderRefreshRequestLog::Configure { - provider_name: request.provider.clone(), - credential_key: request.credential_key.clone(), - expires_at_ms: request.expires_at_ms, - }); - let providers = self.state.providers.lock().await; - let provider = providers - .get(&request.provider) - .ok_or_else(|| Status::not_found("provider not found"))?; - let status = ProviderCredentialRefreshStatus { - provider_name: request.provider.clone(), - provider_id: provider.object_id().to_string(), - credential_key: request.credential_key.clone(), - strategy: request.strategy, - status: "configured".to_string(), - expires_at_ms: request.expires_at_ms.unwrap_or_default(), - next_refresh_at_ms: 0, - last_refresh_at_ms: 0, - last_error: String::new(), - }; - drop(providers); - self.state - .refresh_statuses - .lock() - .await - .insert((request.provider, request.credential_key), status.clone()); - Ok(Response::new( - openshell_core::proto::ConfigureProviderRefreshResponse { - status: Some(status), - }, - )) - } - - async fn rotate_provider_credential( - &self, - request: tonic::Request, - ) -> Result, Status> { - let request = request.into_inner(); - self.state - .refresh_requests - .lock() - .await - .push(ProviderRefreshRequestLog::Rotate { - provider_name: request.provider.clone(), - credential_key: request.credential_key.clone(), - }); - let mut refresh_statuses = self.state.refresh_statuses.lock().await; - let status = refresh_statuses - .get_mut(&(request.provider, request.credential_key)) - .ok_or_else(|| Status::not_found("provider refresh state not found"))?; - status.status = "rotation_requested".to_string(); - Ok(Response::new(RotateProviderCredentialResponse { - status: Some(status.clone()), - })) - } - - async fn delete_provider_refresh( - &self, - request: tonic::Request, - ) -> Result, Status> { - let request = request.into_inner(); - self.state - .refresh_requests - .lock() - .await - .push(ProviderRefreshRequestLog::Delete { - provider_name: request.provider.clone(), - credential_key: request.credential_key.clone(), - }); - let deleted = self - .state - .refresh_statuses - .lock() - .await - .remove(&(request.provider, request.credential_key)) - .is_some(); - Ok(Response::new(DeleteProviderRefreshResponse { deleted })) - } async fn delete_provider( &self, @@ -916,7 +736,6 @@ async fn provider_cli_run_functions_support_full_crud_flow() { false, &["API_KEY=rotated".to_string()], &["profile=prod".to_string()], - &[], &ts.tls, ) .await @@ -936,115 +755,6 @@ async fn provider_list_profiles_cli_uses_profile_browsing_rpc() { .expect("provider list-profiles"); } -#[tokio::test] -async fn provider_refresh_cli_run_functions_wire_requests() { - let ts = run_server().await; - - run::provider_create( - &ts.endpoint, - "my-graph", - "outlook", - false, - &["MS_GRAPH_ACCESS_TOKEN=token".to_string()], - &[], - &ts.tls, - ) - .await - .expect("provider create"); - - run::provider_refresh_config( - &ts.endpoint, - run::ProviderRefreshConfigInput { - name: "my-graph", - credential_key: "MS_GRAPH_ACCESS_TOKEN", - strategy: "oauth2_client_credentials", - material: &["tenant_id=tenant".to_string()], - secret_material_keys: &["client_secret".to_string()], - credential_expires_at_ms: Some(1_767_225_600_000), - }, - &ts.tls, - ) - .await - .expect("provider refresh configure"); - run::provider_refresh_status( - &ts.endpoint, - "my-graph", - Some("MS_GRAPH_ACCESS_TOKEN"), - &ts.tls, - ) - .await - .expect("provider refresh status"); - run::provider_rotate(&ts.endpoint, "my-graph", "MS_GRAPH_ACCESS_TOKEN", &ts.tls) - .await - .expect("provider refresh rotate"); - run::provider_refresh_delete(&ts.endpoint, "my-graph", "MS_GRAPH_ACCESS_TOKEN", &ts.tls) - .await - .expect("provider refresh delete"); - - let requests = ts.state.refresh_requests.lock().await.clone(); - assert_eq!( - requests, - vec![ - ProviderRefreshRequestLog::Configure { - provider_name: "my-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - expires_at_ms: Some(1_767_225_600_000), - }, - ProviderRefreshRequestLog::Status { - provider_name: "my-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - }, - ProviderRefreshRequestLog::Rotate { - provider_name: "my-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - }, - ProviderRefreshRequestLog::Delete { - provider_name: "my-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - }, - ] - ); -} - -#[tokio::test] -async fn provider_create_allows_empty_credentials_for_gateway_refresh_profiles() { - let ts = run_server().await; - ts.state.profiles.lock().await.insert( - "custom-refresh".to_string(), - ProviderProfile { - id: "custom-refresh".to_string(), - display_name: "Custom Refresh".to_string(), - credentials: vec![ProviderProfileCredential { - name: "ACCESS_TOKEN".to_string(), - required: true, - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::Oauth2RefreshToken as i32, - ..Default::default() - }), - ..Default::default() - }], - ..Default::default() - }, - ); - - run::provider_create( - &ts.endpoint, - "custom-refresh-provider", - "custom-refresh", - false, - &[], - &[], - &ts.tls, - ) - .await - .expect("provider create"); - - let stored = ts.state.providers.lock().await; - let provider = stored.get("custom-refresh-provider").expect("provider"); - assert_eq!(provider.r#type, "custom-refresh"); - assert!(provider.credentials.is_empty()); -} - #[tokio::test] async fn sandbox_provider_cli_run_functions_wire_requests_and_idempotent_results() { let ts = run_server().await; diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 8e606beea..1ad00dd6e 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -85,7 +85,6 @@ impl OpenShell for TestOpenShell { name: sandbox_name, created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), phase: SandboxPhase::Provisioning as i32, ..Sandbox::default() @@ -105,7 +104,6 @@ impl OpenShell for TestOpenShell { name, created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), phase: SandboxPhase::Ready as i32, ..Sandbox::default() @@ -291,33 +289,6 @@ impl OpenShell for TestOpenShell { ) -> Result, Status> { Ok(Response::new(ProviderResponse::default())) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } async fn delete_provider( &self, @@ -353,7 +324,6 @@ impl OpenShell for TestOpenShell { name: sandbox_id.trim_start_matches("id-").to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), phase: SandboxPhase::Provisioning as i32, ..Sandbox::default() diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 44393fb2f..531599dcf 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -77,7 +77,6 @@ impl OpenShell for TestOpenShell { name, created_at_ms: 0, labels: std::collections::HashMap::new(), - resource_version: 0, }), ..Default::default() }), @@ -261,33 +260,6 @@ impl OpenShell for TestOpenShell { ) -> Result, Status> { Ok(Response::new(ProviderResponse::default())) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } async fn delete_provider( &self, diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index b03fb1494..78c87d54c 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -20,6 +20,8 @@ serde = { workspace = true } serde_json = { workspace = true } url = { workspace = true } ipnet = "2" +chrono = { version = "0.4", default-features = false, features = ["clock", "std"] } +reqwest = { workspace = true, features = ["blocking", "rustls-tls-webpki-roots"] } [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index dbd8dfb8a..e045d0a52 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -22,7 +22,7 @@ use std::str::FromStr; pub const DEFAULT_SSH_PORT: u16 = 2222; /// Default gateway server port. -pub const DEFAULT_SERVER_PORT: u16 = 17670; +pub const DEFAULT_SERVER_PORT: u16 = 8080; /// Default container stop timeout in seconds (SIGTERM → SIGKILL). pub const DEFAULT_STOP_TIMEOUT_SECS: u32 = 10; @@ -34,7 +34,7 @@ pub const DEFAULT_DOCKER_NETWORK_NAME: &str = "openshell-docker"; pub const DEFAULT_SERVICE_ROUTING_DOMAIN: &str = "openshell.localhost"; /// Default OCI image for the openshell-sandbox supervisor binary. -pub const DEFAULT_SUPERVISOR_IMAGE: &str = "ghcr.io/nvidia/openshell/supervisor:latest"; +pub const DEFAULT_SUPERVISOR_IMAGE: &str = "openshell/supervisor:latest"; /// CDI device identifier for requesting all NVIDIA GPUs. pub const CDI_GPU_DEVICE_ALL: &str = "nvidia.com/gpu=all"; @@ -451,7 +451,7 @@ impl Default for ServiceRoutingConfig { } fn default_bind_address() -> SocketAddr { - "127.0.0.1:17670".parse().expect("valid default address") + "127.0.0.1:8080".parse().expect("valid default address") } fn default_service_routing_domains() -> Vec { @@ -557,7 +557,7 @@ mod tests { #[test] fn config_defaults_to_loopback_bind_address() { - let expected: SocketAddr = "127.0.0.1:17670".parse().expect("valid address"); + let expected: SocketAddr = "127.0.0.1:8080".parse().expect("valid address"); assert_eq!(Config::new(None).bind_address, expected); } diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index d0225c471..8c0379f8f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -24,11 +24,12 @@ pub mod progress; pub mod proto; pub mod sandbox_env; pub mod settings; +pub mod telemetry; pub mod time; pub use config::{ComputeDriverKind, Config, OidcConfig, TlsConfig}; pub use error::{ComputeDriverError, Error, Result}; -pub use metadata::{GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion}; +pub use metadata::{ObjectId, ObjectLabels, ObjectName}; /// Build version string derived from git metadata. /// diff --git a/crates/openshell-core/src/metadata.rs b/crates/openshell-core/src/metadata.rs index 78533e1e0..6f7b7b0a4 100644 --- a/crates/openshell-core/src/metadata.rs +++ b/crates/openshell-core/src/metadata.rs @@ -7,7 +7,7 @@ use crate::proto::{ InferenceRoute, ObjectForTest, Provider, Sandbox, ServiceEndpoint, SshSession, - StoredProviderCredentialRefreshState, StoredProviderProfile, + StoredProviderProfile, }; use std::collections::HashMap; @@ -26,16 +26,6 @@ pub trait ObjectLabels { fn object_labels(&self) -> Option>; } -/// Provides mutable access to set the object's resource version from persistence. -pub trait SetResourceVersion { - fn set_resource_version(&mut self, version: u64); -} - -/// Provides read access to the object's current resource version. -pub trait GetResourceVersion { - fn get_resource_version(&self) -> u64; -} - // Implementations for Sandbox impl ObjectId for Sandbox { fn object_id(&self) -> &str { @@ -55,20 +45,6 @@ impl ObjectLabels for Sandbox { } } -impl SetResourceVersion for Sandbox { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for Sandbox { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for Provider impl ObjectId for Provider { fn object_id(&self) -> &str { @@ -88,20 +64,6 @@ impl ObjectLabels for Provider { } } -impl SetResourceVersion for Provider { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for Provider { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for StoredProviderProfile impl ObjectId for StoredProviderProfile { fn object_id(&self) -> &str { @@ -121,53 +83,6 @@ impl ObjectLabels for StoredProviderProfile { } } -impl SetResourceVersion for StoredProviderProfile { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for StoredProviderProfile { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - -// Implementations for StoredProviderCredentialRefreshState -impl ObjectId for StoredProviderCredentialRefreshState { - fn object_id(&self) -> &str { - self.metadata.as_ref().map_or("", |m| m.id.as_str()) - } -} - -impl ObjectName for StoredProviderCredentialRefreshState { - fn object_name(&self) -> &str { - self.metadata.as_ref().map_or("", |m| m.name.as_str()) - } -} - -impl ObjectLabels for StoredProviderCredentialRefreshState { - fn object_labels(&self) -> Option> { - self.metadata.as_ref().map(|m| m.labels.clone()) - } -} - -impl SetResourceVersion for StoredProviderCredentialRefreshState { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for StoredProviderCredentialRefreshState { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for SshSession impl ObjectId for SshSession { fn object_id(&self) -> &str { @@ -187,20 +102,6 @@ impl ObjectLabels for SshSession { } } -impl SetResourceVersion for SshSession { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for SshSession { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for ServiceEndpoint impl ObjectId for ServiceEndpoint { fn object_id(&self) -> &str { @@ -220,20 +121,6 @@ impl ObjectLabels for ServiceEndpoint { } } -impl SetResourceVersion for ServiceEndpoint { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for ServiceEndpoint { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for InferenceRoute impl ObjectId for InferenceRoute { fn object_id(&self) -> &str { @@ -253,20 +140,6 @@ impl ObjectLabels for InferenceRoute { } } -impl SetResourceVersion for InferenceRoute { - fn set_resource_version(&mut self, version: u64) { - if let Some(meta) = self.metadata.as_mut() { - meta.resource_version = version; - } - } -} - -impl GetResourceVersion for InferenceRoute { - fn get_resource_version(&self) -> u64 { - self.metadata.as_ref().map_or(0, |m| m.resource_version) - } -} - // Implementations for ObjectForTest (test-only proto type) impl ObjectId for ObjectForTest { fn object_id(&self) -> &str { @@ -285,16 +158,3 @@ impl ObjectLabels for ObjectForTest { None } } - -impl SetResourceVersion for ObjectForTest { - fn set_resource_version(&mut self, _version: u64) { - // ObjectForTest doesn't have metadata, so this is a no-op - } -} - -impl GetResourceVersion for ObjectForTest { - fn get_resource_version(&self) -> u64 { - // ObjectForTest doesn't have metadata - 0 - } -} diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index 65000c6cf..00104f3c2 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -29,24 +29,6 @@ pub fn openshell_config_dir() -> Result { Ok(xdg_config_dir()?.join("openshell")) } -/// Resolve the XDG state base directory. -/// -/// Returns `$XDG_STATE_HOME` if set, otherwise `$HOME/.local/state`. -pub fn xdg_state_dir() -> Result { - if let Ok(path) = std::env::var("XDG_STATE_HOME") { - return Ok(PathBuf::from(path)); - } - let home = std::env::var("HOME") - .into_diagnostic() - .wrap_err("HOME is not set")?; - Ok(PathBuf::from(home).join(".local").join("state")) -} - -/// The top-level `OpenShell` state directory: `$XDG_STATE_HOME/openshell/`. -pub fn openshell_state_dir() -> Result { - Ok(xdg_state_dir()?.join("openshell")) -} - /// Resolve the XDG data base directory. /// /// Returns `$XDG_DATA_HOME` if set, otherwise `$HOME/.local/share`. @@ -148,15 +130,6 @@ mod tests { ); } - #[test] - fn openshell_state_dir_appends_openshell() { - let dir = openshell_state_dir().unwrap(); - assert!( - dir.ends_with("openshell"), - "expected path ending with 'openshell', got: {dir:?}" - ); - } - #[cfg(unix)] #[test] fn create_dir_restricted_sets_0o700() { diff --git a/crates/openshell-core/src/telemetry.rs b/crates/openshell-core/src/telemetry.rs new file mode 100644 index 000000000..0a1745d99 --- /dev/null +++ b/crates/openshell-core/src/telemetry.rs @@ -0,0 +1,266 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Best-effort anonymous telemetry emission helpers. + +use chrono::{SecondsFormat, Utc}; +use reqwest::blocking::Client; +use serde_json::{Value, json}; +use std::thread; +use std::time::Duration; + +const CLIENT_ID: &str = "415437562476676"; +const DEFAULT_ENDPOINT: &str = "https://events.telemetry.data-uat.nvidia.com/v1.1/events/json"; +const EVENT_SCHEMA_VERSION: &str = "2.0"; +const EVENT_PROTOCOL_VERSION: &str = "1.6"; +const EVENT_SYSTEM_VERSION: &str = "openshell-telemetry/1.0"; +const HTTP_TIMEOUT: Duration = Duration::from_secs(5); +const SOURCE: &str = "openshell"; + +fn telemetry_enabled() -> bool { + telemetry_enabled_from(std::env::var("OPENSHELL_TELEMETRY_ENABLED").ok().as_deref()) +} + +fn telemetry_enabled_from(value: Option<&str>) -> bool { + let value = value.unwrap_or("true"); + !matches!( + value.trim().to_ascii_lowercase().as_str(), + "0" | "false" | "no" | "off" + ) +} + +fn telemetry_endpoint() -> Option { + telemetry_endpoint_from( + std::env::var("OPENSHELL_TELEMETRY_ENDPOINT") + .ok() + .as_deref(), + ) +} + +fn telemetry_endpoint_from(endpoint: Option<&str>) -> Option { + let endpoint = endpoint.unwrap_or(DEFAULT_ENDPOINT); + let endpoint = endpoint.trim(); + if endpoint.is_empty() { + None + } else { + Some(endpoint.to_string()) + } +} + +fn timestamp() -> String { + Utc::now().to_rfc3339_opts(SecondsFormat::Millis, true) +} + +fn build_payload(name: &str, event: Value, ts: &str) -> Value { + json!({ + "browserType": "undefined", + "clientId": CLIENT_ID, + "clientType": "Native", + "clientVariant": "Release", + "clientVer": std::env::var("OPENSHELL_SOURCE_CLIENT_VERSION") + .unwrap_or_else(|_| "undefined".to_string()), + "cpuArchitecture": std::env::consts::ARCH, + "deviceGdprBehOptIn": "None", + "deviceGdprFuncOptIn": "None", + "deviceGdprTechOptIn": "None", + "deviceId": "undefined", + "deviceMake": "undefined", + "deviceModel": "undefined", + "deviceOS": "undefined", + "deviceOSVersion": "undefined", + "deviceType": "undefined", + "eventProtocol": EVENT_PROTOCOL_VERSION, + "eventSchemaVer": EVENT_SCHEMA_VERSION, + "eventSysVer": EVENT_SYSTEM_VERSION, + "externalUserId": "undefined", + "gdprBehOptIn": "None", + "gdprFuncOptIn": "None", + "gdprTechOptIn": "None", + "idpId": "undefined", + "integrationId": "undefined", + "productName": "undefined", + "productVersion": "undefined", + "sentTs": ts, + "sessionId": "undefined", + "userId": "undefined", + "events": [ + { + "ts": ts, + "parameters": event, + "name": name, + } + ], + }) +} + +fn publish_payload(endpoint: &str, payload: Value) -> Result<(), reqwest::Error> { + Client::builder() + .use_rustls_tls() + .tls_built_in_root_certs(true) + .timeout(HTTP_TIMEOUT) + .build()? + .post(endpoint) + .json(&payload) + .send()? + .error_for_status()?; + Ok(()) +} + +fn emit_event(name: &'static str, event: Value) { + if !telemetry_enabled() { + return; + } + let Some(endpoint) = telemetry_endpoint() else { + return; + }; + + thread::spawn(move || { + let payload = build_payload(name, event, ×tamp()); + let _ = publish_payload(&endpoint, payload); + }); +} + +pub fn emit_lifecycle(resource: &str, operation: &str, outcome: &str) { + emit_event( + "openshell_lifecycle_event", + json!({ + "nvidiaSource": SOURCE, + "resource": resource, + "operation": operation, + "outcome": outcome, + }), + ); +} + +pub fn emit_provider_lifecycle(operation: &str, outcome: &str, provider_profile: &str) { + emit_event( + "openshell_provider_lifecycle_event", + json!({ + "nvidiaSource": SOURCE, + "operation": operation, + "outcome": outcome, + "providerProfile": provider_profile, + }), + ); +} + +pub fn emit_sandbox_create( + outcome: &str, + requested_gpu: bool, + provider_count: u64, + has_custom_policy: bool, + template_source: &str, +) { + emit_event( + "openshell_sandbox_create_event", + json!({ + "nvidiaSource": SOURCE, + "outcome": outcome, + "requestedGpu": requested_gpu, + "providerCount": provider_count, + "hasCustomPolicy": has_custom_policy, + "templateSource": template_source, + }), + ); +} + +pub fn emit_policy_decision(operation: &str, outcome: &str, rule_count: u64) { + emit_event( + "openshell_policy_decision_event", + json!({ + "nvidiaSource": SOURCE, + "operation": operation, + "outcome": outcome, + "ruleCount": rule_count, + }), + ); +} + +pub fn emit_sandbox_activity_summary( + network_activity_count: u64, + denied_action_count: u64, + denial_rate_pct: f64, + denials_by_group: I, +) where + I: IntoIterator, + S: Into, +{ + let mut rows: Vec = denials_by_group + .into_iter() + .map(|(group, count)| { + json!({ + "denyGroup": group.into(), + "deniedCount": count, + }) + }) + .collect(); + rows.sort_by(|left, right| { + left["denyGroup"] + .as_str() + .unwrap_or_default() + .cmp(right["denyGroup"].as_str().unwrap_or_default()) + }); + emit_event( + "openshell_sandbox_activity_summary_event", + json!({ + "nvidiaSource": SOURCE, + "networkActivityCount": network_activity_count, + "deniedActionCount": denied_action_count, + "denialRatePct": denial_rate_pct, + "denialsByGroup": rows, + }), + ); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn telemetry_enabled_defaults_true() { + assert!(telemetry_enabled_from(None)); + } + + #[test] + fn telemetry_enabled_honors_false_values() { + assert!(!telemetry_enabled_from(Some("off"))); + assert!(!telemetry_enabled_from(Some("false"))); + assert!(telemetry_enabled_from(Some("yes"))); + } + + #[test] + fn telemetry_endpoint_empty_disables_publish() { + assert_eq!(telemetry_endpoint_from(Some(" ")), None); + assert_eq!( + telemetry_endpoint_from(None), + Some(DEFAULT_ENDPOINT.to_string()) + ); + } + + #[test] + fn build_payload_matches_schema_metadata() { + let payload = build_payload( + "openshell_sandbox_create_event", + json!({ + "nvidiaSource": SOURCE, + "outcome": "success", + "requestedGpu": false, + "providerCount": 1, + "hasCustomPolicy": true, + "templateSource": "default", + }), + "2026-05-18T00:00:00.000Z", + ); + + assert_eq!(payload["clientId"], CLIENT_ID); + assert_eq!(payload["eventSchemaVer"], EVENT_SCHEMA_VERSION); + assert_eq!(payload["deviceId"], "undefined"); + assert_eq!(payload["userId"], "undefined"); + assert_eq!( + payload["events"][0]["name"], + "openshell_sandbox_create_event" + ); + assert_eq!(payload["events"][0]["parameters"]["nvidiaSource"], SOURCE); + assert_eq!(payload["events"][0]["ts"], "2026-05-18T00:00:00.000Z"); + } +} diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 3a0772217..30507422b 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -252,6 +252,12 @@ impl DockerComputeDriver { docker_config: &DockerComputeConfig, supervisor_readiness: Arc, ) -> CoreResult { + if docker_config.grpc_endpoint.trim().is_empty() { + return Err(Error::config( + "grpc_endpoint is required when using the docker compute driver", + )); + } + let docker = Docker::connect_with_local_defaults() .map_err(|err| Error::execution(format!("failed to create Docker client: {err}")))?; let version = docker.version().await.map_err(|err| { @@ -275,24 +281,14 @@ impl DockerComputeDriver { let host_gateway_ip = parse_optional_host_gateway_ip(&docker_config.host_gateway_ip)?; let gateway_route = docker_gateway_route(&info, bridge_gateway_ip, gateway_port, host_gateway_ip); - let mut docker_config = docker_config.clone(); - if docker_config.grpc_endpoint.trim().is_empty() { - let scheme = if docker_guest_tls_configured(&docker_config) { - "https" - } else { - "http" - }; - docker_config.grpc_endpoint = - format!("{scheme}://{HOST_OPENSHELL_INTERNAL}:{gateway_port}"); - } let grpc_endpoint = docker_container_openshell_endpoint( &docker_config.grpc_endpoint, HOST_OPENSHELL_INTERNAL, gateway_port, ); let daemon_arch = normalize_docker_arch(version.arch.as_deref().unwrap_or_default()); - let supervisor_bin = resolve_supervisor_bin(&docker, &docker_config, &daemon_arch).await?; - let guest_tls = docker_guest_tls_paths(&docker_config)?; + let supervisor_bin = resolve_supervisor_bin(&docker, docker_config, &daemon_arch).await?; + let guest_tls = docker_guest_tls_paths(docker_config)?; let driver = Self { docker: Arc::new(docker), @@ -2013,12 +2009,6 @@ pub(crate) fn validate_linux_elf_binary(path: &Path) -> CoreResult<()> { Ok(()) } -fn docker_guest_tls_configured(docker_config: &DockerComputeConfig) -> bool { - docker_config.guest_tls_ca.is_some() - && docker_config.guest_tls_cert.is_some() - && docker_config.guest_tls_key.is_some() -} - pub(crate) fn docker_guest_tls_paths( docker_config: &DockerComputeConfig, ) -> CoreResult> { diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 2ac2da1ee..62a6b89e4 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -74,7 +74,7 @@ fn container_visible_endpoint_rewrites_loopback_hosts() { HOST_OPENSHELL_INTERNAL, DEFAULT_SERVER_PORT, ), - "https://host.openshell.internal:17670/" + "https://host.openshell.internal:8080/" ); assert_eq!( docker_container_openshell_endpoint( @@ -82,7 +82,7 @@ fn container_visible_endpoint_rewrites_loopback_hosts() { HOST_OPENSHELL_INTERNAL, DEFAULT_SERVER_PORT, ), - "http://host.openshell.internal:17670/" + "http://host.openshell.internal:8080/" ); assert_eq!( docker_container_openshell_endpoint( @@ -90,7 +90,7 @@ fn container_visible_endpoint_rewrites_loopback_hosts() { HOST_OPENSHELL_INTERNAL, DEFAULT_SERVER_PORT, ), - "https://host.openshell.internal:17670/" + "https://host.openshell.internal:8080/" ); } @@ -273,7 +273,7 @@ fn docker_gateway_route_uses_bridge_gateway_for_linux_docker() { assert_eq!( route, DockerGatewayRoute::Bridge { - bind_address: "172.18.0.1:17670".parse().unwrap(), + bind_address: "172.18.0.1:8080".parse().unwrap(), host_alias_ip: IpAddr::V4(Ipv4Addr::new(172, 18, 0, 1)), } ); @@ -303,7 +303,7 @@ fn docker_gateway_route_prefers_configured_host_gateway_ip() { assert_eq!( route, DockerGatewayRoute::Bridge { - bind_address: "172.20.0.4:17670".parse().unwrap(), + bind_address: "172.20.0.4:8080".parse().unwrap(), host_alias_ip: IpAddr::V4(Ipv4Addr::new(172, 20, 0, 4)), } ); diff --git a/crates/openshell-driver-podman/NETWORKING.md b/crates/openshell-driver-podman/NETWORKING.md index 2cb6e35d2..d7f5ed6be 100644 --- a/crates/openshell-driver-podman/NETWORKING.md +++ b/crates/openshell-driver-podman/NETWORKING.md @@ -178,7 +178,7 @@ Namespace 2: Rootless Podman network namespace, managed by pasta Namespace 3: Inner sandbox netns, created by supervisor | veth pair, such as 10.200.0.1 <-> 10.200.0.2 - nftables forces ordinary traffic through proxy + iptables forces ordinary traffic through proxy user workload runs here ``` @@ -270,7 +270,7 @@ Container on the Podman bridge | user code runs here | - nftables rules: + iptables rules: ACCEPT -> proxy TCP ACCEPT -> loopback ACCEPT -> established/related @@ -337,7 +337,7 @@ User code in inner netns HTTP_PROXY points at the local sandbox proxy | 2. TCP connect to proxy - allowed by nftables as the only ordinary egress destination + allowed by iptables as the only ordinary egress destination | 3. HTTP CONNECT api.example.com:443 | @@ -357,9 +357,9 @@ Supervisor proxy in container netns The Podman driver auto-detects the callback endpoint scheme based on whether TLS client certificates are configured. When the RPM's auto-generated PKI is in -place, the endpoint is `https://host.containers.internal:17670` and the +place, the endpoint is `https://host.containers.internal:8080` and the supervisor connects with mTLS. Without TLS configuration, it falls back to -`http://host.containers.internal:`. +`http://host.containers.internal:8080`. ```text Supervisor in container netns @@ -382,9 +382,10 @@ Gateway 9. Same gRPC channel reused for RelayStream calls ``` -The gateway binds to `127.0.0.1:17670` by default in the RPM packaging. Client -certificates are auto-generated by `openshell-gateway generate-certs` on first -start and bind-mounted into sandbox containers by the Podman driver. +The gateway binds to `0.0.0.0` by default in the RPM packaging. mTLS prevents +unauthenticated access even though the gateway is reachable from the network. +Client certificates are auto-generated by `init-pki.sh` on first start and +bind-mounted into sandbox containers by the Podman driver. ## Differences from the Kubernetes Driver @@ -397,7 +398,7 @@ start and bind-mounted into sandbox containers by the Podman driver. | Port publishing | Not needed for relay | Ephemeral host port remains in the container spec for compatibility and debug paths. | | TLS | mTLS via Kubernetes secrets | mTLS via mounted client files, RPM defaults, or explicit configuration. | | DNS | Kubernetes CoreDNS | Podman bridge DNS through aardvark-dns when DNS is enabled. | -| Network policy | Kubernetes network policy for pod ingress plus supervisor policy | nftables inside inner sandbox netns plus supervisor policy. | +| Network policy | Kubernetes network policy for pod ingress plus supervisor policy | iptables inside inner sandbox netns plus supervisor policy. | | Supervisor delivery | Kubernetes driver managed pod image or template | OCI image volume mount. | | Secrets | Kubernetes Secret volume and env vars | Mounted TLS client materials from a Podman secret. | @@ -411,7 +412,7 @@ published ports, or the supervisor relay. | Port | Component | Purpose | |---|---|---| -| `17670` | Gateway | Default local gRPC and HTTP multiplexed server port. | +| `8080` | Gateway | gRPC and HTTP multiplexed default server port. | | `2222` | Sandbox | Container port mapping default for the SSH compatibility port. | | `3128` | Sandbox proxy | HTTP CONNECT proxy inside the sandbox network model. | | `0` | Host | Ephemeral host port requested for the container SSH compatibility port. | diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index dbf508c03..1906bd912 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -55,7 +55,7 @@ The restricted agent child does not retain these supervisor privileges. | Capability | Purpose | |---|---| | `SYS_ADMIN` | seccomp filter installation, namespace creation, and Landlock setup. | -| `NET_ADMIN` | Network namespace veth setup, IP address assignment, routes, and nftables. | +| `NET_ADMIN` | Network namespace veth setup, IP address assignment, routes, and iptables. | | `SYS_PTRACE` | Reading `/proc//exe` and walking process ancestry for binary identity. | | `SYSLOG` | Reading `/dev/kmsg` for bypass-detection diagnostics. | | `DAC_READ_SEARCH` | Reading `/proc//fd/` across UIDs so the proxy can resolve the binary responsible for a connection. | @@ -120,10 +120,10 @@ connection back to the gateway. On SELinux systems, the bind mounts include Podman's shared relabel option so the container process can read the files. The RPM packaging auto-generates a self-signed PKI on first start via -`openshell-gateway generate-certs`. Client certs are placed in the CLI -auto-discovery directory (`~/.config/openshell/gateways/openshell/mtls/`) so -the CLI connects with mTLS without manual configuration. See -`deploy/rpm/CONFIGURATION.md` for the full RPM configuration reference. +`init-pki.sh`. Client certs are placed in the CLI auto-discovery directory +(`~/.config/openshell/gateways/openshell/mtls/`) so the CLI connects with mTLS +without manual configuration. See `deploy/rpm/CONFIGURATION.md` for the full +RPM configuration reference. ## Network Model @@ -134,7 +134,7 @@ the supervisor for sandbox process isolation. ```mermaid graph TB subgraph Host - GW["Gateway Server
127.0.0.1:17670"] + GW["Gateway Server
127.0.0.1:8080"] PS["Podman Socket"] end @@ -289,11 +289,11 @@ Podman resources after out-of-band container removal or label drift. | `OPENSHELL_SANDBOX_IMAGE` | `--sandbox-image` | From gateway config | Default OCI image for sandboxes. | | `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY` | `--sandbox-image-pull-policy` | `missing` | Pull policy: `always`, `missing`, `never`, or `newer`. | | `OPENSHELL_GRPC_ENDPOINT` | `--grpc-endpoint` | Auto-detected via `host.containers.internal` | Gateway gRPC endpoint for sandbox callbacks. | -| `OPENSHELL_GATEWAY_PORT` | `--gateway-port` | `17670` | Gateway port used for endpoint auto-detection by the standalone binary. | +| `OPENSHELL_GATEWAY_PORT` | `--gateway-port` | `8080` | Gateway port used for endpoint auto-detection by the standalone binary. | | `OPENSHELL_NETWORK_NAME` | `--network-name` | `openshell` | Podman bridge network name. | | `OPENSHELL_SANDBOX_SSH_SOCKET_PATH` | `--sandbox-ssh-socket-path` | `/run/openshell/ssh.sock` | Supervisor Unix socket path in `PodmanComputeConfig`. | | `OPENSHELL_STOP_TIMEOUT` | `--stop-timeout` | `10` | Container stop timeout in seconds. | -| `OPENSHELL_SUPERVISOR_IMAGE` | `--supervisor-image` | `ghcr.io/nvidia/openshell/supervisor:latest` through the gateway, required standalone | OCI image containing the supervisor binary. | +| `OPENSHELL_SUPERVISOR_IMAGE` | `--supervisor-image` | `openshell/supervisor:latest` through the gateway, required standalone | OCI image containing the supervisor binary. | | `OPENSHELL_PODMAN_TLS_CA` | `--podman-tls-ca` | unset | Host path to the CA certificate mounted for sandbox mTLS. | | `OPENSHELL_PODMAN_TLS_CERT` | `--podman-tls-cert` | unset | Host path to the client certificate mounted for sandbox mTLS. | | `OPENSHELL_PODMAN_TLS_KEY` | `--podman-tls-key` | unset | Host path to the client private key mounted for sandbox mTLS. | diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index c3f2c3282..1cb58e338 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -1036,7 +1036,7 @@ mod tests { let vol = &image_volumes[0]; assert_eq!( vol["source"].as_str(), - Some("ghcr.io/nvidia/openshell/supervisor:latest"), + Some("openshell/supervisor:latest"), "image volume source should be the supervisor image" ); assert_eq!( diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index 8da0b96a4..f581d8766 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -207,25 +207,6 @@ RUST_LOG=openshell_server=debug,openshell_driver_vm=debug \ The VM guest's serial console is appended to `//console.log`. Sandbox IDs must match `[A-Za-z0-9._-]{1,128}` before the driver uses them in host paths. The gateway-owned compute-driver socket lives at `/run/compute-driver.sock`; OpenShell creates `run/` with owner-only permissions, removes same-owner stale sockets, and the gateway removes the socket on clean shutdown via `ManagedDriverProcess::drop`. UDS clients must match the driver UID and provide the expected gateway process PID by default. Standalone same-UID UDS mode requires the explicit `--allow-same-uid-peer` development flag. TCP mode is disabled by default because it is unauthenticated; use `--allow-unauthenticated-tcp --bind-address 127.0.0.1:50061` only for local development. -## Host-side nftables rules - -The VM driver creates a per-VM nftables table on the host (`openshell_vm_vmtap_`) with three chains. These rules serve two purposes: NAT infrastructure (required for VM connectivity) and defense-in-depth host isolation. Primary security enforcement — proxy-only egress and bypass detection — is handled by the sandbox supervisor's own nftables rules inside the VM guest. - -**`postrouting` (NAT):** Masquerades outbound VM traffic so it can be routed from the VM's private subnet to the external network. This chain handles forwarded traffic (VM → internet), not traffic destined for the host. - -**`forward` (defense-in-depth):** Accepts all outbound traffic from the VM (security enforcement happens guest-side) and accepts established/related response traffic back to the VM. Drops unsolicited inbound connections to the VM from the broader network. This chain handles forwarded traffic only — packets transiting the host between the TAP interface and other interfaces. - -**`input` (defense-in-depth):** Accepts traffic from the VM to the gateway port on the host. Drops all other traffic from the VM destined for the host itself. This limits what a compromised guest can reach on the host to the gateway service only. - -The `input` and `postrouting` chains handle different traffic paths: `input` covers packets addressed to the host (VM → host), while `postrouting` covers packets the host is forwarding on behalf of the VM (VM → internet). A packet from the VM goes through one path or the other, never both. - -All chains use `policy accept`, so non-TAP traffic is unaffected. Because nftables evaluates multiple base chains on the same hook independently, host firewalls interact with these rules as follows: - -- **Open host (no other firewall):** Our chains are the only filter. The defense-in-depth drop rules block unsolicited inbound and non-gateway host access. Non-TAP traffic passes through. -- **Restrictive host firewall (e.g. firewalld):** The host firewall's chains may additionally drop TAP traffic that our chains accept. A `drop` verdict from any chain is final — our `accept` cannot override it. If VM connectivity fails, verify that the host firewall allows forwarding and input for `vmtap-*` interfaces. - -Each table is created atomically via `nft -f` on VM start and torn down atomically via `nft delete table` when the VM is destroyed. - ## Prerequisites - macOS on Apple Silicon, or Linux on aarch64/x86_64 with KVM diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 8725984f9..07f55ecc7 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -595,13 +595,6 @@ run_post_overlay_setup() { mount -t cgroup2 cgroup2 "$(root_path /sys/fs/cgroup)" 2>/dev/null & wait - # Allow nftables LOG rules to work in non-init network namespaces. - # Without this, the kernel's nf_log_syslog silently suppresses output - # from the sandbox's network namespace. - if [ -f /proc/sys/net/netfilter/nf_log_all_netns ]; then - echo 1 > /proc/sys/net/netfilter/nf_log_all_netns 2>/dev/null || true - fi - setup_sandbox_workdir configure_hostname diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 5b2ddc2bc..194dde43c 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -5,7 +5,6 @@ pub mod driver; mod embedded_runtime; mod ffi; pub mod gpu; -mod nft_ruleset; pub mod procguard; mod rootfs; mod runtime; diff --git a/crates/openshell-driver-vm/src/nft_ruleset.rs b/crates/openshell-driver-vm/src/nft_ruleset.rs deleted file mode 100644 index fe3e86c90..000000000 --- a/crates/openshell-driver-vm/src/nft_ruleset.rs +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -use std::fmt::Write; - -/// Sanitize a TAP device name for use as an nftables table name suffix. -/// Assumes device names match `vmtap-[a-f0-9]+` (driver-controlled). -fn sanitize_table_name(device: &str) -> String { - device.replace('-', "_") -} - -/// Return the nftables table name for a TAP device. -pub fn teardown_table_name(device: &str) -> String { - format!("openshell_vm_{}", sanitize_table_name(device)) -} - -/// Generate the nftables ruleset for VM TAP networking. -pub fn generate_tap_ruleset(tap_device: &str, subnet: &str, gateway_port: u16) -> String { - let table_name = teardown_table_name(tap_device); - let mut ruleset = String::with_capacity(512); - - writeln!(ruleset, "table ip {table_name} {{").unwrap(); - writeln!(ruleset, " chain postrouting {{").unwrap(); - writeln!( - ruleset, - " type nat hook postrouting priority 100; policy accept;" - ) - .unwrap(); - writeln!(ruleset, " ip saddr {subnet} masquerade").unwrap(); - writeln!(ruleset, " }}").unwrap(); - writeln!(ruleset, " chain forward {{").unwrap(); - writeln!( - ruleset, - " type filter hook forward priority 0; policy accept;" - ) - .unwrap(); - writeln!(ruleset, " iifname \"{tap_device}\" accept").unwrap(); - writeln!( - ruleset, - " oifname \"{tap_device}\" ct state related,established accept" - ) - .unwrap(); - writeln!(ruleset, " oifname \"{tap_device}\" drop").unwrap(); - writeln!(ruleset, " }}").unwrap(); - writeln!(ruleset, " chain input {{").unwrap(); - writeln!( - ruleset, - " type filter hook input priority 0; policy accept;" - ) - .unwrap(); - writeln!( - ruleset, - " iifname \"{tap_device}\" tcp dport {gateway_port} accept" - ) - .unwrap(); - writeln!(ruleset, " iifname \"{tap_device}\" drop").unwrap(); - writeln!(ruleset, " }}").unwrap(); - writeln!(ruleset, "}}").unwrap(); - - ruleset -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn generates_tap_setup_ruleset() { - let ruleset = generate_tap_ruleset("vmtap-abcd", "10.0.128.0/30", 8080); - assert!(ruleset.contains("table ip openshell_vm_vmtap_abcd {")); - assert!(ruleset.contains("type nat hook postrouting priority 100; policy accept;")); - assert!(ruleset.contains("ip saddr 10.0.128.0/30 masquerade")); - assert!(ruleset.contains("type filter hook forward priority 0; policy accept;")); - assert!(ruleset.contains("iifname \"vmtap-abcd\" accept")); - assert!(ruleset.contains("oifname \"vmtap-abcd\" ct state related,established accept")); - assert!(ruleset.contains("oifname \"vmtap-abcd\" drop")); - assert!(ruleset.contains("type filter hook input priority 0; policy accept;")); - assert!(ruleset.contains("iifname \"vmtap-abcd\" tcp dport 8080 accept")); - } - - #[test] - fn table_name_sanitizes_device_name() { - let ruleset = generate_tap_ruleset("vmtap-abc-123", "10.0.128.0/30", 8080); - assert!(ruleset.contains("table ip openshell_vm_vmtap_abc_123 {")); - } - - #[test] - fn teardown_command_targets_correct_table() { - let cmd = teardown_table_name("vmtap-abcd"); - assert_eq!(cmd, "openshell_vm_vmtap_abcd"); - } -} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 1ce6fb26b..4a9053c46 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -10,7 +10,7 @@ use std::ptr; use std::sync::atomic::{AtomicI32, Ordering}; use std::time::{Duration, Instant}; -use crate::{embedded_runtime, ffi, nft_ruleset, procguard, rootfs}; +use crate::{embedded_runtime, ffi, procguard, rootfs}; pub const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; @@ -413,12 +413,6 @@ fn setup_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) -> R enable_ip_forwarding()?; let subnet = tap_subnet_from_host_ip(host_ip); - let table_name = nft_ruleset::teardown_table_name(tap_device); - - // Delete any stale nftables table from a previous driver run. - let _ = run_cmd("nft", &["delete", "table", "ip", &table_name]); - - // Clean up legacy iptables rules from older driver versions. let _ = run_cmd( "iptables", &[ @@ -432,10 +426,27 @@ fn setup_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) -> R "MASQUERADE", ], ); + run_cmd( + "iptables", + &[ + "-t", + "nat", + "-A", + "POSTROUTING", + "-s", + &subnet, + "-j", + "MASQUERADE", + ], + )?; let _ = run_cmd( "iptables", &["-D", "FORWARD", "-i", tap_device, "-j", "ACCEPT"], ); + run_cmd( + "iptables", + &["-A", "FORWARD", "-i", tap_device, "-j", "ACCEPT"], + )?; let _ = run_cmd( "iptables", &[ @@ -451,6 +462,25 @@ fn setup_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) -> R "ACCEPT", ], ); + run_cmd( + "iptables", + &[ + "-A", + "FORWARD", + "-o", + tap_device, + "-m", + "state", + "--state", + "RELATED,ESTABLISHED", + "-j", + "ACCEPT", + ], + )?; + // Allow guest → host traffic only to the gateway gRPC port. + // Previous versions accepted ALL inbound traffic from the TAP + // interface; scope to the specific port so the guest cannot reach + // other host services. let port_str = gateway_port.to_string(); let _ = run_cmd( "iptables", @@ -458,24 +488,17 @@ fn setup_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) -> R "-D", "INPUT", "-i", tap_device, "-p", "tcp", "--dport", &port_str, "-j", "ACCEPT", ], ); - let _ = run_cmd( + run_cmd( "iptables", - &["-D", "INPUT", "-i", tap_device, "-j", "ACCEPT"], - ); - - // Load nftables ruleset atomically. - let ruleset = nft_ruleset::generate_tap_ruleset(tap_device, &subnet, gateway_port); - run_nft_stdin(&ruleset)?; + &[ + "-A", "INPUT", "-i", tap_device, "-p", "tcp", "--dport", &port_str, "-j", "ACCEPT", + ], + )?; Ok(()) } fn teardown_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) { - // Delete the entire nftables table — single atomic operation. - let table_name = nft_ruleset::teardown_table_name(tap_device); - let _ = run_cmd("nft", &["delete", "table", "ip", &table_name]); - - // Clean up legacy iptables rules from older driver versions. let subnet = tap_subnet_from_host_ip(host_ip); let _ = run_cmd( "iptables", @@ -496,6 +519,8 @@ fn teardown_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) { "iptables", &["-D", "FORWARD", "-i", tap_device, "-j", "ACCEPT"], ); + // Remove the port-scoped INPUT rule. Also try the legacy blanket + // rule so stale rules from older driver versions are cleaned up. if gateway_port > 0 { let port_str = gateway_port.to_string(); let _ = run_cmd( @@ -522,7 +547,6 @@ fn teardown_tap_networking(tap_device: &str, host_ip: &str, gateway_port: u16) { "MASQUERADE", ], ); - let _ = run_cmd("ip", &["link", "set", tap_device, "down"]); let _ = run_cmd("ip", &["tuntap", "del", "dev", tap_device, "mode", "tap"]); } @@ -559,35 +583,6 @@ fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), String> { } } -fn run_nft_stdin(ruleset: &str) -> Result<(), String> { - use std::io::Write; - - let mut child = StdCommand::new("nft") - .args(["-f", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .map_err(|e| format!("failed to run nft: {e}"))?; - - if let Some(mut stdin) = child.stdin.take() { - stdin - .write_all(ruleset.as_bytes()) - .map_err(|e| format!("failed to write nft ruleset: {e}"))?; - } - - let output = child - .wait_with_output() - .map_err(|e| format!("failed to wait for nft: {e}"))?; - - if output.status.success() { - Ok(()) - } else { - let stderr = String::from_utf8_lossy(&output.stderr); - Err(format!("nft -f - failed: {stderr}")) - } -} - /// RAII guard that tears down TAP networking on drop. struct TapGuard { tap_device: String, @@ -720,7 +715,7 @@ fn run_libkrun_vm(config: &VmLaunchConfig) -> Result<(), String> { // on its own service ports (DNS:53, DHCP, HTTP API:80). // // That network plane is also what the sandbox supervisor's - // per-sandbox netns (veth pair + nftables, see + // per-sandbox netns (veth pair + iptables, see // `openshell-sandbox/src/sandbox/linux/netns.rs`) branches off of; // libkrun's built-in TSI socket impersonation would not satisfy // those kernel-level primitives. @@ -1486,17 +1481,4 @@ mod tests { assert_ne!(first, second); } - - #[test] - fn tap_subnet_from_host_ip_calculates_slash30_base() { - assert_eq!(tap_subnet_from_host_ip("10.0.128.1"), "10.0.128.0/30"); - assert_eq!(tap_subnet_from_host_ip("10.0.128.2"), "10.0.128.0/30"); - assert_eq!(tap_subnet_from_host_ip("10.0.128.5"), "10.0.128.4/30"); - } - - #[test] - fn tap_subnet_from_host_ip_handles_invalid_ip() { - let result = tap_subnet_from_host_ip("not-an-ip"); - assert_eq!(result, "not-an-ip/30"); - } } diff --git a/crates/openshell-ocsf/src/events/network_activity.rs b/crates/openshell-ocsf/src/events/network_activity.rs index 92450bbe8..6cd125fdc 100644 --- a/crates/openshell-ocsf/src/events/network_activity.rs +++ b/crates/openshell-ocsf/src/events/network_activity.rs @@ -11,7 +11,7 @@ use crate::objects::{Actor, ConnectionInfo, Endpoint, FirewallRule}; /// OCSF Network Activity Event [4001]. /// -/// Proxy CONNECT tunnel events and nftables bypass detection. +/// Proxy CONNECT tunnel events and iptables-level bypass detection. #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] pub struct NetworkActivityEvent { /// Common base event fields. diff --git a/crates/openshell-ocsf/src/format/shorthand.rs b/crates/openshell-ocsf/src/format/shorthand.rs index 0e50fc6c5..08b413429 100644 --- a/crates/openshell-ocsf/src/format/shorthand.rs +++ b/crates/openshell-ocsf/src/format/shorthand.rs @@ -456,7 +456,7 @@ mod tests { actor: Some(Actor { process: Process::new("node", 1234), }), - firewall_rule: Some(FirewallRule::new("bypass-detect", "nftables")), + firewall_rule: Some(FirewallRule::new("bypass-detect", "iptables")), connection_info: Some(ConnectionInfo::new("tcp")), action: Some(ActionId::Denied), disposition: Some(DispositionId::Blocked), @@ -467,7 +467,7 @@ mod tests { let shorthand = event.format_shorthand(); assert_eq!( shorthand, - "NET:REFUSE [MED] DENIED node(1234) -> 93.184.216.34:443/tcp [policy:bypass-detect engine:nftables]" + "NET:REFUSE [MED] DENIED node(1234) -> 93.184.216.34:443/tcp [policy:bypass-detect engine:iptables]" ); } diff --git a/crates/openshell-ocsf/src/objects/firewall_rule.rs b/crates/openshell-ocsf/src/objects/firewall_rule.rs index 2e242225b..fa8829275 100644 --- a/crates/openshell-ocsf/src/objects/firewall_rule.rs +++ b/crates/openshell-ocsf/src/objects/firewall_rule.rs @@ -11,7 +11,7 @@ pub struct FirewallRule { /// Rule name (e.g., "default-egress", "bypass-detect"). pub name: String, - /// Rule type / engine (e.g., "mechanistic", "opa", "nftables"). + /// Rule type / engine (e.g., "mechanistic", "opa", "iptables"). /// /// Kept as `String` because this is a project-specific extension field /// (not OCSF-enumerated) with runtime-dynamic values from the policy engine. diff --git a/crates/openshell-providers/src/lib.rs b/crates/openshell-providers/src/lib.rs index 6fa44d147..3b28030ca 100644 --- a/crates/openshell-providers/src/lib.rs +++ b/crates/openshell-providers/src/lib.rs @@ -18,10 +18,9 @@ pub use openshell_core::proto::Provider; pub use context::{DiscoveryContext, RealDiscoveryContext}; pub use discovery::discover_with_spec; pub use profiles::{ - CredentialRefreshProfile, ProfileError, ProfileValidationDiagnostic, ProviderTypeProfile, - default_profiles, get_default_profile, normalize_profile_id, parse_profile_json, - parse_profile_yaml, profile_to_json, profile_to_yaml, profiles_to_json, profiles_to_yaml, - validate_profile_set, + ProfileError, ProfileValidationDiagnostic, ProviderTypeProfile, default_profiles, + get_default_profile, normalize_profile_id, parse_profile_json, parse_profile_yaml, + profile_to_json, profile_to_yaml, profiles_to_json, profiles_to_yaml, validate_profile_set, }; #[derive(Debug, thiserror::Error)] @@ -144,7 +143,7 @@ impl ProviderRegistry { pub fn normalize_provider_type(input: &str) -> Option<&'static str> { let normalized = input.trim().to_ascii_lowercase(); match normalized.as_str() { - "claude" | "claude-code" | "claude_code" => Some("claude-code"), + "claude" => Some("claude"), "codex" => Some("codex"), "copilot" => Some("copilot"), "opencode" => Some("opencode"), @@ -178,8 +177,7 @@ mod tests { assert_eq!(normalize_provider_type("gitlab"), Some("gitlab")); assert_eq!(normalize_provider_type("glab"), Some("gitlab")); assert_eq!(normalize_provider_type("gh"), Some("github")); - assert_eq!(normalize_provider_type("CLAUDE"), Some("claude-code")); - assert_eq!(normalize_provider_type("claude-code"), Some("claude-code")); + assert_eq!(normalize_provider_type("CLAUDE"), Some("claude")); assert_eq!(normalize_provider_type("generic"), Some("generic")); assert_eq!(normalize_provider_type("openai"), Some("openai")); assert_eq!(normalize_provider_type("anthropic"), Some("anthropic")); @@ -192,7 +190,7 @@ mod tests { fn detects_provider_from_command_token() { assert_eq!( detect_provider_from_command(&["claude".to_string()]), - Some("claude-code") + Some("claude") ); assert_eq!( detect_provider_from_command(&["/usr/bin/glab".to_string()]), diff --git a/crates/openshell-providers/src/profiles.rs b/crates/openshell-providers/src/profiles.rs index 65dd33bea..588e77702 100644 --- a/crates/openshell-providers/src/profiles.rs +++ b/crates/openshell-providers/src/profiles.rs @@ -7,9 +7,7 @@ use openshell_core::proto::{ GraphqlOperation, L7Allow, L7DenyRule, L7QueryMatcher, L7Rule, NetworkBinary, NetworkEndpoint, - NetworkPolicyRule, ProviderCredentialRefresh, ProviderCredentialRefreshMaterial, - ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileCategory, - ProviderProfileCredential, + NetworkPolicyRule, ProviderProfile, ProviderProfileCategory, ProviderProfileCredential, }; use serde::ser::SerializeStruct; use serde::{Deserialize, Deserializer, Serialize, Serializer, de}; @@ -17,9 +15,16 @@ use std::collections::{HashMap, HashSet}; use std::sync::OnceLock; const BUILT_IN_PROFILE_YAMLS: &[&str] = &[ - include_str!("../../../providers/claude-code.yaml"), + include_str!("../../../providers/anthropic.yaml"), + include_str!("../../../providers/claude.yaml"), + include_str!("../../../providers/codex.yaml"), + include_str!("../../../providers/copilot.yaml"), include_str!("../../../providers/github.yaml"), + include_str!("../../../providers/gitlab.yaml"), include_str!("../../../providers/nvidia.yaml"), + include_str!("../../../providers/openai.yaml"), + include_str!("../../../providers/opencode.yaml"), + include_str!("../../../providers/outlook.yaml"), ]; #[derive(Debug, thiserror::Error)] @@ -79,39 +84,6 @@ pub struct CredentialProfile { pub header_name: String, #[serde(default)] pub query_param: String, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub refresh: Option, -} - -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] -pub struct CredentialRefreshProfile { - #[serde( - default = "default_refresh_strategy", - deserialize_with = "deserialize_refresh_strategy", - serialize_with = "serialize_refresh_strategy" - )] - pub strategy: ProviderCredentialRefreshStrategy, - #[serde(default, skip_serializing_if = "String::is_empty")] - pub token_url: String, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub scopes: Vec, - #[serde(default, skip_serializing_if = "is_zero_i64")] - pub refresh_before_seconds: i64, - #[serde(default, skip_serializing_if = "is_zero_i64")] - pub max_lifetime_seconds: i64, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub material: Vec, -} - -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] -pub struct CredentialRefreshMaterialProfile { - pub name: String, - #[serde(default, skip_serializing_if = "String::is_empty")] - pub description: String, - #[serde(default)] - pub required: bool, - #[serde(default)] - pub secret: bool, } // These YAML/JSON DTOs mirror the network policy protos intentionally. Keep @@ -268,10 +240,6 @@ impl ProviderTypeProfile { auth_style: credential.auth_style.clone(), header_name: credential.header_name.clone(), query_param: credential.query_param.clone(), - refresh: credential - .refresh - .as_ref() - .map(credential_refresh_from_proto), }) .collect(), endpoints: profile.endpoints.iter().map(endpoint_from_proto).collect(), @@ -311,7 +279,6 @@ impl ProviderTypeProfile { auth_style: credential.auth_style.clone(), header_name: credential.header_name.clone(), query_param: credential.query_param.clone(), - refresh: credential.refresh.as_ref().map(credential_refresh_to_proto), }) .collect(), endpoints: self.endpoints.iter().map(endpoint_to_proto).collect(), @@ -391,15 +358,6 @@ fn is_zero(value: &u32) -> bool { *value == 0 } -#[allow(clippy::trivially_copy_pass_by_ref)] -fn is_zero_i64(value: &i64) -> bool { - *value == 0 -} - -fn default_refresh_strategy() -> ProviderCredentialRefreshStrategy { - ProviderCredentialRefreshStrategy::Unspecified -} - fn deserialize_category<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, @@ -420,28 +378,6 @@ where serializer.serialize_str(provider_profile_category_to_yaml(*category)) } -fn deserialize_refresh_strategy<'de, D>( - deserializer: D, -) -> Result -where - D: Deserializer<'de>, -{ - let raw = String::deserialize(deserializer)?; - provider_refresh_strategy_from_yaml(&raw) - .ok_or_else(|| de::Error::custom(format!("unsupported provider refresh strategy: {raw}"))) -} - -#[allow(clippy::trivially_copy_pass_by_ref)] -fn serialize_refresh_strategy( - strategy: &ProviderCredentialRefreshStrategy, - serializer: S, -) -> Result -where - S: Serializer, -{ - serializer.serialize_str(provider_refresh_strategy_to_yaml(*strategy)) -} - #[must_use] pub fn provider_profile_category_from_yaml(raw: &str) -> Option { match raw.trim().to_ascii_lowercase().replace('-', "_").as_str() { @@ -469,78 +405,6 @@ pub fn provider_profile_category_to_yaml(category: ProviderProfileCategory) -> & } } -#[must_use] -pub fn provider_refresh_strategy_from_yaml(raw: &str) -> Option { - match raw.trim().to_ascii_lowercase().replace('-', "_").as_str() { - "" => Some(ProviderCredentialRefreshStrategy::Unspecified), - "static" => Some(ProviderCredentialRefreshStrategy::Static), - "external" => Some(ProviderCredentialRefreshStrategy::External), - "oauth2_refresh_token" => Some(ProviderCredentialRefreshStrategy::Oauth2RefreshToken), - "oauth2_client_credentials" => { - Some(ProviderCredentialRefreshStrategy::Oauth2ClientCredentials) - } - "google_service_account_jwt" => { - Some(ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt) - } - _ => None, - } -} - -#[must_use] -pub fn provider_refresh_strategy_to_yaml( - strategy: ProviderCredentialRefreshStrategy, -) -> &'static str { - match strategy { - ProviderCredentialRefreshStrategy::Static => "static", - ProviderCredentialRefreshStrategy::External => "external", - ProviderCredentialRefreshStrategy::Oauth2RefreshToken => "oauth2_refresh_token", - ProviderCredentialRefreshStrategy::Oauth2ClientCredentials => "oauth2_client_credentials", - ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt => "google_service_account_jwt", - ProviderCredentialRefreshStrategy::Unspecified => "unspecified", - } -} - -fn credential_refresh_from_proto(refresh: &ProviderCredentialRefresh) -> CredentialRefreshProfile { - CredentialRefreshProfile { - strategy: ProviderCredentialRefreshStrategy::try_from(refresh.strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified), - token_url: refresh.token_url.clone(), - scopes: refresh.scopes.clone(), - refresh_before_seconds: refresh.refresh_before_seconds, - max_lifetime_seconds: refresh.max_lifetime_seconds, - material: refresh - .material - .iter() - .map(|material| CredentialRefreshMaterialProfile { - name: material.name.clone(), - description: material.description.clone(), - required: material.required, - secret: material.secret, - }) - .collect(), - } -} - -fn credential_refresh_to_proto(refresh: &CredentialRefreshProfile) -> ProviderCredentialRefresh { - ProviderCredentialRefresh { - strategy: refresh.strategy as i32, - token_url: refresh.token_url.clone(), - scopes: refresh.scopes.clone(), - refresh_before_seconds: refresh.refresh_before_seconds, - max_lifetime_seconds: refresh.max_lifetime_seconds, - material: refresh - .material - .iter() - .map(|material| ProviderCredentialRefreshMaterial { - name: material.name.clone(), - description: material.description.clone(), - required: material.required, - secret: material.secret, - }) - .collect(), - } -} - fn endpoint_to_proto(endpoint: &EndpointProfile) -> NetworkEndpoint { NetworkEndpoint { host: endpoint.host.clone(), @@ -925,52 +789,6 @@ pub fn validate_profile_set( format!("unsupported auth_style: {}", credential.auth_style), )), } - - if let Some(refresh) = credential.refresh.as_ref() { - if refresh.strategy == ProviderCredentialRefreshStrategy::Unspecified { - diagnostics.push(ProfileValidationDiagnostic::error( - source, - profile_id, - "credentials.refresh.strategy", - "refresh strategy is required", - )); - } - if refresh.refresh_before_seconds < 0 { - diagnostics.push(ProfileValidationDiagnostic::error( - source, - profile_id, - "credentials.refresh.refresh_before_seconds", - "refresh_before_seconds must be greater than or equal to 0", - )); - } - if refresh.max_lifetime_seconds < 0 { - diagnostics.push(ProfileValidationDiagnostic::error( - source, - profile_id, - "credentials.refresh.max_lifetime_seconds", - "max_lifetime_seconds must be greater than or equal to 0", - )); - } - let mut material_names = HashSet::new(); - for material in &refresh.material { - let name = material.name.trim(); - if name.is_empty() { - diagnostics.push(ProfileValidationDiagnostic::error( - source, - profile_id, - "credentials.refresh.material.name", - "refresh material name is required", - )); - } else if !material_names.insert(name.to_string()) { - diagnostics.push(ProfileValidationDiagnostic::error( - source, - profile_id, - "credentials.refresh.material.name", - format!("duplicate refresh material name: {name}"), - )); - } - } - } } for (index, endpoint) in profile.endpoints.iter().enumerate() { @@ -1067,10 +885,10 @@ mod tests { #[test] fn credential_env_vars_are_deduplicated_in_profile_order() { - let profile = get_default_profile("claude-code").expect("claude-code profile"); + let profile = get_default_profile("copilot").expect("copilot profile"); assert_eq!( profile.credential_env_vars(), - vec!["ANTHROPIC_API_KEY", "CLAUDE_API_KEY"] + vec!["COPILOT_GITHUB_TOKEN", "GH_TOKEN", "GITHUB_TOKEN"] ); } @@ -1092,48 +910,6 @@ credentials: assert_eq!(profile.credential_env_vars(), vec!["EXAMPLE_API_KEY"]); } - #[test] - fn profile_refresh_metadata_round_trips_through_proto_and_yaml() { - let profile = parse_profile_yaml( - r" -id: ms-graph -display_name: Microsoft Graph -credentials: - - name: access_token - env_vars: [MS_GRAPH_ACCESS_TOKEN] - refresh: - strategy: oauth2_client_credentials - token_url: https://login.microsoftonline.com/common/oauth2/v2.0/token - scopes: [https://graph.microsoft.com/.default] - refresh_before_seconds: 300 - material: - - name: tenant_id - required: true - - name: client_secret - required: true - secret: true -", - ) - .expect("profile should parse"); - - let refresh = profile.credentials[0].refresh.as_ref().expect("refresh"); - assert_eq!( - refresh.token_url, - "https://login.microsoftonline.com/common/oauth2/v2.0/token" - ); - assert_eq!(refresh.material.len(), 2); - - let from_proto = ProviderTypeProfile::from_proto(&profile.to_proto()); - assert_eq!( - from_proto.credentials[0].refresh, - profile.credentials[0].refresh - ); - - let exported = profile_to_yaml(&from_proto).expect("yaml"); - assert!(exported.contains("oauth2_client_credentials")); - assert!(exported.contains("client_secret")); - } - #[test] fn profile_json_round_trip_preserves_compact_dto_shape() { let profile = get_default_profile("github").expect("github profile"); diff --git a/crates/openshell-providers/src/providers/claude.rs b/crates/openshell-providers/src/providers/claude.rs index 64944fc9e..576b30e38 100644 --- a/crates/openshell-providers/src/providers/claude.rs +++ b/crates/openshell-providers/src/providers/claude.rs @@ -8,7 +8,7 @@ use crate::{ pub struct ClaudeProvider; pub const SPEC: ProviderDiscoverySpec = ProviderDiscoverySpec { - id: "claude-code", + id: "claude", credential_env_vars: &["ANTHROPIC_API_KEY", "CLAUDE_API_KEY"], }; diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index b90a9221b..29919ede4 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -85,7 +85,6 @@ libc = "0.2" [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.4" seccompiler = "0.5" -tempfile = "3" uuid = { version = "1", features = ["v4"] } [dev-dependencies] diff --git a/crates/openshell-sandbox/src/activity_aggregator.rs b/crates/openshell-sandbox/src/activity_aggregator.rs new file mode 100644 index 000000000..335de51a6 --- /dev/null +++ b/crates/openshell-sandbox/src/activity_aggregator.rs @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Anonymous sandbox network activity counter aggregation. + +use std::collections::HashMap; +use std::future::Future; +use tokio::sync::mpsc; +use tracing::debug; + +#[derive(Debug, Clone)] +pub struct ActivityEvent { + pub denied: bool, + pub deny_group: &'static str, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlushableActivitySummary { + pub network_activity_count: u32, + pub denied_action_count: u32, + pub denials_by_group: Vec<(String, u32)>, +} + +pub struct ActivityAggregator { + rx: mpsc::UnboundedReceiver, + network_activity_count: u32, + denied_action_count: u32, + denials_by_group: HashMap, + flush_interval_secs: u64, +} + +impl ActivityAggregator { + pub fn new(rx: mpsc::UnboundedReceiver, flush_interval_secs: u64) -> Self { + Self { + rx, + network_activity_count: 0, + denied_action_count: 0, + denials_by_group: HashMap::new(), + flush_interval_secs, + } + } + + pub async fn run(mut self, flush_callback: F) + where + F: Fn(FlushableActivitySummary) -> Fut, + Fut: Future, + { + let mut flush_interval = + tokio::time::interval(std::time::Duration::from_secs(self.flush_interval_secs)); + flush_interval.tick().await; + + loop { + tokio::select! { + event = self.rx.recv() => { + if let Some(event) = event { + self.ingest(event); + } else { + if let Some(summary) = self.drain() { + flush_callback(summary).await; + } + debug!("ActivityAggregator: channel closed, exiting"); + return; + } + } + _ = flush_interval.tick() => { + if let Some(summary) = self.drain() { + debug!( + count = summary.network_activity_count, + denied = summary.denied_action_count, + "ActivityAggregator: flushing anonymous activity summary" + ); + flush_callback(summary).await; + } + } + } + } + } + + fn ingest(&mut self, event: ActivityEvent) { + self.network_activity_count = self.network_activity_count.saturating_add(1); + if event.denied { + self.denied_action_count = self.denied_action_count.saturating_add(1); + let group = sanitize_deny_group(event.deny_group).to_string(); + let count = self.denials_by_group.entry(group).or_default(); + *count = count.saturating_add(1); + } + } + + fn drain(&mut self) -> Option { + if self.network_activity_count == 0 { + return None; + } + let mut denials_by_group: Vec<(String, u32)> = self.denials_by_group.drain().collect(); + denials_by_group.sort_by(|left, right| left.0.cmp(&right.0)); + let summary = FlushableActivitySummary { + network_activity_count: self.network_activity_count, + denied_action_count: self.denied_action_count, + denials_by_group, + }; + self.network_activity_count = 0; + self.denied_action_count = 0; + Some(summary) + } +} + +pub fn sanitize_deny_group(raw: &str) -> &'static str { + match raw { + "connect_policy" | "connect" | "l4_deny" => "connect_policy", + "forward_policy" | "forward" => "forward_policy", + "l7_policy" | "l7" | "l7_deny" | "forward-l7-deny" => "l7_policy", + "l7_parse_rejection" | "parse_rejection" => "l7_parse_rejection", + "ssrf" => "ssrf", + "bypass" => "bypass", + "policy_stale" => "policy_stale", + _ => "unknown", + } +} + +#[cfg(test)] +fn denial_rate_pct(network_activity_count: u32, denied_action_count: u32) -> f64 { + if network_activity_count == 0 { + return 0.0; + } + ((f64::from(denied_action_count) / f64::from(network_activity_count)) * 100.0).clamp(0.0, 100.0) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_float_eq(actual: f64, expected: f64) { + assert!((actual - expected).abs() <= f64::EPSILON); + } + + #[test] + fn deny_group_sanitization_uses_allowlist() { + assert_eq!(sanitize_deny_group("connect"), "connect_policy"); + assert_eq!(sanitize_deny_group("forward-l7-deny"), "l7_policy"); + assert_eq!(sanitize_deny_group("host=example.test/path"), "unknown"); + assert_eq!(sanitize_deny_group("acme.internal:443"), "unknown"); + assert_eq!( + sanitize_deny_group("binary=/usr/local/bin/private"), + "unknown" + ); + } + + #[test] + fn denial_rate_handles_zero_and_clamps() { + assert_float_eq(denial_rate_pct(0, 10), 0.0); + assert_float_eq(denial_rate_pct(4, 1), 25.0); + assert_float_eq(denial_rate_pct(4, 10), 100.0); + } +} diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-sandbox/src/bypass_monitor.rs index 9e37ef27c..bc945c01a 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-sandbox/src/bypass_monitor.rs @@ -5,17 +5,18 @@ //! detect and report direct connection attempts that bypass the HTTP CONNECT //! proxy. //! -//! When the sandbox network namespace has nftables log rules installed (see +//! When the sandbox network namespace has iptables LOG rules installed (see //! `NetworkNamespace::install_bypass_rules`), the kernel writes a log line for -//! each dropped packet. This module reads those messages, parses the nftables +//! each dropped packet. This module reads those messages, parses the iptables //! LOG format, and emits structured tracing events + denial aggregator entries. //! //! ## Graceful degradation //! //! If `/dev/kmsg` cannot be opened (e.g., restricted container environment), -//! the monitor logs a one-time warning and returns. The nftables reject rules +//! the monitor logs a one-time warning and returns. The iptables REJECT rules //! still provide fast-fail UX — the monitor only adds diagnostic visibility. +use crate::activity_aggregator::ActivityEvent; use crate::denial_aggregator::DenialEvent; use openshell_ocsf::{ ActionId, ActivityId, ConfidenceId, DetectionFindingBuilder, DispositionId, Endpoint, @@ -26,7 +27,7 @@ use std::sync::atomic::{AtomicU32, Ordering}; use tokio::sync::mpsc; use tracing::debug; -/// A parsed nftables log entry from `/dev/kmsg`. +/// A parsed iptables LOG entry from `/dev/kmsg`. #[derive(Debug, Clone, PartialEq, Eq)] pub struct BypassEvent { /// Destination IP address. @@ -41,7 +42,7 @@ pub struct BypassEvent { pub uid: Option, } -/// Parse a nftables log line from `/dev/kmsg`. +/// Parse an iptables LOG line from `/dev/kmsg`. /// /// Expected format (from the kernel LOG target): /// ```text @@ -74,7 +75,7 @@ pub fn parse_kmsg_line(line: &str, namespace_prefix: &str) -> Option &'static str { /// Spawn the bypass monitor as a background tokio task. /// -/// Uses `dmesg --follow` to tail the kernel ring buffer for nftables log +/// Uses `dmesg --follow` to tail the kernel ring buffer for iptables LOG /// entries matching the given namespace. Falls back gracefully if `dmesg` /// is not available. /// @@ -118,6 +119,7 @@ pub fn spawn( namespace_name: String, entrypoint_pid: Arc, denial_tx: Option>, + activity_tx: Option>, ) -> Option> { use std::io::BufRead; use std::process::{Command, Stdio}; @@ -221,7 +223,7 @@ pub fn spawn( .severity(SeverityId::Medium) .dst_endpoint(dst_ep.clone()) .actor_process(Process::from_bypass(&binary, &binary_pid, &ancestors)) - .firewall_rule("bypass-detect", "nftables") + .firewall_rule("bypass-detect", "iptables") .observation_point(3) .message(format!( "BYPASS_DETECT {}:{} proto={} binary={binary} action=reject reason={reason}", @@ -277,6 +279,12 @@ pub fn spawn( l7_path: None, }); } + if let Some(ref tx) = activity_tx { + let _ = tx.send(ActivityEvent { + denied: true, + deny_group: "bypass", + }); + } } // Clean up the dmesg child process. diff --git a/crates/openshell-sandbox/src/child_env.rs b/crates/openshell-sandbox/src/child_env.rs index 32eecbee3..e764afdfe 100644 --- a/crates/openshell-sandbox/src/child_env.rs +++ b/crates/openshell-sandbox/src/child_env.rs @@ -24,12 +24,11 @@ pub fn proxy_env_vars(proxy_url: &str) -> [(&'static str, String); 9] { pub fn tls_env_vars( ca_cert_path: &Path, combined_bundle_path: &Path, -) -> [(&'static str, String); 6] { +) -> [(&'static str, String); 5] { let ca_cert_path = ca_cert_path.display().to_string(); let combined_bundle_path = combined_bundle_path.display().to_string(); [ - ("NODE_EXTRA_CA_CERTS", ca_cert_path.clone()), - ("DENO_CERT", ca_cert_path), + ("NODE_EXTRA_CA_CERTS", ca_cert_path), ("SSL_CERT_FILE", combined_bundle_path.clone()), ("REQUESTS_CA_BUNDLE", combined_bundle_path.clone()), ("CURL_CA_BUNDLE", combined_bundle_path.clone()), @@ -82,7 +81,6 @@ mod tests { let stdout = String::from_utf8(output.stdout).expect("utf8"); assert!(stdout.contains("NODE_EXTRA_CA_CERTS=/etc/openshell-tls/openshell-ca.pem")); - assert!(stdout.contains("DENO_CERT=/etc/openshell-tls/openshell-ca.pem")); assert!(stdout.contains("SSL_CERT_FILE=/etc/openshell-tls/ca-bundle.pem")); assert!(stdout.contains("REQUESTS_CA_BUNDLE=/etc/openshell-tls/ca-bundle.pem")); assert!(stdout.contains("CURL_CA_BUNDLE=/etc/openshell-tls/ca-bundle.pem")); diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 3fccb680f..71ab4924a 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -10,10 +10,10 @@ use std::time::Duration; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, - GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, - PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, - SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, - inference_client::InferenceClient, open_shell_client::OpenShellClient, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, NetworkActivitySummary, + PolicyChunk, PolicySource, PolicyStatus, ReportPolicyStatusRequest, + SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, + UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; use tracing::debug; @@ -155,7 +155,6 @@ async fn sync_policy_with_client( delete_setting: false, global: false, merge_operations: vec![], - expected_resource_version: 0, }) .await .into_diagnostic() @@ -228,7 +227,6 @@ pub async fn fetch_provider_environment( Ok(ProviderEnvironmentResult { environment: inner.environment, provider_env_revision: inner.provider_env_revision, - credential_expires_at_ms: inner.credential_expires_at_ms, }) } @@ -258,7 +256,6 @@ pub struct SettingsPollResult { pub struct ProviderEnvironmentResult { pub environment: HashMap, pub provider_env_revision: u64, - pub credential_expires_at_ms: HashMap, } impl CachedOpenShellClient { @@ -310,6 +307,7 @@ impl CachedOpenShellClient { sandbox_name: &str, summaries: Vec, proposed_chunks: Vec, + network_activity_summaries: Vec, analysis_mode: &str, ) -> Result { let response = self @@ -319,6 +317,7 @@ impl CachedOpenShellClient { name: sandbox_name.to_string(), summaries, proposed_chunks, + network_activity_summaries, analysis_mode: analysis_mode.to_string(), }) .await diff --git a/crates/openshell-sandbox/src/l7/graphql.rs b/crates/openshell-sandbox/src/l7/graphql.rs index 5d0746d01..2ff502d1c 100644 --- a/crates/openshell-sandbox/src/l7/graphql.rs +++ b/crates/openshell-sandbox/src/l7/graphql.rs @@ -801,6 +801,7 @@ network_policies: ancestors: Vec::new(), cmdline_paths: Vec::new(), secret_resolver: None, + activity_tx: None, }; let request_info = crate::l7::L7RequestInfo { action: req.action, diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 6d271af21..32283f538 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -7,6 +7,7 @@ //! Parses each request within the tunnel, evaluates it against OPA policy, //! and either forwards or denies the request. +use crate::activity_aggregator::ActivityEvent; use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::rest::WebSocketExtensionMode; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; @@ -19,6 +20,7 @@ use openshell_ocsf::{ }; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio::sync::mpsc; use tracing::{debug, warn}; /// Context for L7 request policy evaluation. @@ -37,6 +39,8 @@ pub struct L7EvalContext { pub cmdline_paths: Vec, /// Supervisor-only placeholder resolver for outbound headers. pub(crate) secret_resolver: Option>, + /// Anonymous activity counter channel. + pub(crate) activity_tx: Option>, } #[derive(Default)] @@ -119,6 +123,7 @@ fn emit_parse_rejection(ctx: &L7EvalContext, detail: &str, engine_type: &str) { .status_detail(detail) .build(); ocsf_emit!(event); + emit_activity(ctx, true, "l7_parse_rejection"); } /// Run protocol-aware L7 inspection on a tunnel. @@ -448,6 +453,13 @@ fn emit_l7_request_log( )) .build(); ocsf_emit!(event); + emit_activity(ctx, decision_str == "deny", "l7_policy"); +} + +fn emit_activity(ctx: &L7EvalContext, denied: bool, deny_group: &'static str) { + if let Some(tx) = &ctx.activity_tx { + let _ = tx.send(ActivityEvent { denied, deny_group }); + } } /// Handle an upgraded connection (101 Switching Protocols). @@ -1371,6 +1383,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let request = L7RequestInfo { action: "WEBSOCKET_TEXT".into(), @@ -1426,6 +1439,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let (mut app, mut relay_client) = tokio::io::duplex(8192); @@ -1530,6 +1544,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: resolver.map(Arc::new), + activity_tx: None, }; let (mut app, mut relay_client) = tokio::io::duplex(8192); @@ -1647,6 +1662,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: resolver.map(Arc::new), + activity_tx: None, }; let (mut app, mut relay_client) = tokio::io::duplex(8192); @@ -1817,6 +1833,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let (mut app, mut relay_client) = tokio::io::duplex(8192); @@ -1904,6 +1921,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let (mut app, mut relay_client) = tokio::io::duplex(8192); diff --git a/crates/openshell-sandbox/src/l7/websocket.rs b/crates/openshell-sandbox/src/l7/websocket.rs index 2dc1b25c3..89a6e6c51 100644 --- a/crates/openshell-sandbox/src/l7/websocket.rs +++ b/crates/openshell-sandbox/src/l7/websocket.rs @@ -1270,6 +1270,7 @@ network_policies: ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let (mut client_write, mut relay_read) = tokio::io::duplex(MAX_TEXT_MESSAGE_BYTES + 1024); let (mut relay_write, mut upstream_read) = tokio::io::duplex(MAX_TEXT_MESSAGE_BYTES + 1024); diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index ded56ce9e..fe21ec096 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -5,6 +5,7 @@ //! //! This crate provides process sandboxing and monitoring capabilities. +mod activity_aggregator; pub mod bypass_monitor; mod child_env; pub mod denial_aggregator; @@ -351,7 +352,7 @@ pub async fn run_sandbox( // Fetch provider environment variables from the server. // This is done after loading the policy so the sandbox can still start // even if provider env fetch fails (graceful degradation). - let (provider_env_revision, provider_env, provider_credential_expires_at_ms) = + let (provider_env_revision, provider_env) = if let (Some(id), Some(endpoint)) = (&sandbox_id, &openshell_endpoint) { match grpc_client::fetch_provider_environment(endpoint, id).await { Ok(result) => { @@ -366,11 +367,7 @@ pub async fn run_sandbox( )) .build() ); - ( - result.provider_env_revision, - result.environment, - result.credential_expires_at_ms, - ) + (result.provider_env_revision, result.environment) } Err(e) => { ocsf_emit!( @@ -383,25 +380,16 @@ pub async fn run_sandbox( )) .build() ); - ( - 0, - std::collections::HashMap::new(), - std::collections::HashMap::new(), - ) + (0, std::collections::HashMap::new()) } } } else { - ( - 0, - std::collections::HashMap::new(), - std::collections::HashMap::new(), - ) + (0, std::collections::HashMap::new()) }; let provider_credentials = provider_credentials::ProviderCredentialState::from_environment( provider_env_revision, provider_env, - provider_credential_expires_at_ms, ); let provider_env = provider_credentials.snapshot().child_env.clone(); @@ -522,7 +510,7 @@ pub async fn run_sandbox( let netns = if matches!(policy.network.mode, NetworkMode::Proxy) { match NetworkNamespace::create() { Ok(ns) => { - // Install bypass detection rules (nftables log + reject). + // Install bypass detection rules (iptables LOG + REJECT). // This provides fast-fail UX and diagnostic logging for direct // connection attempts that bypass the HTTP CONNECT proxy. let proxy_port = policy @@ -563,7 +551,7 @@ pub async fn run_sandbox( let _netns: Option<()> = None; // Install the supervisor seccomp prelude after privileged startup helpers - // (network namespace setup, nftables probes) complete, but before the SSH + // (network namespace setup, iptables probes) complete, but before the SSH // listener and workload process are exposed. apply_supervisor_startup_hardening()?; @@ -571,69 +559,87 @@ pub async fn run_sandbox( // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); - let (_proxy, denial_rx, bypass_denial_tx) = if matches!(policy.network.mode, NetworkMode::Proxy) - { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!("Network mode is set to proxy but no proxy configuration was provided") - })?; + let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = + if matches!(policy.network.mode, NetworkMode::Proxy) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!( + "Network mode is set to proxy but no proxy configuration was provided" + ) + })?; - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; + let engine = opa_engine.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") - })?; + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!( + "Proxy mode requires an identity cache (OPA engine must be configured)" + ) + })?; - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { - let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.as_ref().map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = build_inference_context( + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + }; + let (activity_tx, activity_rx, bypass_activity_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + }; - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + activity_tx, + ) + .await?; + ( + Some(proxy_handle), + denial_rx, + bypass_denial_tx, + activity_rx, + bypass_activity_tx, + ) } else { - (None, None, None) + (None, None, None, None, None) }; - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - ) - .await?; - (Some(proxy_handle), denial_rx, bypass_denial_tx) - } else { - (None, None, None) - }; - // Spawn bypass detection monitor (Linux only, proxy mode only). - // Reads /dev/kmsg for nftables log entries and emits structured + // Reads /dev/kmsg for iptables LOG entries and emits structured // tracing events for direct connection attempts that bypass the proxy. #[cfg(target_os = "linux")] let _bypass_monitor = netns.as_ref().and_then(|ns| { @@ -641,12 +647,15 @@ pub async fn run_sandbox( ns.name().to_string(), entrypoint_pid.clone(), bypass_denial_tx, + bypass_activity_tx, ) }); // On non-Linux, bypass_denial_tx is unused (no /dev/kmsg). #[cfg(not(target_os = "linux"))] drop(bypass_denial_tx); + #[cfg(not(target_os = "linux"))] + drop(bypass_activity_tx); // Compute the proxy URL and netns fd for SSH sessions. // SSH shell processes need both to enforce network policy: @@ -996,6 +1005,31 @@ pub async fn run_sandbox( .await; }); } + if let Some(rx) = activity_rx { + let agg_name = sandbox_name_for_agg.clone().unwrap_or_else(|| id.clone()); + let agg_endpoint = endpoint.clone(); + let flush_interval_secs: u64 = std::env::var("OPENSHELL_ACTIVITY_FLUSH_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + let aggregator = activity_aggregator::ActivityAggregator::new(rx, flush_interval_secs); + + tokio::spawn(async move { + aggregator + .run(|summary| { + let endpoint = agg_endpoint.clone(); + let sandbox_name = agg_name.clone(); + async move { + if let Err(e) = + flush_activity_to_gateway(&endpoint, &sandbox_name, summary).await + { + warn!(error = %e, "Failed to flush activity summary to gateway"); + } + } + }) + .await; + }); + } } // Wait for process with optional timeout @@ -2278,12 +2312,45 @@ async fn flush_proposals_to_gateway( ); client - .submit_policy_analysis(sandbox_name, proto_summaries, proposals, "mechanistic") + .submit_policy_analysis( + sandbox_name, + proto_summaries, + proposals, + vec![], + "mechanistic", + ) .await?; Ok(()) } +async fn flush_activity_to_gateway( + endpoint: &str, + sandbox_name: &str, + summary: activity_aggregator::FlushableActivitySummary, +) -> Result<()> { + use crate::grpc_client::CachedOpenShellClient; + use openshell_core::proto::{DenialGroupCount, NetworkActivitySummary}; + + let client = CachedOpenShellClient::connect(endpoint).await?; + let summary = NetworkActivitySummary { + network_activity_count: summary.network_activity_count, + denied_action_count: summary.denied_action_count, + denials_by_group: summary + .denials_by_group + .into_iter() + .map(|(deny_group, denied_count)| DenialGroupCount { + deny_group, + denied_count, + }) + .collect(), + }; + client + .submit_policy_analysis(sandbox_name, vec![], vec![], vec![summary], "telemetry") + .await?; + Ok(()) +} + /// `reload_from_proto_with_pid()`. Reports load success/failure back to the /// server. On failure, the previous engine is untouched (LKG behavior). /// @@ -2372,7 +2439,6 @@ async fn run_policy_poll_loop(ctx: PolicyPollLoopContext) -> Result<()> { let env_count = ctx.provider_credentials.install_environment( env_result.provider_env_revision, env_result.environment, - env_result.credential_expires_at_ms, ); current_provider_env_revision = env_result.provider_env_revision; ocsf_emit!( diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index 657fd760f..4006028cf 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -494,7 +494,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j let audit_summaries: Vec = chunks.iter().map(summarize_chunk_for_audit).collect(); let response = match client - .submit_policy_analysis(sandbox_name, vec![], chunks, "agent_authored") + .submit_policy_analysis(sandbox_name, vec![], chunks, vec![], "agent_authored") .await { Ok(response) => response, diff --git a/crates/openshell-sandbox/src/provider_credentials.rs b/crates/openshell-sandbox/src/provider_credentials.rs index ae91e8d6e..829e1b226 100644 --- a/crates/openshell-sandbox/src/provider_credentials.rs +++ b/crates/openshell-sandbox/src/provider_credentials.rs @@ -29,17 +29,9 @@ pub struct ProviderCredentialState { } impl ProviderCredentialState { - pub fn from_environment( - revision: u64, - env: HashMap, - credential_expires_at_ms: HashMap, - ) -> Self { + pub fn from_environment(revision: u64, env: HashMap) -> Self { let (child_env, generation_resolver, current_resolver) = - SecretResolver::from_provider_env_for_current_revision( - env, - credential_expires_at_ms, - revision, - ); + SecretResolver::from_provider_env_for_current_revision(env, revision); let snapshot = Arc::new(ProviderCredentialSnapshot { revision, child_env, @@ -74,18 +66,9 @@ impl ProviderCredentialState { .clone() } - pub fn install_environment( - &self, - revision: u64, - env: HashMap, - credential_expires_at_ms: HashMap, - ) -> usize { + pub fn install_environment(&self, revision: u64, env: HashMap) -> usize { let (child_env, generation_resolver, current_resolver) = - SecretResolver::from_provider_env_for_current_revision( - env, - credential_expires_at_ms, - revision, - ); + SecretResolver::from_provider_env_for_current_revision(env, revision); let mut inner = self .inner .write() @@ -131,7 +114,6 @@ mod tests { let state = ProviderCredentialState::from_environment( 10, HashMap::from([("GITHUB_TOKEN".to_string(), "old".to_string())]), - HashMap::new(), ); let first = state.snapshot(); assert_eq!( @@ -142,7 +124,6 @@ mod tests { state.install_environment( 11, HashMap::from([("GITHUB_TOKEN".to_string(), "new".to_string())]), - HashMap::new(), ); let second = state.snapshot(); assert_eq!( @@ -174,10 +155,9 @@ mod tests { let state = ProviderCredentialState::from_environment( 10, HashMap::from([("GITHUB_TOKEN".to_string(), "old".to_string())]), - HashMap::new(), ); - state.install_environment(11, HashMap::new(), HashMap::new()); + state.install_environment(11, HashMap::new()); assert!(state.snapshot().child_env.is_empty()); let resolver = state.resolver().expect("old resolver retained"); @@ -194,36 +174,4 @@ mod tests { None ); } - - #[test] - fn expired_retained_generation_does_not_resolve() { - let now_ms = i64::try_from( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_millis(), - ) - .unwrap(); - let state = ProviderCredentialState::from_environment( - 10, - HashMap::from([("GITHUB_TOKEN".to_string(), "old".to_string())]), - HashMap::from([("GITHUB_TOKEN".to_string(), now_ms - 1_000)]), - ); - - state.install_environment( - 11, - HashMap::from([("GITHUB_TOKEN".to_string(), "new".to_string())]), - HashMap::from([("GITHUB_TOKEN".to_string(), now_ms + 60_000)]), - ); - - let resolver = state.resolver().expect("resolver"); - assert_eq!( - resolver.resolve_placeholder("openshell:resolve:env:v10_GITHUB_TOKEN"), - None - ); - assert_eq!( - resolver.resolve_placeholder("openshell:resolve:env:v11_GITHUB_TOKEN"), - Some("new") - ); - } } diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 81ed0322f..1845218a0 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -3,6 +3,7 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. +use crate::activity_aggregator::ActivityEvent; use crate::denial_aggregator::DenialEvent; use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; @@ -52,22 +53,14 @@ const CLOUD_METADATA_IPS: &[IpAddr] = &[ ]; /// Maximum total bytes for a streaming inference response body (32 MiB). -#[cfg(not(test))] const MAX_STREAMING_BODY: usize = 32 * 1024 * 1024; -// Keep unit tests deterministic without pushing tens of MiB through loopback. -#[cfg(test)] -const MAX_STREAMING_BODY: usize = 1024; /// Idle timeout per chunk when relaying streaming inference responses. /// /// Reasoning models (e.g. nemotron-3-super, o1, o3) can pause for 60+ seconds /// between "thinking" and output phases. 120s provides headroom while still /// catching genuinely stuck streams. -#[cfg(not(test))] const CHUNK_IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(120); -// Exercise idle-timeout truncation without slowing the full package test suite. -#[cfg(test)] -const CHUNK_IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(100); /// Result of a proxy CONNECT policy decision. struct ConnectDecision { @@ -186,6 +179,7 @@ impl ProxyHandle { provider_credentials: Option, policy_local_ctx: Option>, denial_tx: Option>, + activity_tx: Option>, ) -> Result { // Use override bind_addr, fall back to policy http_addr, then default // to loopback:3128. The default allows the proxy to function when no @@ -241,6 +235,7 @@ impl ProxyHandle { .as_ref() .and_then(ProviderCredentialState::resolver); let dtx = denial_tx.clone(); + let atx = activity_tx.clone(); tokio::spawn(async move { if let Err(err) = handle_tcp_connection( stream, @@ -253,6 +248,7 @@ impl ProxyHandle { gw, resolver, dtx, + atx, ) .await { @@ -298,6 +294,26 @@ impl Drop for ProxyHandle { } } +fn emit_activity( + tx: &Option>, + denied: bool, + deny_group: &'static str, +) { + if let Some(tx) = tx { + let _ = tx.send(ActivityEvent { denied, deny_group }); + } +} + +fn emit_activity_simple( + tx: Option<&mpsc::UnboundedSender>, + denied: bool, + deny_group: &'static str, +) { + if let Some(tx) = tx { + let _ = tx.send(ActivityEvent { denied, deny_group }); + } +} + /// Emit a denial event to the aggregator channel (if configured). /// Used by `handle_tcp_connection` which owns `Option`. fn emit_denial( @@ -371,6 +387,7 @@ async fn handle_tcp_connection( trusted_host_gateway: Arc>, secret_resolver: Option>, denial_tx: Option>, + activity_tx: Option>, ) -> Result<()> { let mut buf = vec![0u8; MAX_HEADER_BYTES]; let mut used = 0usize; @@ -417,6 +434,7 @@ async fn handle_tcp_connection( trusted_host_gateway, secret_resolver, denial_tx.as_ref(), + activity_tx.as_ref(), ) .await; } @@ -435,6 +453,7 @@ async fn handle_tcp_connection( ) .await?; if let InferenceOutcome::Denied { reason } = outcome { + emit_activity(&activity_tx, true, "forward_policy"); let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) .activity(ActivityId::Open) .action(ActionId::Denied) @@ -539,6 +558,7 @@ async fn handle_tcp_connection( &deny_reason, "connect", ); + emit_activity(&activity_tx, true, "connect_policy"); respond( &mut client, &build_json_error_response( @@ -612,6 +632,7 @@ async fn handle_tcp_connection( &reason, "ssrf", ); + emit_activity(&activity_tx, true, "ssrf"); respond( &mut client, &build_json_error_response( @@ -667,6 +688,7 @@ async fn handle_tcp_connection( &reason, "ssrf", ); + emit_activity(&activity_tx, true, "ssrf"); respond( &mut client, &build_json_error_response( @@ -714,6 +736,7 @@ async fn handle_tcp_connection( &reason, "ssrf", ); + emit_activity(&activity_tx, true, "ssrf"); respond( &mut client, &build_json_error_response( @@ -764,6 +787,7 @@ async fn handle_tcp_connection( &reason, "ssrf", ); + emit_activity(&activity_tx, true, "ssrf"); respond( &mut client, &build_json_error_response( @@ -818,6 +842,7 @@ async fn handle_tcp_connection( .build(); ocsf_emit!(event); } + emit_activity(&activity_tx, false, "unknown"); // Determine effective TLS mode. Check the raw endpoint config for // `tls: skip` independently of L7 config (which requires `protocol`). @@ -845,6 +870,7 @@ async fn handle_tcp_connection( .map(|p| p.to_string_lossy().into_owned()) .collect(), secret_resolver: secret_resolver.clone(), + activity_tx: activity_tx.clone(), }; if effective_tls_skip { @@ -2699,6 +2725,7 @@ async fn handle_forward_proxy( trusted_host_gateway: Arc>, secret_resolver: Option>, denial_tx: Option<&mpsc::UnboundedSender>, + activity_tx: Option<&mpsc::UnboundedSender>, ) -> Result<()> { // 1. Parse the absolute-form URI. `path` is marked `mut` so that, when an // L7 config applies, the canonicalized form produced below replaces it @@ -2861,6 +2888,7 @@ async fn handle_forward_proxy( reason, "forward", ); + emit_activity_simple(activity_tx, true, "forward_policy"); respond( client, &build_json_error_response( @@ -2880,6 +2908,7 @@ async fn handle_forward_proxy( Ok(guard) => guard, Err(e) => { emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); + emit_activity_simple(activity_tx, true, "policy_stale"); respond( client, &build_json_error_response( @@ -2923,7 +2952,9 @@ async fn handle_forward_proxy( .map(|p| p.to_string_lossy().into_owned()) .collect(), secret_resolver: secret_resolver.clone(), + activity_tx: activity_tx.cloned(), }; + let mut l7_activity_recorded = false; // 4b. If the endpoint has L7 config, evaluate the request against // L7 policy. The forward proxy handles exactly one request per @@ -2941,6 +2972,7 @@ async fn handle_forward_proxy( route.generation, ), ); + emit_activity_simple(activity_tx, true, "policy_stale"); respond( client, &build_json_error_response( @@ -2957,6 +2989,7 @@ async fn handle_forward_proxy( Ok(engine) => engine, Err(e) => { emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); + emit_activity_simple(activity_tx, true, "policy_stale"); respond( client, &build_json_error_response( @@ -3013,6 +3046,7 @@ async fn handle_forward_proxy( )) .build(); ocsf_emit!(event); + emit_activity_simple(activity_tx, true, "l7_parse_rejection"); respond( client, &build_json_error_response( @@ -3027,6 +3061,7 @@ async fn handle_forward_proxy( } }; let Some(l7_config) = select_l7_config_for_path(&route.configs, &path) else { + emit_activity_simple(activity_tx, true, "l7_policy"); respond( client, &build_json_error_response( @@ -3079,6 +3114,7 @@ async fn handle_forward_proxy( .message(format!("FORWARD_GRAPHQL_L7 request rejected: {e}")) .build(); ocsf_emit!(event); + emit_activity_simple(activity_tx, true, "l7_parse_rejection"); respond( client, &build_json_error_response( @@ -3184,6 +3220,8 @@ async fn handle_forward_proxy( let effectively_denied = force_deny || (!allowed && l7_config.config.enforcement == crate::l7::EnforcementMode::Enforce); + emit_activity_simple(activity_tx, effectively_denied, "l7_policy"); + l7_activity_recorded = true; if effectively_denied { emit_denial_simple( @@ -3267,6 +3305,7 @@ async fn handle_forward_proxy( &reason, "ssrf", ); + emit_activity_simple(activity_tx, true, "ssrf"); respond( client, &build_json_error_response( @@ -3323,6 +3362,7 @@ async fn handle_forward_proxy( &reason, "ssrf", ); + emit_activity_simple(activity_tx, true, "ssrf"); respond( client, &build_json_error_response( @@ -3374,6 +3414,7 @@ async fn handle_forward_proxy( &reason, "ssrf", ); + emit_activity_simple(activity_tx, true, "ssrf"); respond( client, &build_json_error_response( @@ -3428,6 +3469,7 @@ async fn handle_forward_proxy( &reason, "ssrf", ); + emit_activity_simple(activity_tx, true, "ssrf"); respond( client, &build_json_error_response( @@ -3445,6 +3487,7 @@ async fn handle_forward_proxy( if let Err(e) = forward_generation_guard.ensure_current() { emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); + emit_activity_simple(activity_tx, true, "policy_stale"); respond( client, &build_json_error_response( @@ -3518,6 +3561,9 @@ async fn handle_forward_proxy( .build(); ocsf_emit!(event); } + if !l7_activity_recorded { + emit_activity_simple(activity_tx, false, "unknown"); + } // 9. Rewrite request and forward to upstream let rewritten = match rewrite_forward_request( @@ -3679,11 +3725,8 @@ fn is_benign_relay_error(err: &miette::Report) -> bool { )] mod tests { use super::*; - use std::future::Future; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; use std::sync::Arc; - use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; - use tokio::net::{TcpListener, TcpStream}; fn websocket_l7_config( protocol: crate::l7::L7Protocol, @@ -3828,6 +3871,7 @@ mod tests { ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; (config, tunnel_engine, ctx) } @@ -3993,6 +4037,7 @@ mod tests { ancestors: vec![], cmdline_paths: vec![], secret_resolver: resolver, + activity_tx: None, }; let query_params = std::collections::HashMap::new(); @@ -4033,6 +4078,7 @@ mod tests { ancestors: vec![], cmdline_paths: vec![], secret_resolver: None, + activity_tx: None, }; let query_params = std::collections::HashMap::new(); let config = websocket_l7_config(crate::l7::L7Protocol::Rest, false); @@ -5011,184 +5057,6 @@ network_policies: assert!(!forwarded_lc.contains("cookie:")); } - fn streaming_inference_route(endpoint: String) -> openshell_router::config::ResolvedRoute { - openshell_router::config::ResolvedRoute { - name: "inference.local".to_string(), - endpoint, - model: "meta/llama-3.1-8b-instruct".to_string(), - api_key: "test-api-key".to_string(), - protocols: vec!["openai_chat_completions".to_string()], - auth: openshell_router::config::AuthHeader::Bearer, - default_headers: vec![], - passthrough_headers: vec![], - timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, - } - } - - async fn read_forwarded_inference_request(stream: &mut S) { - use crate::l7::inference::{ParseResult, try_parse_http_request}; - - let mut buf = Vec::new(); - let mut chunk = [0u8; 4096]; - loop { - let n = stream.read(&mut chunk).await.unwrap(); - assert!(n > 0, "upstream request closed before completion"); - buf.extend_from_slice(&chunk[..n]); - - match try_parse_http_request(&buf) { - ParseResult::Complete(_, _) => return, - ParseResult::Incomplete => continue, - ParseResult::Invalid(reason) => { - panic!("forwarded request should parse cleanly: {reason}"); - } - } - } - } - - async fn run_live_streaming_inference(serve_upstream: F) -> String - where - F: FnOnce(TcpStream) -> Fut + Send + 'static, - Fut: Future + Send + 'static, - { - let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); - let upstream_addr = listener.local_addr().unwrap(); - let upstream_task = tokio::spawn(async move { - let (mut upstream, _) = listener.accept().await.unwrap(); - read_forwarded_inference_request(&mut upstream).await; - serve_upstream(upstream).await; - }); - - let router = openshell_router::Router::new().unwrap(); - let patterns = crate::l7::inference::default_patterns(); - let ctx = InferenceContext::new( - patterns, - router, - vec![streaming_inference_route(format!("http://{upstream_addr}"))], - vec![], - ); - - let body = r#"{"model":"ignored","messages":[{"role":"user","content":"hi"}]}"#; - let request = format!( - "POST /v1/chat/completions HTTP/1.1\r\n\ - Host: inference.local\r\n\ - Content-Type: application/json\r\n\ - Accept: text/event-stream\r\n\ - Content-Length: {}\r\n\r\n{}", - body.len(), - body, - ); - - let (client, mut server) = tokio::io::duplex(65536); - let (mut client_read, mut client_write) = tokio::io::split(client); - let server_task = - tokio::spawn(async move { process_inference_keepalive(&mut server, &ctx, 443).await }); - - client_write.write_all(request.as_bytes()).await.unwrap(); - client_write.shutdown().await.unwrap(); - - let mut response = Vec::new(); - client_read.read_to_end(&mut response).await.unwrap(); - - let outcome = server_task.await.unwrap().unwrap(); - assert!( - matches!(outcome, InferenceOutcome::Routed), - "expected Routed outcome, got: {outcome:?}" - ); - upstream_task.await.unwrap(); - - String::from_utf8(response).unwrap() - } - - fn assert_streaming_sse_error(response: &str, message: &str) { - assert!( - response.starts_with("HTTP/1.1 200 OK\r\n"), - "expected successful streaming response, got: {response}" - ); - assert!( - response - .to_ascii_lowercase() - .contains("transfer-encoding: chunked"), - "expected chunked streaming response, got: {response}" - ); - assert!( - response.contains("\"type\":\"proxy_stream_error\""), - "expected proxy_stream_error SSE event, got: {response}" - ); - assert!( - response.contains(&format!("\"message\":\"{message}\"")), - "expected SSE message {message:?}, got: {response}" - ); - assert!( - response.ends_with("0\r\n\r\n"), - "streaming response must end with chunked terminator, got: {response}" - ); - } - - #[tokio::test] - async fn inference_stream_byte_limit_injects_sse_error() { - let response = run_live_streaming_inference(|mut upstream| async move { - use crate::l7::inference::{format_chunk, format_chunk_terminator}; - - upstream - .write_all( - b"HTTP/1.1 200 OK\r\n\ - Content-Type: text/event-stream\r\n\ - Transfer-Encoding: chunked\r\n\r\n", - ) - .await - .unwrap(); - let body = vec![b'a'; MAX_STREAMING_BODY + 1]; - let _ = upstream.write_all(&format_chunk(&body)).await; - let _ = upstream.write_all(format_chunk_terminator()).await; - }) - .await; - - assert_streaming_sse_error( - &response, - "response truncated: exceeded maximum streaming body size", - ); - } - - #[tokio::test] - async fn inference_stream_upstream_read_error_injects_sse_error() { - let response = run_live_streaming_inference(|mut upstream| async move { - upstream - .write_all( - b"HTTP/1.1 200 OK\r\n\ - Content-Type: text/event-stream\r\n\ - Content-Length: 64\r\n\r\n\ - partial", - ) - .await - .unwrap(); - }) - .await; - - assert!( - response.contains("partial"), - "expected initial upstream bytes before truncation, got: {response}" - ); - assert_streaming_sse_error(&response, "response truncated: upstream read error"); - } - - #[tokio::test] - async fn inference_stream_idle_timeout_injects_sse_error() { - let response = run_live_streaming_inference(|mut upstream| async move { - upstream - .write_all( - b"HTTP/1.1 200 OK\r\n\ - Content-Type: text/event-stream\r\n\ - Transfer-Encoding: chunked\r\n\r\n", - ) - .await - .unwrap(); - tokio::time::sleep(CHUNK_IDLE_TIMEOUT + std::time::Duration::from_millis(50)).await; - }) - .await; - - assert_streaming_sse_error(&response, "response truncated: chunk idle timeout exceeded"); - } - // -- router_error_to_http -- #[test] diff --git a/crates/openshell-sandbox/src/sandbox/linux/mod.rs b/crates/openshell-sandbox/src/sandbox/linux/mod.rs index a3a32c77a..848ab1e3b 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/mod.rs @@ -5,7 +5,6 @@ mod landlock; pub mod netns; -mod nft_ruleset; mod seccomp; use crate::policy::SandboxPolicy; diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-sandbox/src/sandbox/linux/netns.rs index 433f70b1c..019036e53 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/netns.rs @@ -242,7 +242,7 @@ impl NetworkNamespace { self.ns_fd } - /// Install nftables rules for bypass detection inside the namespace. + /// Install iptables rules for bypass detection inside the namespace. /// /// Sets up OUTPUT chain rules that: /// 1. ACCEPT traffic destined for the proxy (`host_ip:proxy_port`) @@ -253,21 +253,22 @@ impl NetworkNamespace { /// This provides two benefits: /// - **Fast-fail UX**: applications get immediate ECONNREFUSED instead of /// a 30-second timeout when they bypass the proxy - /// - **Diagnostics**: nftables LOG entries are picked up by the bypass + /// - **Diagnostics**: iptables LOG entries are picked up by the bypass /// monitor to emit structured tracing events /// - /// Degrades gracefully if `nft` is not available — the namespace + /// Degrades gracefully if `iptables` is not available — the namespace /// still provides isolation via routing, just without fast-fail and /// diagnostic logging. pub fn install_bypass_rules(&self, proxy_port: u16) -> Result<()> { - let Some(nft_path) = find_nft() else { + // Check if iptables is available before attempting to install rules. + let Some(iptables_path) = find_iptables() else { openshell_ocsf::ocsf_emit!( openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) .severity(openshell_ocsf::SeverityId::Medium) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Disabled, "degraded") .message(format!( - "nft not found; bypass detection rules will not be installed [ns:{}]", + "iptables not found; bypass detection rules will not be installed [ns:{}]", self.name )) .build() @@ -276,53 +277,49 @@ impl NetworkNamespace { }; let host_ip_str = self.host_ip.to_string(); + let proxy_port_str = proxy_port.to_string(); let log_prefix = format!("openshell:bypass:{}:", &self.name); - // The kernel's nf_log_syslog module suppresses log output from - // non-init network namespaces by default. Enable it so the bypass - // monitor can see log entries from the sandbox namespace. - enable_nf_log_all_netns(); + // "Installing bypass detection rules" is a transient step — skip OCSF. + // The completion event below covers the outcome. - // Try combined ruleset with log rules first. Log rules must appear - // before reject rules in the chain so packets are logged before being - // rejected. If the kernel lacks nft_log support, fall back to the - // reject-only ruleset. - let ruleset_with_log = super::nft_ruleset::generate_bypass_ruleset( + // Install IPv4 rules + if let Err(e) = self.install_bypass_rules_for( + &iptables_path, &host_ip_str, - proxy_port, - Some(&log_prefix), - ); + &proxy_port_str, + &log_prefix, + ) { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(openshell_ocsf::SeverityId::Medium) + .status(openshell_ocsf::StatusId::Failure) + .state(openshell_ocsf::StateId::Disabled, "failed") + .message(format!( + "Failed to install IPv4 bypass detection rules [ns:{}]: {e}", + self.name + )) + .build() + ); + return Err(e); + } - if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_with_log) { + // Install IPv6 rules — best-effort. + // Skip the proxy ACCEPT rule for IPv6 since the proxy address is IPv4. + if let Some(ip6_path) = find_ip6tables(&iptables_path) + && let Err(e) = self.install_bypass_rules_for_v6(&ip6_path, &log_prefix) + { openshell_ocsf::ocsf_emit!( openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) .severity(openshell_ocsf::SeverityId::Low) .status(openshell_ocsf::StatusId::Failure) .state(openshell_ocsf::StateId::Other, "degraded") .message(format!( - "Failed to install bypass log rules (non-fatal), falling back to reject-only [ns:{}]: {e}", + "Failed to install IPv6 bypass detection rules (non-fatal) [ns:{}]: {e}", self.name )) .build() ); - - let ruleset_no_log = - super::nft_ruleset::generate_bypass_ruleset(&host_ip_str, proxy_port, None); - - if let Err(e) = run_nft_netns(&self.name, &nft_path, &ruleset_no_log) { - openshell_ocsf::ocsf_emit!( - openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) - .severity(openshell_ocsf::SeverityId::Medium) - .status(openshell_ocsf::StatusId::Failure) - .state(openshell_ocsf::StateId::Disabled, "failed") - .message(format!( - "Failed to install bypass detection rules [ns:{}]: {e}", - self.name - )) - .build() - ); - return Err(e); - } } openshell_ocsf::ocsf_emit!( @@ -339,6 +336,297 @@ impl NetworkNamespace { Ok(()) } + + /// Install bypass detection rules for a specific iptables variant (iptables or ip6tables). + fn install_bypass_rules_for( + &self, + iptables_cmd: &str, + host_ip: &str, + proxy_port: &str, + log_prefix: &str, + ) -> Result<()> { + // Rule 1: ACCEPT traffic to the proxy + run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-d", + &format!("{host_ip}/32"), + "-p", + "tcp", + "--dport", + proxy_port, + "-j", + "ACCEPT", + ], + )?; + + // Rule 2: ACCEPT loopback traffic + run_iptables_netns( + &self.name, + iptables_cmd, + &["-A", "OUTPUT", "-o", "lo", "-j", "ACCEPT"], + )?; + + // Rule 3: ACCEPT established/related connections (response packets) + run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-m", + "conntrack", + "--ctstate", + "ESTABLISHED,RELATED", + "-j", + "ACCEPT", + ], + )?; + + // Rule 4: LOG TCP SYN bypass attempts (rate-limited) + // LOG rule failure is non-fatal — the REJECT rule still provides fast-fail. + if let Err(e) = run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "tcp", + "--syn", + "-m", + "limit", + "--limit", + "5/sec", + "--limit-burst", + "10", + "-j", + "LOG", + "--log-prefix", + log_prefix, + "--log-uid", + ], + ) { + openshell_ocsf::ocsf_emit!(openshell_ocsf::ConfigStateChangeBuilder::new( + crate::ocsf_ctx() + ) + .severity(openshell_ocsf::SeverityId::Low) + .status(openshell_ocsf::StatusId::Failure) + .state(openshell_ocsf::StateId::Other, "degraded") + .message(format!( + "Failed to install LOG rule for TCP (xt_LOG module may not be loaded) [ns:{}]: {e}", + self.name + )) + .build()); + } + + // Rule 5: REJECT TCP bypass attempts (fast-fail) + run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "tcp", + "-j", + "REJECT", + "--reject-with", + "icmp-port-unreachable", + ], + )?; + + // Rule 6: LOG UDP bypass attempts (rate-limited, covers DNS bypass) + if let Err(e) = run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "udp", + "-m", + "limit", + "--limit", + "5/sec", + "--limit-burst", + "10", + "-j", + "LOG", + "--log-prefix", + log_prefix, + "--log-uid", + ], + ) { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(openshell_ocsf::SeverityId::Low) + .status(openshell_ocsf::StatusId::Failure) + .state(openshell_ocsf::StateId::Other, "degraded") + .message(format!( + "Failed to install LOG rule for UDP [ns:{}]: {e}", + self.name + )) + .build() + ); + } + + // Rule 7: REJECT UDP bypass attempts (covers DNS bypass) + run_iptables_netns( + &self.name, + iptables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "udp", + "-j", + "REJECT", + "--reject-with", + "icmp-port-unreachable", + ], + )?; + + Ok(()) + } + + /// Install IPv6 bypass detection rules. + /// + /// Similar to `install_bypass_rules_for` but omits the proxy ACCEPT rule + /// (the proxy listens on an IPv4 address) and uses IPv6-appropriate + /// REJECT types. + fn install_bypass_rules_for_v6(&self, ip6tables_cmd: &str, log_prefix: &str) -> Result<()> { + // ACCEPT loopback traffic + run_iptables_netns( + &self.name, + ip6tables_cmd, + &["-A", "OUTPUT", "-o", "lo", "-j", "ACCEPT"], + )?; + + // ACCEPT established/related connections + run_iptables_netns( + &self.name, + ip6tables_cmd, + &[ + "-A", + "OUTPUT", + "-m", + "conntrack", + "--ctstate", + "ESTABLISHED,RELATED", + "-j", + "ACCEPT", + ], + )?; + + // LOG TCP SYN bypass attempts (rate-limited) + if let Err(e) = run_iptables_netns( + &self.name, + ip6tables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "tcp", + "--syn", + "-m", + "limit", + "--limit", + "5/sec", + "--limit-burst", + "10", + "-j", + "LOG", + "--log-prefix", + log_prefix, + "--log-uid", + ], + ) { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(openshell_ocsf::SeverityId::Low) + .status(openshell_ocsf::StatusId::Failure) + .state(openshell_ocsf::StateId::Other, "degraded") + .message(format!( + "Failed to install IPv6 LOG rule for TCP [ns:{}]: {e}", + self.name + )) + .build() + ); + } + + // REJECT TCP bypass attempts + run_iptables_netns( + &self.name, + ip6tables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "tcp", + "-j", + "REJECT", + "--reject-with", + "icmp6-port-unreachable", + ], + )?; + + // LOG UDP bypass attempts (rate-limited) + if let Err(e) = run_iptables_netns( + &self.name, + ip6tables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "udp", + "-m", + "limit", + "--limit", + "5/sec", + "--limit-burst", + "10", + "-j", + "LOG", + "--log-prefix", + log_prefix, + "--log-uid", + ], + ) { + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(openshell_ocsf::SeverityId::Low) + .status(openshell_ocsf::StatusId::Failure) + .state(openshell_ocsf::StateId::Other, "degraded") + .message(format!( + "Failed to install IPv6 LOG rule for UDP [ns:{}]: {e}", + self.name + )) + .build() + ); + } + + // REJECT UDP bypass attempts + run_iptables_netns( + &self.name, + ip6tables_cmd, + &[ + "-A", + "OUTPUT", + "-p", + "udp", + "-j", + "REJECT", + "--reject-with", + "icmp6-port-unreachable", + ], + )?; + + Ok(()) + } } impl Drop for NetworkNamespace { @@ -444,43 +732,34 @@ fn run_ip_netns(netns: &str, args: &[&str]) -> Result<()> { Ok(()) } -/// Load an nftables ruleset inside a network namespace via `nsenter --net=`. +/// Run an iptables command inside a network namespace via `nsenter --net=`. /// -/// Writes the ruleset to a temp file and loads it with `nft -f `. -/// A temp file is used instead of piping to stdin (`nft -f -`) because -/// `nft` resolves `-` to `/dev/stdin`, which may not exist in minimal -/// VM guest environments (e.g. virtiofs rootfs without /proc mounted -/// at nft invocation time). -fn run_nft_netns(netns: &str, nft_cmd: &str, ruleset: &str) -> Result<()> { - use std::io::Write; - let mut tmp = tempfile::Builder::new() - .prefix("openshell-nft-") - .suffix(".conf") - .tempfile() - .into_diagnostic()?; - tmp.write_all(ruleset.as_bytes()).into_diagnostic()?; - let ruleset_path = tmp.path().to_string_lossy().to_string(); - +/// Uses `nsenter` instead of `ip netns exec` to avoid the sysfs remount +/// that fails in rootless container runtimes. See `run_ip_netns` for details. +fn run_iptables_netns(netns: &str, iptables_cmd: &str, args: &[&str]) -> Result<()> { let nsenter_path = find_trusted_binary("nsenter", NSENTER_SEARCH_PATHS)?; let ns_path = format!("/var/run/netns/{netns}"); let net_flag = format!("--net={ns_path}"); + let mut full_args = vec![net_flag.as_str(), "--", iptables_cmd]; + full_args.extend(args); + debug!( - command = %format!("{nsenter_path} {net_flag} -- {nft_cmd} -f {ruleset_path}"), - "Loading nftables ruleset in namespace" + command = %format!("{nsenter_path} {}", full_args.join(" ")), + "Running iptables in namespace via nsenter" ); let output = Command::new(nsenter_path) - .args([net_flag.as_str(), "--", nft_cmd, "-f", &ruleset_path]) + .args(&full_args) .output() .into_diagnostic()?; - drop(tmp); - if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); return Err(miette::miette!( - "nft ruleset load failed in netns {netns}: {}", + "{nsenter_path} --net={} {} failed: {}", + ns_path, + iptables_cmd, stderr.trim() )); } @@ -488,35 +767,11 @@ fn run_nft_netns(netns: &str, nft_cmd: &str, ruleset: &str) -> Result<()> { Ok(()) } -const NF_LOG_ALL_NETNS_PATH: &str = "/proc/sys/net/netfilter/nf_log_all_netns"; - -/// Enable nftables logging from non-init network namespaces. -/// -/// The kernel's `nf_log_syslog` module silently suppresses log output from -/// non-init network namespaces unless `net.netfilter.nf_log_all_netns` is -/// set to 1. Since sandbox bypass rules live in a per-sandbox network -/// namespace, the bypass monitor can't see log entries without this. -fn enable_nf_log_all_netns() { - use std::path::Path; - if !Path::new(NF_LOG_ALL_NETNS_PATH).exists() { - debug!("nf_log_all_netns sysctl not available (may already be set by init)"); - return; - } - match std::fs::write(NF_LOG_ALL_NETNS_PATH, "1") { - Ok(()) => { - debug!("Enabled nf_log_all_netns for non-init namespace logging"); - } - Err(e) => { - debug!( - error = %e, - "Could not enable nf_log_all_netns; bypass log rules may not produce output" - ); - } - } -} - -/// Well-known paths where nft may be installed. -const NFT_SEARCH_PATHS: &[&str] = &["/usr/sbin/nft", "/sbin/nft", "/usr/bin/nft"]; +/// Well-known paths where iptables may be installed. +/// The sandbox container PATH often excludes `/usr/sbin`, so we probe +/// explicit paths rather than relying on `which`. +const IPTABLES_SEARCH_PATHS: &[&str] = + &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"]; fn find_trusted_binary<'a>(name: &str, paths: &'a [&str]) -> Result<&'a str> { paths @@ -534,11 +789,100 @@ fn find_trusted_binary<'a>(name: &str, paths: &'a [&str]) -> Result<&'a str> { }) } -/// Find the nft binary path, checking well-known locations. -fn find_nft() -> Option { - find_trusted_binary("nft", NFT_SEARCH_PATHS) - .ok() - .map(String::from) +/// Returns true if xt extension modules (e.g. `xt_comment`) cannot be used +/// via the given iptables binary. +/// +/// Some kernels have `nf_tables` but lack the `nft_compat` bridge that allows +/// xt extension modules to be used through the `nf_tables` path (e.g. Jetson +/// Linux 5.15-tegra). This probe detects that condition by attempting to +/// insert a rule using the `xt_comment` extension. If it fails, xt extensions +/// are unavailable and the caller should fall back to iptables-legacy. +fn xt_extensions_unavailable(iptables_path: &str) -> bool { + // Create a temporary probe chain. If this fails (e.g. no CAP_NET_ADMIN), + // we can't determine availability — assume extensions are available. + let created = Command::new(iptables_path) + .args(["-t", "filter", "-N", "_xt_probe"]) + .output() + .is_ok_and(|o| o.status.success()); + + if !created { + return false; + } + + // Attempt to insert a rule using xt_comment. Failure means nft_compat + // cannot bridge xt extension modules on this kernel. + let probe_ok = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-A", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output() + .is_ok_and(|o| o.status.success()); + + // Clean up — best-effort, ignore failures. + let _ = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-D", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output(); + let _ = Command::new(iptables_path) + .args(["-t", "filter", "-X", "_xt_probe"]) + .output(); + + !probe_ok +} + +/// Find the iptables binary path, checking well-known locations. +/// +/// If xt extension modules are unavailable via the standard binary and +/// `iptables-legacy` is available alongside it, the legacy binary is returned +/// instead. This ensures bypass-detection rules can be installed on kernels +/// where `nft_compat` is unavailable (e.g. Jetson Linux 5.15-tegra). +fn find_iptables() -> Option { + let standard_path = IPTABLES_SEARCH_PATHS + .iter() + .find(|path| Path::new(path).exists()) + .copied()?; + + if xt_extensions_unavailable(standard_path) { + let legacy_path = standard_path.replace("iptables", "iptables-legacy"); + if Path::new(&legacy_path).exists() { + debug!( + legacy = legacy_path, + "xt extensions unavailable; using iptables-legacy" + ); + return Some(legacy_path); + } + } + + Some(standard_path.to_string()) +} + +/// Find the ip6tables binary path, deriving it from the iptables location. +fn find_ip6tables(iptables_path: &str) -> Option { + let ip6_path = iptables_path.replace("iptables", "ip6tables"); + if Path::new(&ip6_path).exists() { + Some(ip6_path) + } else { + None + } } #[cfg(test)] @@ -570,16 +914,6 @@ mod tests { assert!(err.to_string().contains("trusted nsenter helper not found")); } - #[test] - fn nft_search_paths_are_absolute() { - for path in NFT_SEARCH_PATHS { - assert!( - path.starts_with('/'), - "NFT_SEARCH_PATHS entry must be absolute: {path}" - ); - } - } - #[test] #[ignore = "requires root privileges"] fn test_create_and_drop_namespace() { diff --git a/crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs b/crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs deleted file mode 100644 index ba7aeb936..000000000 --- a/crates/openshell-sandbox/src/sandbox/linux/nft_ruleset.rs +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! nftables ruleset generation for sandbox network bypass enforcement. -//! -//! This module provides pure functions to generate nftables rulesets that enforce -//! the sandbox network policy: all traffic must go through the proxy, with bypass -//! attempts logged and rejected. - -/// Generate a complete nftables ruleset for sandbox network bypass enforcement. -/// -/// Creates an `inet` family table (handles both IPv4 and IPv6) with rules that: -/// 1. Accept traffic to the proxy (IPv4 only) -/// 2. Accept loopback traffic -/// 3. Accept established/related connections -/// 4. Reject TCP and UDP bypass attempts (both IPv4 and IPv6) -/// -/// If `log_prefix` is provided, log rules are inserted before each reject rule -/// so that bypass attempts are recorded in the kernel ring buffer before being -/// rejected. The `log` expression requires kernel `nft_log` module support; -/// pass `None` for `log_prefix` as a fallback when that module is unavailable. -pub fn generate_bypass_ruleset(host_ip: &str, proxy_port: u16, log_prefix: Option<&str>) -> String { - let log_tcp = log_prefix - .map(|p| { - format!( - "\n tcp flags syn limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" - ) - }) - .unwrap_or_default(); - let log_udp = log_prefix - .map(|p| { - format!( - "\n meta l4proto udp limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" - ) - }) - .unwrap_or_default(); - - format!( - r#"table inet openshell_bypass {{ - chain output {{ - type filter hook output priority 0; policy accept; - - ip daddr {host_ip} tcp dport {proxy_port} accept - oifname "lo" accept - ct state established,related accept{log_tcp} - meta nfproto ipv4 meta l4proto tcp reject with icmp type port-unreachable - meta nfproto ipv6 meta l4proto tcp reject with icmpv6 type port-unreachable{log_udp} - meta nfproto ipv4 meta l4proto udp reject with icmp type port-unreachable - meta nfproto ipv6 meta l4proto udp reject with icmpv6 type port-unreachable - }} -}} -"# - ) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn generates_bypass_ruleset_with_proxy_rule() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, None); - assert!(ruleset.contains("table inet openshell_bypass")); - assert!(ruleset.contains("chain output")); - assert!(ruleset.contains("ip daddr 10.0.2.2 tcp dport 8080 accept")); - } - - #[test] - fn ruleset_has_inet_family_table_and_output_chain() { - let ruleset = generate_bypass_ruleset("192.168.1.1", 3128, None); - assert!(ruleset.contains("table inet openshell_bypass")); - assert!(ruleset.contains("type filter hook output priority 0; policy accept;")); - } - - #[test] - fn proxy_accept_rule_uses_provided_ip_and_port() { - let ruleset = generate_bypass_ruleset("172.16.0.1", 9999, None); - assert!(ruleset.contains("ip daddr 172.16.0.1 tcp dport 9999 accept")); - } - - #[test] - fn rules_are_ordered_accept_then_reject() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, None); - let proxy_pos = ruleset.find("ip daddr").unwrap(); - let lo_pos = ruleset.find("oifname \"lo\"").unwrap(); - let ct_pos = ruleset.find("ct state established,related").unwrap(); - let reject_pos = ruleset.find("reject with icmp type").unwrap(); - - assert!(proxy_pos < lo_pos); - assert!(lo_pos < ct_pos); - assert!(ct_pos < reject_pos); - } - - #[test] - fn both_ipv4_and_ipv6_reject_types_are_present() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, None); - let icmp_count = ruleset - .matches("reject with icmp type port-unreachable") - .count(); - let icmpv6_count = ruleset - .matches("reject with icmpv6 type port-unreachable") - .count(); - assert_eq!(icmp_count, 2, "need IPv4 ICMP rejects for TCP + UDP"); - assert_eq!(icmpv6_count, 2, "need IPv6 ICMPv6 rejects for TCP + UDP"); - } - - #[test] - fn no_log_ruleset_omits_log_rules() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, None); - assert!( - !ruleset.contains("log prefix"), - "no-log ruleset must not contain log rules" - ); - } - - #[test] - fn log_ruleset_contains_prefix_for_tcp_and_udp() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, Some("openshell:bypass:test:")); - let count = ruleset - .matches("log prefix \"openshell:bypass:test:\"") - .count(); - assert_eq!(count, 2, "need log rules for both TCP and UDP"); - assert!(ruleset.contains("tcp flags syn limit rate 5/second burst 10 packets")); - assert!(ruleset.contains("meta l4proto udp limit rate 5/second burst 10 packets")); - } - - #[test] - fn log_rules_appear_before_reject_rules() { - let ruleset = generate_bypass_ruleset("10.0.2.2", 8080, Some("openshell:bypass:test:")); - let tcp_log_pos = ruleset.find("tcp flags syn").unwrap(); - let tcp_reject_pos = ruleset - .find("meta nfproto ipv4 meta l4proto tcp reject") - .unwrap(); - let udp_log_pos = ruleset.find("meta l4proto udp limit rate").unwrap(); - let udp_reject_pos = ruleset - .find("meta nfproto ipv4 meta l4proto udp reject") - .unwrap(); - - assert!( - tcp_log_pos < tcp_reject_pos, - "TCP log rule must come before TCP reject rule" - ); - assert!( - udp_log_pos < udp_reject_pos, - "UDP log rule must come before UDP reject rule" - ); - } -} diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs index de7804393..1363a5d09 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-sandbox/src/secrets.rs @@ -26,13 +26,6 @@ fn contains_raw_reserved_marker(value: &str) -> bool { value.contains(PLACEHOLDER_PREFIX) || value.contains(PROVIDER_ALIAS_MARKER) } -fn current_time_ms() -> i64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .map(|duration| i64::try_from(duration.as_millis()).unwrap_or(i64::MAX)) - .unwrap_or_default() -} - pub fn contains_reserved_credential_marker(value: &str) -> bool { if contains_raw_reserved_marker(value) { return true; @@ -91,13 +84,7 @@ pub struct RewriteTargetResult { #[derive(Clone, Default)] pub struct SecretResolver { - by_placeholder: HashMap, -} - -#[derive(Clone)] -struct SecretValue { - value: String, - expires_at_ms: i64, + by_placeholder: HashMap, } // Manual `Debug` impl: the auto-derived `Debug` would format the @@ -120,49 +107,34 @@ impl SecretResolver { pub(crate) fn from_provider_env( provider_env: HashMap, ) -> (HashMap, Option) { - Self::from_provider_env_for_revision(provider_env, HashMap::new(), 0) + Self::from_provider_env_for_revision(provider_env, 0) } pub(crate) fn from_provider_env_for_revision( provider_env: HashMap, - credential_expires_at_ms: HashMap, revision: u64, ) -> (HashMap, Option) { - Self::from_provider_env_for_revision_with_current_aliases( - provider_env, - credential_expires_at_ms, - revision, - false, - ) + Self::from_provider_env_for_revision_with_current_aliases(provider_env, revision, false) } pub(crate) fn from_provider_env_for_current_revision( provider_env: HashMap, - credential_expires_at_ms: HashMap, revision: u64, ) -> (HashMap, Option, Option) { if revision == 0 { let (child_env, current_resolver) = - Self::from_provider_env_for_revision_with_current_aliases( - provider_env, - credential_expires_at_ms, - 0, - true, - ); + Self::from_provider_env_for_revision_with_current_aliases(provider_env, 0, true); return (child_env, None, current_resolver); } let provider_env_for_current = provider_env.clone(); - let credential_expires_at_ms_for_current = credential_expires_at_ms.clone(); let (child_env, revision_resolver) = Self::from_provider_env_for_revision_with_current_aliases( provider_env, - credential_expires_at_ms, revision, false, ); let (_, current_resolver) = Self::from_provider_env_for_revision_with_current_aliases( provider_env_for_current, - credential_expires_at_ms_for_current, revision, true, ); @@ -171,7 +143,6 @@ impl SecretResolver { fn from_provider_env_for_revision_with_current_aliases( provider_env: HashMap, - credential_expires_at_ms: HashMap, revision: u64, include_current_aliases: bool, ) -> (HashMap, Option) { @@ -184,17 +155,10 @@ impl SecretResolver { for (key, value) in provider_env { let placeholder = placeholder_for_env_key_for_revision(&key, revision); - let secret = SecretValue { - value, - expires_at_ms: credential_expires_at_ms - .get(&key) - .copied() - .unwrap_or_default(), - }; child_env.insert(key.clone(), placeholder.clone()); - by_placeholder.insert(placeholder, secret.clone()); + by_placeholder.insert(placeholder, value.clone()); if include_current_aliases && revision != 0 { - by_placeholder.insert(placeholder_for_env_key(&key), secret.clone()); + by_placeholder.insert(placeholder_for_env_key(&key), value.clone()); } } @@ -219,20 +183,13 @@ impl SecretResolver { /// contains prohibited control characters (CRLF, null byte). pub(crate) fn resolve_placeholder(&self, value: &str) -> Option<&str> { let secret = if let Some(secret) = self.by_placeholder.get(value) { - secret + secret.as_str() } else { let key = alias_env_key(value)?; let canonical = placeholder_for_env_key(key); - self.by_placeholder.get(&canonical)? + self.by_placeholder.get(&canonical).map(String::as_str)? }; - if secret.expires_at_ms > 0 && secret.expires_at_ms <= current_time_ms() { - tracing::warn!( - location = "resolve_placeholder", - "credential resolution rejected: credential is expired" - ); - return None; - } - match validate_resolved_secret(&secret.value) { + match validate_resolved_secret(secret) { Ok(s) => Some(s), Err(reason) => { tracing::warn!( diff --git a/crates/openshell-server/migrations/postgres/005_add_resource_version.sql b/crates/openshell-server/migrations/postgres/005_add_resource_version.sql deleted file mode 100644 index e6a294d62..000000000 --- a/crates/openshell-server/migrations/postgres/005_add_resource_version.sql +++ /dev/null @@ -1,5 +0,0 @@ --- Add resource_version column for optimistic concurrency control -ALTER TABLE objects ADD COLUMN resource_version BIGINT NOT NULL DEFAULT 1; - --- Backfill existing rows with resource_version = 1 --- (DEFAULT clause handles this automatically for existing rows in PostgreSQL) diff --git a/crates/openshell-server/migrations/sqlite/005_add_resource_version.sql b/crates/openshell-server/migrations/sqlite/005_add_resource_version.sql deleted file mode 100644 index 50aacb99d..000000000 --- a/crates/openshell-server/migrations/sqlite/005_add_resource_version.sql +++ /dev/null @@ -1,5 +0,0 @@ --- Add resource_version column for optimistic concurrency control -ALTER TABLE objects ADD COLUMN resource_version INTEGER NOT NULL DEFAULT 1; - --- Backfill existing rows with resource_version = 1 --- (DEFAULT clause handles this automatically for existing rows in SQLite) diff --git a/crates/openshell-server/src/auth/authz.rs b/crates/openshell-server/src/auth/authz.rs index 832687c14..2c29517f9 100644 --- a/crates/openshell-server/src/auth/authz.rs +++ b/crates/openshell-server/src/auth/authz.rs @@ -22,9 +22,6 @@ const ADMIN_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/CreateProvider", "/openshell.v1.OpenShell/UpdateProvider", "/openshell.v1.OpenShell/DeleteProvider", - "/openshell.v1.OpenShell/ConfigureProviderRefresh", - "/openshell.v1.OpenShell/RotateProviderCredential", - "/openshell.v1.OpenShell/DeleteProviderRefresh", // Global config and policy "/openshell.v1.OpenShell/UpdateConfig", // Draft policy approvals @@ -80,26 +77,10 @@ const SCOPED_METHODS: &[(&str, &str)] = &[ // provider:read ("/openshell.v1.OpenShell/GetProvider", "provider:read"), ("/openshell.v1.OpenShell/ListProviders", "provider:read"), - ( - "/openshell.v1.OpenShell/GetProviderRefreshStatus", - "provider:read", - ), // provider:write ("/openshell.v1.OpenShell/CreateProvider", "provider:write"), ("/openshell.v1.OpenShell/UpdateProvider", "provider:write"), ("/openshell.v1.OpenShell/DeleteProvider", "provider:write"), - ( - "/openshell.v1.OpenShell/ConfigureProviderRefresh", - "provider:write", - ), - ( - "/openshell.v1.OpenShell/RotateProviderCredential", - "provider:write", - ), - ( - "/openshell.v1.OpenShell/DeleteProviderRefresh", - "provider:write", - ), // config:read ("/openshell.v1.OpenShell/GetGatewayConfig", "config:read"), ("/openshell.v1.OpenShell/GetSandboxConfig", "config:read"), @@ -519,49 +500,6 @@ mod tests { assert!(err.message().contains("sandbox:write")); } - #[test] - fn provider_refresh_methods_require_provider_scopes_and_admin_for_writes() { - let policy = scoped_policy(); - let reader = identity_with_roles_and_scopes(&["openshell-user"], &["provider:read"]); - assert!( - policy - .check(&reader, "/openshell.v1.OpenShell/GetProviderRefreshStatus") - .is_ok() - ); - - let writer_without_admin = - identity_with_roles_and_scopes(&["openshell-user"], &["provider:write"]); - let err = policy - .check( - &writer_without_admin, - "/openshell.v1.OpenShell/ConfigureProviderRefresh", - ) - .unwrap_err(); - assert_eq!(err.code(), tonic::Code::PermissionDenied); - assert!(err.message().contains("openshell-admin")); - - let admin_without_scope = - identity_with_roles_and_scopes(&["openshell-admin"], &["provider:read"]); - let err = policy - .check( - &admin_without_scope, - "/openshell.v1.OpenShell/RotateProviderCredential", - ) - .unwrap_err(); - assert_eq!(err.code(), tonic::Code::PermissionDenied); - assert!(err.message().contains("provider:write")); - - let admin_writer = - identity_with_roles_and_scopes(&["openshell-admin"], &["provider:write"]); - for method in [ - "/openshell.v1.OpenShell/ConfigureProviderRefresh", - "/openshell.v1.OpenShell/RotateProviderCredential", - "/openshell.v1.OpenShell/DeleteProviderRefresh", - ] { - assert!(policy.check(&admin_writer, method).is_ok(), "{method}"); - } - } - #[test] fn get_sandbox_config_requires_config_read_scope() { let policy = scoped_policy(); diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs index 683170aad..c72dcd6dd 100644 --- a/crates/openshell-server/src/certgen.rs +++ b/crates/openshell-server/src/certgen.rs @@ -8,9 +8,9 @@ //! - **Kubernetes mode** (default): create two `kubernetes.io/tls` Secrets //! in the supplied namespace. Used by the Helm pre-install hook. Requires //! `--namespace`, `--server-secret-name`, `--client-secret-name`. -//! - **Local mode** (`--output-dir `): write PEMs to the local package -//! filesystem layout. Used by systemd units' `ExecStartPre`. Also copies -//! client materials to +//! - **Local mode** (`--output-dir `): write PEMs to a filesystem layout +//! used by the RPM systemd unit's `ExecStartPre`. Also copies client +//! materials to //! `$XDG_CONFIG_HOME/openshell/gateways/openshell/mtls/` so the local CLI //! picks them up automatically. //! diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 8d4e094c4..a2cfacde5 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -16,7 +16,6 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; use crate::config_file::{self, ConfigFile, GatewayFileSection}; -use crate::defaults::{self, LocalTlsPaths}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -88,9 +87,9 @@ struct RunArgs { /// Database URL for persistence. /// - /// When unset, the gateway stores state under the `XDG` state - /// directory. Kept as an Option at the clap layer so the `generate-certs` - /// subcommand can run without gateway runtime defaults. + /// Required when running the gateway. Validated at the call site rather + /// than as a clap-level requirement so the `generate-certs` subcommand + /// (which does not need a database) can run without it. #[arg(long, env = "OPENSHELL_DB_URL")] db_url: Option, @@ -202,11 +201,10 @@ pub async fn run_cli() -> Result<()> { } async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { - // Load TOML when explicitly requested, or from the default XDG location - // when that file exists. Missing default config is not an error: runtime - // defaults and OPENSHELL_* env vars are enough for package-managed starts. - let config_path = resolve_config_path(&args)?; - let file: Option = if let Some(path) = config_path { + // Load TOML file when --config / OPENSHELL_GATEWAY_CONFIG is set. + // File values are applied below for any argument that is still at its + // built-in default — CLI flags and OPENSHELL_* env vars always win. + let file: Option = if let Some(path) = args.config.clone() { Some(config_file::load(&path).map_err(|e| miette::miette!("{e}"))?) } else { None @@ -215,8 +213,6 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { merge_file_into_args(&mut args, &file.openshell.gateway, &matches); } - let local_tls = apply_runtime_defaults(&mut args)?; - let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -255,7 +251,7 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { let db_url = args .db_url .clone() - .expect("runtime defaults populate db_url"); + .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; let mut config = openshell_core::Config::new(tls) .with_bind_address(bind) @@ -336,13 +332,8 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } - let vm_config = build_vm_config( - file.as_ref(), - local_tls.as_ref(), - args.disable_tls, - args.port, - )?; - let docker_config = build_docker_config(file.as_ref(), local_tls.as_ref())?; + let vm_config = build_vm_config(file.as_ref())?; + let docker_config = build_docker_config(file.as_ref())?; if args.disable_tls { warn!("TLS disabled — listening on plaintext HTTP"); @@ -381,40 +372,6 @@ fn parse_compute_driver(value: &str) -> std::result::Result Result> { - if let Some(path) = args.config.clone() { - return Ok(Some(path)); - } - - let default_path = defaults::default_gateway_config_path()?; - Ok(default_path.is_file().then_some(default_path)) -} - -fn apply_runtime_defaults(args: &mut RunArgs) -> Result> { - let local_tls = if args.disable_tls { - None - } else { - defaults::complete_local_tls_paths()? - }; - - if args.db_url.is_none() { - args.db_url = Some(defaults::default_database_url()?); - } - - if !args.disable_tls - && args.tls_cert.is_none() - && args.tls_key.is_none() - && args.tls_client_ca.is_none() - && let Some(paths) = &local_tls - { - args.tls_cert = Some(paths.server_cert.clone()); - args.tls_key = Some(paths.server_key.clone()); - args.tls_client_ca = Some(paths.ca.clone()); - } - - Ok(local_tls) -} - /// Returns `true` when an argument's value came from clap's built-in default /// (or was never supplied at all). When the predicate is `true`, the loader /// is free to replace the value with one read from the TOML config file. @@ -543,12 +500,7 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: /// Build [`VmComputeConfig`] from the `[openshell.drivers.vm]` table /// inherited from `[openshell.gateway]`. -fn build_vm_config( - file: Option<&ConfigFile>, - local_tls: Option<&LocalTlsPaths>, - disable_tls: bool, - gateway_port: u16, -) -> Result { +fn build_vm_config(file: Option<&ConfigFile>) -> Result { let mut cfg = if let Some(file) = file { let merged = config_file::driver_table( ComputeDriverKind::Vm, @@ -565,61 +517,23 @@ fn build_vm_config( if cfg.state_dir.as_os_str().is_empty() { cfg.state_dir = VmComputeConfig::default_state_dir(); } - if cfg.grpc_endpoint.trim().is_empty() && (disable_tls || local_tls.is_some()) { - let scheme = if disable_tls { "http" } else { "https" }; - cfg.grpc_endpoint = format!("{scheme}://127.0.0.1:{gateway_port}"); - } - apply_guest_tls_defaults( - &mut cfg.guest_tls_ca, - &mut cfg.guest_tls_cert, - &mut cfg.guest_tls_key, - local_tls, - ); Ok(cfg) } /// Build [`DockerComputeConfig`] using the same inheritance pattern as /// [`build_vm_config`]. -fn build_docker_config( - file: Option<&ConfigFile>, - local_tls: Option<&LocalTlsPaths>, -) -> Result { - let mut cfg = if let Some(file) = file { +fn build_docker_config(file: Option<&ConfigFile>) -> Result { + if let Some(file) = file { let merged = config_file::driver_table( ComputeDriverKind::Docker, &file.openshell.gateway, file.openshell.drivers.get("docker"), ); - merged + return merged .try_into::() - .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}"))? - } else { - DockerComputeConfig::default() - }; - apply_guest_tls_defaults( - &mut cfg.guest_tls_ca, - &mut cfg.guest_tls_cert, - &mut cfg.guest_tls_key, - local_tls, - ); - Ok(cfg) -} - -fn apply_guest_tls_defaults( - ca: &mut Option, - cert: &mut Option, - key: &mut Option, - local_tls: Option<&LocalTlsPaths>, -) { - if ca.is_none() - && cert.is_none() - && key.is_none() - && let Some(paths) = local_tls - { - *ca = Some(paths.ca.clone()); - *cert = Some(paths.client_cert.clone()); - *key = Some(paths.client_key.clone()); + .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}")); } + Ok(DockerComputeConfig::default()) } #[cfg(test)] @@ -868,10 +782,11 @@ mod tests { } #[test] - fn bare_invocation_with_no_db_url_parses_for_runtime_defaults() { + fn bare_invocation_with_no_db_url_errors_at_runtime_not_parse_time() { // db_url is Option at the clap level so subcommand parsing - // does not require it. The Run path fills a default URL from XDG - // state when neither CLI nor env supplied one. + // does not require it. The Run path validates it inside + // run_from_args. This test asserts the parse step succeeds with no + // --db-url, mirroring what the runtime check sees. let _lock = ENV_LOCK .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); @@ -904,99 +819,6 @@ mod tests { toml::from_str(toml).expect("valid TOML in test fixture") } - #[test] - fn default_config_path_is_loaded_only_when_present() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let tmp = tempfile::tempdir().unwrap(); - let _g1 = EnvVarGuard::remove("OPENSHELL_GATEWAY_CONFIG"); - let _g2 = EnvVarGuard::set("XDG_CONFIG_HOME", tmp.path().to_str().unwrap()); - - let (args, _) = parse_with_args(&["openshell-gateway"]); - assert_eq!(super::resolve_config_path(&args).unwrap(), None); - - let config = tmp.path().join("openshell").join("gateway.toml"); - std::fs::create_dir_all(config.parent().unwrap()).unwrap(); - std::fs::write(&config, "[openshell]\nversion = 1\n").unwrap(); - - assert_eq!(super::resolve_config_path(&args).unwrap(), Some(config)); - } - - #[test] - fn explicit_config_path_is_returned_even_when_missing() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let _g = EnvVarGuard::remove("OPENSHELL_GATEWAY_CONFIG"); - - let (args, _) = parse_with_args(&["openshell-gateway", "--config", "/tmp/missing.toml"]); - - assert_eq!( - super::resolve_config_path(&args).unwrap(), - Some(std::path::PathBuf::from("/tmp/missing.toml")) - ); - } - - #[test] - fn runtime_defaults_populate_database_url_from_xdg_state() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let tmp = tempfile::tempdir().unwrap(); - let _g1 = EnvVarGuard::remove("OPENSHELL_DB_URL"); - let _g2 = EnvVarGuard::set("XDG_STATE_HOME", tmp.path().to_str().unwrap()); - - let (mut args, _) = parse_with_args(&["openshell-gateway", "--disable-tls"]); - let local_tls = super::apply_runtime_defaults(&mut args).unwrap(); - - let expected = format!( - "sqlite:{}", - tmp.path().join("openshell/gateway/openshell.db").display() - ); - assert!(local_tls.is_none()); - assert_eq!(args.db_url.as_deref(), Some(expected.as_str())); - assert!(tmp.path().join("openshell/gateway").is_dir()); - } - - #[test] - fn runtime_defaults_use_complete_local_tls_bundle() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let state = tempfile::tempdir().unwrap(); - let tls = tempfile::tempdir().unwrap(); - let _g1 = EnvVarGuard::remove("OPENSHELL_DB_URL"); - let _g2 = EnvVarGuard::remove("OPENSHELL_TLS_CERT"); - let _g3 = EnvVarGuard::remove("OPENSHELL_TLS_KEY"); - let _g4 = EnvVarGuard::remove("OPENSHELL_TLS_CLIENT_CA"); - let _g5 = EnvVarGuard::remove("OPENSHELL_DISABLE_TLS"); - let _g6 = EnvVarGuard::set("XDG_STATE_HOME", state.path().to_str().unwrap()); - let _g7 = EnvVarGuard::set("OPENSHELL_LOCAL_TLS_DIR", tls.path().to_str().unwrap()); - - std::fs::create_dir_all(tls.path().join("server")).unwrap(); - std::fs::create_dir_all(tls.path().join("client")).unwrap(); - for rel in [ - "ca.crt", - "server/tls.crt", - "server/tls.key", - "client/tls.crt", - "client/tls.key", - ] { - std::fs::write(tls.path().join(rel), "pem").unwrap(); - } - - let (mut args, _) = parse_with_args(&["openshell-gateway"]); - let local_tls = super::apply_runtime_defaults(&mut args) - .unwrap() - .expect("complete bundle should be returned"); - - assert_eq!(args.tls_cert, Some(tls.path().join("server/tls.crt"))); - assert_eq!(args.tls_key, Some(tls.path().join("server/tls.key"))); - assert_eq!(args.tls_client_ca, Some(tls.path().join("ca.crt"))); - assert_eq!(local_tls.client_cert, tls.path().join("client/tls.crt")); - } - #[test] fn file_value_applies_when_cli_uses_default() { let _lock = ENV_LOCK diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index a69231dea..d8e823df9 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -9,7 +9,7 @@ pub use openshell_driver_docker::DockerComputeConfig; pub use vm::VmComputeConfig; use crate::grpc::policy::SANDBOX_SETTINGS_OBJECT_TYPE; -use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store, WriteCondition}; +use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store}; use crate::sandbox_index::SandboxIndex; use crate::sandbox_watch::SandboxWatchBus; use crate::supervisor_session::SupervisorSessionRegistry; @@ -422,35 +422,23 @@ impl ComputeRuntime { } pub async fn create_sandbox(&self, sandbox: Sandbox) -> Result { - let sandbox_id = sandbox.object_id().to_string(); + let existing = self + .store + .get_message_by_name::(sandbox.object_name()) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?; + if existing.is_some() { + return Err(Status::already_exists(format!( + "sandbox '{}' already exists", + sandbox.object_name() + ))); + } - // Create with MustCreate condition to prevent duplicate creation race self.sandbox_index.update_from_sandbox(&sandbox); - let mut sandbox = sandbox; - let result = self - .store - .put_if( - Sandbox::object_type(), - &sandbox_id, - sandbox.object_name(), - &sandbox.encode_to_vec(), - None, - WriteCondition::MustCreate, - ) + self.store + .put_message(&sandbox) .await - .map_err(|e| { - if matches!( - e, - crate::persistence::PersistenceError::UniqueViolation { .. } - ) { - Status::already_exists(format!( - "sandbox '{}' already exists", - sandbox.object_name() - )) - } else { - Status::internal(format!("persist sandbox failed: {e}")) - } - })?; + .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; let driver_sandbox = driver_sandbox_from_public(&sandbox); match self @@ -462,9 +450,6 @@ impl ComputeRuntime { { Ok(_) => { self.sandbox_watch_bus.notify(sandbox.object_id()); - if let Some(metadata) = sandbox.metadata.as_mut() { - metadata.resource_version = result.resource_version; - } Ok(sandbox) } Err(status) if status.code() == Code::AlreadyExists => { @@ -498,31 +483,22 @@ impl ComputeRuntime { } pub async fn delete_sandbox(&self, name: &str) -> Result { - // Resolve sandbox ID from name let sandbox = self .store .get_message_by_name::(name) .await .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?; - let Some(sandbox) = sandbox else { + let Some(mut sandbox) = sandbox else { return Err(Status::not_found("sandbox not found")); }; let id = sandbox.object_id().to_string(); - - // Use CAS to set phase to Deleting - // TODO: Accept expected_version from DeleteSandboxRequest for proper client-driven CAS - let sandbox = self - .store - .update_message_cas::(&id, 0, |s| { - s.phase = SandboxPhase::Deleting as i32; - }) + sandbox.phase = SandboxPhase::Deleting as i32; + self.store + .put_message(&sandbox) .await - .map_err(|e| { - crate::grpc::persistence_error_to_status(e, "set sandbox phase to Deleting") - })?; - + .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; self.sandbox_index.update_from_sandbox(&sandbox); self.sandbox_watch_bus.notify(&id); self.cleanup_sandbox_owned_records(&sandbox).await; @@ -667,40 +643,30 @@ impl ComputeRuntime { async fn mark_sandbox_error(&self, sandbox: &Sandbox, reason: &str, message: &str) { let _guard = self.sync_lock.lock().await; - let sandbox_id = sandbox.object_id().to_string(); - let reason = reason.to_string(); - let message = message.to_string(); - match self - .store - .update_message_cas::(&sandbox_id, 0, |s| { - s.phase = SandboxPhase::Error as i32; - let name = s.object_name().to_string(); - upsert_ready_condition( - &mut s.status, - &name, - SandboxCondition { - r#type: "Ready".to_string(), - status: "False".to_string(), - reason: reason.clone(), - message: message.clone(), - last_transition_time: String::new(), - }, - ); - }) - .await - { - Ok(updated) => { - self.sandbox_index.update_from_sandbox(&updated); - self.sandbox_watch_bus.notify(&sandbox_id); - } - Err(err) => { - warn!( - sandbox_id = %sandbox_id, - error = %err, - "Failed to persist sandbox error state during startup resume" - ); - } + let mut updated = sandbox.clone(); + updated.phase = SandboxPhase::Error as i32; + let updated_name = updated.object_name().to_string(); + upsert_ready_condition( + &mut updated.status, + &updated_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: reason.to_string(), + message: message.to_string(), + last_transition_time: String::new(), + }, + ); + self.sandbox_index.update_from_sandbox(&updated); + if let Err(err) = self.store.put_message(&updated).await { + warn!( + sandbox_id = %sandbox.object_id(), + error = %err, + "Failed to persist sandbox error state during startup resume" + ); + return; } + self.sandbox_watch_bus.notify(sandbox.object_id()); } async fn watch_loop(self: Arc) { @@ -845,136 +811,85 @@ impl ComputeRuntime { .as_ref() .map(decode_sandbox_record) .transpose()?; + let previous = existing.clone(); - // If no existing record, create initial sandbox (first watch event for this sandbox) - if existing.is_none() { - use crate::persistence::WriteCondition; - let now_ms = openshell_core::time::now_ms(); - - let mut status = incoming.status.as_ref().map(public_status_from_driver); - rewrite_user_facing_conditions(&mut status, None); - - let session_connected = self.supervisor_sessions.has_session(&incoming.id); - let mut phase = derive_phase(incoming.status.as_ref()); - - let sandbox_name = incoming.name.clone(); - if session_connected - && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown) - { - ensure_supervisor_ready_status(&mut status, &sandbox_name); - phase = SandboxPhase::Ready; - } + let mut status = incoming.status.as_ref().map(public_status_from_driver); + rewrite_user_facing_conditions( + &mut status, + existing.as_ref().and_then(|sandbox| sandbox.spec.as_ref()), + ); - let sandbox = Sandbox { + let session_connected = self.supervisor_sessions.has_session(&incoming.id); + let mut phase = derive_phase(incoming.status.as_ref()); + let mut sandbox = existing.unwrap_or_else(|| { + let now_ms = openshell_core::time::now_ms(); + Sandbox { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { id: incoming.id.clone(), - name: sandbox_name, + name: incoming.name.clone(), created_at_ms: now_ms, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: None, - status, - phase: phase as i32, + status: None, + phase: SandboxPhase::Unknown as i32, current_policy_version: 0, - }; - - self.store - .put_if( - Sandbox::object_type(), - &incoming.id, - sandbox.object_name(), - &sandbox.encode_to_vec(), - None, - WriteCondition::MustCreate, - ) - .await - .map_err(|e| match e { - crate::persistence::PersistenceError::Conflict { - current_resource_version, - } => format!( - "concurrent modification detected during sandbox creation (current resource_version: {})", - current_resource_version - .map_or_else(|| "unknown".to_string(), |v| v.to_string()) - ), - other => other.to_string(), - })?; + } + }); - self.sandbox_index.update_from_sandbox(&sandbox); - self.sandbox_watch_bus.notify(sandbox.object_id()); - return Ok(()); + if session_connected && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown) + { + ensure_supervisor_ready_status(&mut status, sandbox.object_name()); + phase = SandboxPhase::Ready; } - // Single-attempt CAS: on conflict, the next watch event will naturally retry - let session_connected = self.supervisor_sessions.has_session(&incoming.id); - let sandbox_name = incoming.name.clone(); - - let sandbox = self - .store - .update_message_cas::(&incoming.id, 0, |sandbox| { - let mut status = incoming.status.as_ref().map(public_status_from_driver); - rewrite_user_facing_conditions(&mut status, sandbox.spec.as_ref()); + let old_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + if old_phase != phase { + info!( + sandbox_id = %incoming.id, + sandbox_name = %incoming.name, + old_phase = ?old_phase, + new_phase = ?phase, + "Sandbox phase changed" + ); + } - let mut phase = derive_phase(incoming.status.as_ref()); - if session_connected - && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown) + if phase == SandboxPhase::Error + && let Some(ref status) = status + { + for condition in &status.conditions { + if condition.r#type == "Ready" + && condition.status.eq_ignore_ascii_case("false") + && is_terminal_failure_reason(&condition.reason) { - ensure_supervisor_ready_status(&mut status, &sandbox_name); - phase = SandboxPhase::Ready; - } - - let old_phase = - SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); - if old_phase != phase { - info!( + warn!( sandbox_id = %incoming.id, - sandbox_name = %sandbox_name, - old_phase = ?old_phase, - new_phase = ?phase, - "Sandbox phase changed" + sandbox_name = %incoming.name, + reason = %condition.reason, + message = %condition.message, + "Sandbox failed to become ready" ); } + } + } - if phase == SandboxPhase::Error - && let Some(ref status) = status - { - for condition in &status.conditions { - if condition.r#type == "Ready" - && condition.status.eq_ignore_ascii_case("false") - && is_terminal_failure_reason(&condition.reason) - { - warn!( - sandbox_id = %incoming.id, - sandbox_name = %sandbox_name, - reason = %condition.reason, - message = %condition.message, - "Sandbox failed to become ready" - ); - } - } - } + // Update metadata fields + if let Some(metadata) = sandbox.metadata.as_mut() { + metadata.name = incoming.name; + } + // Note: namespace field removed from public Sandbox API - it remains internal to DriverSandbox + sandbox.status = status; + sandbox.phase = phase as i32; - // Update metadata fields - if let Some(metadata) = sandbox.metadata.as_mut() { - metadata.name.clone_from(&sandbox_name); - } - // Note: namespace field removed from public Sandbox API - it remains internal to DriverSandbox - sandbox.status = status; - sandbox.phase = phase as i32; - }) - .await - .map_err(|e| match e { - crate::persistence::PersistenceError::Conflict { - current_resource_version, - } => format!( - "concurrent modification detected during sandbox reconciliation (current resource_version: {})", - current_resource_version - .map_or_else(|| "unknown".to_string(), |v| v.to_string()) - ), - other => other.to_string(), - })?; + if previous.as_ref() == Some(&sandbox) { + return Ok(()); + } self.sandbox_index.update_from_sandbox(&sandbox); + self.store + .put_message(&sandbox) + .await + .map_err(|e| e.to_string())?; self.sandbox_watch_bus.notify(sandbox.object_id()); Ok(()) } @@ -993,51 +908,38 @@ impl ComputeRuntime { connected: bool, ) -> Result<(), String> { let _guard = self.sync_lock.lock().await; - - // Use CAS to update sandbox phase based on supervisor session state - let result = self + let Some(record) = self .store - .update_message_cas::(sandbox_id, 0, |sandbox| { - let current_phase = - SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + .get(Sandbox::object_type(), sandbox_id) + .await + .map_err(|e| e.to_string())? + else { + return Ok(()); + }; - // Skip if sandbox is in terminal state - if current_phase == SandboxPhase::Deleting || current_phase == SandboxPhase::Error { - return; - } + let mut sandbox = decode_sandbox_record(&record)?; + let current_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); - let sandbox_name = sandbox.object_name().to_string(); - if connected { - ensure_supervisor_ready_status(&mut sandbox.status, &sandbox_name); - sandbox.phase = SandboxPhase::Ready as i32; - } else if current_phase == SandboxPhase::Ready { - ensure_supervisor_not_ready_status(&mut sandbox.status, &sandbox_name); - sandbox.phase = SandboxPhase::Provisioning as i32; - } - }) - .await; + if current_phase == SandboxPhase::Deleting || current_phase == SandboxPhase::Error { + return Ok(()); + } - // Handle not found gracefully (sandbox may have been deleted) - let sandbox = match result { - Ok(s) => s, - Err(crate::persistence::PersistenceError::Database(ref msg)) - if msg.contains("not found") => - { - return Ok(()); - } - Err(crate::persistence::PersistenceError::Conflict { - current_resource_version, - }) => { - return Err(format!( - "concurrent modification detected (current resource_version: {})", - current_resource_version - .map_or_else(|| "unknown".to_string(), |v| v.to_string()) - )); - } - Err(e) => return Err(e.to_string()), - }; + let sandbox_name = sandbox.object_name().to_string(); + if connected { + ensure_supervisor_ready_status(&mut sandbox.status, &sandbox_name); + sandbox.phase = SandboxPhase::Ready as i32; + } else if current_phase == SandboxPhase::Ready { + ensure_supervisor_not_ready_status(&mut sandbox.status, &sandbox_name); + sandbox.phase = SandboxPhase::Provisioning as i32; + } else { + return Ok(()); + } self.sandbox_index.update_from_sandbox(&sandbox); + self.store + .put_message(&sandbox) + .await + .map_err(|e| e.to_string())?; self.sandbox_watch_bus.notify(sandbox_id); Ok(()) } @@ -1918,7 +1820,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), phase: phase as i32, ..Default::default() @@ -1932,7 +1833,6 @@ mod tests { name: format!("session-{id}"), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), sandbox_id: sandbox_id.to_string(), token: format!("token-{id}"), @@ -2847,105 +2747,4 @@ mod tests { "unset user_namespaces must not produce host_users" ); } - - #[tokio::test] - async fn create_sandbox_returns_resource_version_one() { - let runtime = test_runtime(Arc::new(TestDriver::default())).await; - - let mut sandbox = sandbox_record("sb-new", "test-sandbox", SandboxPhase::Provisioning); - // Clear metadata to simulate incoming request - sandbox.metadata = Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sb-new".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }); - - let created = runtime.create_sandbox(sandbox).await.unwrap(); - - assert_eq!( - created.metadata.as_ref().unwrap().resource_version, - 1, - "create_sandbox should return resource_version: 1 after insert" - ); - - // Verify database also has resource_version: 1 - let created_id = created.metadata.as_ref().unwrap().id.clone(); - let stored = runtime - .store - .get_message::(&created_id) - .await - .unwrap() - .unwrap(); - assert_eq!( - stored.metadata.as_ref().unwrap().resource_version, - 1, - "database should have resource_version: 1 after create" - ); - } - - #[tokio::test] - async fn concurrent_create_sandbox_rejects_duplicate() { - let runtime = Arc::new(test_runtime(Arc::new(TestDriver::default())).await); - - let sandbox = sandbox_record( - "sb-concurrent", - "test-concurrent", - SandboxPhase::Provisioning, - ); - - // Spawn two concurrent creation attempts for the same sandbox - let runtime1 = runtime.clone(); - let sandbox1 = sandbox.clone(); - let handle1 = tokio::spawn(async move { runtime1.create_sandbox(sandbox1).await }); - - let runtime2 = runtime.clone(); - let sandbox2 = sandbox.clone(); - let handle2 = tokio::spawn(async move { runtime2.create_sandbox(sandbox2).await }); - - // Wait for both to complete - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // Exactly one should succeed, one should fail with AlreadyExists - let success_count = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - let already_exists_count = [&result1, &result2] - .iter() - .filter(|r| { - r.as_ref() - .err() - .is_some_and(|e| e.code() == Code::AlreadyExists) - }) - .count(); - - assert_eq!( - success_count, 1, - "exactly one creation should succeed, got results: {result1:?} {result2:?}" - ); - assert_eq!( - already_exists_count, 1, - "exactly one creation should fail with AlreadyExists, got results: {result1:?} {result2:?}" - ); - - // Verify the successful sandbox can be retrieved by name - let created_sandbox = [result1, result2] - .into_iter() - .find_map(Result::ok) - .expect("should have one successful creation"); - let retrieved = runtime - .store - .get_message_by_name::("test-concurrent") - .await - .unwrap(); - assert!( - retrieved.is_some(), - "created sandbox should be retrievable by name" - ); - assert_eq!( - retrieved.unwrap().object_id(), - created_sandbox.object_id(), - "retrieved sandbox should match created sandbox" - ); - } } diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index 82b9cb3fd..c4db1428c 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -105,10 +105,7 @@ impl VmComputeConfig { /// Default working directory for VM driver state. #[must_use] pub fn default_state_dir() -> PathBuf { - openshell_core::paths::openshell_state_dir().map_or_else( - |_| PathBuf::from("target/openshell-vm-driver"), - |dir| dir.join("vm-driver"), - ) + PathBuf::from("target/openshell-vm-driver") } /// Default libkrun log level. @@ -232,21 +229,7 @@ pub fn resolve_compute_driver_bin(vm_config: &VmComputeConfig) -> Result Vec { vm_config.driver_dir.clone().map_or_else( - || { - let mut dirs = Vec::new(); - if let Ok(current_exe) = std::env::current_exe() - && let Some(prefix) = current_exe.parent().and_then(Path::parent) - { - push_unique_path(&mut dirs, prefix.join("libexec")); - push_unique_path(&mut dirs, prefix.join("libexec").join("openshell")); - } - for dir in VmComputeConfig::default_driver_search_dirs( - std::env::var_os("HOME").map(PathBuf::from), - ) { - push_unique_path(&mut dirs, dir); - } - dirs - }, + || VmComputeConfig::default_driver_search_dirs(std::env::var_os("HOME").map(PathBuf::from)), |dir| vec![dir], ) } diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index db0dcd684..2a1320a55 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -515,47 +515,4 @@ version = 2 .expect_err("missing file must be io error"); assert!(matches!(err, ConfigFileError::Io { .. })); } - - /// Contract test: the RPM default config template must parse against the - /// current schema and must pin the settings that Podman deployments require. - /// - /// This test loads `deploy/rpm/gateway.toml.default` through the same - /// `load()` path that the gateway uses at runtime, catching: - /// - template corruption or unknown fields (`deny_unknown_fields`) - /// - schema drift (version bump or field renames) - /// - accidental changes to the bind address or compute driver list - #[test] - fn rpm_default_config_parses_and_has_podman_defaults() { - let path = - Path::new(env!("CARGO_MANIFEST_DIR")).join("../../deploy/rpm/gateway.toml.default"); - let config = - load(&path).expect("deploy/rpm/gateway.toml.default must parse against current schema"); - let gw = &config.openshell.gateway; - - let addr = gw - .bind_address - .expect("bind_address must be explicitly set in the RPM default config"); - assert!( - addr.ip().is_unspecified(), - "RPM default bind_address must be 0.0.0.0 so Podman sandbox containers \ - can reach the gateway over the host network bridge, got {addr}" - ); - assert_eq!( - addr.port(), - openshell_core::config::DEFAULT_SERVER_PORT, - "RPM default port must match DEFAULT_SERVER_PORT ({})", - openshell_core::config::DEFAULT_SERVER_PORT - ); - - let drivers = gw - .compute_drivers - .as_ref() - .expect("compute_drivers must be explicitly set in the RPM default config"); - assert_eq!( - drivers, - &[ComputeDriverKind::Podman], - "RPM default must pin compute_drivers to [podman] to prevent unexpected \ - driver selection when Docker is also installed" - ); - } } diff --git a/crates/openshell-server/src/defaults.rs b/crates/openshell-server/src/defaults.rs deleted file mode 100644 index 9705c4e9b..000000000 --- a/crates/openshell-server/src/defaults.rs +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Runtime defaults for local gateway installs. - -use miette::Result; -use std::path::{Path, PathBuf}; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct LocalTlsPaths { - pub ca: PathBuf, - pub server_cert: PathBuf, - pub server_key: PathBuf, - pub client_cert: PathBuf, - pub client_key: PathBuf, -} - -impl LocalTlsPaths { - fn resolve(dir: &Path) -> Self { - Self { - ca: dir.join("ca.crt"), - server_cert: dir.join("server").join("tls.crt"), - server_key: dir.join("server").join("tls.key"), - client_cert: dir.join("client").join("tls.crt"), - client_key: dir.join("client").join("tls.key"), - } - } - - fn files(&self) -> [&Path; 5] { - [ - &self.ca, - &self.server_cert, - &self.server_key, - &self.client_cert, - &self.client_key, - ] - } -} - -pub fn default_gateway_config_path() -> Result { - Ok(openshell_core::paths::openshell_config_dir()?.join("gateway.toml")) -} - -pub fn default_database_url() -> Result { - let path = openshell_core::paths::openshell_state_dir()? - .join("gateway") - .join("openshell.db"); - openshell_core::paths::ensure_parent_dir_restricted(&path)?; - Ok(format!("sqlite:{}", path.display())) -} - -fn default_local_tls_dir() -> Result { - if let Some(path) = std::env::var_os("OPENSHELL_LOCAL_TLS_DIR") { - return Ok(PathBuf::from(path)); - } - Ok(openshell_core::paths::openshell_state_dir()?.join("tls")) -} - -pub fn complete_local_tls_paths() -> Result> { - let dir = default_local_tls_dir()?; - let paths = LocalTlsPaths::resolve(&dir); - let present = paths.files().iter().filter(|path| path.is_file()).count(); - match present { - 0 => Ok(None), - 5 => Ok(Some(paths)), - _ => Err(miette::miette!( - "partial local TLS state in {}: expected ca.crt, server/tls.crt, server/tls.key, client/tls.crt, and client/tls.key", - dir.display() - )), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::{LazyLock, Mutex}; - - static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); - - struct EnvVarGuard { - key: &'static str, - original: Option, - } - - impl EnvVarGuard { - #[allow(unsafe_code)] - fn set(key: &'static str, value: &Path) -> Self { - let original = std::env::var(key).ok(); - // SAFETY: tests serialize environment mutation with ENV_LOCK. - unsafe { std::env::set_var(key, value) }; - Self { key, original } - } - } - - impl Drop for EnvVarGuard { - #[allow(unsafe_code)] - fn drop(&mut self) { - match self.original.as_deref() { - // SAFETY: tests serialize environment mutation with ENV_LOCK. - Some(value) => unsafe { std::env::set_var(self.key, value) }, - // SAFETY: tests serialize environment mutation with ENV_LOCK. - None => unsafe { std::env::remove_var(self.key) }, - } - } - } - - #[test] - fn complete_local_tls_paths_returns_none_when_bundle_absent() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let tmp = tempfile::tempdir().unwrap(); - let _guard = EnvVarGuard::set("OPENSHELL_LOCAL_TLS_DIR", tmp.path()); - - assert!(complete_local_tls_paths().unwrap().is_none()); - } - - #[test] - fn complete_local_tls_paths_rejects_partial_bundle() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let tmp = tempfile::tempdir().unwrap(); - let _guard = EnvVarGuard::set("OPENSHELL_LOCAL_TLS_DIR", tmp.path()); - std::fs::write(tmp.path().join("ca.crt"), "ca").unwrap(); - - let err = complete_local_tls_paths().unwrap_err(); - assert!(err.to_string().contains("partial local TLS state")); - } - - #[test] - fn complete_local_tls_paths_returns_full_bundle() { - let _lock = ENV_LOCK - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner); - let tmp = tempfile::tempdir().unwrap(); - let _guard = EnvVarGuard::set("OPENSHELL_LOCAL_TLS_DIR", tmp.path()); - std::fs::create_dir_all(tmp.path().join("server")).unwrap(); - std::fs::create_dir_all(tmp.path().join("client")).unwrap(); - for rel in [ - "ca.crt", - "server/tls.crt", - "server/tls.key", - "client/tls.crt", - "client/tls.key", - ] { - std::fs::write(tmp.path().join(rel), "pem").unwrap(); - } - - let paths = complete_local_tls_paths().unwrap().unwrap(); - assert_eq!(paths.ca, tmp.path().join("ca.crt")); - assert_eq!(paths.server_cert, tmp.path().join("server/tls.crt")); - assert_eq!(paths.client_key, tmp.path().join("client/tls.key")); - } -} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index 8f70c20bb..db3d2350d 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -4,7 +4,7 @@ //! gRPC service implementation. pub mod policy; -pub mod provider; +mod provider; mod sandbox; mod service; mod validation; @@ -12,17 +12,15 @@ mod validation; use openshell_core::proto::{ ApproveAllDraftChunksRequest, ApproveAllDraftChunksResponse, ApproveDraftChunkRequest, ApproveDraftChunkResponse, AttachSandboxProviderRequest, AttachSandboxProviderResponse, - ClearDraftChunksRequest, ClearDraftChunksResponse, ConfigureProviderRefreshRequest, - ConfigureProviderRefreshResponse, CreateProviderRequest, CreateSandboxRequest, + ClearDraftChunksRequest, ClearDraftChunksResponse, CreateProviderRequest, CreateSandboxRequest, CreateSshSessionRequest, CreateSshSessionResponse, DeleteProviderProfileRequest, - DeleteProviderProfileResponse, DeleteProviderRefreshRequest, DeleteProviderRefreshResponse, - DeleteProviderRequest, DeleteProviderResponse, DeleteSandboxRequest, DeleteSandboxResponse, - DeleteServiceRequest, DeleteServiceResponse, DetachSandboxProviderRequest, - DetachSandboxProviderResponse, EditDraftChunkRequest, EditDraftChunkResponse, ExecSandboxEvent, - ExecSandboxInput, ExecSandboxRequest, ExposeServiceRequest, GatewayMessage, - GetDraftHistoryRequest, GetDraftHistoryResponse, GetDraftPolicyRequest, GetDraftPolicyResponse, - GetGatewayConfigRequest, GetGatewayConfigResponse, GetProviderProfileRequest, - GetProviderRefreshStatusRequest, GetProviderRefreshStatusResponse, GetProviderRequest, + DeleteProviderProfileResponse, DeleteProviderRequest, DeleteProviderResponse, + DeleteSandboxRequest, DeleteSandboxResponse, DeleteServiceRequest, DeleteServiceResponse, + DetachSandboxProviderRequest, DetachSandboxProviderResponse, EditDraftChunkRequest, + EditDraftChunkResponse, ExecSandboxEvent, ExecSandboxInput, ExecSandboxRequest, + ExposeServiceRequest, GatewayMessage, GetDraftHistoryRequest, GetDraftHistoryResponse, + GetDraftPolicyRequest, GetDraftPolicyResponse, GetGatewayConfigRequest, + GetGatewayConfigResponse, GetProviderProfileRequest, GetProviderRequest, GetSandboxConfigRequest, GetSandboxConfigResponse, GetSandboxLogsRequest, GetSandboxLogsResponse, GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, @@ -34,11 +32,11 @@ use openshell_core::proto::{ ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, ReportPolicyStatusResponse, - RevokeSshSessionRequest, RevokeSshSessionResponse, RotateProviderCredentialRequest, - RotateProviderCredentialResponse, SandboxResponse, SandboxStreamEvent, ServiceEndpointResponse, - ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, - TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, - UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, + RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxResponse, SandboxStreamEvent, + ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, + SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, + UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, + WatchSandboxRequest, open_shell_server::OpenShell, }; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -67,29 +65,6 @@ pub fn clamp_limit(raw: u32, default: u32, max: u32) -> u32 { if raw == 0 { default } else { raw.min(max) } } -/// Map a `PersistenceError` to an appropriate gRPC `Status`. -/// -/// CAS conflicts (optimistic concurrency failures) are mapped to `ABORTED` -/// to signal that the client should retry with fresh data. Other persistence -/// errors are mapped to `INTERNAL`. -pub fn persistence_error_to_status( - err: crate::persistence::PersistenceError, - operation: &str, -) -> Status { - use crate::persistence::PersistenceError; - - match err { - PersistenceError::Conflict { - current_resource_version, - } => Status::aborted(format!( - "{} failed due to concurrent modification (current resource_version: {})", - operation, - current_resource_version.map_or_else(|| "unknown".to_string(), |v| v.to_string()) - )), - other => Status::internal(format!("{operation} failed: {other}")), - } -} - // --------------------------------------------------------------------------- // Field-level size limits (shared across submodules) // --------------------------------------------------------------------------- @@ -129,10 +104,6 @@ const MAX_PROVIDER_CONFIG_ENTRIES: usize = 64; struct StoredSettings { revision: u64, settings: BTreeMap, - /// Database `resource_version` for CAS. Not persisted in the JSON payload; - /// loaded from `ObjectRecord` and used for optimistic concurrency control. - #[serde(skip)] - resource_version: u64, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -393,34 +364,6 @@ impl OpenShell for OpenShellService { provider::handle_update_provider(&self.state, request).await } - async fn get_provider_refresh_status( - &self, - request: Request, - ) -> Result, Status> { - provider::handle_get_provider_refresh_status(&self.state, request).await - } - - async fn configure_provider_refresh( - &self, - request: Request, - ) -> Result, Status> { - provider::handle_configure_provider_refresh(&self.state, request).await - } - - async fn rotate_provider_credential( - &self, - request: Request, - ) -> Result, Status> { - provider::handle_rotate_provider_credential(&self.state, request).await - } - - async fn delete_provider_refresh( - &self, - request: Request, - ) -> Result, Status> { - provider::handle_delete_provider_refresh(&self.state, request).await - } - async fn delete_provider( &self, request: Request, diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 412febb96..c5ce4d435 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -76,6 +76,34 @@ const GLOBAL_POLICY_SANDBOX_ID: &str = "__global__"; /// Maximum number of optimistic retry attempts for policy version conflicts. const MERGE_RETRY_LIMIT: usize = 5; +fn emit_sandbox_policy_update_success() { + openshell_core::telemetry::emit_lifecycle("sandbox_policy", "update", "success"); +} + +fn should_emit_config_update_policy_telemetry(sandbox_caller: bool) -> bool { + !sandbox_caller +} + +fn emit_config_update_policy_success(sandbox_caller: bool) { + if should_emit_config_update_policy_telemetry(sandbox_caller) { + emit_sandbox_policy_update_success(); + } +} + +fn should_emit_full_policy_update_telemetry(sandbox_caller: bool, next_version: i64) -> bool { + !sandbox_caller && next_version > 1 +} + +fn emit_full_policy_update_success(sandbox_caller: bool, next_version: i64) { + if should_emit_full_policy_update_telemetry(sandbox_caller, next_version) { + emit_sandbox_policy_update_success(); + } +} + +fn emit_policy_decision_success(operation: &str, rule_count: u64) { + openshell_core::telemetry::emit_policy_decision(operation, "success", rule_count); +} + fn emit_gateway_policy_audit_log( sandbox_id: &str, sandbox_name: &str, @@ -522,12 +550,6 @@ pub(super) async fn compute_provider_env_revision( for key in credential_keys { hasher.update(key.as_bytes()); } - let mut expiry_keys: Vec<_> = provider.credential_expires_at_ms.keys().collect(); - expiry_keys.sort(); - for key in expiry_keys { - hasher.update(key.as_bytes()); - hasher.update(provider.credential_expires_at_ms[key].to_le_bytes()); - } } None => { hasher.update(b"missing"); @@ -631,22 +653,21 @@ pub(super) async fn handle_get_sandbox_provider_environment( let provider_names = spec.providers; let provider_env_revision = compute_provider_env_revision(state.store.as_ref(), &provider_names).await?; - let provider_environment = + let environment = super::provider::resolve_provider_environment(state.store.as_ref(), &provider_names) .await?; info!( sandbox_id = %sandbox_id, provider_count = provider_names.len(), - env_count = provider_environment.environment.len(), + env_count = environment.len(), provider_env_revision, "GetSandboxProviderEnvironment request completed successfully" ); Ok(Response::new(GetSandboxProviderEnvironmentResponse { - environment: provider_environment.environment, + environment, provider_env_revision, - credential_expires_at_ms: provider_environment.credential_expires_at_ms, })) } @@ -981,6 +1002,7 @@ pub(super) async fn handle_update_config( operation_count = merge_ops.len(), "UpdateConfig: merged incremental policy operations" ); + emit_config_update_policy_success(sandbox_caller); return Ok(Response::new(UpdateConfigResponse { version: u32::try_from(version).unwrap_or(0), @@ -1013,29 +1035,32 @@ pub(super) async fn handle_update_config( validate_static_fields_unchanged(baseline_policy, &new_policy)?; validate_policy_safety(&new_policy)?; } else { - // Backfill spec.policy using CAS (first-time policy discovery) let _sandbox_sync_guard = state.compute.sandbox_sync_guard().await; - let sandbox_id = sandbox.object_id().to_string(); - let new_policy_clone = new_policy.clone(); - state + let mut sandbox = state .store - .update_message_cas::( - &sandbox_id, - req.expected_resource_version, - |sandbox| { - if let Some(ref mut spec) = sandbox.spec - && spec.policy.is_none() - { - spec.policy = Some(new_policy_clone.clone()); - } - }, - ) + .get_message::(&sandbox_id) .await - .map_err(|e| super::persistence_error_to_status(e, "backfill spec.policy"))?; - info!( - sandbox_id = %sandbox_id, - "UpdateConfig: backfilled spec.policy from sandbox-discovered policy" - ); + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + let spec = sandbox + .spec + .as_mut() + .ok_or_else(|| Status::internal("sandbox has no spec"))?; + if let Some(baseline_policy) = spec.policy.as_ref() { + validate_static_fields_unchanged(baseline_policy, &new_policy)?; + validate_policy_safety(&new_policy)?; + } else { + spec.policy = Some(new_policy.clone()); + state + .store + .put_message(&sandbox) + .await + .map_err(|e| Status::internal(format!("backfill spec.policy failed: {e}")))?; + info!( + sandbox_id = %sandbox_id, + "UpdateConfig: backfilled spec.policy from sandbox-discovered policy" + ); + } } let latest = state @@ -1080,6 +1105,7 @@ pub(super) async fn handle_update_config( policy_hash = %hash, "UpdateConfig: new policy version persisted" ); + emit_full_policy_update_success(sandbox_caller, next_version); Ok(Response::new(UpdateConfigResponse { version: u32::try_from(next_version).unwrap_or(0), @@ -1232,19 +1258,11 @@ pub(super) async fn handle_report_policy_status( .store .supersede_older_policies(&req.sandbox_id, version) .await; - - // Update current_policy_version using CAS - // TODO: Accept expected_version from UpdateConfigRequest for proper client-driven CAS let _sandbox_sync_guard = state.compute.sandbox_sync_guard().await; - let version_to_set = req.version; - state - .store - .update_message_cas::(&req.sandbox_id, 0, |sandbox| { - sandbox.current_policy_version = version_to_set; - }) - .await - .map_err(|e| super::persistence_error_to_status(e, "update current_policy_version"))?; - + if let Ok(Some(mut sandbox)) = state.store.get_message::(&req.sandbox_id).await { + sandbox.current_policy_version = req.version; + let _ = state.store.put_message(&sandbox).await; + } state.sandbox_watch_bus.notify(&req.sandbox_id); } @@ -1359,6 +1377,22 @@ pub(super) async fn handle_submit_policy_analysis( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + for summary in &req.network_activity_summaries { + state + .telemetry + .record_network_activity(&sandbox_id, summary); + } + if req.proposed_chunks.is_empty() + && req.summaries.is_empty() + && !req.network_activity_summaries.is_empty() + { + return Ok(Response::new(SubmitPolicyAnalysisResponse { + accepted_chunks: 0, + rejected_chunks: 0, + rejection_reasons: Vec::new(), + accepted_chunk_ids: Vec::new(), + })); + } let current_version = state .store @@ -1608,6 +1642,8 @@ pub(super) async fn handle_approve_draft_chunk( policy_hash = %hash, "ApproveDraftChunk: rule merged successfully" ); + emit_sandbox_policy_update_success(); + emit_policy_decision_success("approve", 1); Ok(Response::new(ApproveDraftChunkResponse { policy_version: u32::try_from(version).unwrap_or(0), @@ -1677,6 +1713,7 @@ pub(super) async fn handle_reject_draft_chunk( version, &hash, ); + emit_sandbox_policy_update_success(); } let now_ms = current_time_ms(); @@ -1695,6 +1732,7 @@ pub(super) async fn handle_reject_draft_chunk( .map_err(|e| Status::internal(format!("update chunk status failed: {e}")))?; state.sandbox_watch_bus.notify(&sandbox_id); + emit_policy_decision_success("reject", 1); Ok(Response::new(RejectDraftChunkResponse {})) } @@ -1784,6 +1822,7 @@ pub(super) async fn handle_approve_all_draft_chunks( &last_hash, ); chunks_approved += 1; + emit_sandbox_policy_update_success(); } state.sandbox_watch_bus.notify(&sandbox_id); @@ -1806,6 +1845,7 @@ pub(super) async fn handle_approve_all_draft_chunks( policy_hash = %last_hash, "ApproveAllDraftChunks: bulk approval complete" ); + emit_policy_decision_success("approve_all", u64::from(chunks_approved)); Ok(Response::new(ApproveAllDraftChunksResponse { policy_version: u32::try_from(last_version).unwrap_or(0), @@ -1944,6 +1984,8 @@ pub(super) async fn handle_undo_draft_chunk( policy_hash = %hash, "UndoDraftChunk: rule removed, chunk reverted to pending" ); + emit_sandbox_policy_update_success(); + emit_policy_decision_success("undo", 1); Ok(Response::new(UndoDraftChunkResponse { policy_version: u32::try_from(version).unwrap_or(0), @@ -2679,11 +2721,8 @@ async fn load_settings_record( .await .map_err(|e| Status::internal(format!("fetch settings failed: {e}")))?; if let Some(record) = record { - let mut settings = serde_json::from_slice::(&record.payload) - .map_err(|e| Status::internal(format!("decode settings payload failed: {e}")))?; - // Populate resource_version from database record for CAS - settings.resource_version = record.resource_version; - Ok(settings) + serde_json::from_slice::(&record.payload) + .map_err(|e| Status::internal(format!("decode settings payload failed: {e}"))) } else { Ok(StoredSettings::default()) } @@ -2695,43 +2734,18 @@ async fn save_settings_record( name: &str, settings: &StoredSettings, ) -> Result<(), Status> { - use crate::persistence::WriteCondition; - let payload = serde_json::to_vec(settings) .map_err(|e| Status::internal(format!("encode settings payload failed: {e}")))?; - - let (id, condition) = if settings.resource_version == 0 { - // Create new settings (resource_version 0 means never persisted) - (uuid::Uuid::new_v4().to_string(), WriteCondition::MustCreate) - } else { - // Update existing with CAS on the version from when it was loaded - // Fetch the record to get the stable ID - let existing = store - .get_by_name(object_type, name) - .await - .map_err(|e| Status::internal(format!("fetch settings for CAS failed: {e}")))? - .ok_or_else(|| Status::not_found("settings disappeared since load"))?; - - ( - existing.id, - WriteCondition::MatchResourceVersion(settings.resource_version), - ) - }; - - // Single-attempt CAS write store - .put_if(object_type, &id, name, &payload, None, condition) + .put( + object_type, + &uuid::Uuid::new_v4().to_string(), + name, + &payload, + None, + ) .await - .map_err(|e| match e { - crate::persistence::PersistenceError::Conflict { .. } => { - Status::aborted("settings were modified concurrently; please retry") - } - crate::persistence::PersistenceError::UniqueViolation { .. } => { - Status::aborted("settings were created concurrently; please retry") - } - other => super::persistence_error_to_status(other, "persist settings"), - })?; - + .map_err(|e| Status::internal(format!("persist settings failed: {e}")))?; Ok(()) } @@ -2876,6 +2890,19 @@ mod tests { assert!(is_sandbox_caller(&req)); } + #[test] + fn sandbox_caller_policy_sync_does_not_emit_policy_update_telemetry() { + assert!(!should_emit_config_update_policy_telemetry(true)); + assert!(should_emit_config_update_policy_telemetry(false)); + } + + #[test] + fn first_policy_revision_does_not_emit_policy_update_telemetry() { + assert!(!should_emit_full_policy_update_telemetry(false, 1)); + assert!(!should_emit_full_policy_update_telemetry(true, 2)); + assert!(should_emit_full_policy_update_telemetry(false, 2)); + } + // ---- Sandbox without policy ---- #[tokio::test] @@ -2890,7 +2917,6 @@ mod tests { name: "no-policy-sandbox".to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -2916,13 +2942,11 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: std::iter::once(("GITHUB_TOKEN".to_string(), "ghp-test".to_string())) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), } } @@ -2959,7 +2983,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: Some(policy), @@ -2979,7 +3002,6 @@ mod tests { StoredSettingValue::Bool(true), )) .collect(), - ..Default::default() }; save_global_settings(state.store.as_ref(), &global_settings) .await @@ -3029,7 +3051,6 @@ mod tests { name: "generic".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), profile: Some(openshell_core::proto::ProviderProfile { id: "generic".to_string(), @@ -3071,7 +3092,6 @@ mod tests { name: "custom-api".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), profile: Some(openshell_core::proto::ProviderProfile { id: "custom-api".to_string(), @@ -3134,7 +3154,6 @@ mod tests { name: "custom-api".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), profile: Some(openshell_core::proto::ProviderProfile { id: "custom-api".to_string(), @@ -3561,7 +3580,6 @@ mod tests { Request::new(AttachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -3597,7 +3615,6 @@ mod tests { Request::new(DetachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -3721,7 +3738,6 @@ mod tests { Request::new(AttachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), - expected_resource_version: 0, }), ) .await @@ -3760,7 +3776,6 @@ mod tests { Request::new(DetachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), - expected_resource_version: 0, }), ) .await @@ -3824,7 +3839,6 @@ mod tests { name: "global-profile-sandbox".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: Some(sandbox_policy), @@ -3866,7 +3880,6 @@ mod tests { ] .into_iter() .collect(), - ..Default::default() }; save_global_settings(state.store.as_ref(), &global_settings) .await @@ -3913,7 +3926,6 @@ mod tests { name: "backfill-sandbox".to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -3975,7 +3987,6 @@ mod tests { name: "draft-flow".to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4186,7 +4197,6 @@ mod tests { name: sandbox_name.clone(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4284,7 +4294,6 @@ mod tests { name: sandbox_name.clone(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4398,7 +4407,6 @@ mod tests { name: sandbox_name.clone(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4498,7 +4506,6 @@ mod tests { name: sandbox_name.clone(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4604,7 +4611,6 @@ mod tests { name: "draft-owner".to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -4619,7 +4625,6 @@ mod tests { name: "draft-other".to_string(), created_at_ms: 1_000_001, labels: std::collections::HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -5355,7 +5360,6 @@ mod tests { revision: 1, settings: std::iter::once(("policy".to_string(), StoredSettingValue::Bytes(encoded))) .collect(), - ..Default::default() }; let decoded = decode_policy_from_global_settings(&global) @@ -5438,7 +5442,6 @@ mod tests { ] .into_iter() .collect(), - ..Default::default() }; let sandbox = StoredSettings { revision: 1, @@ -5451,7 +5454,6 @@ mod tests { ] .into_iter() .collect(), - ..Default::default() }; let merged = merge_effective_settings(&global, &sandbox).unwrap(); @@ -5481,7 +5483,6 @@ mod tests { )] .into_iter() .collect(), - ..Default::default() }; let merged = merge_effective_settings(&global, &sandbox).unwrap(); @@ -5511,7 +5512,6 @@ mod tests { StoredSettingValue::Bytes("deadbeef".to_string()), )) .collect(), - ..Default::default() }; let sandbox = StoredSettings { revision: 1, @@ -5520,7 +5520,6 @@ mod tests { StoredSettingValue::Bytes("cafebabe".to_string()), )) .collect(), - ..Default::default() }; let merged = merge_effective_settings(&global, &sandbox).unwrap(); @@ -5813,53 +5812,24 @@ mod tests { .settings .insert(format!("key_{i}"), StoredSettingValue::Int(i as i64)); settings.revision = settings.revision.wrapping_add(1); - save_global_settings(&store, &settings).await + save_global_settings(&store, &settings).await.unwrap(); })); } - let mut succeeded = 0; - let mut cas_conflicts = 0; for h in handles { - match h.await.unwrap() { - Ok(()) => succeeded += 1, - Err(e) if e.code() == Code::Aborted => cas_conflicts += 1, - Err(e) => panic!("unexpected error: {e}"), - } + h.await.unwrap(); } let final_settings = load_global_settings(&store).await.unwrap(); - - // With single-attempt CAS (no retry), concurrent modifications are properly detected: - // - All tasks read initial state (revision=0, resource_version=0) - // - First write succeeds with resource_version=1 - // - Subsequent writes fail with ABORTED (CAS conflict) because they all have stale resource_version=0 - // - Only the first write succeeds; all others are rejected - // - // This demonstrates that single-attempt CAS prevents lost writes by rejecting stale updates. - // The caller must retry from a fresh read to incorporate concurrent changes. - assert!( - cas_conflicts > 0, - "most concurrent writes should fail with CAS conflict (succeeded={succeeded}, conflicts={cas_conflicts})" - ); - assert!( - succeeded < n, - "not all writes should succeed due to conflicts (succeeded={succeeded}, total={n})" - ); - assert_eq!( - final_settings.revision as usize, succeeded, - "final revision should match number of successful writes" - ); - assert_eq!( - final_settings.settings.len(), - succeeded, - "final settings should contain exactly the keys from successful writes" - ); - - eprintln!( - "unlocked CAS test: {succeeded} succeeded, {cas_conflicts} CAS conflicts, \ - final revision={} (matches succeeded count, demonstrating proper conflict detection)", - final_settings.revision - ); + let lost = (n as u64).saturating_sub(final_settings.revision); + if lost == 0 { + eprintln!( + "note: no lost writes detected in unlocked test (sequential scheduling); \ + the locked test is the authoritative correctness check" + ); + } else { + eprintln!("unlocked test: {lost} lost writes out of {n} (expected behavior)"); + } } // ---- Conflict guard tests ---- @@ -5906,7 +5876,6 @@ mod tests { .await .unwrap(); - // Create initial global settings let mut global = StoredSettings::default(); global.settings.insert( "log_level".to_string(), @@ -5918,8 +5887,6 @@ mod tests { let loaded = load_global_settings(&store).await.unwrap(); assert!(loaded.settings.contains_key("log_level")); - // Load fresh to get current resource_version before updating - let mut global = load_global_settings(&store).await.unwrap(); global.settings.remove("log_level"); global.revision = 2; save_global_settings(&store, &global).await.unwrap(); @@ -5963,330 +5930,4 @@ mod tests { assert_eq!(err.code(), Code::InvalidArgument); assert!(err.message().contains("unknown setting key")); } - - #[tokio::test] - async fn save_settings_detects_concurrent_modification() { - let store = Store::connect("sqlite::memory:").await.unwrap(); - - // Create initial settings - let mut settings = StoredSettings { - revision: 1, - settings: std::iter::once(( - "initial_key".to_string(), - StoredSettingValue::String("initial_value".to_string()), - )) - .collect(), - ..Default::default() - }; - save_global_settings(&store, &settings).await.unwrap(); - - // Load settings (simulating first client read) - let loaded = load_global_settings(&store).await.unwrap(); - assert_eq!(loaded.revision, 1); - - // Simulate concurrent modification: another client updates the settings - let mut concurrent_update = loaded.clone(); - concurrent_update.settings.insert( - "concurrent_key".to_string(), - StoredSettingValue::String("concurrent_value".to_string()), - ); - concurrent_update.revision = 2; - save_global_settings(&store, &concurrent_update) - .await - .unwrap(); - - // Now attempt to save our original modification (which is based on stale revision 1) - settings.settings.insert( - "our_key".to_string(), - StoredSettingValue::String("our_value".to_string()), - ); - settings.revision = 2; // We think we're updating to revision 2 - - let result = save_global_settings(&store, &settings).await; - - // Should fail with ABORTED due to concurrent modification - assert!(result.is_err(), "save with stale revision should fail"); - let err = result.unwrap_err(); - assert_eq!( - err.code(), - Code::Aborted, - "should fail with ABORTED due to version mismatch" - ); - assert!( - err.message().contains("concurrently"), - "error should mention concurrent modification: {}", - err.message() - ); - - // Verify the database contains the concurrent update, not our stale update - let final_settings = load_global_settings(&store).await.unwrap(); - assert_eq!(final_settings.revision, 2); - assert!( - final_settings.settings.contains_key("concurrent_key"), - "concurrent update should be preserved" - ); - assert!( - !final_settings.settings.contains_key("our_key"), - "stale update should NOT be in database" - ); - } - - // ---- CAS (Client-driven optimistic concurrency) tests for UpdateConfig ---- - // These test the policy backfill path where spec.policy is None and UpdateConfig - // uses update_message_cas to atomically set it. - - #[tokio::test] - async fn update_config_policy_backfill_cas_succeeds_with_correct_version() { - use openshell_core::proto::{SandboxPhase, SandboxSpec}; - - let state = test_server_state().await; - - // Create a sandbox WITHOUT a policy (spec.policy = None) - // This simulates a sandbox before the supervisor has discovered and synced a policy - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sb-1".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - policy: None, // No policy yet - will be backfilled - providers: Vec::new(), - ..Default::default() - }), - phase: SandboxPhase::Provisioning as i32, - ..Default::default() - }; - state.store.put_message(&sandbox).await.unwrap(); - - // Fetch the sandbox to get its current resource_version - let current = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - let current_version = current.metadata.as_ref().unwrap().resource_version; - - // Backfill the policy with correct expected_resource_version - let new_policy = ProtoSandboxPolicy::default(); - - let response = handle_update_config( - &state, - Request::new(UpdateConfigRequest { - name: "test-sandbox".to_string(), - policy: Some(new_policy), - setting_key: String::new(), - setting_value: None, - delete_setting: false, - global: false, - merge_operations: vec![], - expected_resource_version: current_version, - }), - ) - .await - .unwrap() - .into_inner(); - - // UpdateConfigResponse contains the policy version - assert_eq!(response.version, 1); - - // Verify the resource_version incremented and policy was backfilled - let updated_sandbox = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - assert_eq!( - updated_sandbox.metadata.as_ref().unwrap().resource_version, - current_version + 1, - "resource_version should increment during CAS backfill" - ); - assert!( - updated_sandbox.spec.as_ref().unwrap().policy.is_some(), - "policy should be backfilled" - ); - } - - #[tokio::test] - async fn update_config_policy_backfill_cas_rejects_stale_version() { - use openshell_core::proto::{SandboxPhase, SandboxSpec}; - - let state = test_server_state().await; - - // Create a sandbox WITHOUT a policy - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sb-1".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - policy: None, - providers: Vec::new(), - ..Default::default() - }), - phase: SandboxPhase::Provisioning as i32, - ..Default::default() - }; - state.store.put_message(&sandbox).await.unwrap(); - - // Get current version - let current = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - let current_version = current.metadata.as_ref().unwrap().resource_version; - - // Try to backfill with a stale version - let new_policy = ProtoSandboxPolicy::default(); - - let err = handle_update_config( - &state, - Request::new(UpdateConfigRequest { - name: "test-sandbox".to_string(), - policy: Some(new_policy), - setting_key: String::new(), - setting_value: None, - delete_setting: false, - global: false, - merge_operations: vec![], - expected_resource_version: 99, // stale version - }), - ) - .await - .unwrap_err(); - - // Should get ABORTED status for CAS conflict - assert_eq!(err.code(), Code::Aborted); - assert!( - err.message().contains("modified concurrently") - || err.message().contains("resource_version"), - "error message should mention concurrency conflict: {}", - err.message() - ); - - // Verify the sandbox was not modified (policy still None) - let unchanged = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - assert_eq!( - unchanged.metadata.as_ref().unwrap().resource_version, - current_version, - "resource_version should not change when CAS fails" - ); - assert!( - unchanged.spec.as_ref().unwrap().policy.is_none(), - "policy should still be None after failed backfill" - ); - } - - #[tokio::test] - async fn update_config_policy_backfill_concurrent_with_stale_versions() { - use openshell_core::proto::{SandboxPhase, SandboxSpec}; - use std::sync::Arc; - - let state = Arc::new(test_server_state().await); - - // Create a sandbox WITHOUT a policy - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sb-1".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - policy: None, - providers: Vec::new(), - ..Default::default() - }), - phase: SandboxPhase::Provisioning as i32, - ..Default::default() - }; - state.store.put_message(&sandbox).await.unwrap(); - - // All three clients fetch the sandbox and see the same version - let initial = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - let initial_version = initial.metadata.as_ref().unwrap().resource_version; - - // Launch 3 concurrent policy backfill attempts, all using the same initial version - let mut handles = vec![]; - for _i in 0..3 { - let state_clone = Arc::clone(&state); - let new_policy = ProtoSandboxPolicy::default(); - - let handle = tokio::spawn(async move { - handle_update_config( - &state_clone, - Request::new(UpdateConfigRequest { - name: "test-sandbox".to_string(), - policy: Some(new_policy), - setting_key: String::new(), - setting_value: None, - delete_setting: false, - global: false, - merge_operations: vec![], - expected_resource_version: initial_version, - }), - ) - .await - }); - handles.push(handle); - } - - let results: Vec<_> = futures::future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Only one should succeed; others should get ABORTED - let successes = results.iter().filter(|r| r.is_ok()).count(); - let aborted_conflicts = results - .iter() - .filter(|r| r.as_ref().err().is_some_and(|e| e.code() == Code::Aborted)) - .count(); - - assert_eq!( - successes, 1, - "exactly one backfill should succeed with client-driven CAS" - ); - assert_eq!( - aborted_conflicts, 2, - "two backfills should fail with ABORTED due to stale version" - ); - - // Final sandbox should have resource_version = initial_version + 1 and policy backfilled - let final_sandbox = state - .store - .get_message_by_name::("test-sandbox") - .await - .unwrap() - .unwrap(); - assert_eq!( - final_sandbox.metadata.as_ref().unwrap().resource_version, - initial_version + 1 - ); - assert!( - final_sandbox.spec.as_ref().unwrap().policy.is_some(), - "policy should be backfilled after one success" - ); - } } diff --git a/crates/openshell-server/src/grpc/provider.rs b/crates/openshell-server/src/grpc/provider.rs index cd85e31b1..a13fa3fe0 100644 --- a/crates/openshell-server/src/grpc/provider.rs +++ b/crates/openshell-server/src/grpc/provider.rs @@ -5,18 +5,14 @@ #![allow(clippy::result_large_err)] // gRPC handlers return Result, Status> -use crate::persistence::{ - ObjectId, ObjectLabels, ObjectName, ObjectType, Store, WriteCondition, generate_name, -}; +use crate::persistence::{ObjectName, ObjectType, Store, generate_name}; use openshell_core::proto::{Provider, Sandbox}; use prost::Message; use tonic::Status; use tracing::warn; use super::validation::validate_provider_fields; -use super::{ - MAX_MAP_KEY_LEN, MAX_MAP_VALUE_LEN, MAX_PAGE_SIZE, MAX_PROVIDER_CONFIG_ENTRIES, clamp_limit, -}; +use super::{MAX_PAGE_SIZE, clamp_limit}; // --------------------------------------------------------------------------- // CRUD helpers @@ -33,29 +29,6 @@ fn redact_provider_credentials(mut provider: Provider) -> Provider { provider } -#[derive(Debug, Clone, Default, PartialEq, Eq)] -pub(super) struct ProviderEnvironment { - pub environment: std::collections::HashMap, - pub credential_expires_at_ms: std::collections::HashMap, -} - -impl ProviderEnvironment { - #[cfg(test)] - fn is_empty(&self) -> bool { - self.environment.is_empty() - } - - #[cfg(test)] - fn get(&self, key: &str) -> Option<&String> { - self.environment.get(key) - } - - #[cfg(test)] - fn contains_key(&self, key: &str) -> bool { - self.environment.contains_key(key) - } -} - pub(super) async fn create_provider_record( store: &Store, mut provider: Provider, @@ -70,7 +43,6 @@ pub(super) async fn create_provider_record( name: generate_name(), created_at_ms: now_ms, labels: std::collections::HashMap::new(), - resource_version: 0, }); } @@ -90,9 +62,7 @@ pub(super) async fn create_provider_record( if provider.r#type.trim().is_empty() { return Err(Status::invalid_argument("provider.type is required")); } - if provider.credentials.is_empty() - && !provider_type_allows_empty_credentials_for_refresh(store, &provider.r#type).await? - { + if provider.credentials.is_empty() { return Err(Status::invalid_argument( "provider.credentials must not be empty", )); @@ -101,39 +71,20 @@ pub(super) async fn create_provider_record( // Validate field sizes before any I/O. validate_provider_fields(&provider)?; - // Generate UUID for database row and update metadata.id to match - let provider_id = uuid::Uuid::new_v4().to_string(); - let mut provider = provider; - if let Some(metadata) = provider.metadata.as_mut() { - metadata.id.clone_from(&provider_id); - } - - // Create with MustCreate condition to prevent duplicate creation race - let result = store - .put_if( - Provider::object_type(), - &provider_id, - provider.object_name(), - &provider.encode_to_vec(), - None, - WriteCondition::MustCreate, - ) + let existing = store + .get_message_by_name::(provider.object_name()) .await - .map_err(|e| { - if matches!( - e, - crate::persistence::PersistenceError::UniqueViolation { .. } - ) { - Status::already_exists("provider already exists") - } else { - Status::internal(format!("persist provider failed: {e}")) - } - })?; + .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))?; - if let Some(metadata) = provider.metadata.as_mut() { - metadata.resource_version = result.resource_version; + if existing.is_some() { + return Err(Status::already_exists("provider already exists")); } + store + .put_message(&provider) + .await + .map_err(|e| Status::internal(format!("persist provider failed: {e}")))?; + Ok(redact_provider_credentials(provider)) } @@ -155,31 +106,31 @@ pub(super) async fn list_provider_records( limit: u32, offset: u32, ) -> Result, Status> { - let providers: Vec = store - .list_messages(limit, offset) + let records = store + .list(Provider::object_type(), limit, offset) .await .map_err(|e| Status::internal(format!("list providers failed: {e}")))?; - Ok(providers - .into_iter() - .map(redact_provider_credentials) - .collect()) + let mut providers = Vec::with_capacity(records.len()); + for record in records { + let provider = Provider::decode(record.payload.as_slice()) + .map_err(|e| Status::internal(format!("decode provider failed: {e}")))?; + providers.push(redact_provider_credentials(provider)); + } + + Ok(providers) } pub(super) async fn update_provider_record( store: &Store, provider: Provider, ) -> Result { - use crate::persistence::{ObjectId, ObjectName}; + use crate::persistence::ObjectName; if provider.object_name().is_empty() { return Err(Status::invalid_argument("provider.name is required")); } - // Extract expected version from provider metadata - let expected_resource_version = provider.metadata.as_ref().map_or(0, |m| m.resource_version); - - // Resolve provider ID from name for CAS update let existing = store .get_message_by_name::(provider.object_name()) .await @@ -198,75 +149,24 @@ pub(super) async fn update_provider_record( )); } - let current_version = existing.metadata.as_ref().map_or(0, |m| m.resource_version); - - let cas_version = if expected_resource_version == 0 { - current_version - } else { - expected_resource_version + let updated = Provider { + metadata: existing.metadata, + r#type: existing.r#type, + credentials: merge_map(existing.credentials, provider.credentials), + config: merge_map(existing.config, provider.config), }; - // Apply merge to create candidate - let mut candidate = existing.clone(); - candidate.credentials = merge_map(candidate.credentials, provider.credentials); - candidate.config = merge_map(candidate.config, provider.config); - candidate.credential_expires_at_ms = merge_i64_map( - candidate.credential_expires_at_ms, - provider.credential_expires_at_ms, - ); + // Ensure metadata is valid (defense in depth - existing.metadata should always be valid) + super::validation::validate_object_metadata(updated.metadata.as_ref(), "provider")?; - // Validate BEFORE writing to prevent persisting invalid state - super::validation::validate_object_metadata(candidate.metadata.as_ref(), "provider")?; - validate_provider_fields(&candidate)?; - validate_provider_update_against_attached_sandboxes(store, &candidate).await?; - - // Serialize labels for storage - let labels_map = candidate.object_labels(); - let labels_json = if labels_map - .as_ref() - .is_none_or(std::collections::HashMap::is_empty) - { - None - } else { - Some( - serde_json::to_string(&labels_map) - .map_err(|e| Status::internal(format!("serialize labels failed: {e}")))?, - ) - }; + validate_provider_fields(&updated)?; - // Write validated candidate with CAS condition - let result = store - .put_if( - Provider::object_type(), - candidate.object_id(), - candidate.object_name(), - &candidate.encode_to_vec(), - labels_json.as_deref(), - WriteCondition::MatchResourceVersion(cas_version), - ) + store + .put_message(&updated) .await - .map_err(|e| { - if matches!(e, crate::persistence::PersistenceError::Conflict { .. }) { - Status::aborted(format!( - "provider was modified concurrently (current resource_version: {})", - match e { - crate::persistence::PersistenceError::Conflict { - current_resource_version, - } => current_resource_version.unwrap_or(0), - _ => 0, - } - )) - } else { - Status::internal(format!("update provider failed: {e}")) - } - })?; + .map_err(|e| Status::internal(format!("persist provider failed: {e}")))?; - // Update resource_version from successful write - if let Some(metadata) = candidate.metadata.as_mut() { - metadata.resource_version = result.resource_version; - } - - Ok(redact_provider_credentials(candidate)) + Ok(redact_provider_credentials(updated)) } pub(super) async fn delete_provider_record(store: &Store, name: &str) -> Result { @@ -274,14 +174,6 @@ pub(super) async fn delete_provider_record(store: &Store, name: &str) -> Result< return Err(Status::invalid_argument("name is required")); } - let Some(provider) = store - .get_message_by_name::(name) - .await - .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? - else { - return Ok(false); - }; - let blocking_sandboxes = sandboxes_using_provider(store, name).await?; if !blocking_sandboxes.is_empty() { return Err(Status::failed_precondition(format!( @@ -290,9 +182,6 @@ pub(super) async fn delete_provider_record(store: &Store, name: &str) -> Result< ))); } - crate::provider_refresh::delete_refresh_states_for_provider(store, provider.object_id()) - .await?; - store .delete_by_name(Provider::object_type(), name) .await @@ -336,41 +225,6 @@ async fn sandboxes_using_provider( Ok(blocking) } -async fn sandboxes_using_provider_records( - store: &Store, - provider_name: &str, -) -> Result, Status> { - let mut sandboxes = Vec::new(); - let mut offset = 0; - loop { - let records = store - .list(Sandbox::object_type(), 1000, offset) - .await - .map_err(|e| Status::internal(format!("list sandboxes failed: {e}")))?; - if records.is_empty() { - break; - } - offset = offset - .checked_add( - u32::try_from(records.len()) - .map_err(|_| Status::internal("sandbox page size exceeded u32"))?, - ) - .ok_or_else(|| Status::internal("sandbox pagination offset overflow"))?; - - for record in records { - let sandbox = Sandbox::decode(record.payload.as_slice()) - .map_err(|e| Status::internal(format!("decode sandbox failed: {e}")))?; - let Some(spec) = sandbox.spec.as_ref() else { - continue; - }; - if spec.providers.iter().any(|name| name == provider_name) { - sandboxes.push(sandbox); - } - } - } - Ok(sandboxes) -} - /// Merge an incoming map into an existing map. /// /// - If `incoming` is empty, return `existing` unchanged (no-op). @@ -393,23 +247,6 @@ fn merge_map( existing } -fn merge_i64_map( - mut existing: std::collections::HashMap, - incoming: std::collections::HashMap, -) -> std::collections::HashMap { - if incoming.is_empty() { - return existing; - } - for (key, value) in incoming { - if value <= 0 { - existing.remove(&key); - } else { - existing.insert(key, value); - } - } - existing -} - // --------------------------------------------------------------------------- // Provider environment resolution // --------------------------------------------------------------------------- @@ -418,20 +255,17 @@ fn merge_i64_map( /// /// For each provider name in the list, fetches the provider from the store and /// collects credential key-value pairs. Returns a map of environment variables -/// to inject into the sandbox. Credential keys must be unique across attached -/// providers so one provider cannot silently overwrite another provider's token. +/// to inject into the sandbox. When duplicate keys appear across providers, the +/// first provider's value wins. pub(super) async fn resolve_provider_environment( store: &Store, provider_names: &[String], -) -> Result { +) -> Result, Status> { if provider_names.is_empty() { - return Ok(ProviderEnvironment::default()); + return Ok(std::collections::HashMap::new()); } let mut env = std::collections::HashMap::new(); - let mut expires = std::collections::HashMap::new(); - let now_ms = crate::persistence::current_time_ms(); - validate_provider_environment_keys_unique_at(store, provider_names, None, now_ms).await?; for name in provider_names { let provider = store @@ -442,23 +276,6 @@ pub(super) async fn resolve_provider_environment( for (key, value) in &provider.credentials { if is_valid_env_key(key) { - let expires_at_ms = provider - .credential_expires_at_ms - .get(key) - .copied() - .unwrap_or_default(); - if expires_at_ms > 0 && expires_at_ms <= now_ms { - warn!( - provider_name = %name, - key = %key, - expires_at_ms, - "skipping expired provider credential" - ); - continue; - } - if expires_at_ms > 0 { - expires.entry(key.clone()).or_insert(expires_at_ms); - } env.entry(key.clone()).or_insert_with(|| value.clone()); } else { warn!( @@ -470,133 +287,7 @@ pub(super) async fn resolve_provider_environment( } } - Ok(ProviderEnvironment { - environment: env, - credential_expires_at_ms: expires, - }) -} - -pub async fn validate_provider_environment_keys_unique( - store: &Store, - provider_names: &[String], -) -> Result<(), Status> { - validate_provider_environment_keys_unique_at( - store, - provider_names, - None, - crate::persistence::current_time_ms(), - ) - .await -} - -pub async fn validate_provider_credential_key_available_for_attached_sandboxes( - store: &Store, - provider: &Provider, - credential_key: &str, -) -> Result<(), Status> { - let mut candidate = provider.clone(); - candidate - .credentials - .entry(credential_key.to_string()) - .or_insert_with(|| "pending".to_string()); - candidate.credential_expires_at_ms.remove(credential_key); - validate_provider_update_against_attached_sandboxes(store, &candidate).await -} - -pub async fn validate_provider_update_against_attached_sandboxes( - store: &Store, - provider: &Provider, -) -> Result<(), Status> { - let provider_name = provider.object_name().to_string(); - for sandbox in sandboxes_using_provider_records(store, &provider_name).await? { - let sandbox_name = sandbox.object_name().to_string(); - let Some(spec) = sandbox.spec.as_ref() else { - continue; - }; - validate_provider_environment_keys_unique_at( - store, - &spec.providers, - Some(provider), - crate::persistence::current_time_ms(), - ) - .await - .map_err(|err| { - Status::failed_precondition(format!( - "provider update would create credential env key conflict on sandbox '{sandbox_name}': {}", - err.message() - )) - })?; - } - Ok(()) -} - -async fn validate_provider_environment_keys_unique_at( - store: &Store, - provider_names: &[String], - candidate_provider: Option<&Provider>, - now_ms: i64, -) -> Result<(), Status> { - let mut seen = std::collections::HashMap::::new(); - for name in provider_names { - let provider = match candidate_provider { - Some(candidate) if candidate.object_name() == name.as_str() => candidate.clone(), - _ => store - .get_message_by_name::(name) - .await - .map_err(|e| Status::internal(format!("failed to fetch provider '{name}': {e}")))? - .ok_or_else(|| { - Status::failed_precondition(format!("provider '{name}' not found")) - })?, - }; - let provider_name = provider.object_name().to_string(); - for key in active_provider_environment_keys(store, &provider, now_ms).await? { - if let Some(first_provider) = seen.get(&key) { - if first_provider != &provider_name { - return Err(Status::failed_precondition(format!( - "credential env key '{key}' is provided by both provider '{first_provider}' and provider '{provider_name}'; use provider-specific env names" - ))); - } - } else { - seen.insert(key, provider_name.clone()); - } - } - } - Ok(()) -} - -async fn active_provider_environment_keys( - store: &Store, - provider: &Provider, - now_ms: i64, -) -> Result, Status> { - let mut keys = active_provider_credential_keys(provider, now_ms); - if !provider.object_id().is_empty() { - keys.extend( - crate::provider_refresh::list_refresh_states_for_provider(store, provider.object_id()) - .await? - .into_iter() - .map(|state| state.credential_key) - .filter(|key| is_valid_env_key(key)), - ); - } - keys.sort(); - keys.dedup(); - Ok(keys) -} - -fn active_provider_credential_keys(provider: &Provider, now_ms: i64) -> Vec { - provider - .credentials - .keys() - .filter(|key| is_valid_env_key(key)) - .filter(|key| { - provider - .credential_expires_at_ms - .get(*key) - .is_none_or(|expires_at_ms| *expires_at_ms <= 0 || *expires_at_ms > now_ms) - }) - .cloned() - .collect() + Ok(env) } pub(super) fn is_valid_env_key(key: &str) -> bool { @@ -626,21 +317,17 @@ impl ObjectType for Provider { use crate::ServerState; use openshell_core::proto::{ - ConfigureProviderRefreshRequest, ConfigureProviderRefreshResponse, CreateProviderRequest, - DeleteProviderProfileRequest, DeleteProviderProfileResponse, DeleteProviderRefreshRequest, - DeleteProviderRefreshResponse, DeleteProviderRequest, DeleteProviderResponse, - GetProviderProfileRequest, GetProviderRefreshStatusRequest, GetProviderRefreshStatusResponse, - GetProviderRequest, ImportProviderProfilesRequest, ImportProviderProfilesResponse, - LintProviderProfilesRequest, LintProviderProfilesResponse, ListProviderProfilesRequest, - ListProviderProfilesResponse, ListProvidersRequest, ListProvidersResponse, - ProviderCredentialRefreshStrategy, ProviderProfile, ProviderProfileDiagnostic, - ProviderProfileImportItem, ProviderProfileResponse, ProviderResponse, - RotateProviderCredentialRequest, RotateProviderCredentialResponse, StoredProviderProfile, + CreateProviderRequest, DeleteProviderProfileRequest, DeleteProviderProfileResponse, + DeleteProviderRequest, DeleteProviderResponse, GetProviderProfileRequest, GetProviderRequest, + ImportProviderProfilesRequest, ImportProviderProfilesResponse, LintProviderProfilesRequest, + LintProviderProfilesResponse, ListProviderProfilesRequest, ListProviderProfilesResponse, + ListProvidersRequest, ListProvidersResponse, ProviderProfile, ProviderProfileDiagnostic, + ProviderProfileImportItem, ProviderProfileResponse, ProviderResponse, StoredProviderProfile, UpdateProviderRequest, }; use openshell_providers::{ - CredentialRefreshProfile, ProfileValidationDiagnostic, ProviderTypeProfile, default_profiles, - get_default_profile, normalize_profile_id, validate_profile_set, + ProfileValidationDiagnostic, ProviderTypeProfile, default_profiles, get_default_profile, + normalize_profile_id, normalize_provider_type, validate_profile_set, }; use std::sync::Arc; use tonic::{Request, Response}; @@ -654,6 +341,7 @@ pub(super) async fn handle_create_provider( .provider .ok_or_else(|| Status::invalid_argument("provider is required"))?; let provider = create_provider_record(state.store.as_ref(), provider).await?; + emit_provider_lifecycle(&provider.r#type, "create", "success"); Ok(Response::new(ProviderResponse { provider: Some(provider), @@ -747,14 +435,7 @@ pub(super) async fn handle_import_provider_profiles( let stored = stored_provider_profile(profile.to_proto()); state .store - .put_if( - StoredProviderProfile::object_type(), - stored.object_id(), - stored.object_name(), - &stored.encode_to_vec(), - None, - WriteCondition::MustCreate, - ) + .put_message(&stored) .await .map_err(|e| Status::internal(format!("persist provider profile failed: {e}")))?; imported.push(stored.profile.unwrap_or_default()); @@ -841,72 +522,6 @@ pub(super) async fn get_provider_type_profile( Ok(profile) } -async fn provider_refresh_defaults( - store: &Store, - provider: &Provider, - credential_key: &str, -) -> Result, Status> { - let Some(profile) = get_provider_type_profile(store, &provider.r#type).await? else { - return Ok(None); - }; - Ok(profile - .credentials - .iter() - .find(|credential| { - credential.name == credential_key - || credential - .env_vars - .iter() - .any(|env_var| env_var == credential_key) - }) - .and_then(|credential| credential.refresh.clone())) -} - -fn validate_refresh_material( - material: &std::collections::HashMap, - refresh_defaults: Option<&CredentialRefreshProfile>, -) -> Result<(), Status> { - let Some(refresh_defaults) = refresh_defaults else { - return Ok(()); - }; - for required in refresh_defaults - .material - .iter() - .filter(|item| item.required) - { - if material - .get(&required.name) - .is_none_or(|value| value.trim().is_empty()) - { - return Err(Status::invalid_argument(format!( - "{} material is required by the provider profile", - required.name - ))); - } - } - Ok(()) -} - -async fn provider_type_allows_empty_credentials_for_refresh( - store: &Store, - provider_type: &str, -) -> Result { - let Some(profile) = get_provider_type_profile(store, provider_type).await? else { - return Ok(false); - }; - let required_credentials = profile - .credentials - .iter() - .filter(|credential| credential.required) - .collect::>(); - Ok(!required_credentials.is_empty() - && required_credentials.iter().all(|credential| { - credential.refresh.as_ref().is_some_and(|refresh| { - crate::provider_refresh::is_gateway_mintable_strategy(refresh.strategy) - }) - })) -} - async fn merged_provider_profiles(store: &Store) -> Result, Status> { let mut profiles = default_profiles().to_vec(); profiles.extend( @@ -920,10 +535,17 @@ async fn merged_provider_profiles(store: &Store) -> Result Result, Status> { - let profiles: Vec = store - .list_messages(10_000, 0) + let records = store + .list(StoredProviderProfile::object_type(), 10_000, 0) .await .map_err(|e| Status::internal(format!("list provider profiles failed: {e}")))?; + + let mut profiles = Vec::with_capacity(records.len()); + for record in records { + let profile = StoredProviderProfile::decode(record.payload.as_slice()) + .map_err(|e| Status::internal(format!("decode provider profile failed: {e}")))?; + profiles.push(profile); + } Ok(profiles) } @@ -995,6 +617,18 @@ async fn profile_conflict_diagnostics( }); continue; } + if let Some(provider_type) = normalize_provider_type(&id) { + diagnostics.push(ProfileValidationDiagnostic { + source: source.clone(), + profile_id: id.clone(), + field: "id".to_string(), + message: format!( + "provider profile id '{id}' is reserved for legacy provider type '{provider_type}'" + ), + severity: "error".to_string(), + }); + continue; + } if store .get_message_by_name::(&id) .await @@ -1022,7 +656,6 @@ fn stored_provider_profile(profile: ProviderProfile) -> StoredProviderProfile { name: profile.id.clone(), created_at_ms: now_ms, labels: std::collections::HashMap::new(), - resource_version: 0, }), profile: Some(profile), } @@ -1093,358 +726,50 @@ pub(super) async fn handle_update_provider( request: Request, ) -> Result, Status> { let req = request.into_inner(); - let mut provider = req + let provider = req .provider .ok_or_else(|| Status::invalid_argument("provider is required"))?; - provider - .credential_expires_at_ms - .extend(req.credential_expires_at_ms); let provider = update_provider_record(state.store.as_ref(), provider).await?; + emit_provider_lifecycle(&provider.r#type, "update", "success"); Ok(Response::new(ProviderResponse { provider: Some(provider), })) } -pub(super) async fn handle_get_provider_refresh_status( - state: &Arc, - request: Request, -) -> Result, Status> { - let request = request.into_inner(); - if request.provider.trim().is_empty() { - return Err(Status::invalid_argument("provider is required")); - } - let provider = state - .store - .get_message_by_name::(&request.provider) - .await - .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? - .ok_or_else(|| Status::not_found("provider not found"))?; - - let states = if request.credential_key.trim().is_empty() { - crate::provider_refresh::list_refresh_states_for_provider( - state.store.as_ref(), - provider.object_id(), - ) - .await? - } else { - crate::provider_refresh::get_refresh_state( - state.store.as_ref(), - provider.object_id(), - request.credential_key.trim(), - ) - .await? - .into_iter() - .collect() - }; - - Ok(Response::new(GetProviderRefreshStatusResponse { - credentials: states - .iter() - .map(crate::provider_refresh::refresh_status_from_state) - .collect(), - })) -} - -pub(super) async fn handle_configure_provider_refresh( +pub(super) async fn handle_delete_provider( state: &Arc, - request: Request, -) -> Result, Status> { - let request = request.into_inner(); - let provider_name = request.provider.trim(); - let credential_key = request.credential_key.trim(); - if provider_name.is_empty() { - return Err(Status::invalid_argument("provider is required")); - } - if credential_key.is_empty() { - return Err(Status::invalid_argument("credential_key is required")); - } - if !is_valid_env_key(credential_key) { - return Err(Status::invalid_argument( - "credential_key must be a valid environment variable name", - )); - } - let strategy = ProviderCredentialRefreshStrategy::try_from(request.strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified); - if strategy == ProviderCredentialRefreshStrategy::Unspecified { - return Err(Status::invalid_argument("refresh strategy is required")); - } - if !crate::provider_refresh::is_gateway_mintable_strategy(strategy) { - return Err(Status::invalid_argument(format!( - "refresh strategy '{}' is not gateway-mintable; update current credentials with provider update instead", - crate::provider_refresh::refresh_strategy_name(strategy as i32) - ))); - } - if request.material.len() > MAX_PROVIDER_CONFIG_ENTRIES { - return Err(Status::invalid_argument(format!( - "material exceeds maximum entries ({} > {MAX_PROVIDER_CONFIG_ENTRIES})", - request.material.len() - ))); - } - for (key, value) in &request.material { - if key.len() > MAX_MAP_KEY_LEN { - return Err(Status::invalid_argument(format!( - "material key exceeds maximum length ({} > {MAX_MAP_KEY_LEN})", - key.len() - ))); - } - if value.len() > MAX_MAP_VALUE_LEN { - return Err(Status::invalid_argument(format!( - "material value exceeds maximum length ({} > {MAX_MAP_VALUE_LEN})", - value.len() - ))); - } - } - if request.secret_material_keys.len() > MAX_PROVIDER_CONFIG_ENTRIES { - return Err(Status::invalid_argument(format!( - "secret_material_keys exceeds maximum entries ({} > {MAX_PROVIDER_CONFIG_ENTRIES})", - request.secret_material_keys.len() - ))); - } - for key in &request.secret_material_keys { - if key.len() > MAX_MAP_KEY_LEN { - return Err(Status::invalid_argument(format!( - "secret_material_keys entry exceeds maximum length ({} > {MAX_MAP_KEY_LEN})", - key.len() - ))); - } - } - if request - .material - .get("token_url") - .is_some_and(|value| !value.trim().is_empty()) - || request - .material - .get("token_uri") - .is_some_and(|value| !value.trim().is_empty()) - { - return Err(Status::invalid_argument( - "refresh token endpoints must be defined by the provider profile, not material", - )); - } - if request - .expires_at_ms - .is_some_and(|expires_at_ms| expires_at_ms < 0) - { - return Err(Status::invalid_argument( - "expires_at_ms must be greater than or equal to 0", - )); - } - - let provider = state - .store - .get_message_by_name::(provider_name) - .await - .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? - .ok_or_else(|| Status::not_found("provider not found"))?; - validate_provider_credential_key_available_for_attached_sandboxes( - state.store.as_ref(), - &provider, - credential_key, - ) - .await?; - let refresh_defaults = - provider_refresh_defaults(state.store.as_ref(), &provider, credential_key).await?; - validate_refresh_material(&request.material, refresh_defaults.as_ref())?; - let material_scopes = crate::provider_refresh::material_scopes(&request.material); - let token_url = refresh_defaults - .as_ref() - .map(|refresh| refresh.token_url.clone()) - .unwrap_or_default(); - let scopes = if material_scopes.is_empty() { - refresh_defaults - .as_ref() - .map(|refresh| refresh.scopes.clone()) - .unwrap_or_default() - } else { - material_scopes - }; - let refresh_before_seconds = - crate::provider_refresh::parse_material_i64(&request.material, "refresh_before_seconds")? - .or_else(|| { - refresh_defaults - .as_ref() - .map(|refresh| refresh.refresh_before_seconds) - }) - .unwrap_or_default(); - let max_lifetime_seconds = - crate::provider_refresh::parse_material_i64(&request.material, "max_lifetime_seconds")? - .or_else(|| { - refresh_defaults - .as_ref() - .map(|refresh| refresh.max_lifetime_seconds) - }) - .unwrap_or_default(); - if refresh_before_seconds < 0 { - return Err(Status::invalid_argument( - "refresh_before_seconds material must be greater than or equal to 0", - )); - } - if max_lifetime_seconds < 0 { - return Err(Status::invalid_argument( - "max_lifetime_seconds material must be greater than or equal to 0", - )); - } - let existing_refresh_state = crate::provider_refresh::get_refresh_state( - state.store.as_ref(), - provider.object_id(), - credential_key, - ) - .await?; - let expires_at_ms = request.expires_at_ms.unwrap_or_else(|| { - existing_refresh_state - .as_ref() - .map(|state| state.expires_at_ms) - .unwrap_or_default() - }); - let mut state_record = crate::provider_refresh::new_refresh_state( - &provider, - credential_key, - crate::provider_refresh::NewRefreshStateConfig { - strategy, - material: request.material, - secret_material_keys: request.secret_material_keys, - expires_at_ms, - token_url, - scopes, - refresh_before_seconds, - max_lifetime_seconds, - }, - )?; - if let Some(existing) = existing_refresh_state { - state_record.metadata = existing.metadata; - state_record.last_refresh_at_ms = existing.last_refresh_at_ms; - } - crate::provider_refresh::put_refresh_state(state.store.as_ref(), &state_record).await?; - - if let Some(expires_at_ms) = request.expires_at_ms { - let updated = Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: provider_name.to_string(), - created_at_ms: 0, - labels: std::collections::HashMap::new(), - resource_version: 0, - }), - r#type: String::new(), - credentials: std::collections::HashMap::new(), - config: std::collections::HashMap::new(), - credential_expires_at_ms: std::collections::HashMap::from([( - credential_key.to_string(), - expires_at_ms, - )]), - }; - update_provider_record(state.store.as_ref(), updated).await?; + request: Request, +) -> Result, Status> { + let name = request.into_inner().name; + let provider_profile = provider_profile_for_name(state.store.as_ref(), &name).await; + let deleted = delete_provider_record(state.store.as_ref(), &name).await?; + if deleted && let Some(provider_profile) = provider_profile { + openshell_core::telemetry::emit_provider_lifecycle("delete", "success", &provider_profile); } - Ok(Response::new(ConfigureProviderRefreshResponse { - status: Some(crate::provider_refresh::refresh_status_from_state( - &state_record, - )), - })) + Ok(Response::new(DeleteProviderResponse { deleted })) } -pub(super) async fn handle_rotate_provider_credential( - state: &Arc, - request: Request, -) -> Result, Status> { - let request = request.into_inner(); - let provider_name = request.provider.trim(); - let credential_key = request.credential_key.trim(); - if provider_name.is_empty() { - return Err(Status::invalid_argument("provider is required")); - } - if credential_key.is_empty() { - return Err(Status::invalid_argument("credential_key is required")); - } - let refresh_state = crate::provider_refresh::refresh_provider_credential( - state.store.as_ref(), - provider_name, - credential_key, - ) - .await?; - - Ok(Response::new(RotateProviderCredentialResponse { - status: Some(crate::provider_refresh::refresh_status_from_state( - &refresh_state, - )), - })) +fn emit_provider_lifecycle(provider_type: &str, operation: &str, outcome: &str) { + let provider_profile = telemetry_provider_profile(provider_type); + openshell_core::telemetry::emit_provider_lifecycle(operation, outcome, &provider_profile); } -pub(super) async fn handle_delete_provider_refresh( - state: &Arc, - request: Request, -) -> Result, Status> { - let request = request.into_inner(); - let provider_name = request.provider.trim(); - let credential_key = request.credential_key.trim(); - if provider_name.is_empty() { - return Err(Status::invalid_argument("provider is required")); - } - if credential_key.is_empty() { - return Err(Status::invalid_argument("credential_key is required")); - } - let provider = state - .store - .get_message_by_name::(provider_name) +async fn provider_profile_for_name(store: &Store, name: &str) -> Option { + store + .get_message_by_name::(name) .await - .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? - .ok_or_else(|| Status::not_found("provider not found"))?; - let existing_refresh_state = crate::provider_refresh::get_refresh_state( - state.store.as_ref(), - provider.object_id(), - credential_key, - ) - .await?; - let deleted_refresh_state = crate::provider_refresh::delete_refresh_state( - state.store.as_ref(), - provider.object_id(), - credential_key, - ) - .await?; - - let refresh_owned_expiry = existing_refresh_state - .as_ref() - .is_some_and(|refresh_state| { - refresh_state.expires_at_ms > 0 - && provider - .credential_expires_at_ms - .get(credential_key) - .is_some_and(|expires_at_ms| *expires_at_ms == refresh_state.expires_at_ms) - }); - if refresh_owned_expiry { - let updated = Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: provider_name.to_string(), - created_at_ms: 0, - labels: std::collections::HashMap::new(), - resource_version: 0, - }), - r#type: String::new(), - credentials: std::collections::HashMap::new(), - config: std::collections::HashMap::new(), - credential_expires_at_ms: std::collections::HashMap::from([( - credential_key.to_string(), - 0, - )]), - }; - update_provider_record(state.store.as_ref(), updated).await?; - } - - Ok(Response::new(DeleteProviderRefreshResponse { - deleted: deleted_refresh_state, - })) + .ok() + .flatten() + .map(|provider| telemetry_provider_profile(&provider.r#type)) } -pub(super) async fn handle_delete_provider( - state: &Arc, - request: Request, -) -> Result, Status> { - let name = request.into_inner().name; - let deleted = delete_provider_record(state.store.as_ref(), &name).await?; - - Ok(Response::new(DeleteProviderResponse { deleted })) +fn telemetry_provider_profile(provider_type: &str) -> String { + normalize_provider_type(provider_type) + .filter(|profile| get_default_profile(profile).is_some()) + .unwrap_or("custom") + .to_string() } // --------------------------------------------------------------------------- @@ -1459,9 +784,8 @@ mod tests { use openshell_core::proto::{ DeleteProviderProfileRequest, GetProviderProfileRequest, ImportProviderProfilesRequest, L7Allow, L7Rule, LintProviderProfilesRequest, ListProviderProfilesRequest, NetworkBinary, - NetworkEndpoint, ProviderCredentialRefresh, ProviderCredentialRefreshMaterial, - ProviderProfile, ProviderProfileCategory, ProviderProfileCredential, - ProviderProfileImportItem, Sandbox, SandboxSpec, + NetworkEndpoint, ProviderProfile, ProviderProfileCategory, ProviderProfileImportItem, + Sandbox, SandboxSpec, }; use openshell_core::{ObjectId, ObjectName}; use std::collections::HashMap; @@ -1484,6 +808,19 @@ mod tests { assert!(!is_valid_env_key("X;rm -rf /")); } + #[test] + fn telemetry_provider_profile_maps_unknown_to_custom() { + assert_eq!(telemetry_provider_profile("CLAUDE"), "claude"); + assert_eq!(telemetry_provider_profile("github"), "github"); + assert_eq!(telemetry_provider_profile("gh"), "github"); + assert_eq!(telemetry_provider_profile("glab"), "gitlab"); + assert_eq!(telemetry_provider_profile("outlook"), "outlook"); + assert_eq!(telemetry_provider_profile("generic"), "custom"); + assert_eq!(telemetry_provider_profile("unknown-private"), "custom"); + assert_eq!(telemetry_provider_profile("acme-internal"), "custom"); + assert_eq!(telemetry_provider_profile("corp-llm-prod"), "custom"); + } + fn provider_with_values(name: &str, provider_type: &str) -> Provider { Provider { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { @@ -1491,7 +828,6 @@ mod tests { name: name.to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: [ @@ -1506,7 +842,6 @@ mod tests { ] .into_iter() .collect(), - credential_expires_at_ms: HashMap::new(), } } @@ -1533,75 +868,6 @@ mod tests { profile } - fn refreshable_credential(name: &str, env_var: &str) -> ProviderProfileCredential { - ProviderProfileCredential { - name: name.to_string(), - description: String::new(), - env_vars: vec![env_var.to_string()], - required: true, - auth_style: "bearer".to_string(), - header_name: "authorization".to_string(), - query_param: String::new(), - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - token_url: "https://auth.example.com/token".to_string(), - scopes: Vec::new(), - refresh_before_seconds: 300, - max_lifetime_seconds: 3600, - material: vec![ - ProviderCredentialRefreshMaterial { - name: "client_id".to_string(), - description: String::new(), - required: true, - secret: false, - }, - ProviderCredentialRefreshMaterial { - name: "client_secret".to_string(), - description: String::new(), - required: true, - secret: true, - }, - ], - }), - } - } - - async fn import_test_refresh_profile(state: &Arc, id: &str, credential_key: &str) { - let mut profile = custom_profile(id); - profile.category = ProviderProfileCategory::Messaging as i32; - profile.credentials = vec![refreshable_credential("access_token", credential_key)]; - handle_import_provider_profiles( - state, - Request::new(ImportProviderProfilesRequest { - profiles: vec![ProviderProfileImportItem { - profile: Some(profile), - source: format!("{id}.yaml"), - }], - }), - ) - .await - .unwrap(); - } - - const TEST_GRAPH_PROVIDER_TYPE: &str = "test-msgraph"; - - async fn import_test_graph_refresh_profile(state: &Arc) { - import_test_refresh_profile(state, TEST_GRAPH_PROVIDER_TYPE, "MS_GRAPH_ACCESS_TOKEN").await; - } - - fn static_credential(name: &str, env_var: &str, required: bool) -> ProviderProfileCredential { - ProviderProfileCredential { - name: name.to_string(), - description: String::new(), - env_vars: vec![env_var.to_string()], - required, - auth_style: "bearer".to_string(), - header_name: "authorization".to_string(), - query_param: String::new(), - refresh: None, - } - } - #[tokio::test] async fn list_provider_profiles_returns_built_in_profile_categories() { let state = test_server_state().await; @@ -1616,13 +882,6 @@ mod tests { .unwrap() .into_inner(); - let ids = response - .profiles - .iter() - .map(|profile| profile.id.as_str()) - .collect::>(); - assert_eq!(ids, vec!["claude-code", "github", "nvidia"]); - let github = response .profiles .iter() @@ -1632,6 +891,13 @@ mod tests { github.category, ProviderProfileCategory::SourceControl as i32 ); + assert!( + response + .profiles + .iter() + .all(|profile| profile.id != "generic"), + "generic remains a legacy provider type without a v2 profile" + ); } #[tokio::test] @@ -1741,14 +1007,14 @@ mod tests { } #[tokio::test] - async fn import_provider_profile_allows_legacy_provider_type_ids_without_built_in_profiles() { + async fn import_provider_profile_rejects_legacy_provider_type_ids() { let state = test_server_state().await; let response = handle_import_provider_profiles( &state, Request::new(ImportProviderProfilesRequest { profiles: vec![ProviderProfileImportItem { - profile: Some(custom_profile("codex")), - source: "codex.yaml".to_string(), + profile: Some(custom_profile("generic")), + source: "generic.yaml".to_string(), }], }), ) @@ -1756,21 +1022,23 @@ mod tests { .unwrap() .into_inner(); - assert!(response.imported); - assert!(response.diagnostics.is_empty()); + assert!(!response.imported); + assert!( + response + .diagnostics + .iter() + .any(|diagnostic| diagnostic.message.contains("reserved")) + ); - let imported = handle_get_provider_profile( + let missing = handle_get_provider_profile( &state, Request::new(GetProviderProfileRequest { - id: "codex".to_string(), + id: "generic".to_string(), }), ) .await - .unwrap() - .into_inner() - .profile - .expect("codex profile should be returned"); - assert_eq!(imported.id, "codex"); + .unwrap_err(); + assert_eq!(missing.code(), Code::NotFound); } #[tokio::test] @@ -2051,7 +1319,6 @@ mod tests { name: "sandbox-using-custom".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { providers: vec!["custom-provider".to_string()], @@ -2075,86 +1342,24 @@ mod tests { } #[tokio::test] - async fn configure_provider_refresh_stores_scoped_status_and_provider_expiry() { + async fn delete_provider_profile_removes_unused_custom_profile() { let state = test_server_state().await; - import_test_graph_refresh_profile(&state).await; - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "msgraph".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - - let expires_at_ms = crate::persistence::current_time_ms() + 60_000; - let response = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: Some(expires_at_ms), - }), - ) - .await - .unwrap() - .into_inner() - .status - .expect("status"); - assert_eq!(response.credential_key, "MS_GRAPH_ACCESS_TOKEN"); - - let status = handle_get_provider_refresh_status( + handle_import_provider_profiles( &state, - Request::new(GetProviderRefreshStatusRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), + Request::new(ImportProviderProfilesRequest { + profiles: vec![ProviderProfileImportItem { + profile: Some(custom_profile("custom-api")), + source: "custom-api.yaml".to_string(), + }], }), ) .await - .unwrap() - .into_inner(); - assert_eq!(status.credentials.len(), 1); - assert_eq!(status.credentials[0].expires_at_ms, expires_at_ms); - - let provider = state - .store - .get_message_by_name::("msgraph") - .await - .unwrap() - .expect("provider"); - assert_eq!( - provider - .credential_expires_at_ms - .get("MS_GRAPH_ACCESS_TOKEN"), - Some(&expires_at_ms) - ); + .unwrap(); - let deleted = handle_delete_provider_refresh( + let deleted = handle_delete_provider_profile( &state, - Request::new(DeleteProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), + Request::new(DeleteProviderProfileRequest { + id: "custom-api".to_string(), }), ) .await @@ -2162,927 +1367,188 @@ mod tests { .into_inner(); assert!(deleted.deleted); - let status_after_delete = handle_get_provider_refresh_status( + let missing = handle_get_provider_profile( &state, - Request::new(GetProviderRefreshStatusRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), + Request::new(GetProviderProfileRequest { + id: "custom-api".to_string(), }), ) .await - .unwrap() - .into_inner(); - assert!(status_after_delete.credentials.is_empty()); - - let provider_after_delete = state - .store - .get_message_by_name::("msgraph") - .await - .unwrap() - .expect("provider"); - assert!( - !provider_after_delete - .credential_expires_at_ms - .contains_key("MS_GRAPH_ACCESS_TOKEN") - ); + .unwrap_err(); + assert_eq!(missing.code(), Code::NotFound); } #[tokio::test] - async fn delete_provider_refresh_preserves_manually_updated_expiry() { - let state = test_server_state().await; - import_test_graph_refresh_profile(&state).await; - create_provider_record( - state.store.as_ref(), + async fn provider_crud_round_trip_and_semantics() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + let created = provider_with_values("gitlab-local", "gitlab"); + let persisted = create_provider_record(&store, created.clone()) + .await + .unwrap(); + assert_eq!(persisted.object_name(), "gitlab-local"); + assert_eq!(persisted.r#type, "gitlab"); + assert!(!persisted.object_id().is_empty()); + let provider_id = persisted.object_id().to_string(); + + let duplicate_err = create_provider_record(&store, created).await.unwrap_err(); + assert_eq!(duplicate_err.code(), Code::AlreadyExists); + + let loaded = get_provider_record(&store, "gitlab-local").await.unwrap(); + assert_eq!(loaded.object_id(), provider_id); + + let listed = list_provider_records(&store, 100, 0).await.unwrap(); + assert_eq!(listed.len(), 1); + assert_eq!(listed[0].object_name(), "gitlab-local"); + + let updated = update_provider_record( + &store, Provider { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { id: String::new(), - name: "msgraph".to_string(), - created_at_ms: 0, + name: "gitlab-local".to_string(), + created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), + r#type: "gitlab".to_string(), credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "token".to_string(), + "API_TOKEN".to_string(), + "rotated-token".to_string(), )) .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), + config: std::iter::once(("endpoint".to_string(), "https://gitlab.com".to_string())) + .collect(), }, ) .await .unwrap(); + assert_eq!(updated.object_id(), provider_id); + assert_eq!(updated.credentials.len(), 2); + assert_eq!( + updated.credentials.get("API_TOKEN"), + Some(&"REDACTED".to_string()), + "credential values must be redacted in gRPC responses" + ); + assert_eq!( + updated.credentials.get("SECONDARY"), + Some(&"REDACTED".to_string()), + ); + let stored: Provider = store + .get_message_by_name("gitlab-local") + .await + .unwrap() + .unwrap(); + assert_eq!( + stored.credentials.get("API_TOKEN"), + Some(&"rotated-token".to_string()) + ); + assert_eq!( + stored.credentials.get("SECONDARY"), + Some(&"secondary-token".to_string()) + ); + assert_eq!( + updated.config.get("endpoint"), + Some(&"https://gitlab.com".to_string()) + ); + assert_eq!(updated.config.get("region"), Some(&"us-west".to_string())); - let refresh_expires_at_ms = crate::persistence::current_time_ms() + 60_000; - handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: Some(refresh_expires_at_ms), - }), - ) - .await - .unwrap(); + let deleted = delete_provider_record(&store, "gitlab-local") + .await + .unwrap(); + assert!(deleted); - let manual_expires_at_ms = refresh_expires_at_ms + 60_000; - update_provider_record( - state.store.as_ref(), - Provider { + let deleted_again = delete_provider_record(&store, "gitlab-local") + .await + .unwrap(); + assert!(!deleted_again); + + let missing = get_provider_record(&store, "gitlab-local") + .await + .unwrap_err(); + assert_eq!(missing.code(), Code::NotFound); + } + + #[tokio::test] + async fn delete_provider_rejects_attached_provider() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + create_provider_record(&store, provider_with_values("gitlab-local", "gitlab")) + .await + .unwrap(); + store + .put_message(&Sandbox { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "msgraph".to_string(), + id: "sandbox-id".to_string(), + name: "attached-sandbox".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), - r#type: String::new(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::from([( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - manual_expires_at_ms, - )]), - }, - ) - .await - .unwrap(); - - let deleted = handle_delete_provider_refresh( - &state, - Request::new(DeleteProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - }), - ) - .await - .unwrap() - .into_inner(); - assert!(deleted.deleted); + spec: Some(SandboxSpec { + providers: vec!["gitlab-local".to_string()], + ..Default::default() + }), + ..Default::default() + }) + .await + .unwrap(); - let provider_after_delete = state - .store - .get_message_by_name::("msgraph") + let err = delete_provider_record(&store, "gitlab-local") .await - .unwrap() - .expect("provider"); - assert_eq!( - provider_after_delete - .credential_expires_at_ms - .get("MS_GRAPH_ACCESS_TOKEN"), - Some(&manual_expires_at_ms) + .unwrap_err(); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!( + err.message().contains("attached-sandbox"), + "error should identify blocking sandbox: {}", + err.message() ); } #[tokio::test] - async fn configure_provider_refresh_rejects_credential_key_collision_for_attached_sandbox() { - let state = test_server_state().await; - import_test_graph_refresh_profile(&state).await; - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "existing-graph".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "existing-token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "refreshing-graph".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), - credentials: std::iter::once(("OTHER_TOKEN".to_string(), "other".to_string())) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - state - .store - .put_message(&Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sandbox-collision".to_string(), - name: "collision".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - providers: vec!["existing-graph".to_string(), "refreshing-graph".to_string()], - ..SandboxSpec::default() - }), - ..Default::default() - }) - .await - .unwrap(); - - let err = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "refreshing-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: None, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), Code::FailedPrecondition); - assert!(err.message().contains("collision")); - assert!(err.message().contains("MS_GRAPH_ACCESS_TOKEN")); - let states = crate::provider_refresh::list_all_refresh_states(state.store.as_ref()) - .await - .unwrap(); - assert!(states.is_empty()); - } - - #[tokio::test] - async fn configure_provider_refresh_treats_existing_refresh_state_keys_as_reserved() { - let state = test_server_state().await; - import_test_graph_refresh_profile(&state).await; - for name in ["first-graph", "second-graph"] { - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: name.to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - } - state - .store - .put_message(&Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sandbox-refresh-collision".to_string(), - name: "refresh-collision".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - providers: vec!["first-graph".to_string(), "second-graph".to_string()], - ..SandboxSpec::default() - }), - ..Default::default() - }) - .await - .unwrap(); - - handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "first-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: None, - }), - ) - .await - .unwrap(); - - let err = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "second-graph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: None, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), Code::FailedPrecondition); - assert!(err.message().contains("collision")); - assert!(err.message().contains("MS_GRAPH_ACCESS_TOKEN")); - assert!(err.message().contains("first-graph")); - assert!(err.message().contains("second-graph")); - } - - #[tokio::test] - async fn configure_provider_refresh_rejects_profile_endpoint_override_and_missing_material() { - let state = test_server_state().await; - import_test_graph_refresh_profile(&state).await; - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "msgraph".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: TEST_GRAPH_PROVIDER_TYPE.to_string(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - - let endpoint_override = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([ - ("tenant_id".to_string(), "tenant".to_string()), - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ( - "token_url".to_string(), - "https://attacker.example/token".to_string(), - ), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: None, - }), - ) - .await - .unwrap_err(); - assert_eq!(endpoint_override.code(), Code::InvalidArgument); - assert!(endpoint_override.message().contains("provider profile")); - - let missing_material = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32, - material: HashMap::from([("tenant_id".to_string(), "tenant".to_string())]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: None, - }), - ) - .await - .unwrap_err(); - assert_eq!(missing_material.code(), Code::InvalidArgument); - assert!(missing_material.message().contains("client_id material")); - } - - #[tokio::test] - async fn configure_provider_refresh_rejects_non_gateway_mintable_strategies() { - let state = test_server_state().await; - create_provider_record( - state.store.as_ref(), - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "msgraph".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "outlook".to_string(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - - for strategy in [ - ProviderCredentialRefreshStrategy::Static, - ProviderCredentialRefreshStrategy::External, - ] { - let err = handle_configure_provider_refresh( - &state, - Request::new(ConfigureProviderRefreshRequest { - provider: "msgraph".to_string(), - credential_key: "MS_GRAPH_ACCESS_TOKEN".to_string(), - strategy: strategy as i32, - material: HashMap::new(), - secret_material_keys: Vec::new(), - expires_at_ms: None, - }), - ) - .await - .unwrap_err(); - assert_eq!(err.code(), Code::InvalidArgument); - assert!( - err.message().contains("not gateway-mintable"), - "unexpected error: {}", - err.message() - ); - } - - let refresh_states = crate::provider_refresh::list_all_refresh_states(state.store.as_ref()) - .await - .unwrap(); - assert!(refresh_states.is_empty()); - } - - #[tokio::test] - async fn delete_provider_profile_removes_unused_custom_profile() { - let state = test_server_state().await; - handle_import_provider_profiles( - &state, - Request::new(ImportProviderProfilesRequest { - profiles: vec![ProviderProfileImportItem { - profile: Some(custom_profile("custom-api")), - source: "custom-api.yaml".to_string(), - }], - }), - ) - .await - .unwrap(); - - let deleted = handle_delete_provider_profile( - &state, - Request::new(DeleteProviderProfileRequest { - id: "custom-api".to_string(), - }), - ) - .await - .unwrap() - .into_inner(); - assert!(deleted.deleted); - - let missing = handle_get_provider_profile( - &state, - Request::new(GetProviderProfileRequest { - id: "custom-api".to_string(), - }), - ) - .await - .unwrap_err(); - assert_eq!(missing.code(), Code::NotFound); - } - - #[tokio::test] - async fn provider_crud_round_trip_and_semantics() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - let created = provider_with_values("gitlab-local", "gitlab"); - let persisted = create_provider_record(&store, created.clone()) - .await - .unwrap(); - assert_eq!(persisted.object_name(), "gitlab-local"); - assert_eq!(persisted.r#type, "gitlab"); - assert!(!persisted.object_id().is_empty()); - let provider_id = persisted.object_id().to_string(); - - let duplicate_err = create_provider_record(&store, created).await.unwrap_err(); - assert_eq!(duplicate_err.code(), Code::AlreadyExists); - - let loaded = get_provider_record(&store, "gitlab-local").await.unwrap(); - assert_eq!(loaded.object_id(), provider_id); - - let listed = list_provider_records(&store, 100, 0).await.unwrap(); - assert_eq!(listed.len(), 1); - assert_eq!(listed[0].object_name(), "gitlab-local"); - - let updated = update_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "gitlab-local".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "gitlab".to_string(), - credentials: std::iter::once(( - "API_TOKEN".to_string(), - "rotated-token".to_string(), - )) - .collect(), - config: std::iter::once(("endpoint".to_string(), "https://gitlab.com".to_string())) - .collect(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - assert_eq!(updated.object_id(), provider_id); - assert_eq!(updated.credentials.len(), 2); - assert_eq!( - updated.credentials.get("API_TOKEN"), - Some(&"REDACTED".to_string()), - "credential values must be redacted in gRPC responses" - ); - assert_eq!( - updated.credentials.get("SECONDARY"), - Some(&"REDACTED".to_string()), - ); - let stored: Provider = store - .get_message_by_name("gitlab-local") - .await - .unwrap() - .unwrap(); - assert_eq!( - stored.credentials.get("API_TOKEN"), - Some(&"rotated-token".to_string()) - ); - assert_eq!( - stored.credentials.get("SECONDARY"), - Some(&"secondary-token".to_string()) - ); - assert_eq!( - updated.config.get("endpoint"), - Some(&"https://gitlab.com".to_string()) - ); - assert_eq!(updated.config.get("region"), Some(&"us-west".to_string())); - - let deleted = delete_provider_record(&store, "gitlab-local") - .await - .unwrap(); - assert!(deleted); - - let deleted_again = delete_provider_record(&store, "gitlab-local") - .await - .unwrap(); - assert!(!deleted_again); - - let missing = get_provider_record(&store, "gitlab-local") - .await - .unwrap_err(); - assert_eq!(missing.code(), Code::NotFound); - } - - #[tokio::test] - async fn delete_provider_removes_scoped_refresh_states() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - let provider = create_provider_record( - &store, - Provider { - credential_expires_at_ms: HashMap::from([("API_TOKEN".to_string(), 123_456)]), - ..provider_with_values("gitlab-local", "gitlab") - }, - ) - .await - .unwrap(); - let refresh_state = crate::provider_refresh::new_refresh_state( - &provider, - "API_TOKEN", - crate::provider_refresh::NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::External, - material: HashMap::from([( - "endpoint".to_string(), - "https://refresh.example.com".to_string(), - )]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: 123_456, - token_url: "https://refresh.example.com/token".to_string(), - scopes: Vec::new(), - refresh_before_seconds: 300, - max_lifetime_seconds: 3600, - }, - ) - .unwrap(); - crate::provider_refresh::put_refresh_state(&store, &refresh_state) - .await - .unwrap(); - - let deleted = delete_provider_record(&store, "gitlab-local") - .await - .unwrap(); - assert!(deleted); - - let refresh_states = - crate::provider_refresh::list_refresh_states_for_provider(&store, provider.object_id()) - .await - .unwrap(); - assert!(refresh_states.is_empty()); - } - - #[tokio::test] - async fn delete_provider_rejects_attached_provider() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - create_provider_record(&store, provider_with_values("gitlab-local", "gitlab")) - .await - .unwrap(); - store - .put_message(&Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sandbox-id".to_string(), - name: "attached-sandbox".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - providers: vec!["gitlab-local".to_string()], - ..Default::default() - }), - ..Default::default() - }) - .await - .unwrap(); - - let err = delete_provider_record(&store, "gitlab-local") - .await - .unwrap_err(); - assert_eq!(err.code(), Code::FailedPrecondition); - assert!( - err.message().contains("attached-sandbox"), - "error should identify blocking sandbox: {}", - err.message() - ); - } - - #[tokio::test] - async fn provider_create_and_update_return_correct_resource_version() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // Create provider and verify resource_version: 1 in response - let created = provider_with_values("test-provider", "openai"); - let persisted = create_provider_record(&store, created).await.unwrap(); - assert_eq!( - persisted.metadata.as_ref().unwrap().resource_version, - 1, - "create_provider_record should return resource_version: 1 after insert" - ); - - // Update provider and verify resource_version: 2 in response - let updated = update_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "test-provider".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "openai".to_string(), - credentials: std::iter::once(( - "OPENAI_API_KEY".to_string(), - "updated-key".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - assert_eq!( - updated.metadata.as_ref().unwrap().resource_version, - 2, - "update_provider_record should return resource_version: 2 after first update" - ); - - // Update again and verify resource_version: 3 - let updated_again = update_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "test-provider".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "openai".to_string(), - credentials: std::iter::once(( - "OPENAI_API_KEY".to_string(), - "third-key".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - assert_eq!( - updated_again.metadata.as_ref().unwrap().resource_version, - 3, - "update_provider_record should return resource_version: 3 after second update" - ); - } - - #[tokio::test] - async fn provider_validation_errors() { - let state = test_server_state().await; - let store = state.store.as_ref(); - - let create_missing_type = create_provider_record( - store, + async fn provider_validation_errors() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + let create_missing_type = create_provider_record( + &store, Provider { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { id: String::new(), name: "bad-provider".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await .unwrap_err(); assert_eq!(create_missing_type.code(), Code::InvalidArgument); - let create_missing_credentials = create_provider_record( - store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "gitlab-no-creds".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "gitlab".to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap_err(); - assert_eq!(create_missing_credentials.code(), Code::InvalidArgument); - - handle_import_provider_profiles( - &state, - Request::new(ImportProviderProfilesRequest { - profiles: vec![ProviderProfileImportItem { - profile: Some(ProviderProfile { - id: "delegated-refresh-api".to_string(), - display_name: "Delegated Refresh API".to_string(), - description: String::new(), - category: ProviderProfileCategory::Messaging as i32, - credentials: vec![ProviderProfileCredential { - name: "access_token".to_string(), - description: String::new(), - env_vars: vec!["DELEGATED_ACCESS_TOKEN".to_string()], - required: true, - auth_style: "bearer".to_string(), - header_name: "authorization".to_string(), - query_param: String::new(), - refresh: Some(ProviderCredentialRefresh { - strategy: ProviderCredentialRefreshStrategy::Oauth2RefreshToken - as i32, - token_url: "https://login.example/token".to_string(), - scopes: vec!["https://example.test/.default".to_string()], - refresh_before_seconds: 300, - max_lifetime_seconds: 3600, - material: vec![ - ProviderCredentialRefreshMaterial { - name: "client_id".to_string(), - description: String::new(), - required: true, - secret: false, - }, - ProviderCredentialRefreshMaterial { - name: "refresh_token".to_string(), - description: String::new(), - required: true, - secret: true, - }, - ], - }), - }], - endpoints: vec![], - binaries: vec![], - inference_capable: false, - }), - source: "delegated-refresh-api.yaml".to_string(), - }], - }), - ) - .await - .unwrap(); - let delegated_refresh_bootstrap_provider = create_provider_record( - store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "delegated-refresh-no-token-yet".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "delegated-refresh-api".to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - assert!(delegated_refresh_bootstrap_provider.credentials.is_empty()); - - let mut mixed_required_profile = custom_profile("mixed-required-api"); - mixed_required_profile.credentials = vec![ - refreshable_credential("access_token", "MIXED_ACCESS_TOKEN"), - static_credential("static_token", "MIXED_STATIC_TOKEN", true), - ]; - handle_import_provider_profiles( - &state, - Request::new(ImportProviderProfilesRequest { - profiles: vec![ProviderProfileImportItem { - profile: Some(mixed_required_profile), - source: "mixed-required-api.yaml".to_string(), - }], - }), - ) - .await - .unwrap(); - let mixed_required_empty = create_provider_record( - store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "mixed-required-no-token-yet".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "mixed-required-api".to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap_err(); - assert_eq!(mixed_required_empty.code(), Code::InvalidArgument); - - let mut optional_static_profile = custom_profile("optional-static-api"); - optional_static_profile.credentials = vec![ - refreshable_credential("access_token", "OPTIONAL_ACCESS_TOKEN"), - static_credential("static_token", "OPTIONAL_STATIC_TOKEN", false), - ]; - handle_import_provider_profiles( - &state, - Request::new(ImportProviderProfilesRequest { - profiles: vec![ProviderProfileImportItem { - profile: Some(optional_static_profile), - source: "optional-static-api.yaml".to_string(), - }], - }), - ) - .await - .unwrap(); - let optional_static_empty = create_provider_record( - store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "optional-static-no-token-yet".to_string(), - created_at_ms: 1_000_000, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "optional-static-api".to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - assert!(optional_static_empty.credentials.is_empty()); - - let get_err = get_provider_record(store, "").await.unwrap_err(); + let get_err = get_provider_record(&store, "").await.unwrap_err(); assert_eq!(get_err.code(), Code::InvalidArgument); - let delete_err = delete_provider_record(store, "").await.unwrap_err(); + let delete_err = delete_provider_record(&store, "").await.unwrap_err(); assert_eq!(delete_err.code(), Code::InvalidArgument); let update_missing_err = update_provider_record( - store, + &store, Provider { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { id: String::new(), name: "missing".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3107,12 +1573,10 @@ mod tests { name: "noop-test".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3156,12 +1620,10 @@ mod tests { name: "delete-key-test".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: std::iter::once(("SECONDARY".to_string(), String::new())).collect(), config: std::iter::once(("region".to_string(), String::new())).collect(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3209,12 +1671,10 @@ mod tests { name: "type-preserve-test".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3240,12 +1700,10 @@ mod tests { name: "type-change-test".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "openai".to_string(), credentials: HashMap::new(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3273,12 +1731,10 @@ mod tests { name: "validate-merge-test".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: String::new(), credentials: std::iter::once((oversized_key, "value".to_string())).collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3288,85 +1744,43 @@ mod tests { } #[tokio::test] - async fn resolve_provider_env_empty_list_returns_empty() { - let store = Store::connect("sqlite::memory:").await.unwrap(); - let result = resolve_provider_environment(&store, &[]).await.unwrap(); - assert!(result.is_empty()); - } - - #[tokio::test] - async fn resolve_provider_env_injects_credentials() { - let store = Store::connect("sqlite::memory:").await.unwrap(); - let provider = Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "claude-local".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "claude".to_string(), - credentials: [ - ("ANTHROPIC_API_KEY".to_string(), "sk-abc".to_string()), - ("CLAUDE_API_KEY".to_string(), "sk-abc".to_string()), - ] - .into_iter() - .collect(), - config: std::iter::once(( - "endpoint".to_string(), - "https://api.anthropic.com".to_string(), - )) - .collect(), - credential_expires_at_ms: HashMap::new(), - }; - create_provider_record(&store, provider).await.unwrap(); - - let result = resolve_provider_environment(&store, &["claude-local".to_string()]) - .await - .unwrap(); - assert_eq!(result.get("ANTHROPIC_API_KEY"), Some(&"sk-abc".to_string())); - assert_eq!(result.get("CLAUDE_API_KEY"), Some(&"sk-abc".to_string())); - assert!(!result.contains_key("endpoint")); + async fn resolve_provider_env_empty_list_returns_empty() { + let store = Store::connect("sqlite::memory:").await.unwrap(); + let result = resolve_provider_environment(&store, &[]).await.unwrap(); + assert!(result.is_empty()); } #[tokio::test] - async fn resolve_provider_env_skips_expired_credentials_and_returns_expiry_metadata() { + async fn resolve_provider_env_injects_credentials() { let store = Store::connect("sqlite::memory:").await.unwrap(); - let now_ms = crate::persistence::current_time_ms(); let provider = Provider { metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { id: String::new(), - name: "expiring-provider".to_string(), + name: "claude-local".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), - r#type: "test".to_string(), + r#type: "claude".to_string(), credentials: [ - ("FRESH_TOKEN".to_string(), "fresh".to_string()), - ("STALE_TOKEN".to_string(), "stale".to_string()), + ("ANTHROPIC_API_KEY".to_string(), "sk-abc".to_string()), + ("CLAUDE_API_KEY".to_string(), "sk-abc".to_string()), ] .into_iter() .collect(), - config: HashMap::new(), - credential_expires_at_ms: [ - ("FRESH_TOKEN".to_string(), now_ms + 60_000), - ("STALE_TOKEN".to_string(), now_ms - 60_000), - ] - .into_iter() + config: std::iter::once(( + "endpoint".to_string(), + "https://api.anthropic.com".to_string(), + )) .collect(), }; create_provider_record(&store, provider).await.unwrap(); - let result = resolve_provider_environment(&store, &["expiring-provider".to_string()]) + let result = resolve_provider_environment(&store, &["claude-local".to_string()]) .await .unwrap(); - assert_eq!(result.get("FRESH_TOKEN"), Some(&"fresh".to_string())); - assert!(!result.contains_key("STALE_TOKEN")); - assert_eq!( - result.credential_expires_at_ms.get("FRESH_TOKEN"), - Some(&(now_ms + 60_000)) - ); + assert_eq!(result.get("ANTHROPIC_API_KEY"), Some(&"sk-abc".to_string())); + assert_eq!(result.get("CLAUDE_API_KEY"), Some(&"sk-abc".to_string())); + assert!(!result.contains_key("endpoint")); } #[tokio::test] @@ -3388,7 +1802,6 @@ mod tests { name: "test-provider".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "test".to_string(), credentials: [ @@ -3399,7 +1812,6 @@ mod tests { .into_iter() .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }; create_provider_record(&store, provider).await.unwrap(); @@ -3422,7 +1834,6 @@ mod tests { name: "claude-local".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "claude".to_string(), credentials: std::iter::once(( @@ -3431,7 +1842,6 @@ mod tests { )) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3444,13 +1854,11 @@ mod tests { name: "gitlab-local".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "gitlab".to_string(), credentials: std::iter::once(("GITLAB_TOKEN".to_string(), "glpat-xyz".to_string())) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3467,7 +1875,7 @@ mod tests { } #[tokio::test] - async fn resolve_provider_env_rejects_duplicate_credential_keys() { + async fn resolve_provider_env_first_credential_wins_on_duplicate_key() { let store = Store::connect("sqlite::memory:").await.unwrap(); create_provider_record( &store, @@ -3477,13 +1885,11 @@ mod tests { name: "provider-a".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "claude".to_string(), credentials: std::iter::once(("SHARED_KEY".to_string(), "first-value".to_string())) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3496,7 +1902,6 @@ mod tests { name: "provider-b".to_string(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: "gitlab".to_string(), credentials: std::iter::once(( @@ -3505,113 +1910,18 @@ mod tests { )) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await .unwrap(); - let err = resolve_provider_environment( + let result = resolve_provider_environment( &store, &["provider-a".to_string(), "provider-b".to_string()], ) .await - .unwrap_err(); - assert_eq!(err.code(), Code::FailedPrecondition); - assert!(err.message().contains("SHARED_KEY")); - assert!(err.message().contains("provider-a")); - assert!(err.message().contains("provider-b")); - } - - #[tokio::test] - async fn update_provider_rejects_credential_key_collision_for_attached_sandbox() { - let store = Store::connect("sqlite::memory:").await.unwrap(); - create_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "provider-a".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "outlook".to_string(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "graph-token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap(); - create_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "provider-b".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: "google-drive".to_string(), - credentials: std::iter::once(( - "GOOGLE_ACCESS_TOKEN".to_string(), - "google-token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await .unwrap(); - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "sandbox-collision".to_string(), - name: "collision".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - providers: vec!["provider-a".to_string(), "provider-b".to_string()], - ..SandboxSpec::default() - }), - ..Default::default() - }; - store.put_message(&sandbox).await.unwrap(); - - let err = update_provider_record( - &store, - Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "provider-b".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: String::new(), - credentials: std::iter::once(( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "wrong-token".to_string(), - )) - .collect(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }, - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), Code::FailedPrecondition); - assert!(err.message().contains("collision")); - assert!(err.message().contains("MS_GRAPH_ACCESS_TOKEN")); + assert_eq!(result.get("SHARED_KEY"), Some(&"first-value".to_string())); } #[tokio::test] @@ -3628,7 +1938,6 @@ mod tests { name: "my-claude".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: "claude".to_string(), credentials: std::iter::once(( @@ -3637,7 +1946,6 @@ mod tests { )) .collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), }, ) .await @@ -3649,7 +1957,6 @@ mod tests { name: "test-sandbox".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec { providers: vec!["my-claude".to_string()], @@ -3686,7 +1993,6 @@ mod tests { name: "empty-sandbox".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), spec: Some(SandboxSpec::default()), status: None, @@ -3716,365 +2022,4 @@ mod tests { let result = store.get_message::("nonexistent").await.unwrap(); assert!(result.is_none()); } - - #[tokio::test] - async fn update_provider_validates_before_write() { - let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); - - // Create a valid provider - let provider = provider_with_values("test-validate-provider", "test-type"); - let created = create_provider_record(&store, provider.clone()) - .await - .unwrap(); - - // Build update request with just the name and new credentials - let mut update_req = Provider { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: String::new(), - name: "test-validate-provider".to_string(), - created_at_ms: 0, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: String::new(), // Empty type is ignored in update - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - }; - - // Attempt to update with an oversized credential key (exceeds MAX_MAP_KEY_LEN) - update_req.credentials.insert( - "k".repeat(MAX_MAP_KEY_LEN + 1), - "oversized-key-value".to_string(), - ); - - let result = update_provider_record(&store, update_req).await; - - // Update should fail with InvalidArgument due to oversized key - assert!(result.is_err(), "update with invalid data should fail"); - let err = result.unwrap_err(); - assert_eq!( - err.code(), - Code::InvalidArgument, - "should fail validation with InvalidArgument" - ); - assert!( - err.message().contains("key"), - "error message should mention key: {}", - err.message() - ); - - // Verify database still contains the ORIGINAL valid provider (not the invalid one) - let stored = store - .get_message_by_name::("test-validate-provider") - .await - .unwrap() - .expect("provider should still exist"); - - assert_eq!( - stored.object_id(), - created.object_id(), - "stored provider ID should match original" - ); - assert_eq!( - stored.credentials.len(), - created.credentials.len(), - "credentials count should not have changed" - ); - assert!( - !stored - .credentials - .contains_key(&"k".repeat(MAX_MAP_KEY_LEN + 1)), - "oversized key should NOT be in database" - ); - } - - #[tokio::test] - async fn concurrent_create_provider_rejects_duplicate() { - let store = Arc::new( - Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(), - ); - - let provider = provider_with_values("test-concurrent-provider", "test-type"); - - // Spawn two concurrent creation attempts for the same provider - let store1 = store.clone(); - let provider1 = provider.clone(); - let handle1 = tokio::spawn(async move { create_provider_record(&store1, provider1).await }); - - let store2 = store.clone(); - let provider2 = provider.clone(); - let handle2 = tokio::spawn(async move { create_provider_record(&store2, provider2).await }); - - // Wait for both to complete - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // Exactly one should succeed, one should fail with AlreadyExists - let success_count = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - let already_exists_count = [&result1, &result2] - .iter() - .filter(|r| { - r.as_ref() - .err() - .is_some_and(|e| e.code() == Code::AlreadyExists) - }) - .count(); - - assert_eq!( - success_count, 1, - "exactly one creation should succeed, got results: {result1:?} {result2:?}" - ); - assert_eq!( - already_exists_count, 1, - "exactly one creation should fail with AlreadyExists, got results: {result1:?} {result2:?}" - ); - - // Verify the successful provider can be retrieved by name - let created_provider = [result1, result2] - .into_iter() - .find_map(Result::ok) - .expect("should have one successful creation"); - let retrieved = store - .get_message_by_name::("test-concurrent-provider") - .await - .unwrap(); - assert!( - retrieved.is_some(), - "created provider should be retrievable by name" - ); - assert_eq!( - retrieved.unwrap().object_id(), - created_provider.object_id(), - "retrieved provider should match created provider" - ); - } - - // ---- CAS (Client-driven optimistic concurrency) tests for UpdateProvider ---- - - #[tokio::test] - async fn update_provider_client_driven_cas_succeeds_with_correct_version() { - let state = test_server_state().await; - - // Create a provider - let mut provider = provider_with_values("test-provider", "generic"); - provider.metadata.as_mut().unwrap().id = String::new(); - handle_create_provider( - &state, - Request::new(CreateProviderRequest { - provider: Some(provider.clone()), - }), - ) - .await - .unwrap(); - - // Fetch the provider to get its current resource_version - let current = state - .store - .get_message_by_name::("test-provider") - .await - .unwrap() - .unwrap(); - let current_version = current.metadata.as_ref().unwrap().resource_version; - - // Prepare an update with the correct resource_version - let mut updated_provider = current.clone(); - updated_provider - .credentials - .insert("NEW_KEY".to_string(), "new-value".to_string()); - updated_provider.metadata.as_mut().unwrap().resource_version = current_version; - - // Update should succeed - let response = handle_update_provider( - &state, - Request::new(UpdateProviderRequest { - provider: Some(updated_provider.clone()), - credential_expires_at_ms: HashMap::new(), - }), - ) - .await - .unwrap() - .into_inner(); - - assert_eq!( - response.provider.as_ref().unwrap().object_name(), - "test-provider" - ); - assert_eq!( - response - .provider - .as_ref() - .unwrap() - .metadata - .as_ref() - .unwrap() - .resource_version, - current_version + 1 - ); - assert!( - response - .provider - .unwrap() - .credentials - .contains_key("NEW_KEY") - ); - } - - #[tokio::test] - async fn update_provider_client_driven_cas_rejects_stale_version() { - let state = test_server_state().await; - - // Create a provider - let mut provider = provider_with_values("test-provider", "generic"); - provider.metadata.as_mut().unwrap().id = String::new(); - handle_create_provider( - &state, - Request::new(CreateProviderRequest { - provider: Some(provider.clone()), - }), - ) - .await - .unwrap(); - - // Fetch the current state - let current = state - .store - .get_message_by_name::("test-provider") - .await - .unwrap() - .unwrap(); - let current_version = current.metadata.as_ref().unwrap().resource_version; - - // Prepare an update with a stale resource_version - let mut stale_provider = current.clone(); - stale_provider - .credentials - .insert("NEW_KEY".to_string(), "new-value".to_string()); - stale_provider.metadata.as_mut().unwrap().resource_version = 99; // stale version - - // Update should fail with ABORTED - let err = handle_update_provider( - &state, - Request::new(UpdateProviderRequest { - provider: Some(stale_provider), - credential_expires_at_ms: HashMap::new(), - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), Code::Aborted); - assert!( - err.message().contains("modified concurrently") - || err.message().contains("resource_version"), - "error message should mention concurrency conflict: {}", - err.message() - ); - - // Verify the provider was not modified - let unchanged = state - .store - .get_message_by_name::("test-provider") - .await - .unwrap() - .unwrap(); - assert_eq!( - unchanged.metadata.as_ref().unwrap().resource_version, - current_version - ); - assert!(!unchanged.credentials.contains_key("NEW_KEY")); - } - - #[tokio::test] - async fn update_provider_concurrent_updates_with_stale_versions() { - use std::sync::Arc; - - let state = Arc::new(test_server_state().await); - - // Create a provider - let mut provider = provider_with_values("test-provider", "generic"); - provider.metadata.as_mut().unwrap().id = String::new(); - handle_create_provider( - &state, - Request::new(CreateProviderRequest { - provider: Some(provider.clone()), - }), - ) - .await - .unwrap(); - - // All three clients fetch the provider and see the same version - let initial = state - .store - .get_message_by_name::("test-provider") - .await - .unwrap() - .unwrap(); - let initial_version = initial.metadata.as_ref().unwrap().resource_version; - - // Launch 3 concurrent updates, all using the same initial version - let mut handles = vec![]; - for i in 0..3 { - let state_clone = Arc::clone(&state); - let mut updated = initial.clone(); - updated - .credentials - .insert(format!("KEY_{i}"), format!("value-{i}")); - updated.metadata.as_mut().unwrap().resource_version = initial_version; - - let handle = tokio::spawn(async move { - handle_update_provider( - &state_clone, - Request::new(UpdateProviderRequest { - provider: Some(updated), - credential_expires_at_ms: HashMap::new(), - }), - ) - .await - }); - handles.push(handle); - } - - let results: Vec<_> = futures::future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Only one should succeed; others should get ABORTED - let successes = results.iter().filter(|r| r.is_ok()).count(); - let aborted_conflicts = results - .iter() - .filter(|r| r.as_ref().err().is_some_and(|e| e.code() == Code::Aborted)) - .count(); - - assert_eq!( - successes, 1, - "exactly one update should succeed with client-driven CAS" - ); - assert_eq!( - aborted_conflicts, 2, - "two updates should fail with ABORTED due to stale version" - ); - - // Final provider should have exactly 1 new credential key and resource_version = initial_version + 1 - let final_provider = state - .store - .get_message_by_name::("test-provider") - .await - .unwrap() - .unwrap(); - assert_eq!( - final_provider.metadata.as_ref().unwrap().resource_version, - initial_version + 1 - ); - - // Exactly one of KEY_0, KEY_1, or KEY_2 should be present - let new_keys_count = (0..3) - .filter(|i| final_provider.credentials.contains_key(&format!("KEY_{i}"))) - .count(); - assert_eq!(new_keys_count, 1); - } } diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 4978687ed..05fea9b7c 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -10,8 +10,9 @@ #![allow(clippy::cast_possible_wrap)] // Intentional u32->i32 conversions for proto compat use crate::ServerState; -use crate::persistence::{ObjectType, WriteCondition, generate_name}; +use crate::persistence::{ObjectType, generate_name}; use futures::future; +use openshell_core::ObjectId; use openshell_core::proto::{ AttachSandboxProviderRequest, AttachSandboxProviderResponse, CreateSandboxRequest, CreateSshSessionRequest, CreateSshSessionResponse, DeleteSandboxRequest, DeleteSandboxResponse, @@ -23,12 +24,10 @@ use openshell_core::proto::{ TcpRelayTarget, WatchSandboxRequest, relay_open, tcp_forward_init, }; use openshell_core::proto::{Sandbox, SandboxPhase, SandboxTemplate, SshSession}; -use openshell_core::{ObjectId, ObjectName}; use prost::Message; use std::net::IpAddr; use std::pin::Pin; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; use tokio::net::{TcpListener, TcpStream}; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; @@ -38,9 +37,7 @@ use tracing::{debug, info, warn}; use russh::ChannelMsg; use russh::client::AuthResult; -use super::provider::{ - get_provider_record, is_valid_env_key, validate_provider_environment_keys_unique, -}; +use super::provider::{get_provider_record, is_valid_env_key}; use super::validation::{ level_matches, source_matches, validate_exec_request_fields, validate_policy_safety, validate_sandbox_spec, @@ -56,6 +53,42 @@ const TCP_FORWARD_CHUNK_SIZE: usize = 64 * 1024; pub(super) async fn handle_create_sandbox( state: &Arc, request: Request, +) -> Result, Status> { + let create_request = request.get_ref().clone(); + let result = handle_create_sandbox_inner(state, request).await; + emit_sandbox_create_telemetry( + &create_request, + if result.is_ok() { "success" } else { "failure" }, + ); + result +} + +fn emit_sandbox_create_telemetry(request: &CreateSandboxRequest, outcome: &str) { + let Some(spec) = request.spec.as_ref() else { + openshell_core::telemetry::emit_sandbox_create(outcome, false, 0, false, "undefined"); + return; + }; + let template_source = if spec + .template + .as_ref() + .is_some_and(|template| !template.image.trim().is_empty()) + { + "image" + } else { + "default" + }; + openshell_core::telemetry::emit_sandbox_create( + outcome, + spec.gpu, + spec.providers.len() as u64, + spec.policy.is_some(), + template_source, + ); +} + +async fn handle_create_sandbox_inner( + state: &Arc, + request: Request, ) -> Result, Status> { use crate::persistence::current_time_ms; @@ -82,7 +115,6 @@ pub(super) async fn handle_create_sandbox( .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? .ok_or_else(|| Status::failed_precondition(format!("provider '{name}' not found")))?; } - validate_provider_environment_keys_unique(state.store.as_ref(), &spec.providers).await?; // Ensure the template always carries the resolved image. let mut spec = spec; @@ -113,7 +145,6 @@ pub(super) async fn handle_create_sandbox( name: name.clone(), created_at_ms: now_ms, labels: request.labels.clone(), - resource_version: 0, }), spec: Some(spec), status: None, @@ -173,21 +204,34 @@ pub(super) async fn handle_list_sandboxes( let request = request.into_inner(); let limit = clamp_limit(request.limit, 100, MAX_PAGE_SIZE); - let sandboxes: Vec = if request.label_selector.is_empty() { + // If no label selector is provided, use the unfiltered list path + let records = if request.label_selector.is_empty() { state .store - .list_messages(limit, request.offset) + .list(Sandbox::object_type(), limit, request.offset) .await .map_err(|e| Status::internal(format!("list sandboxes failed: {e}")))? } else { crate::grpc::validation::validate_label_selector(&request.label_selector)?; state .store - .list_messages_with_selector(&request.label_selector, limit, request.offset) + .list_with_selector( + Sandbox::object_type(), + &request.label_selector, + limit, + request.offset, + ) .await .map_err(|e| Status::internal(format!("list sandboxes with selector failed: {e}")))? }; + let mut sandboxes = Vec::with_capacity(records.len()); + for record in records { + let sandbox = Sandbox::decode(record.payload.as_slice()) + .map_err(|e| Status::internal(format!("decode sandbox failed: {e}")))?; + sandboxes.push(sandbox); + } + Ok(Response::new(ListSandboxesResponse { sandboxes })) } @@ -209,16 +253,6 @@ pub(super) async fn handle_attach_sandbox_provider( return Err(Status::invalid_argument("provider_name is required")); } - // Validate provider name would not violate sandbox spec constraints if added - // (pre-validation ensures CAS mutations preserve invariants) - if request.provider_name.len() > super::MAX_NAME_LEN { - return Err(Status::invalid_argument(format!( - "provider_name exceeds maximum length ({} > {})", - request.provider_name.len(), - super::MAX_NAME_LEN - ))); - } - get_provider_record(state.store.as_ref(), &request.provider_name) .await .map_err(|err| { @@ -233,73 +267,39 @@ pub(super) async fn handle_attach_sandbox_provider( })?; let _sandbox_sync_guard = state.compute.sandbox_sync_guard().await; - let sandbox = sandbox_by_name(state, &request.sandbox_name).await?; - let sandbox_id = sandbox + let mut sandbox = sandbox_by_name(state, &request.sandbox_name).await?; + let sandbox_name = sandbox .metadata .as_ref() - .ok_or_else(|| Status::internal("sandbox metadata is missing"))? - .id - .clone(); - - // Pre-check: fail fast if sandbox spec is missing (invariant violation) + .map_or_else(String::new, |metadata| metadata.name.clone()); let spec = sandbox .spec - .as_ref() - .ok_or_else(|| Status::internal("sandbox spec is missing"))?; - - // Pre-check: fail fast if already at MAX_PROVIDERS limit (avoid spurious CAS conflicts) - // Note: This is an optimization; the CAS closure rechecks after dedupe in case of races - if spec.providers.len() >= MAX_PROVIDERS - && !spec - .providers - .iter() - .any(|name| name == &request.provider_name) - { - return Err(Status::invalid_argument(format!( - "providers list exceeds maximum ({MAX_PROVIDERS})" - ))); - } - let mut candidate_spec = spec.clone(); - dedupe_provider_names(&mut candidate_spec.providers); - if !candidate_spec + .as_mut() + .ok_or_else(|| Status::failed_precondition("sandbox spec is missing"))?; + + dedupe_provider_names(&mut spec.providers); + let attached = if spec .providers .iter() .any(|name| name == &request.provider_name) { - candidate_spec.providers.push(request.provider_name.clone()); - } - validate_sandbox_spec(&request.sandbox_name, &candidate_spec)?; - validate_provider_environment_keys_unique(state.store.as_ref(), &candidate_spec.providers) - .await?; - - let provider_name = request.provider_name.clone(); - let attached = Arc::new(AtomicBool::new(false)); - let attached_clone = attached.clone(); + false + } else { + if spec.providers.len() >= MAX_PROVIDERS { + return Err(Status::invalid_argument(format!( + "providers list exceeds maximum ({MAX_PROVIDERS})" + ))); + } + spec.providers.push(request.provider_name.clone()); + true + }; + validate_sandbox_spec(&sandbox_name, spec)?; - let sandbox = state + state .store - .update_message_cas::( - &sandbox_id, - request.expected_resource_version, - |sandbox| { - let Some(ref mut spec) = sandbox.spec else { - // Spec should always exist post-creation; if missing, fail CAS to surface error - return; - }; - - dedupe_provider_names(&mut spec.providers); - if !spec.providers.iter().any(|name| name == &provider_name) - && spec.providers.len() < MAX_PROVIDERS - { - spec.providers.push(provider_name.clone()); - attached_clone.store(true, Ordering::Relaxed); - } - }, - ) + .put_message(&sandbox) .await - .map_err(|e| super::persistence_error_to_status(e, "attach sandbox provider"))?; - - let attached = attached.load(Ordering::Relaxed); + .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; info!( sandbox_name = %request.sandbox_name, @@ -323,58 +323,28 @@ pub(super) async fn handle_detach_sandbox_provider( return Err(Status::invalid_argument("provider_name is required")); } - // Validate provider name (pre-validation ensures CAS mutations preserve invariants) - if request.provider_name.len() > super::MAX_NAME_LEN { - return Err(Status::invalid_argument(format!( - "provider_name exceeds maximum length ({} > {})", - request.provider_name.len(), - super::MAX_NAME_LEN - ))); - } - let _sandbox_sync_guard = state.compute.sandbox_sync_guard().await; - let sandbox = sandbox_by_name(state, &request.sandbox_name).await?; - let sandbox_id = sandbox + let mut sandbox = sandbox_by_name(state, &request.sandbox_name).await?; + let sandbox_name = sandbox .metadata .as_ref() - .ok_or_else(|| Status::internal("sandbox metadata is missing"))? - .id - .clone(); - - // Pre-check: fail fast if sandbox spec is missing (invariant violation) - let _spec = sandbox + .map_or_else(String::new, |metadata| metadata.name.clone()); + let spec = sandbox .spec - .as_ref() - .ok_or_else(|| Status::internal("sandbox spec is missing"))?; + .as_mut() + .ok_or_else(|| Status::failed_precondition("sandbox spec is missing"))?; - let provider_name = request.provider_name.clone(); - let detached = Arc::new(AtomicBool::new(false)); - let detached_clone = detached.clone(); + let before_len = spec.providers.len(); + spec.providers.retain(|name| name != &request.provider_name); + let detached = spec.providers.len() != before_len; + dedupe_provider_names(&mut spec.providers); + validate_sandbox_spec(&sandbox_name, spec)?; - let sandbox = state + state .store - .update_message_cas::( - &sandbox_id, - request.expected_resource_version, - |sandbox| { - let Some(ref mut spec) = sandbox.spec else { - // Spec should always exist post-creation; if missing, fail CAS to surface error - return; - }; - - let before_len = spec.providers.len(); - spec.providers.retain(|name| name != &provider_name); - if spec.providers.len() != before_len { - detached_clone.store(true, Ordering::Relaxed); - // Only dedupe after making a change - dedupe_provider_names(&mut spec.providers); - } - }, - ) + .put_message(&sandbox) .await - .map_err(|e| super::persistence_error_to_status(e, "detach sandbox provider"))?; - - let detached = detached.load(Ordering::Relaxed); + .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; info!( sandbox_name = %request.sandbox_name, @@ -392,13 +362,36 @@ pub(super) async fn handle_detach_sandbox_provider( pub(super) async fn handle_delete_sandbox( state: &Arc, request: Request, +) -> Result, Status> { + let result = handle_delete_sandbox_inner(state, request).await; + let outcome = match &result { + Ok(response) if response.get_ref().deleted => "success", + _ => "failure", + }; + openshell_core::telemetry::emit_lifecycle("sandbox", "delete", outcome); + result +} + +async fn handle_delete_sandbox_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let name = request.into_inner().name; if name.is_empty() { return Err(Status::invalid_argument("name is required")); } + let sandbox_id = state + .store + .get_message_by_name::(&name) + .await + .ok() + .flatten() + .map(|sandbox| sandbox.object_id().to_string()); let deleted = state.compute.delete_sandbox(&name).await?; + if deleted && let Some(sandbox_id) = sandbox_id { + state.telemetry.end_sandbox_session(&sandbox_id); + } info!(sandbox_name = %name, "DeleteSandbox request completed successfully"); Ok(Response::new(DeleteSandboxResponse { deleted })) } @@ -1262,7 +1255,6 @@ pub(super) async fn handle_create_ssh_session( name: generate_name(), created_at_ms: now_ms, labels: std::collections::HashMap::new(), - resource_version: 0, }), sandbox_id: req.sandbox_id.clone(), token: token.clone(), @@ -1273,17 +1265,9 @@ pub(super) async fn handle_create_ssh_session( // Ensure metadata is valid (defense in depth - should always be true for server-constructed metadata) super::validation::validate_object_metadata(session.metadata.as_ref(), "ssh_session")?; - // Use MustCreate to atomically ensure the session token is unique state .store - .put_if( - SshSession::object_type(), - &token, - session.object_name(), - &session.encode_to_vec(), - None, - WriteCondition::MustCreate, - ) + .put_message(&session) .await .map_err(|e| Status::internal(format!("persist ssh session failed: {e}")))?; @@ -1324,26 +1308,12 @@ pub(super) async fn handle_revoke_ssh_session( return Ok(Response::new(RevokeSshSessionResponse { revoked: false })); }; - let resource_version = session - .metadata - .as_ref() - .map_or(0, |metadata| metadata.resource_version); - session.revoked = true; - - // Use CAS to prevent lost updates from concurrent revocations state .store - .put_if( - SshSession::object_type(), - session.object_id(), - session.object_name(), - &session.encode_to_vec(), - None, - WriteCondition::MatchResourceVersion(resource_version), - ) + .put_message(&session) .await - .map_err(|e| super::persistence_error_to_status(e, "revoke ssh session"))?; + .map_err(|e| Status::internal(format!("persist ssh session failed: {e}")))?; Ok(Response::new(RevokeSshSessionResponse { revoked: true })) } @@ -2062,27 +2032,16 @@ mod tests { } fn test_provider(name: &str, provider_type: &str) -> Provider { - test_provider_with_credential_key(name, provider_type, "TOKEN") - } - - fn test_provider_with_credential_key( - name: &str, - provider_type: &str, - credential_key: &str, - ) -> Provider { Provider { metadata: Some(ObjectMeta { id: format!("provider-{name}"), name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), - credentials: std::iter::once((credential_key.to_string(), "secret".to_string())) - .collect(), + credentials: std::iter::once(("TOKEN".to_string(), "secret".to_string())).collect(), config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), } } @@ -2093,7 +2052,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: std::iter::once(("team".to_string(), "agents".to_string())).collect(), - resource_version: 0, }), spec: Some(openshell_core::proto::SandboxSpec { log_level: "debug".to_string(), @@ -2126,7 +2084,6 @@ mod tests { Request::new(AttachSandboxProviderRequest { sandbox_name: "work".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -2169,7 +2126,6 @@ mod tests { Request::new(AttachSandboxProviderRequest { sandbox_name: "work".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -2210,7 +2166,6 @@ mod tests { Request::new(DetachSandboxProviderRequest { sandbox_name: "work".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -2234,7 +2189,6 @@ mod tests { Request::new(DetachSandboxProviderRequest { sandbox_name: "work".to_string(), provider_name: "work-github".to_string(), - expected_resource_version: 0, }), ) .await @@ -2289,7 +2243,6 @@ mod tests { Request::new(AttachSandboxProviderRequest { sandbox_name: "work".to_string(), provider_name: "missing".to_string(), - expected_resource_version: 0, }), ) .await @@ -2439,690 +2392,4 @@ mod tests { Some(SandboxPhase::Ready) ); } - - #[tokio::test] - async fn create_sandbox_rejects_provider_credential_key_collisions() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("provider-a", "outlook")) - .await - .unwrap(); - state - .store - .put_message(&test_provider("provider-b", "google-drive")) - .await - .unwrap(); - - let err = handle_create_sandbox( - &state, - Request::new(CreateSandboxRequest { - name: "collision".to_string(), - spec: Some(openshell_core::proto::SandboxSpec { - providers: vec!["provider-a".to_string(), "provider-b".to_string()], - ..Default::default() - }), - labels: HashMap::new(), - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::FailedPrecondition); - assert!(err.message().contains("TOKEN")); - assert!(err.message().contains("provider-a")); - assert!(err.message().contains("provider-b")); - } - - #[tokio::test] - async fn attach_sandbox_provider_rejects_credential_key_collisions() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("provider-a", "outlook")) - .await - .unwrap(); - state - .store - .put_message(&test_provider("provider-b", "google-drive")) - .await - .unwrap(); - state - .store - .put_message(&test_sandbox("work", vec!["provider-a".to_string()])) - .await - .unwrap(); - - let err = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "provider-b".to_string(), - expected_resource_version: 0, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::FailedPrecondition); - assert!(err.message().contains("TOKEN")); - assert!(err.message().contains("provider-a")); - assert!(err.message().contains("provider-b")); - } - - #[tokio::test] - async fn attach_sandbox_provider_accepts_at_max_providers_limit() { - let state = test_server_state().await; - - // Create MAX_PROVIDERS (32) providers - for i in 0..MAX_PROVIDERS { - state - .store - .put_message(&test_provider_with_credential_key( - &format!("provider-{i}"), - "generic", - &format!("TOKEN_{i}"), - )) - .await - .unwrap(); - } - - // Create sandbox with 31 providers already attached - let mut existing_providers = Vec::new(); - for i in 0..(MAX_PROVIDERS - 1) { - existing_providers.push(format!("provider-{i}")); - } - state - .store - .put_message(&test_sandbox("work", existing_providers)) - .await - .unwrap(); - - // Attaching the 32nd provider should succeed - let response = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "provider-31".to_string(), - expected_resource_version: 0, - }), - ) - .await - .unwrap() - .into_inner(); - - assert!(response.attached); - let providers = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap() - .spec - .unwrap() - .providers; - assert_eq!(providers.len(), MAX_PROVIDERS); - } - - #[tokio::test] - async fn attach_sandbox_provider_rejects_beyond_max_providers_limit() { - let state = test_server_state().await; - - // Create MAX_PROVIDERS + 1 providers - for i in 0..=MAX_PROVIDERS { - state - .store - .put_message(&test_provider_with_credential_key( - &format!("provider-{i}"), - "generic", - &format!("TOKEN_{i}"), - )) - .await - .unwrap(); - } - - // Create sandbox with MAX_PROVIDERS already attached - let mut existing_providers = Vec::new(); - for i in 0..MAX_PROVIDERS { - existing_providers.push(format!("provider-{i}")); - } - state - .store - .put_message(&test_sandbox("work", existing_providers)) - .await - .unwrap(); - - // Attempting to attach the 33rd provider should fail - let err = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "provider-32".to_string(), - expected_resource_version: 0, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - assert!(err.message().contains("exceeds maximum")); - - // Verify sandbox was not modified - let providers = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap() - .spec - .unwrap() - .providers; - assert_eq!(providers.len(), MAX_PROVIDERS); - } - - #[tokio::test] - async fn attach_sandbox_provider_pre_validation_fails_fast() { - let state = test_server_state().await; - - // Provider name that exceeds validation limits - let long_name = "a".repeat(1000); - state - .store - .put_message(&test_provider(&long_name, "generic")) - .await - .unwrap(); - - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // Should fail validation before attempting CAS - let err = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: long_name, - expected_resource_version: 0, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn detach_sandbox_provider_pre_validation_rejects_invalid_names() { - let state = test_server_state().await; - state - .store - .put_message(&test_sandbox("work", vec!["valid".to_string()])) - .await - .unwrap(); - - // Provider name that exceeds validation limits - let long_name = "a".repeat(1000); - - let err = handle_detach_sandbox_provider( - &state, - Request::new(DetachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: long_name, - expected_resource_version: 0, - }), - ) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn concurrent_create_ssh_session_prevents_duplicate_tokens() { - let state = test_server_state().await; - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // Both requests try to create sessions for the same sandbox - // The token generation is random, so we can't force a collision, - // but we can verify that both succeed with different tokens - let state1 = state.clone(); - let handle1 = tokio::spawn(async move { - handle_create_ssh_session( - &state1, - Request::new(CreateSshSessionRequest { - sandbox_id: "sandbox-work".to_string(), - }), - ) - .await - }); - - let state2 = state.clone(); - let handle2 = tokio::spawn(async move { - handle_create_ssh_session( - &state2, - Request::new(CreateSshSessionRequest { - sandbox_id: "sandbox-work".to_string(), - }), - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // Both should succeed (tokens are random UUIDs, collision is astronomically unlikely) - assert!(result1.is_ok(), "first create should succeed"); - assert!(result2.is_ok(), "second create should succeed"); - - let token1 = result1.unwrap().into_inner().token; - let token2 = result2.unwrap().into_inner().token; - - // Tokens must be different - assert_ne!(token1, token2, "tokens should be unique"); - - // Both sessions should be in the database - let session1 = state - .store - .get_message::(&token1) - .await - .unwrap(); - let session2 = state - .store - .get_message::(&token2) - .await - .unwrap(); - assert!(session1.is_some()); - assert!(session2.is_some()); - } - - #[tokio::test] - async fn concurrent_revoke_ssh_session_handles_cas_properly() { - let state = test_server_state().await; - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // Create a session first - let response = handle_create_ssh_session( - &state, - Request::new(CreateSshSessionRequest { - sandbox_id: "sandbox-work".to_string(), - }), - ) - .await - .unwrap(); - let token = response.into_inner().token; - - // Spawn two concurrent revocation attempts - let state1 = state.clone(); - let token1 = token.clone(); - let handle1 = tokio::spawn(async move { - handle_revoke_ssh_session( - &state1, - Request::new(RevokeSshSessionRequest { token: token1 }), - ) - .await - }); - - let state2 = state.clone(); - let token2 = token.clone(); - let handle2 = tokio::spawn(async move { - handle_revoke_ssh_session( - &state2, - Request::new(RevokeSshSessionRequest { token: token2 }), - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // One should succeed, one may fail with ABORTED due to CAS conflict - let successes = [&result1, &result2] - .iter() - .filter(|r| r.is_ok() && r.as_ref().unwrap().get_ref().revoked) - .count(); - - // At least one should succeed in revoking - assert!( - successes >= 1, - "at least one revocation should succeed, got: {result1:?}, {result2:?}" - ); - - // The session should be revoked in the database - let session = state.store.get_message::(&token).await.unwrap(); - assert!(session.is_some()); - assert!(session.unwrap().revoked, "session should be revoked"); - } - - // ---- CAS (Client-driven optimistic concurrency) tests ---- - - #[tokio::test] - async fn attach_sandbox_provider_client_driven_cas_succeeds_with_correct_version() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("github", "github")) - .await - .unwrap(); - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // Fetch the sandbox to get its current resource_version - let sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - let current_version = sandbox.metadata.as_ref().unwrap().resource_version; - - // Attach with correct expected_resource_version - let response = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "github".to_string(), - expected_resource_version: current_version, - }), - ) - .await - .unwrap() - .into_inner(); - - assert!(response.attached); - - // Verify the resource_version incremented - let updated_sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - assert_eq!( - updated_sandbox.metadata.as_ref().unwrap().resource_version, - current_version + 1 - ); - } - - #[tokio::test] - async fn attach_sandbox_provider_client_driven_cas_rejects_stale_version() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("github", "github")) - .await - .unwrap(); - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // Get current version - let sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - let current_version = sandbox.metadata.as_ref().unwrap().resource_version; - - // Try to attach with a stale version (current_version - 1 would be 0, use 99 instead) - let err = handle_attach_sandbox_provider( - &state, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "github".to_string(), - expected_resource_version: 99, - }), - ) - .await - .unwrap_err(); - - // Should get ABORTED status for CAS conflict - assert_eq!(err.code(), tonic::Code::Aborted); - assert!( - err.message().contains("modified concurrently") - || err.message().contains("resource_version"), - "error message should mention concurrency conflict: {}", - err.message() - ); - - // Verify the sandbox was not modified - let unchanged_sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - assert_eq!( - unchanged_sandbox - .metadata - .as_ref() - .unwrap() - .resource_version, - current_version - ); - assert!(unchanged_sandbox.spec.unwrap().providers.is_empty()); - } - - #[tokio::test] - async fn detach_sandbox_provider_client_driven_cas_succeeds_with_correct_version() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("github", "github")) - .await - .unwrap(); - state - .store - .put_message(&test_sandbox("work", vec!["github".to_string()])) - .await - .unwrap(); - - // Fetch the sandbox to get its current resource_version - let sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - let current_version = sandbox.metadata.as_ref().unwrap().resource_version; - - // Detach with correct expected_resource_version - let response = handle_detach_sandbox_provider( - &state, - Request::new(DetachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "github".to_string(), - expected_resource_version: current_version, - }), - ) - .await - .unwrap() - .into_inner(); - - assert!(response.detached); - - // Verify the resource_version incremented - let updated_sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - assert_eq!( - updated_sandbox.metadata.as_ref().unwrap().resource_version, - current_version + 1 - ); - } - - #[tokio::test] - async fn detach_sandbox_provider_client_driven_cas_rejects_stale_version() { - let state = test_server_state().await; - state - .store - .put_message(&test_provider("github", "github")) - .await - .unwrap(); - state - .store - .put_message(&test_sandbox("work", vec!["github".to_string()])) - .await - .unwrap(); - - // Get current version - let sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - let current_version = sandbox.metadata.as_ref().unwrap().resource_version; - - // Try to detach with a stale version - let err = handle_detach_sandbox_provider( - &state, - Request::new(DetachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: "github".to_string(), - expected_resource_version: 99, - }), - ) - .await - .unwrap_err(); - - // Should get ABORTED status for CAS conflict - assert_eq!(err.code(), tonic::Code::Aborted); - assert!( - err.message().contains("modified concurrently") - || err.message().contains("resource_version"), - "error message should mention concurrency conflict: {}", - err.message() - ); - - // Verify the sandbox was not modified - let unchanged_sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - assert_eq!( - unchanged_sandbox - .metadata - .as_ref() - .unwrap() - .resource_version, - current_version - ); - assert_eq!(unchanged_sandbox.spec.unwrap().providers, vec!["github"]); - } - - #[tokio::test] - async fn attach_sandbox_provider_concurrent_with_stale_versions() { - use std::sync::Arc; - - let state = Arc::new(test_server_state().await); - - // Create multiple providers - for i in 0..3 { - state - .store - .put_message(&test_provider_with_credential_key( - &format!("provider-{i}"), - "generic", - &format!("TOKEN_{i}"), - )) - .await - .unwrap(); - } - - state - .store - .put_message(&test_sandbox("work", Vec::new())) - .await - .unwrap(); - - // All three clients fetch the sandbox and see version 1 - let initial_version = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap() - .metadata - .as_ref() - .unwrap() - .resource_version; - - // Launch 3 concurrent attach operations, all using the same initial version - let mut handles = vec![]; - for i in 0..3 { - let state_clone = Arc::clone(&state); - let handle = tokio::spawn(async move { - handle_attach_sandbox_provider( - &state_clone, - Request::new(AttachSandboxProviderRequest { - sandbox_name: "work".to_string(), - provider_name: format!("provider-{i}"), - expected_resource_version: initial_version, - }), - ) - .await - }); - handles.push(handle); - } - - let results: Vec<_> = future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Only one should succeed; others should get ABORTED - let successes = results.iter().filter(|r| r.is_ok()).count(); - let aborted_conflicts = results - .iter() - .filter(|r| { - r.as_ref() - .err() - .is_some_and(|e| e.code() == tonic::Code::Aborted) - }) - .count(); - - assert_eq!( - successes, 1, - "exactly one attach should succeed with client-driven CAS" - ); - assert_eq!( - aborted_conflicts, 2, - "two attaches should fail with ABORTED due to stale version" - ); - - // Final sandbox should have exactly 1 provider and resource_version = initial_version + 1 - let final_sandbox = state - .store - .get_message_by_name::("work") - .await - .unwrap() - .unwrap(); - assert_eq!(final_sandbox.spec.as_ref().unwrap().providers.len(), 1); - assert_eq!( - final_sandbox.metadata.as_ref().unwrap().resource_version, - initial_version + 1 - ); - } } diff --git a/crates/openshell-server/src/grpc/service.rs b/crates/openshell-server/src/grpc/service.rs index 101a094af..ec9405522 100644 --- a/crates/openshell-server/src/grpc/service.rs +++ b/crates/openshell-server/src/grpc/service.rs @@ -15,7 +15,7 @@ use tonic::{Request, Response, Status}; use uuid::Uuid; use crate::ServerState; -use crate::persistence::{ObjectType, WriteCondition}; +use crate::persistence::ObjectType; use crate::service_routing; const MAX_SERVICE_NAME_LEN: usize = 28; @@ -41,52 +41,29 @@ pub(super) async fn handle_expose_service( let now = super::current_time_ms(); let key = service_routing::endpoint_key(&req.sandbox, &req.service); - - // Fetch existing endpoint to determine create vs. update path - let existing = state + let (id, created_at_ms, created) = match state .store .get_message_by_name::(&key) .await - .map_err(|e| Status::internal(format!("fetch endpoint failed: {e}")))?; - - let (id, created_at_ms, condition, created) = if let Some(existing) = existing { - // Update path: preserve id and created_at, use CAS to prevent conflicts - let resource_version = existing - .metadata - .as_ref() - .map_or(0, |metadata| metadata.resource_version); - ( + { + Ok(Some(existing)) => ( existing.object_id().to_string(), existing .metadata .as_ref() .map_or(now, |metadata| metadata.created_at_ms), - WriteCondition::MatchResourceVersion(resource_version), false, - ) - } else { - // Create path: new id and created_at, use MustCreate to prevent races - ( - Uuid::new_v4().to_string(), - now, - WriteCondition::MustCreate, - true, - ) + ), + Ok(None) => (Uuid::new_v4().to_string(), now, true), + Err(e) => return Err(Status::internal(format!("fetch endpoint failed: {e}"))), }; - let labels_json = serde_json::to_string(&HashMap::from([( - "sandbox".to_string(), - req.sandbox.clone(), - )])) - .map_err(|e| Status::internal(format!("serialize labels failed: {e}")))?; - let endpoint = ServiceEndpoint { metadata: Some(ObjectMeta { - id: id.clone(), - name: key.clone(), + id, + name: key, created_at_ms, labels: HashMap::from([("sandbox".to_string(), req.sandbox.clone())]), - resource_version: 0, }), sandbox_id: sandbox.object_id().to_string(), sandbox_name: req.sandbox.clone(), @@ -95,24 +72,11 @@ pub(super) async fn handle_expose_service( domain: true, }; - // Single-attempt CAS write: fails with ABORTED on concurrent modification - let result = state + state .store - .put_if( - ServiceEndpoint::object_type(), - &id, - &key, - &endpoint.encode_to_vec(), - Some(&labels_json), - condition, - ) + .put_message(&endpoint) .await - .map_err(|e| super::persistence_error_to_status(e, "expose service"))?; - - let mut endpoint = endpoint; - if let Some(ref mut meta) = endpoint.metadata { - meta.resource_version = result.resource_version; - } + .map_err(|e| Status::internal(format!("persist endpoint failed: {e}")))?; let url = service_routing::endpoint_url(&state.config, &req.sandbox, &req.service) .unwrap_or_default(); @@ -149,20 +113,30 @@ pub(super) async fn handle_list_services( } let limit = super::clamp_limit(req.limit, 100, super::MAX_PAGE_SIZE); - let endpoints: Vec = if req.sandbox.is_empty() { - state.store.list_messages(limit, req.offset).await + let records = if req.sandbox.is_empty() { + state + .store + .list(ServiceEndpoint::object_type(), limit, req.offset) + .await } else { state .store - .list_messages_with_selector(&format!("sandbox={}", req.sandbox), limit, req.offset) + .list_with_selector( + ServiceEndpoint::object_type(), + &format!("sandbox={}", req.sandbox), + limit, + req.offset, + ) .await } .map_err(|e| Status::internal(format!("list endpoints failed: {e}")))?; - let services = endpoints - .into_iter() - .map(|ep| service_endpoint_response(state, ep)) - .collect(); + let mut services = Vec::with_capacity(records.len()); + for record in records { + let endpoint = ServiceEndpoint::decode(record.payload.as_slice()) + .map_err(|e| Status::internal(format!("decode endpoint failed: {e}")))?; + services.push(service_endpoint_response(state, endpoint)); + } Ok(Response::new(ListServicesResponse { services })) } @@ -305,7 +279,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000, labels: HashMap::new(), - resource_version: 0, }), spec: Some(openshell_core::proto::SandboxSpec::default()), phase: SandboxPhase::Ready as i32, @@ -424,142 +397,4 @@ mod tests { .into_inner(); assert!(listed.services.is_empty()); } - - #[tokio::test] - async fn concurrent_expose_service_handles_cas_properly() { - let state = test_server_state().await; - seed_sandbox(&state, "my-sandbox").await; - - // Spawn two concurrent expose_service calls for the same endpoint - let state1 = state.clone(); - let handle1 = tokio::spawn(async move { - handle_expose_service( - &state1, - Request::new(ExposeServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - target_port: 8080, - domain: true, - }), - ) - .await - }); - - let state2 = state.clone(); - let handle2 = tokio::spawn(async move { - handle_expose_service( - &state2, - Request::new(ExposeServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - target_port: 9090, - domain: true, - }), - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // One should succeed with MustCreate, the other may fail with ABORTED or succeed with update - let successes = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - - // At least one should succeed - assert!( - successes >= 1, - "at least one expose should succeed, got: {result1:?}, {result2:?}" - ); - - // Only one endpoint should exist - let listed = handle_list_services( - &state, - Request::new(ListServicesRequest { - sandbox: "my-sandbox".to_string(), - limit: 0, - offset: 0, - }), - ) - .await - .unwrap() - .into_inner(); - assert_eq!(listed.services.len(), 1); - } - - #[tokio::test] - async fn concurrent_expose_service_update_uses_cas() { - let state = test_server_state().await; - seed_sandbox(&state, "my-sandbox").await; - - // Create an initial endpoint - handle_expose_service( - &state, - Request::new(ExposeServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - target_port: 7070, - domain: true, - }), - ) - .await - .unwrap(); - - // Spawn two concurrent updates - let state1 = state.clone(); - let handle1 = tokio::spawn(async move { - handle_expose_service( - &state1, - Request::new(ExposeServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - target_port: 8080, - domain: true, - }), - ) - .await - }); - - let state2 = state.clone(); - let handle2 = tokio::spawn(async move { - handle_expose_service( - &state2, - Request::new(ExposeServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - target_port: 9090, - domain: true, - }), - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // One should succeed, one may fail with ABORTED due to CAS conflict - let successes = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - - assert!( - successes >= 1, - "at least one update should succeed, got: {result1:?}, {result2:?}" - ); - - // The endpoint should have one of the new port values - let fetched = handle_get_service( - &state, - Request::new(GetServiceRequest { - sandbox: "my-sandbox".to_string(), - service: "web".to_string(), - }), - ) - .await - .unwrap() - .into_inner(); - let port = fetched.endpoint.as_ref().unwrap().target_port; - assert!( - port == 8080 || port == 9090, - "port should be one of the updated values, got {port}" - ); - assert_ne!(port, 7070, "port should not be the original value"); - } } diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 53f292053..160b7e031 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -267,25 +267,6 @@ pub(super) fn validate_provider_fields(provider: &Provider) -> Result<(), Status MAX_MAP_VALUE_LEN, "provider.config", )?; - if provider.credential_expires_at_ms.len() > MAX_PROVIDER_CREDENTIALS_ENTRIES { - return Err(Status::invalid_argument(format!( - "provider.credential_expires_at_ms exceeds maximum entries ({} > {MAX_PROVIDER_CREDENTIALS_ENTRIES})", - provider.credential_expires_at_ms.len() - ))); - } - for (key, value) in &provider.credential_expires_at_ms { - if key.len() > MAX_MAP_KEY_LEN { - return Err(Status::invalid_argument(format!( - "provider.credential_expires_at_ms key exceeds maximum length ({} > {MAX_MAP_KEY_LEN})", - key.len() - ))); - } - if *value < 0 { - return Err(Status::invalid_argument( - "provider.credential_expires_at_ms value must be greater than or equal to 0", - )); - } - } Ok(()) } @@ -893,12 +874,10 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials, config, - credential_expires_at_ms: HashMap::new(), } } diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 53d6265b7..50d1e8df3 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -3,7 +3,6 @@ #![allow(clippy::result_large_err)] // gRPC handlers return Result, Status> -use openshell_core::ObjectId; use openshell_core::proto::{ ClusterInferenceConfig, GetClusterInferenceRequest, GetClusterInferenceResponse, GetInferenceBundleRequest, GetInferenceBundleResponse, InferenceRoute, Provider, ResolvedRoute, @@ -12,14 +11,13 @@ use openshell_core::proto::{ }; use openshell_router::config::ResolvedRoute as RouterResolvedRoute; use openshell_router::{ValidationFailureKind, verify_backend_endpoint}; -use prost::Message as _; use std::sync::Arc; use std::time::Duration; use tonic::{Request, Response, Status}; use crate::{ ServerState, - persistence::{ObjectName, ObjectType, Store, WriteCondition, current_time_ms}, + persistence::{ObjectName, ObjectType, Store, current_time_ms}, }; #[derive(Debug)] @@ -171,7 +169,6 @@ async fn upsert_cluster_inference_route( let config = build_cluster_inference_config(&provider, model_id, timeout_secs); - // Fetch existing route to determine create vs. update path let existing = store .get_message_by_name::(route_name) .await @@ -179,49 +176,32 @@ async fn upsert_cluster_inference_route( let now_ms = current_time_ms(); - let (id, metadata, new_version, condition) = if let Some(existing) = existing { - // Update path: preserve metadata, increment version, use CAS - let resource_version = existing.metadata.as_ref().map_or(0, |m| m.resource_version); - ( - existing.object_id().to_string(), - existing.metadata.clone(), - existing.version.saturating_add(1), - WriteCondition::MatchResourceVersion(resource_version), - ) + let route = if let Some(existing) = existing { + InferenceRoute { + metadata: existing.metadata.clone(), + config: Some(config), + version: existing.version.saturating_add(1), + } } else { - // Create path: new metadata, version 1, use MustCreate - let new_id = uuid::Uuid::new_v4().to_string(); - let new_metadata = Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: new_id.clone(), - name: route_name.to_string(), - created_at_ms: now_ms, - labels: std::collections::HashMap::new(), - resource_version: 0, - }); - (new_id, new_metadata, 1, WriteCondition::MustCreate) - }; - - let route = InferenceRoute { - metadata, - config: Some(config), - version: new_version, + InferenceRoute { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: uuid::Uuid::new_v4().to_string(), + name: route_name.to_string(), + created_at_ms: now_ms, + labels: std::collections::HashMap::new(), + }), + config: Some(config), + version: 1, + } }; // Ensure metadata is valid (defense in depth - should always be true for server-constructed metadata) crate::grpc::validate_object_metadata(route.metadata.as_ref(), "inference_route")?; - // Single-attempt CAS write: fails with ABORTED on concurrent modification store - .put_if( - InferenceRoute::object_type(), - &id, - route_name, - &route.encode_to_vec(), - None, - condition, - ) + .put_message(&route) .await - .map_err(|e| crate::grpc::persistence_error_to_status(e, "upsert inference route"))?; + .map_err(|e| Status::internal(format!("persist route failed: {e}")))?; Ok(UpsertedInferenceRoute { route, validation }) } @@ -509,7 +489,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), config: Some(ClusterInferenceConfig { provider_name: provider_name.to_string(), @@ -527,12 +506,10 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), r#type: provider_type.to_string(), credentials: std::iter::once((key_name.to_string(), key_value.to_string())).collect(), config: std::collections::HashMap::new(), - credential_expires_at_ms: std::collections::HashMap::new(), } } @@ -688,7 +665,6 @@ mod tests { name: "openai-dev".to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), r#type: "openai".to_string(), credentials: std::iter::once(("OPENAI_API_KEY".to_string(), "sk-test".to_string())) @@ -698,7 +674,6 @@ mod tests { "https://station.example.com/v1".to_string(), )) .collect(), - credential_expires_at_ms: std::collections::HashMap::new(), }; store .put_message(&provider) @@ -711,7 +686,6 @@ mod tests { name: CLUSTER_INFERENCE_ROUTE_NAME.to_string(), created_at_ms: 1_000_000, labels: std::collections::HashMap::new(), - resource_version: 0, }), config: Some(ClusterInferenceConfig { provider_name: "openai-dev".to_string(), @@ -774,7 +748,6 @@ mod tests { credentials: std::iter::once(("OPENAI_API_KEY".to_string(), "sk-rotated".to_string())) .collect(), config: provider.config.clone(), - credential_expires_at_ms: provider.credential_expires_at_ms.clone(), }; store .put_message(&rotated_provider) @@ -1073,157 +1046,4 @@ mod tests { let err = effective_route_name("unknown-route").unwrap_err(); assert_eq!(err.code(), tonic::Code::InvalidArgument); } - - #[tokio::test] - async fn concurrent_upsert_route_create_uses_must_create() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .expect("store"); - - let provider = make_provider("openai-dev", "openai", "OPENAI_API_KEY", "sk-test"); - store.put_message(&provider).await.expect("persist"); - - // Spawn two concurrent upsert calls for the same route (create path) - let store1 = store.clone(); - let handle1 = tokio::spawn(async move { - upsert_cluster_inference_route( - &store1, - CLUSTER_INFERENCE_ROUTE_NAME, - "openai-dev", - "gpt-4o", - 0, - false, - ) - .await - }); - - let store2 = store.clone(); - let handle2 = tokio::spawn(async move { - upsert_cluster_inference_route( - &store2, - CLUSTER_INFERENCE_ROUTE_NAME, - "openai-dev", - "gpt-4.1", - 0, - false, - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // One should succeed with MustCreate, the other should fail - let successes = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - let failures = [&result1, &result2] - .iter() - .filter(|r| { - r.as_ref().is_err_and(|e| { - // Accept either ABORTED (from CAS) or Internal (from DB unique constraint) - e.code() == tonic::Code::Aborted - || (e.code() == tonic::Code::Internal - && e.message().contains("unique violation")) - }) - }) - .count(); - - assert_eq!( - successes, 1, - "exactly one create should succeed, got: {result1:?}, {result2:?}" - ); - assert_eq!( - failures, 1, - "exactly one create should fail, got: {result1:?}, {result2:?}" - ); - - // Only one route should exist - let route = store - .get_message_by_name::(CLUSTER_INFERENCE_ROUTE_NAME) - .await - .expect("fetch") - .expect("route should exist"); - assert_eq!(route.version, 1); - } - - #[tokio::test] - async fn concurrent_upsert_route_update_uses_cas() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .expect("store"); - - let provider = make_provider("openai-dev", "openai", "OPENAI_API_KEY", "sk-test"); - store.put_message(&provider).await.expect("persist"); - - // Create initial route - upsert_cluster_inference_route( - &store, - CLUSTER_INFERENCE_ROUTE_NAME, - "openai-dev", - "gpt-3.5", - 0, - false, - ) - .await - .expect("initial create should succeed"); - - // Spawn two concurrent updates - let store1 = store.clone(); - let handle1 = tokio::spawn(async move { - upsert_cluster_inference_route( - &store1, - CLUSTER_INFERENCE_ROUTE_NAME, - "openai-dev", - "gpt-4o", - 0, - false, - ) - .await - }); - - let store2 = store.clone(); - let handle2 = tokio::spawn(async move { - upsert_cluster_inference_route( - &store2, - CLUSTER_INFERENCE_ROUTE_NAME, - "openai-dev", - "gpt-4.1", - 0, - false, - ) - .await - }); - - let result1 = handle1.await.unwrap(); - let result2 = handle2.await.unwrap(); - - // One should succeed, one may fail with ABORTED due to CAS conflict - let successes = [&result1, &result2].iter().filter(|r| r.is_ok()).count(); - - assert!( - successes >= 1, - "at least one update should succeed, got: {result1:?}, {result2:?}" - ); - - // The route should have one of the new model values and version 2 - let route = store - .get_message_by_name::(CLUSTER_INFERENCE_ROUTE_NAME) - .await - .expect("fetch") - .expect("route should exist"); - let config = route.config.expect("config"); - assert!( - config.model_id == "gpt-4o" || config.model_id == "gpt-4.1", - "model should be one of the updated values, got {}", - config.model_id - ); - assert_ne!( - config.model_id, "gpt-3.5", - "model should not be the original value" - ); - assert!( - route.version >= 2 && route.version <= 3, - "version should be 2 (one update won, one conflicted) or 3 (both succeeded sequentially), got {}", - route.version - ); - } } diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 220e45026..61fb65506 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -24,19 +24,18 @@ pub mod certgen; pub mod cli; mod compute; pub mod config_file; -mod defaults; mod grpc; mod http; mod inference; mod multiplex; mod persistence; pub(crate) mod policy_store; -mod provider_refresh; mod sandbox_index; mod sandbox_watch; mod service_routing; mod ssh_sessions; pub mod supervisor_session; +mod telemetry; mod tls; pub mod tracing_bus; mod ws_tunnel; @@ -84,6 +83,9 @@ pub struct ServerState { /// In-memory bus for server process logs. pub tracing_log_bus: TracingLogBus, + /// In-memory anonymous telemetry accounting for active sandbox sessions. + pub(crate) telemetry: telemetry::TelemetryState, + /// Active SSH tunnel connection counts per session token. pub ssh_connections_by_token: Mutex>, @@ -144,6 +146,7 @@ impl ServerState { sandbox_index, sandbox_watch_bus, tracing_log_bus, + telemetry: telemetry::TelemetryState::new(), ssh_connections_by_token: Mutex::new(HashMap::new()), ssh_connections_by_sandbox: Mutex::new(HashMap::new()), settings_mutex: tokio::sync::Mutex::new(()), @@ -228,7 +231,6 @@ pub async fn run_server( state.compute.spawn_watchers(); ssh_sessions::spawn_session_reaper(store.clone(), Duration::from_secs(3600)); supervisor_session::spawn_relay_reaper(state.clone(), Duration::from_secs(30)); - provider_refresh::spawn_refresh_worker(state.clone(), Duration::from_secs(60)); // Create the multiplexed service let service = MultiplexService::new(state.clone()); @@ -628,7 +630,6 @@ async fn build_compute_runtime( ComputeDriverKind::Podman => { let mut podman = podman_config_from_file(file)?; podman.gateway_port = config.bind_address.port(); - apply_podman_local_tls_defaults(config, &mut podman)?; ComputeRuntime::new_podman( podman, @@ -681,29 +682,6 @@ fn podman_config_from_file( .map_err(|e| Error::config(format!("invalid [openshell.drivers.podman] table: {e}"))) } -fn apply_podman_local_tls_defaults( - config: &Config, - podman: &mut openshell_driver_podman::PodmanComputeConfig, -) -> Result<()> { - if config.tls.is_none() - || podman.guest_tls_ca.is_some() - || podman.guest_tls_cert.is_some() - || podman.guest_tls_key.is_some() - { - return Ok(()); - } - - let Some(paths) = defaults::complete_local_tls_paths() - .map_err(|e| Error::config(format!("failed to resolve local TLS defaults: {e}")))? - else { - return Ok(()); - }; - podman.guest_tls_ca = Some(paths.ca); - podman.guest_tls_cert = Some(paths.client_cert); - podman.guest_tls_key = Some(paths.client_key); - Ok(()) -} - fn configured_compute_driver(config: &Config) -> Result { match config.compute_drivers.as_slice() { [] => match openshell_core::config::detect_driver() { diff --git a/crates/openshell-server/src/persistence/mod.rs b/crates/openshell-server/src/persistence/mod.rs index 32875a9f9..87aa86581 100644 --- a/crates/openshell-server/src/persistence/mod.rs +++ b/crates/openshell-server/src/persistence/mod.rs @@ -45,10 +45,6 @@ pub enum PersistenceError { detail: Option, constraint_msg: String, }, - #[error("resource version conflict: expected version does not match current")] - Conflict { - current_resource_version: Option, - }, } impl PersistenceError { @@ -86,28 +82,6 @@ pub struct ObjectRecord { pub updated_at_ms: i64, /// JSON-serialized labels (key-value pairs). pub labels: Option, - /// Optimistic concurrency control version. - /// Incremented on each update for compare-and-swap operations. - pub resource_version: u64, -} - -/// Write condition for compare-and-swap operations. -#[derive(Debug, Clone, Copy)] -pub enum WriteCondition { - /// Object must not exist (insert only). - MustCreate, - /// Object must exist with the specified resource version (update only). - MatchResourceVersion(u64), - /// Unconditional write (insert or update). - Unconditional, -} - -/// Result of a successful write operation. -#[derive(Debug, Clone)] -pub struct WriteResult { - pub resource_version: u64, - pub created_at_ms: i64, - pub updated_at_ms: i64, } /// Persistence store implementations. @@ -124,9 +98,7 @@ pub trait ObjectType { // Import object metadata accessor traits from openshell-core // (implementations for all proto types are in openshell-core::metadata) -pub use openshell_core::{ - GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion, -}; +pub use openshell_core::{ObjectId, ObjectLabels, ObjectName}; /// Generate a random 6-character lowercase alphabetic name. pub fn generate_name() -> String { @@ -164,95 +136,18 @@ impl Store { } } - /// Insert or update a generic object with compare-and-swap support. - /// - /// # Arguments - /// * `object_type` - Type discriminator for the object - /// * `id` - Stable object identifier - /// * `name` - Human-readable object name - /// * `payload` - Serialized object data - /// * `labels` - Optional JSON-serialized labels - /// * `condition` - Write precondition (`MustCreate`, `MatchResourceVersion`, or `Unconditional`) - /// - /// # Returns - /// * `Ok(WriteResult)` - Write succeeded with new `resource_version` and timestamps - /// * `Err(Conflict)` - Resource version mismatch (for `MatchResourceVersion`) - /// * `Err(UniqueViolation)` - Object already exists (for `MustCreate`) or name conflict - pub async fn put_if( - &self, - object_type: &str, - id: &str, - name: &str, - payload: &[u8], - labels: Option<&str>, - condition: WriteCondition, - ) -> PersistenceResult { - match self { - Self::Postgres(store) => { - store - .put_if(object_type, id, name, payload, labels, condition) - .await - } - Self::Sqlite(store) => { - store - .put_if(object_type, id, name, payload, labels, condition) - .await - } - } - } - - /// Delete an object by id with compare-and-swap support. - /// - /// # Arguments - /// * `object_type` - Type discriminator for the object - /// * `id` - Stable object identifier - /// * `expected_resource_version` - Required resource version for the delete to proceed - /// - /// # Returns - /// * `Ok(true)` - Object was deleted - /// * `Ok(false)` - Object not found - /// * `Err(Conflict)` - Resource version mismatch - pub async fn delete_if( - &self, - object_type: &str, - id: &str, - expected_resource_version: u64, - ) -> PersistenceResult { - match self { - Self::Postgres(store) => { - store - .delete_if(object_type, id, expected_resource_version) - .await - } - Self::Sqlite(store) => { - store - .delete_if(object_type, id, expected_resource_version) - .await - } - } - } - - /// Insert or update a generic named object with an application-owned scope. - pub async fn put_scoped( + /// Insert or update a generic named object. + pub async fn put( &self, object_type: &str, id: &str, name: &str, - scope: &str, payload: &[u8], labels: Option<&str>, ) -> PersistenceResult<()> { match self { - Self::Postgres(store) => { - store - .put_scoped(object_type, id, name, scope, payload, labels) - .await - } - Self::Sqlite(store) => { - store - .put_scoped(object_type, id, name, scope, payload, labels) - .await - } + Self::Postgres(store) => store.put(object_type, id, name, payload, labels).await, + Self::Sqlite(store) => store.put(object_type, id, name, payload, labels).await, } } @@ -309,20 +204,6 @@ impl Store { } } - /// List objects by type and application-owned scope. - pub async fn list_by_scope( - &self, - object_type: &str, - scope: &str, - limit: u32, - offset: u32, - ) -> PersistenceResult> { - match self { - Self::Postgres(store) => store.list_by_scope(object_type, scope, limit, offset).await, - Self::Sqlite(store) => store.list_by_scope(object_type, scope, limit, offset).await, - } - } - /// List objects by type with label selector filtering. /// Label selector format: "key1=value1,key2=value2" (comma-separated equality matches). pub async fn list_with_selector( @@ -350,14 +231,12 @@ impl Store { // Generic protobuf message helpers // ----------------------------------------------------------------------- - /// Insert or update a protobuf message under an application-owned scope. - pub async fn put_scoped_message< - T: Message + ObjectType + ObjectId + ObjectName + ObjectLabels, - >( + /// Insert or update a protobuf message using its inferred object type, id, and name. + pub async fn put_message( &self, message: &T, - scope: &str, ) -> PersistenceResult<()> { + // Serialize labels to JSON let labels_map = message.object_labels(); let labels_json = if labels_map.as_ref().is_none_or(HashMap::is_empty) { None @@ -367,11 +246,10 @@ impl Store { })?) }; - self.put_scoped( + self.put( T::object_type(), message.object_id(), message.object_name(), - scope, &message.encode_to_vec(), labels_json.as_deref(), ) @@ -379,7 +257,7 @@ impl Store { } /// Fetch and decode a protobuf message by id. - pub async fn get_message( + pub async fn get_message( &self, id: &str, ) -> PersistenceResult> { @@ -388,17 +266,13 @@ impl Store { return Ok(None); }; - let mut message = T::decode(record.payload.as_slice()) - .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}")))?; - - // Hydrate resource_version from DB row (authoritative source) - message.set_resource_version(record.resource_version); - - Ok(Some(message)) + T::decode(record.payload.as_slice()) + .map(Some) + .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}"))) } /// Fetch and decode a protobuf message by name. - pub async fn get_message_by_name( + pub async fn get_message_by_name( &self, name: &str, ) -> PersistenceResult> { @@ -407,142 +281,9 @@ impl Store { return Ok(None); }; - let mut message = T::decode(record.payload.as_slice()) - .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}")))?; - - // Hydrate resource_version from DB row (authoritative source) - message.set_resource_version(record.resource_version); - - Ok(Some(message)) - } - - /// List and decode protobuf messages, hydrating `resource_version` from - /// the authoritative DB row (mirrors `get_message`). - pub async fn list_messages( - &self, - limit: u32, - offset: u32, - ) -> PersistenceResult> { - let records = self.list(T::object_type(), limit, offset).await?; - let mut messages = Vec::with_capacity(records.len()); - for record in records { - let mut message = T::decode(record.payload.as_slice()) - .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}")))?; - message.set_resource_version(record.resource_version); - messages.push(message); - } - Ok(messages) - } - - /// List and decode protobuf messages with label selector filtering, - /// hydrating `resource_version` from the authoritative DB row. - pub async fn list_messages_with_selector< - T: Message + Default + ObjectType + SetResourceVersion, - >( - &self, - label_selector: &str, - limit: u32, - offset: u32, - ) -> PersistenceResult> { - let records = self - .list_with_selector(T::object_type(), label_selector, limit, offset) - .await?; - let mut messages = Vec::with_capacity(records.len()); - for record in records { - let mut message = T::decode(record.payload.as_slice()) - .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}")))?; - message.set_resource_version(record.resource_version); - messages.push(message); - } - Ok(messages) - } - - /// Update a protobuf message using CAS (compare-and-swap). - /// - /// Fetches the current object, validates the expected version, applies the - /// mutation function, and attempts a single CAS write. Returns Conflict on - /// version mismatch for caller-driven retry. - /// - /// # Arguments - /// * `id` - Object ID to update - /// * `expected_version` - Required resource version for the update to proceed. - /// Pass 0 to use the current version (internal operations only). - /// For client-facing operations, pass the client-provided expected version. - /// * `mutate` - Function that modifies the object in place - /// - /// # Returns - /// * `Ok(T)` - Successfully updated object with new `resource_version` - /// * `Err(Conflict)` - Version mismatch; caller should retry - /// * `Err(Database)` - Object not found or other DB error - pub async fn update_message_cas( - &self, - id: &str, - expected_version: u64, - mut mutate: F, - ) -> PersistenceResult - where - T: Message - + Default - + ObjectType - + ObjectId - + ObjectName - + ObjectLabels - + SetResourceVersion - + GetResourceVersion - + Clone, - F: FnMut(&mut T), - { - // Fetch current object with authoritative resource_version - let current = self - .get_message::(id) - .await? - .ok_or_else(|| PersistenceError::Database(format!("object {id} not found")))?; - - let current_version = current.get_resource_version(); - - // Determine the version to use for CAS: - // - If expected_version is 0, use current version (internal operations) - // - Otherwise, validate that expected matches current (client-facing operations) - let cas_version = if expected_version == 0 { - current_version - } else { - if expected_version != current_version { - return Err(PersistenceError::Conflict { - current_resource_version: Some(current_version), - }); - } - expected_version - }; - - // Apply mutation - let mut updated = current.clone(); - mutate(&mut updated); - - // Serialize labels - let labels_map = updated.object_labels(); - let labels_json = if labels_map.as_ref().is_none_or(HashMap::is_empty) { - None - } else { - Some(serde_json::to_string(&labels_map).map_err(|e| { - PersistenceError::Encode(format!("failed to serialize labels: {e}")) - })?) - }; - - // Single-attempt CAS write - fails with Conflict on version mismatch - let result = self - .put_if( - T::object_type(), - updated.object_id(), - updated.object_name(), - &updated.encode_to_vec(), - labels_json.as_deref(), - WriteCondition::MatchResourceVersion(cas_version), - ) - .await?; - - // Success - hydrate the new resource_version and return - updated.set_resource_version(result.resource_version); - Ok(updated) + T::decode(record.payload.as_slice()) + .map(Some) + .map_err(|e| PersistenceError::Decode(format!("protobuf decode error: {e}"))) } } @@ -622,48 +363,5 @@ pub fn parse_label_selector(selector: &str) -> PersistenceResult, - ) -> PersistenceResult<()> { - match self { - Self::Postgres(store) => store.put(object_type, id, name, payload, labels).await, - Self::Sqlite(store) => store.put(object_type, id, name, payload, labels).await, - } - } - - pub async fn put_message( - &self, - message: &T, - ) -> PersistenceResult<()> { - let labels_map = message.object_labels(); - let labels_json = if labels_map.as_ref().is_none_or(HashMap::is_empty) { - None - } else { - Some(serde_json::to_string(&labels_map).map_err(|e| { - PersistenceError::Encode(format!("failed to serialize labels: {e}")) - })?) - }; - self.put( - T::object_type(), - message.object_id(), - message.object_name(), - &message.encode_to_vec(), - labels_json.as_deref(), - ) - .await - } -} - #[cfg(test)] mod tests; diff --git a/crates/openshell-server/src/persistence/postgres.rs b/crates/openshell-server/src/persistence/postgres.rs index 8399fd734..751d70073 100644 --- a/crates/openshell-server/src/persistence/postgres.rs +++ b/crates/openshell-server/src/persistence/postgres.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use super::{ - DraftChunkRecord, ObjectRecord, PersistenceError, PersistenceResult, PolicyRecord, - WriteCondition, WriteResult, current_time_ms, map_db_error, map_migrate_error, + DraftChunkRecord, ObjectRecord, PersistenceResult, PolicyRecord, current_time_ms, map_db_error, + map_migrate_error, }; use crate::policy_store::{ draft_chunk_payload_from_record, draft_chunk_record_from_parts, policy_payload_from_record, @@ -51,7 +51,7 @@ impl PostgresStore { let labels_jsonb: Option = labels .map(serde_json::from_str) .transpose() - .map_err(|e| PersistenceError::Encode(format!("invalid labels JSON: {e}")))?; + .map_err(|e| super::PersistenceError::Encode(format!("invalid labels JSON: {e}")))?; sqlx::query( r" @@ -75,197 +75,6 @@ ON CONFLICT (object_type, name) WHERE name IS NOT NULL DO UPDATE SET Ok(()) } - pub async fn put_if( - &self, - object_type: &str, - id: &str, - name: &str, - payload: &[u8], - labels: Option<&str>, - condition: WriteCondition, - ) -> PersistenceResult { - let now_ms = current_time_ms(); - let labels_jsonb: Option = labels - .map(serde_json::from_str) - .transpose() - .map_err(|e| PersistenceError::Encode(format!("invalid labels JSON: {e}")))?; - - match condition { - WriteCondition::MustCreate => { - // Insert only - fail if object exists - let row = sqlx::query( - r" -INSERT INTO objects (object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version) -VALUES ($1, $2, $3, $4, $5, $5, COALESCE($6, '{}'::jsonb), 1) -RETURNING resource_version, created_at_ms, updated_at_ms -", - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(payload) - .bind(now_ms) - .bind(labels_jsonb) - .fetch_one(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - let resource_version_i64: i64 = row.try_get("resource_version").unwrap_or(1); - Ok(WriteResult { - resource_version: resource_version_i64.max(1).cast_unsigned(), - created_at_ms: row.get("created_at_ms"), - updated_at_ms: row.get("updated_at_ms"), - }) - } - WriteCondition::MatchResourceVersion(expected_version) => { - // Update with version check using RETURNING - let row_result = sqlx::query( - r" -UPDATE objects -SET payload = $4, labels = COALESCE($5, '{}'::jsonb), updated_at_ms = $6, resource_version = resource_version + 1 -WHERE object_type = $1 AND id = $2 AND resource_version = $3 -RETURNING resource_version, created_at_ms, updated_at_ms -", - ) - .bind(object_type) - .bind(id) - .bind(i64::try_from(expected_version).unwrap_or(i64::MAX)) - .bind(payload) - .bind(labels_jsonb) - .bind(now_ms) - .fetch_optional(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - if let Some(row) = row_result { - let resource_version_i64: i64 = row.try_get("resource_version").unwrap_or(1); - Ok(WriteResult { - resource_version: resource_version_i64.max(1).cast_unsigned(), - created_at_ms: row.get("created_at_ms"), - updated_at_ms: row.get("updated_at_ms"), - }) - } else { - // Check if object exists to distinguish NotFound from Conflict - let existing = self.get(object_type, id).await?; - if let Some(record) = existing { - Err(PersistenceError::Conflict { - current_resource_version: Some(record.resource_version), - }) - } else { - Err(PersistenceError::Database(format!( - "object not found: {object_type}/{id}" - ))) - } - } - } - WriteCondition::Unconditional => { - // Unconditional upsert by name - let row = sqlx::query( - r" -INSERT INTO objects (object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version) -VALUES ($1, $2, $3, $4, $5, $5, COALESCE($6, '{}'::jsonb), 1) -ON CONFLICT (object_type, name) WHERE name IS NOT NULL DO UPDATE SET - payload = EXCLUDED.payload, - updated_at_ms = EXCLUDED.updated_at_ms, - labels = EXCLUDED.labels, - resource_version = objects.resource_version + 1 -RETURNING resource_version, created_at_ms, updated_at_ms -", - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(payload) - .bind(now_ms) - .bind(labels_jsonb) - .fetch_one(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - let resource_version_i64: i64 = row.try_get("resource_version").unwrap_or(1); - Ok(WriteResult { - resource_version: resource_version_i64.max(1).cast_unsigned(), - created_at_ms: row.get("created_at_ms"), - updated_at_ms: row.get("updated_at_ms"), - }) - } - } - } - - pub async fn delete_if( - &self, - object_type: &str, - id: &str, - expected_resource_version: u64, - ) -> PersistenceResult { - let result = sqlx::query( - r" -DELETE FROM objects -WHERE object_type = $1 AND id = $2 AND resource_version = $3 -", - ) - .bind(object_type) - .bind(id) - .bind(i64::try_from(expected_resource_version).unwrap_or(i64::MAX)) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - if result.rows_affected() > 0 { - Ok(true) - } else { - // Check if object exists to distinguish NotFound from Conflict - let existing = self.get(object_type, id).await?; - if let Some(record) = existing { - Err(PersistenceError::Conflict { - current_resource_version: Some(record.resource_version), - }) - } else { - Ok(false) - } - } - } - - pub async fn put_scoped( - &self, - object_type: &str, - id: &str, - name: &str, - scope: &str, - payload: &[u8], - labels: Option<&str>, - ) -> PersistenceResult<()> { - let now_ms = current_time_ms(); - let labels_jsonb: Option = labels - .map(serde_json::from_str) - .transpose() - .map_err(|e| PersistenceError::Encode(format!("invalid labels JSON: {e}")))?; - - sqlx::query( - r" -INSERT INTO objects (object_type, id, name, scope, payload, created_at_ms, updated_at_ms, labels, resource_version) -VALUES ($1, $2, $3, $4, $5, $6, $6, COALESCE($7, '{}'::jsonb), 1) -ON CONFLICT (object_type, name) WHERE name IS NOT NULL DO UPDATE SET - scope = EXCLUDED.scope, - payload = EXCLUDED.payload, - updated_at_ms = EXCLUDED.updated_at_ms, - labels = EXCLUDED.labels, - resource_version = objects.resource_version + 1 -", - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(scope) - .bind(payload) - .bind(now_ms) - .bind(labels_jsonb) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - Ok(()) - } - pub async fn get( &self, object_type: &str, @@ -273,7 +82,7 @@ ON CONFLICT (object_type, name) WHERE name IS NOT NULL DO UPDATE SET ) -> PersistenceResult> { let row = sqlx::query( r" -SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version +SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels FROM objects WHERE object_type = $1 AND id = $2 ", @@ -294,7 +103,7 @@ WHERE object_type = $1 AND id = $2 ) -> PersistenceResult> { let row = sqlx::query( r" -SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version +SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels FROM objects WHERE object_type = $1 AND name = $2 ", @@ -336,7 +145,7 @@ WHERE object_type = $1 AND name = $2 ) -> PersistenceResult> { let rows = sqlx::query( r" -SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version +SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels FROM objects WHERE object_type = $1 ORDER BY created_at_ms ASC, name ASC @@ -353,33 +162,6 @@ LIMIT $2 OFFSET $3 Ok(rows.into_iter().map(row_to_object_record).collect()) } - pub async fn list_by_scope( - &self, - object_type: &str, - scope: &str, - limit: u32, - offset: u32, - ) -> PersistenceResult> { - let rows = sqlx::query( - r" -SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels -FROM objects -WHERE object_type = $1 AND scope = $2 -ORDER BY created_at_ms ASC, name ASC -LIMIT $3 OFFSET $4 -", - ) - .bind(object_type) - .bind(scope) - .bind(i64::from(limit)) - .bind(i64::from(offset)) - .fetch_all(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - Ok(rows.into_iter().map(row_to_object_record).collect()) - } - pub async fn list_with_selector( &self, object_type: &str, @@ -390,12 +172,13 @@ LIMIT $3 OFFSET $4 use super::parse_label_selector; let required_labels = parse_label_selector(label_selector)?; - let labels_jsonb = serde_json::to_value(&required_labels) - .map_err(|e| PersistenceError::Encode(format!("failed to serialize labels: {e}")))?; + let labels_jsonb = serde_json::to_value(&required_labels).map_err(|e| { + super::PersistenceError::Encode(format!("failed to serialize labels: {e}")) + })?; let rows = sqlx::query( r" -SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels, resource_version +SELECT object_type, id, name, payload, created_at_ms, updated_at_ms, labels FROM objects WHERE object_type = $1 AND labels @> $2 ORDER BY created_at_ms ASC, name ASC @@ -827,7 +610,6 @@ WHERE object_type = $1 AND scope = $2 fn row_to_object_record(row: sqlx::postgres::PgRow) -> ObjectRecord { let labels_jsonb: Option = row.get("labels"); - let resource_version_i64: i64 = row.try_get("resource_version").unwrap_or(1); ObjectRecord { object_type: row.get("object_type"), id: row.get("id"), @@ -836,7 +618,6 @@ fn row_to_object_record(row: sqlx::postgres::PgRow) -> ObjectRecord { created_at_ms: row.get("created_at_ms"), updated_at_ms: row.get("updated_at_ms"), labels: labels_jsonb.map(|value| value.to_string()), - resource_version: resource_version_i64.max(1).cast_unsigned(), } } diff --git a/crates/openshell-server/src/persistence/sqlite.rs b/crates/openshell-server/src/persistence/sqlite.rs index bdfadc8b0..04c4d8d8a 100644 --- a/crates/openshell-server/src/persistence/sqlite.rs +++ b/crates/openshell-server/src/persistence/sqlite.rs @@ -3,7 +3,7 @@ use super::{ DraftChunkRecord, ObjectRecord, PersistenceError, PersistenceResult, PolicyRecord, - WriteCondition, WriteResult, current_time_ms, map_db_error, map_migrate_error, + current_time_ms, map_db_error, map_migrate_error, }; use crate::policy_store::{ draft_chunk_payload_from_record, draft_chunk_record_from_parts, policy_payload_from_record, @@ -97,191 +97,6 @@ ON CONFLICT ("object_type", "name") WHERE "name" IS NOT NULL DO UPDATE SET Ok(()) } - pub async fn put_if( - &self, - object_type: &str, - id: &str, - name: &str, - payload: &[u8], - labels: Option<&str>, - condition: WriteCondition, - ) -> PersistenceResult { - let now_ms = current_time_ms(); - - match condition { - WriteCondition::MustCreate => { - // Insert only - fail if object exists - sqlx::query( - r#" -INSERT INTO "objects" ("object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version") -VALUES (?1, ?2, ?3, ?4, ?5, ?5, ?6, 1) -"#, - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(payload) - .bind(now_ms) - .bind(labels.unwrap_or("{}")) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - Ok(WriteResult { - resource_version: 1, - created_at_ms: now_ms, - updated_at_ms: now_ms, - }) - } - WriteCondition::MatchResourceVersion(expected_version) => { - // Update with version check - let result = sqlx::query( - r#" -UPDATE "objects" -SET "payload" = ?4, "labels" = ?5, "updated_at_ms" = ?6, "resource_version" = "resource_version" + 1 -WHERE "object_type" = ?1 AND "id" = ?2 AND "resource_version" = ?3 -"#, - ) - .bind(object_type) - .bind(id) - .bind(i64::try_from(expected_version).unwrap_or(i64::MAX)) - .bind(payload) - .bind(labels.unwrap_or("{}")) - .bind(now_ms) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - if result.rows_affected() == 0 { - // Check if object exists to distinguish NotFound from Conflict - let existing = self.get(object_type, id).await?; - if let Some(record) = existing { - return Err(PersistenceError::Conflict { - current_resource_version: Some(record.resource_version), - }); - } - return Err(PersistenceError::Database(format!( - "object not found: {object_type}/{id}" - ))); - } - - // Fetch the updated record to get the new resource_version - let updated = self.get(object_type, id).await?.ok_or_else(|| { - PersistenceError::Database("object disappeared after update".to_string()) - })?; - - Ok(WriteResult { - resource_version: updated.resource_version, - created_at_ms: updated.created_at_ms, - updated_at_ms: updated.updated_at_ms, - }) - } - WriteCondition::Unconditional => { - // Unconditional upsert by name - sqlx::query( - r#" -INSERT INTO "objects" ("object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version") -VALUES (?1, ?2, ?3, ?4, ?5, ?5, ?6, 1) -ON CONFLICT ("object_type", "name") WHERE "name" IS NOT NULL DO UPDATE SET - "payload" = excluded."payload", - "updated_at_ms" = excluded."updated_at_ms", - "labels" = excluded."labels", - "resource_version" = "objects"."resource_version" + 1 -"#, - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(payload) - .bind(now_ms) - .bind(labels.unwrap_or("{}")) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - // Fetch the result to get the resource_version - let record = self.get_by_name(object_type, name).await?.ok_or_else(|| { - PersistenceError::Database("object disappeared after upsert".to_string()) - })?; - - Ok(WriteResult { - resource_version: record.resource_version, - created_at_ms: record.created_at_ms, - updated_at_ms: record.updated_at_ms, - }) - } - } - } - - pub async fn delete_if( - &self, - object_type: &str, - id: &str, - expected_resource_version: u64, - ) -> PersistenceResult { - let result = sqlx::query( - r#" -DELETE FROM "objects" -WHERE "object_type" = ?1 AND "id" = ?2 AND "resource_version" = ?3 -"#, - ) - .bind(object_type) - .bind(id) - .bind(i64::try_from(expected_resource_version).unwrap_or(i64::MAX)) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - if result.rows_affected() > 0 { - Ok(true) - } else { - // Check if object exists to distinguish NotFound from Conflict - let existing = self.get(object_type, id).await?; - if let Some(record) = existing { - return Err(PersistenceError::Conflict { - current_resource_version: Some(record.resource_version), - }); - } - Ok(false) - } - } - - pub async fn put_scoped( - &self, - object_type: &str, - id: &str, - name: &str, - scope: &str, - payload: &[u8], - labels: Option<&str>, - ) -> PersistenceResult<()> { - let now_ms = current_time_ms(); - - sqlx::query( - r#" -INSERT INTO "objects" ("object_type", "id", "name", "scope", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version") -VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?6, ?7, 1) -ON CONFLICT ("object_type", "name") WHERE "name" IS NOT NULL DO UPDATE SET - "scope" = excluded."scope", - "payload" = excluded."payload", - "updated_at_ms" = excluded."updated_at_ms", - "labels" = excluded."labels", - "resource_version" = "objects"."resource_version" + 1 -"#, - ) - .bind(object_type) - .bind(id) - .bind(name) - .bind(scope) - .bind(payload) - .bind(now_ms) - .bind(labels.unwrap_or("{}")) - .execute(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - Ok(()) - } - pub async fn get( &self, object_type: &str, @@ -289,7 +104,7 @@ ON CONFLICT ("object_type", "name") WHERE "name" IS NOT NULL DO UPDATE SET ) -> PersistenceResult> { let row = sqlx::query( r#" -SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version" +SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels" FROM "objects" WHERE "object_type" = ?1 AND "id" = ?2 "#, @@ -310,7 +125,7 @@ WHERE "object_type" = ?1 AND "id" = ?2 ) -> PersistenceResult> { let row = sqlx::query( r#" -SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version" +SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels" FROM "objects" WHERE "object_type" = ?1 AND "name" = ?2 "#, @@ -362,7 +177,7 @@ WHERE "object_type" = ?1 AND "name" = ?2 ) -> PersistenceResult> { let rows = sqlx::query( r#" -SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels", "resource_version" +SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels" FROM "objects" WHERE "object_type" = ?1 ORDER BY "created_at_ms" ASC, "name" ASC @@ -378,33 +193,6 @@ LIMIT ?2 OFFSET ?3 Ok(rows.into_iter().map(row_to_object_record).collect()) } - - pub async fn list_by_scope( - &self, - object_type: &str, - scope: &str, - limit: u32, - offset: u32, - ) -> PersistenceResult> { - let rows = sqlx::query( - r#" -SELECT "object_type", "id", "name", "payload", "created_at_ms", "updated_at_ms", "labels" -FROM "objects" -WHERE "object_type" = ?1 AND "scope" = ?2 -ORDER BY "created_at_ms" ASC, "name" ASC -LIMIT ?3 OFFSET ?4 -"#, - ) - .bind(object_type) - .bind(scope) - .bind(i64::from(limit)) - .bind(i64::from(offset)) - .fetch_all(&self.pool) - .await - .map_err(|e| map_db_error(&e))?; - - Ok(rows.into_iter().map(row_to_object_record).collect()) - } pub async fn list_with_selector( &self, object_type: &str, @@ -880,7 +668,6 @@ pub(super) fn sqlite_sidecar_paths(path: &Path) -> [PathBuf; 2] { } fn row_to_object_record(row: sqlx::sqlite::SqliteRow) -> ObjectRecord { - let resource_version_i64: i64 = row.try_get("resource_version").unwrap_or(1); ObjectRecord { object_type: row.get("object_type"), id: row.get("id"), @@ -889,7 +676,6 @@ fn row_to_object_record(row: sqlx::sqlite::SqliteRow) -> ObjectRecord { created_at_ms: row.get("created_at_ms"), updated_at_ms: row.get("updated_at_ms"), labels: row.get("labels"), - resource_version: resource_version_i64.max(1).cast_unsigned(), } } diff --git a/crates/openshell-server/src/persistence/tests.rs b/crates/openshell-server/src/persistence/tests.rs index db85d2a0e..09549ad29 100644 --- a/crates/openshell-server/src/persistence/tests.rs +++ b/crates/openshell-server/src/persistence/tests.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use super::{ObjectType, PersistenceError, Store, generate_name}; +use super::{ObjectType, Store, generate_name}; use crate::policy_store::PolicyStoreExt; use openshell_core::proto::{ObjectForTest, SandboxPolicy}; use prost::Message; @@ -962,460 +962,3 @@ fn parse_label_selector_handles_whitespace() { assert_eq!(result.get("env"), Some(&"prod".to_string())); assert_eq!(result.get("tier"), Some(&"frontend".to_string())); } - -// --------------------------------------------------------------------------- -// CAS (compare-and-swap) tests -// --------------------------------------------------------------------------- - -#[tokio::test] -async fn cas_put_if_must_create_succeeds() { - use super::WriteCondition; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - let result = store - .put_if( - "sandbox", - "id-1", - "new-sandbox", - b"payload", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - assert_eq!(result.resource_version, 1); - - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 1); - assert_eq!(record.payload, b"payload"); -} - -#[tokio::test] -async fn cas_put_if_must_create_fails_on_duplicate() { - use super::{PersistenceError, WriteCondition}; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // First insert succeeds - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"payload1", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - // Second insert with same ID fails - let result = store - .put_if( - "sandbox", - "id-1", - "sandbox-2", - b"payload2", - None, - WriteCondition::MustCreate, - ) - .await; - - assert!(matches!( - result, - Err(PersistenceError::UniqueViolation { .. }) - )); -} - -#[tokio::test] -async fn cas_put_if_match_version_succeeds() { - use super::WriteCondition; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // Create initial object - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v1", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - // Update with correct version - let result = store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v2", - None, - WriteCondition::MatchResourceVersion(1), - ) - .await - .unwrap(); - - assert_eq!(result.resource_version, 2); - - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 2); - assert_eq!(record.payload, b"v2"); -} - -#[tokio::test] -async fn cas_put_if_match_version_fails_on_mismatch() { - use super::{PersistenceError, WriteCondition}; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // Create initial object - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v1", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - // Update with wrong version - let result = store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v2", - None, - WriteCondition::MatchResourceVersion(99), - ) - .await; - - assert!(matches!( - result, - Err(PersistenceError::Conflict { - current_resource_version: Some(1) - }) - )); - - // Original payload unchanged - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 1); - assert_eq!(record.payload, b"v1"); -} - -#[tokio::test] -async fn cas_delete_if_succeeds_with_correct_version() { - use super::WriteCondition; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"payload", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - let deleted = store.delete_if("sandbox", "id-1", 1).await.unwrap(); - assert!(deleted); - - let record = store.get("sandbox", "id-1").await.unwrap(); - assert!(record.is_none()); -} - -#[tokio::test] -async fn cas_delete_if_fails_with_wrong_version() { - use super::{PersistenceError, WriteCondition}; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"payload", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - let result = store.delete_if("sandbox", "id-1", 99).await; - assert!(matches!( - result, - Err(PersistenceError::Conflict { - current_resource_version: Some(1) - }) - )); - - // Object still exists - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 1); -} - -#[tokio::test] -async fn cas_resource_version_increments() { - use super::WriteCondition; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // Create - let r1 = store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v1", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - assert_eq!(r1.resource_version, 1); - - // Update 1 - let r2 = store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v2", - None, - WriteCondition::MatchResourceVersion(1), - ) - .await - .unwrap(); - assert_eq!(r2.resource_version, 2); - - // Update 2 - let r3 = store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"v3", - None, - WriteCondition::MatchResourceVersion(2), - ) - .await - .unwrap(); - assert_eq!(r3.resource_version, 3); - - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 3); -} - -#[tokio::test] -async fn cas_concurrent_updates_one_succeeds() { - use super::WriteCondition; - use std::sync::Arc; - - let store = Arc::new( - Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(), - ); - - // Create initial object - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - b"initial", - None, - WriteCondition::MustCreate, - ) - .await - .unwrap(); - - // Spawn 10 concurrent updates trying to update from version 1 - let mut handles = vec![]; - for i in 0..10 { - let store = Arc::clone(&store); - let handle = tokio::spawn(async move { - store - .put_if( - "sandbox", - "id-1", - "sandbox-1", - format!("update-{i}").as_bytes(), - None, - WriteCondition::MatchResourceVersion(1), - ) - .await - }); - handles.push(handle); - } - - let results: Vec<_> = futures::future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Exactly one should succeed, rest should conflict - let successes = results.iter().filter(|r| r.is_ok()).count(); - let conflicts = results.iter().filter(|r| r.is_err()).count(); - - assert_eq!(successes, 1); - assert_eq!(conflicts, 9); - - // Final version should be 2 - let record = store.get("sandbox", "id-1").await.unwrap().unwrap(); - assert_eq!(record.resource_version, 2); -} - -#[tokio::test] -async fn cas_update_message_cas_succeeds() { - use openshell_core::proto::Sandbox; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - - // Create a sandbox - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "test-id".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1000, - labels: std::collections::HashMap::new(), - resource_version: 0, - }), - spec: None, - status: None, - phase: 0, - current_policy_version: 0, - }; - - store.put_message(&sandbox).await.unwrap(); - - // Update using CAS with expected_version = 0 (use current version) - let updated = store - .update_message_cas::("test-id", 0, |s| { - s.phase = 2; // Set to Ready - s.current_policy_version = 42; - }) - .await - .unwrap(); - - assert_eq!(updated.phase, 2); - assert_eq!(updated.current_policy_version, 42); - assert_eq!( - updated.metadata.as_ref().map_or(0, |m| m.resource_version), - 2 - ); -} - -#[tokio::test] -async fn cas_update_message_cas_conflicts_on_concurrent_updates() { - use openshell_core::proto::Sandbox; - use std::sync::Arc; - use std::sync::atomic::{AtomicU32, Ordering}; - - let store = Arc::new( - Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(), - ); - - // Create a sandbox - let sandbox = Sandbox { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: "test-id".to_string(), - name: "test-sandbox".to_string(), - created_at_ms: 1000, - labels: std::collections::HashMap::new(), - resource_version: 0, - }), - spec: None, - status: None, - phase: 0, - current_policy_version: 0, - }; - - store.put_message(&sandbox).await.unwrap(); - - // Track how many updates succeed - let success_count = Arc::new(AtomicU32::new(0)); - - // Spawn 5 concurrent CAS updates (using expected_version = 0 to use current) - let mut handles = vec![]; - for i in 0..5 { - let store = Arc::clone(&store); - let success_count = Arc::clone(&success_count); - let handle = tokio::spawn(async move { - let result = store - .update_message_cas::("test-id", 0, |s| { - s.current_policy_version = i; - }) - .await; - if result.is_ok() { - success_count.fetch_add(1, Ordering::SeqCst); - } - result - }); - handles.push(handle); - } - - let results: Vec<_> = futures::future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Only one should succeed; others fail with Conflict due to single-attempt CAS - let successes = results.iter().filter(|r| r.is_ok()).count(); - let conflicts = results - .iter() - .filter(|r| matches!(r, Err(PersistenceError::Conflict { .. }))) - .count(); - assert_eq!(successes, 1, "exactly one concurrent update should succeed"); - assert_eq!(conflicts, 4, "four updates should fail with Conflict"); - assert_eq!(success_count.load(Ordering::SeqCst), 1); - - // Final version should be 2 (initial 1 + 1 successful update) - let final_sandbox = store - .get_message::("test-id") - .await - .unwrap() - .unwrap(); - assert_eq!( - final_sandbox - .metadata - .as_ref() - .map_or(0, |m| m.resource_version), - 2, - "resource_version should be 2 (initial 1 + 1 successful update)" - ); -} diff --git a/crates/openshell-server/src/provider_refresh.rs b/crates/openshell-server/src/provider_refresh.rs deleted file mode 100644 index e12fafacf..000000000 --- a/crates/openshell-server/src/provider_refresh.rs +++ /dev/null @@ -1,1221 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Provider credential refresh state. - -#![allow(clippy::result_large_err)] - -use crate::persistence::{ObjectType, Store, current_time_ms}; -use openshell_core::proto::{ - Provider, ProviderCredentialRefreshStatus, ProviderCredentialRefreshStrategy, - StoredProviderCredentialRefreshState, -}; -use prost::Message; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::time::Duration; -use tonic::Status; -use tracing::{info, warn}; - -const DEFAULT_REFRESH_BEFORE_SECONDS: i64 = 300; -const DEFAULT_MAX_LIFETIME_SECONDS: i64 = 3600; -const REFRESH_ERROR_RETRY_SECONDS: i64 = 60; -const REFRESH_WORKER_PAGE_SIZE: u32 = 1000; - -impl ObjectType for StoredProviderCredentialRefreshState { - fn object_type() -> &'static str { - "provider_credential_refresh_state" - } -} - -pub fn refresh_state_name(provider_id: &str, credential_key: &str) -> String { - let mut key = String::with_capacity(credential_key.len() * 2); - for byte in credential_key.as_bytes() { - use std::fmt::Write as _; - write!(&mut key, "{byte:02x}").expect("writing to String cannot fail"); - } - format!("provider-refresh-{provider_id}-{key}") -} - -pub async fn put_refresh_state( - store: &Store, - state: &StoredProviderCredentialRefreshState, -) -> Result<(), Status> { - store - .put_scoped_message(state, &state.provider_id) - .await - .map_err(|e| Status::internal(format!("persist provider refresh state failed: {e}"))) -} - -pub async fn list_refresh_states_for_provider( - store: &Store, - provider_id: &str, -) -> Result, Status> { - let records = store - .list_by_scope( - StoredProviderCredentialRefreshState::object_type(), - provider_id, - 1000, - 0, - ) - .await - .map_err(|e| Status::internal(format!("list provider refresh states failed: {e}")))?; - - let mut states = Vec::with_capacity(records.len()); - for record in records { - states.push( - StoredProviderCredentialRefreshState::decode(record.payload.as_slice()).map_err( - |e| Status::internal(format!("decode provider refresh state failed: {e}")), - )?, - ); - } - Ok(states) -} - -pub async fn list_all_refresh_states( - store: &Store, -) -> Result, Status> { - let mut states = Vec::new(); - let mut offset = 0; - loop { - let records = store - .list( - StoredProviderCredentialRefreshState::object_type(), - REFRESH_WORKER_PAGE_SIZE, - offset, - ) - .await - .map_err(|e| Status::internal(format!("list provider refresh states failed: {e}")))?; - if records.is_empty() { - break; - } - offset = offset - .checked_add( - u32::try_from(records.len()) - .map_err(|_| Status::internal("provider refresh page size exceeded u32"))?, - ) - .ok_or_else(|| Status::internal("provider refresh pagination offset overflow"))?; - for record in records { - states.push( - StoredProviderCredentialRefreshState::decode(record.payload.as_slice()).map_err( - |e| Status::internal(format!("decode provider refresh state failed: {e}")), - )?, - ); - } - } - Ok(states) -} - -pub async fn get_refresh_state( - store: &Store, - provider_id: &str, - credential_key: &str, -) -> Result, Status> { - let name = refresh_state_name(provider_id, credential_key); - store - .get_message_by_name::(&name) - .await - .map_err(|e| Status::internal(format!("fetch provider refresh state failed: {e}"))) -} - -pub async fn delete_refresh_state( - store: &Store, - provider_id: &str, - credential_key: &str, -) -> Result { - let name = refresh_state_name(provider_id, credential_key); - store - .delete_by_name(StoredProviderCredentialRefreshState::object_type(), &name) - .await - .map_err(|e| Status::internal(format!("delete provider refresh state failed: {e}"))) -} - -pub async fn delete_refresh_states_for_provider( - store: &Store, - provider_id: &str, -) -> Result { - let states = list_refresh_states_for_provider(store, provider_id).await?; - let mut deleted = 0; - for state in states { - if store - .delete_by_name( - StoredProviderCredentialRefreshState::object_type(), - state.object_name(), - ) - .await - .map_err(|e| Status::internal(format!("delete provider refresh state failed: {e}")))? - { - deleted += 1; - } - } - Ok(deleted) -} - -pub fn refresh_status_from_state( - state: &StoredProviderCredentialRefreshState, -) -> ProviderCredentialRefreshStatus { - ProviderCredentialRefreshStatus { - provider_name: state.provider_name.clone(), - provider_id: state.provider_id.clone(), - credential_key: state.credential_key.clone(), - strategy: state.strategy, - status: state.status.clone(), - expires_at_ms: state.expires_at_ms, - next_refresh_at_ms: state.next_refresh_at_ms, - last_refresh_at_ms: state.last_refresh_at_ms, - last_error: state.last_error.clone(), - } -} - -pub struct NewRefreshStateConfig { - pub strategy: ProviderCredentialRefreshStrategy, - pub material: HashMap, - pub secret_material_keys: Vec, - pub expires_at_ms: i64, - pub token_url: String, - pub scopes: Vec, - pub refresh_before_seconds: i64, - pub max_lifetime_seconds: i64, -} - -#[allow(clippy::unnecessary_wraps)] -pub fn new_refresh_state( - provider: &Provider, - credential_key: &str, - config: NewRefreshStateConfig, -) -> Result { - let provider_id = provider.object_id().to_string(); - let provider_name = provider.object_name().to_string(); - let now_ms = current_time_ms(); - let next_refresh_at_ms = next_refresh_at_ms( - config.expires_at_ms, - config.refresh_before_seconds, - config.max_lifetime_seconds, - now_ms, - ); - Ok(StoredProviderCredentialRefreshState { - metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { - id: uuid::Uuid::new_v4().to_string(), - name: refresh_state_name(&provider_id, credential_key), - created_at_ms: now_ms, - labels: HashMap::new(), - resource_version: 0, - }), - provider_id, - provider_name, - credential_key: credential_key.to_string(), - strategy: config.strategy as i32, - material: config.material, - secret_material_keys: config.secret_material_keys, - expires_at_ms: config.expires_at_ms, - next_refresh_at_ms, - last_refresh_at_ms: 0, - status: "configured".to_string(), - last_error: String::new(), - token_url: config.token_url, - scopes: config.scopes, - refresh_before_seconds: config.refresh_before_seconds, - max_lifetime_seconds: config.max_lifetime_seconds, - }) -} - -use openshell_core::{ObjectId, ObjectName}; - -#[derive(Debug)] -struct MintedCredential { - access_token: String, - expires_at_ms: i64, - refresh_token: Option, -} - -#[derive(Debug, Deserialize)] -struct TokenResponse { - access_token: String, - expires_in: Option, - refresh_token: Option, -} - -#[derive(Debug, Serialize)] -struct GoogleServiceAccountClaims<'a> { - iss: &'a str, - scope: String, - aud: &'a str, - iat: i64, - exp: i64, - #[serde(skip_serializing_if = "Option::is_none")] - sub: Option<&'a str>, -} - -pub fn next_refresh_at_ms( - expires_at_ms: i64, - refresh_before_seconds: i64, - _max_lifetime_seconds: i64, - _now_ms: i64, -) -> i64 { - let refresh_before_seconds = if refresh_before_seconds > 0 { - refresh_before_seconds - } else { - DEFAULT_REFRESH_BEFORE_SECONDS - }; - if expires_at_ms > 0 { - return expires_at_ms.saturating_sub(refresh_before_seconds.saturating_mul(1000)); - } - 0 -} - -fn seconds_until_ms(now_ms: i64, target_ms: i64) -> i64 { - if target_ms <= 0 { - return 0; - } - target_ms.saturating_sub(now_ms).max(0) / 1000 -} - -pub fn refresh_strategy_name(strategy: i32) -> &'static str { - match ProviderCredentialRefreshStrategy::try_from(strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified) - { - ProviderCredentialRefreshStrategy::Static => "static", - ProviderCredentialRefreshStrategy::External => "external", - ProviderCredentialRefreshStrategy::Oauth2RefreshToken => "oauth2_refresh_token", - ProviderCredentialRefreshStrategy::Oauth2ClientCredentials => "oauth2_client_credentials", - ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt => "google_service_account_jwt", - ProviderCredentialRefreshStrategy::Unspecified => "unspecified", - } -} - -pub fn is_gateway_mintable_strategy(strategy: ProviderCredentialRefreshStrategy) -> bool { - matches!( - strategy, - ProviderCredentialRefreshStrategy::Oauth2RefreshToken - | ProviderCredentialRefreshStrategy::Oauth2ClientCredentials - | ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt - ) -} - -pub async fn refresh_provider_credential( - store: &Store, - provider_name: &str, - credential_key: &str, -) -> Result { - let provider = store - .get_message_by_name::(provider_name) - .await - .map_err(|e| Status::internal(format!("fetch provider failed: {e}")))? - .ok_or_else(|| Status::not_found("provider not found"))?; - let Some(mut state) = get_refresh_state(store, provider.object_id(), credential_key).await? - else { - return Err(Status::not_found("provider refresh state not found")); - }; - - info!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - expires_at_ms = state.expires_at_ms, - next_refresh_at_ms = state.next_refresh_at_ms, - "provider credential refresh started" - ); - - match mint_credential(&state).await { - Ok(minted) => { - let now_ms = current_time_ms(); - if let Err(err) = - apply_minted_credential(store, &provider, credential_key, &minted).await - { - state.status = "error".to_string(); - state.last_error = err.message().to_string(); - state.next_refresh_at_ms = - now_ms.saturating_add(REFRESH_ERROR_RETRY_SECONDS.saturating_mul(1000)); - put_refresh_state(store, &state).await?; - warn!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - next_refresh_at_ms = state.next_refresh_at_ms, - seconds_until_refresh = seconds_until_ms(now_ms, state.next_refresh_at_ms), - error = %err, - "provider credential refresh errored" - ); - return Err(err); - } - if let Some(refresh_token) = minted.refresh_token { - state - .material - .insert("refresh_token".to_string(), refresh_token); - if !state - .secret_material_keys - .iter() - .any(|key| key == "refresh_token") - { - state.secret_material_keys.push("refresh_token".to_string()); - } - } - state.expires_at_ms = minted.expires_at_ms; - state.next_refresh_at_ms = next_refresh_at_ms( - minted.expires_at_ms, - state.refresh_before_seconds, - state.max_lifetime_seconds, - now_ms, - ); - state.last_refresh_at_ms = now_ms; - state.status = "refreshed".to_string(); - state.last_error.clear(); - put_refresh_state(store, &state).await?; - info!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - expires_at_ms = state.expires_at_ms, - next_refresh_at_ms = state.next_refresh_at_ms, - seconds_until_refresh = seconds_until_ms(now_ms, state.next_refresh_at_ms), - "provider credential refresh completed" - ); - Ok(state) - } - Err(err) => { - let now_ms = current_time_ms(); - state.status = "error".to_string(); - state.last_error = err.message().to_string(); - state.next_refresh_at_ms = - now_ms.saturating_add(REFRESH_ERROR_RETRY_SECONDS.saturating_mul(1000)); - put_refresh_state(store, &state).await?; - warn!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - next_refresh_at_ms = state.next_refresh_at_ms, - seconds_until_refresh = seconds_until_ms(now_ms, state.next_refresh_at_ms), - error = %err, - "provider credential refresh errored" - ); - Err(err) - } - } -} - -async fn apply_minted_credential( - store: &Store, - provider: &Provider, - credential_key: &str, - minted: &MintedCredential, -) -> Result<(), Status> { - let mut updated = provider.clone(); - updated - .credentials - .insert(credential_key.to_string(), minted.access_token.clone()); - if minted.expires_at_ms > 0 { - updated - .credential_expires_at_ms - .insert(credential_key.to_string(), minted.expires_at_ms); - } else { - updated.credential_expires_at_ms.remove(credential_key); - } - crate::grpc::provider::validate_provider_update_against_attached_sandboxes(store, &updated) - .await?; - store - .update_message_cas::(provider.object_id(), 0, |current| { - current - .credentials - .insert(credential_key.to_string(), minted.access_token.clone()); - if minted.expires_at_ms > 0 { - current - .credential_expires_at_ms - .insert(credential_key.to_string(), minted.expires_at_ms); - } else { - current.credential_expires_at_ms.remove(credential_key); - } - }) - .await - .map(|_| ()) - .map_err(|e| Status::internal(format!("persist refreshed provider credential failed: {e}"))) -} - -async fn mint_credential( - state: &StoredProviderCredentialRefreshState, -) -> Result { - let strategy = ProviderCredentialRefreshStrategy::try_from(state.strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified); - match strategy { - ProviderCredentialRefreshStrategy::Oauth2RefreshToken => { - mint_oauth2_refresh_token(state).await - } - ProviderCredentialRefreshStrategy::Oauth2ClientCredentials => { - mint_oauth2_client_credentials(state).await - } - ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt => { - mint_google_service_account_jwt(state).await - } - ProviderCredentialRefreshStrategy::External - | ProviderCredentialRefreshStrategy::Static - | ProviderCredentialRefreshStrategy::Unspecified => Err(Status::failed_precondition( - format!("refresh strategy '{strategy:?}' cannot be minted by the gateway"), - )), - } -} - -async fn mint_oauth2_refresh_token( - state: &StoredProviderCredentialRefreshState, -) -> Result { - let token_url = oauth2_token_url(state)?; - let client_id = required_material(&state.material, "client_id")?; - let refresh_token = required_material(&state.material, "refresh_token")?; - let mut form = vec![ - ("grant_type".to_string(), "refresh_token".to_string()), - ("client_id".to_string(), client_id), - ("refresh_token".to_string(), refresh_token), - ]; - if let Some(client_secret) = material_value(&state.material, &["client_secret"]) { - form.push(("client_secret".to_string(), client_secret)); - } - let scope = refresh_scopes(state).join(" "); - if !scope.is_empty() { - form.push(("scope".to_string(), scope)); - } - - request_token(&token_url, &form, state.max_lifetime_seconds).await -} - -async fn mint_oauth2_client_credentials( - state: &StoredProviderCredentialRefreshState, -) -> Result { - let token_url = oauth2_token_url(state)?; - let client_id = required_material(&state.material, "client_id")?; - let client_secret = required_material(&state.material, "client_secret")?; - let mut form = vec![ - ("grant_type".to_string(), "client_credentials".to_string()), - ("client_id".to_string(), client_id), - ("client_secret".to_string(), client_secret), - ]; - let scope = refresh_scopes(state).join(" "); - if !scope.is_empty() { - form.push(("scope".to_string(), scope)); - } - - request_token(&token_url, &form, state.max_lifetime_seconds).await -} - -async fn mint_google_service_account_jwt( - state: &StoredProviderCredentialRefreshState, -) -> Result { - let token_url = google_token_url(state); - let client_email = required_material(&state.material, "client_email")?; - let private_key = required_material(&state.material, "private_key")?; - let scopes = refresh_scopes(state); - if scopes.is_empty() { - return Err(Status::invalid_argument( - "google_service_account_jwt requires at least one scope", - )); - } - let now_ms = current_time_ms(); - let now_secs = now_ms / 1000; - let lifetime_secs = if state.max_lifetime_seconds > 0 { - state.max_lifetime_seconds.min(DEFAULT_MAX_LIFETIME_SECONDS) - } else { - DEFAULT_MAX_LIFETIME_SECONDS - }; - let subject = material_value(&state.material, &["subject", "sub"]); - let claims = GoogleServiceAccountClaims { - iss: &client_email, - scope: scopes.join(" "), - aud: &token_url, - iat: now_secs, - exp: now_secs.saturating_add(lifetime_secs), - sub: subject.as_deref(), - }; - let assertion = jsonwebtoken::encode( - &jsonwebtoken::Header::new(jsonwebtoken::Algorithm::RS256), - &claims, - &jsonwebtoken::EncodingKey::from_rsa_pem(private_key.as_bytes()).map_err(|_| { - Status::invalid_argument("google_service_account_jwt private_key must be RSA PEM") - })?, - ) - .map_err(|_| Status::internal("sign google service account jwt failed"))?; - let form = vec![ - ( - "grant_type".to_string(), - "urn:ietf:params:oauth:grant-type:jwt-bearer".to_string(), - ), - ("assertion".to_string(), assertion), - ]; - request_token(&token_url, &form, lifetime_secs).await -} - -async fn request_token( - token_url: &str, - form: &[(String, String)], - max_lifetime_seconds: i64, -) -> Result { - let parsed = reqwest::Url::parse(token_url) - .map_err(|_| Status::invalid_argument("token_url must be an absolute URL"))?; - match parsed.scheme() { - "https" => {} - "http" if parsed.host_str().is_some_and(is_loopback_host) => {} - _ => { - return Err(Status::invalid_argument( - "token_url must use https, except loopback http for local tests", - )); - } - } - - let client = reqwest::Client::builder() - .timeout(Duration::from_secs(30)) - .build() - .map_err(|e| Status::internal(format!("build refresh HTTP client failed: {e}")))?; - let response = client - .post(parsed) - .form(form) - .send() - .await - .map_err(|e| Status::unavailable(format!("token endpoint request failed: {e}")))?; - let status = response.status(); - if !status.is_success() { - return Err(Status::failed_precondition(format!( - "token endpoint returned HTTP {status}" - ))); - } - let token = response - .json::() - .await - .map_err(|_| Status::failed_precondition("token endpoint returned invalid JSON"))?; - if token.access_token.trim().is_empty() { - return Err(Status::failed_precondition( - "token endpoint returned empty access_token", - )); - } - let now_ms = current_time_ms(); - let lifetime_cap_seconds = if max_lifetime_seconds > 0 { - max_lifetime_seconds - } else { - DEFAULT_MAX_LIFETIME_SECONDS - }; - let lifetime_seconds = token - .expires_in - .filter(|value| *value > 0) - .unwrap_or(lifetime_cap_seconds); - let lifetime_seconds = lifetime_seconds.min(lifetime_cap_seconds); - Ok(MintedCredential { - access_token: token.access_token, - expires_at_ms: now_ms.saturating_add(lifetime_seconds.saturating_mul(1000)), - refresh_token: token - .refresh_token - .filter(|refresh_token| !refresh_token.trim().is_empty()), - }) -} - -pub fn refresh_scopes(state: &StoredProviderCredentialRefreshState) -> Vec { - if !state.scopes.is_empty() { - return state.scopes.clone(); - } - material_scopes(&state.material) -} - -pub fn material_scopes(material: &HashMap) -> Vec { - material_value(material, &["scope", "scopes"]) - .map(|raw| { - raw.split(|ch: char| ch == ',' || ch.is_ascii_whitespace()) - .map(str::trim) - .filter(|scope| !scope.is_empty()) - .map(ToString::to_string) - .collect() - }) - .unwrap_or_default() -} - -pub fn parse_material_i64( - material: &HashMap, - key: &str, -) -> Result, Status> { - let Some(value) = material_value(material, &[key]) else { - return Ok(None); - }; - value - .parse::() - .map(Some) - .map_err(|_| Status::invalid_argument(format!("{key} material must be a signed integer"))) -} - -fn oauth2_token_url(state: &StoredProviderCredentialRefreshState) -> Result { - if let Some(tenant_id) = material_value(&state.material, &["tenant_id"]) { - return Ok(format!( - "https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token" - )); - } - if !state.token_url.trim().is_empty() { - return Ok(state.token_url.clone()); - } - Err(Status::invalid_argument( - "oauth2_client_credentials requires token_url or tenant_id material", - )) -} - -fn google_token_url(state: &StoredProviderCredentialRefreshState) -> String { - if state.token_url.trim().is_empty() { - "https://oauth2.googleapis.com/token".to_string() - } else { - state.token_url.clone() - } -} - -fn required_material(material: &HashMap, key: &str) -> Result { - material_value(material, &[key]) - .ok_or_else(|| Status::invalid_argument(format!("{key} material is required"))) -} - -fn material_value(material: &HashMap, keys: &[&str]) -> Option { - for key in keys { - if let Some(value) = material.get(*key).map(|value| value.trim()) - && !value.is_empty() - { - return Some(value.to_string()); - } - } - None -} - -fn is_loopback_host(host: &str) -> bool { - matches!(host, "localhost" | "127.0.0.1" | "::1") -} - -pub fn spawn_refresh_worker(state: std::sync::Arc, interval: Duration) { - info!( - interval_seconds = interval.as_secs(), - "provider credential refresh worker started" - ); - tokio::spawn(async move { - let mut ticker = tokio::time::interval(interval); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); - loop { - ticker.tick().await; - if let Err(err) = run_refresh_worker_tick(state.store.as_ref()).await { - warn!(error = %err, "provider credential refresh worker tick failed"); - } - } - }); -} - -async fn run_refresh_worker_tick(store: &Store) -> Result<(), Status> { - let now_ms = current_time_ms(); - let states = list_all_refresh_states(store).await?; - let watched_count = states.len(); - let due_count = states - .iter() - .filter(|state| state.next_refresh_at_ms <= 0 || state.next_refresh_at_ms <= now_ms) - .count(); - let rotation_requested_count = states - .iter() - .filter(|state| state.status == "rotation_requested") - .count(); - info!( - watched_count, - due_count, rotation_requested_count, "provider credential refresh worker sweep" - ); - for state in states { - let strategy = ProviderCredentialRefreshStrategy::try_from(state.strategy) - .unwrap_or(ProviderCredentialRefreshStrategy::Unspecified); - let due = state.next_refresh_at_ms <= 0 || state.next_refresh_at_ms <= now_ms; - let rotation_requested = state.status == "rotation_requested"; - info!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - expires_at_ms = state.expires_at_ms, - seconds_until_expiry = seconds_until_ms(now_ms, state.expires_at_ms), - next_refresh_at_ms = state.next_refresh_at_ms, - last_refresh_at_ms = state.last_refresh_at_ms, - seconds_until_refresh = seconds_until_ms(now_ms, state.next_refresh_at_ms), - due, - rotation_requested, - "provider credential refresh watch" - ); - if !due && !rotation_requested { - continue; - } - if !is_gateway_mintable_strategy(strategy) { - warn!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - "skipping non-gateway-mintable provider credential refresh state" - ); - continue; - } - info!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - "refreshing provider credential" - ); - if let Err(err) = - refresh_provider_credential(store, &state.provider_name, &state.credential_key).await - { - warn!( - provider = %state.provider_name, - credential_key = %state.credential_key, - strategy = %refresh_strategy_name(state.strategy), - status = %state.status, - next_refresh_at_ms = state.next_refresh_at_ms, - error = %err, - "provider credential refresh failed" - ); - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::{ - NewRefreshStateConfig, get_refresh_state, new_refresh_state, put_refresh_state, - refresh_provider_credential, refresh_state_name, refresh_strategy_name, - run_refresh_worker_tick, seconds_until_ms, - }; - use crate::persistence::Store; - use openshell_core::ObjectId; - use openshell_core::proto::datamodel::v1::ObjectMeta; - use openshell_core::proto::{ - Provider, ProviderCredentialRefreshStrategy, Sandbox, SandboxSpec, - }; - use std::collections::HashMap; - use wiremock::matchers::{body_string_contains, method, path}; - use wiremock::{Mock, MockServer, ResponseTemplate}; - - #[test] - fn refresh_state_name_preserves_distinct_credential_keys() { - let provider_id = "provider-id"; - - assert_ne!( - refresh_state_name(provider_id, "API_KEY"), - refresh_state_name(provider_id, "api_key") - ); - assert_ne!( - refresh_state_name(provider_id, " alex-api "), - refresh_state_name(provider_id, " alex_api") - ); - assert_ne!( - refresh_state_name(provider_id, "Alex-API"), - refresh_state_name(provider_id, "alex-api") - ); - } - - #[test] - fn refresh_log_helpers_format_safe_operational_fields() { - assert_eq!(seconds_until_ms(1_000, 61_000), 60); - assert_eq!(seconds_until_ms(61_000, 1_000), 0); - assert_eq!(seconds_until_ms(1_000, 0), 0); - assert_eq!( - refresh_strategy_name(ProviderCredentialRefreshStrategy::Oauth2RefreshToken as i32), - "oauth2_refresh_token" - ); - assert_eq!( - refresh_strategy_name( - ProviderCredentialRefreshStrategy::Oauth2ClientCredentials as i32 - ), - "oauth2_client_credentials" - ); - assert_eq!( - refresh_strategy_name( - ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt as i32 - ), - "google_service_account_jwt" - ); - assert_eq!(refresh_strategy_name(i32::MAX), "unspecified"); - } - - #[tokio::test] - async fn oauth2_client_credentials_refresh_mints_and_persists_access_token() { - let mock_server = MockServer::start().await; - Mock::given(method("POST")) - .and(path("/token")) - .and(body_string_contains("grant_type=client_credentials")) - .and(body_string_contains("client_id=client-id")) - .and(body_string_contains( - "scope=https%3A%2F%2Fgraph.microsoft.com%2F.default", - )) - .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "access_token": "minted-graph-token", - "expires_in": 3600, - "token_type": "Bearer" - }))) - .mount(&mock_server) - .await; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - let provider = provider("my-graph", "outlook"); - store.put_message(&provider).await.unwrap(); - let before_refresh_ms = crate::persistence::current_time_ms(); - let state = new_refresh_state( - &provider, - "MS_GRAPH_ACCESS_TOKEN", - NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials, - material: HashMap::from([ - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: 0, - token_url: format!("{}/token", mock_server.uri()), - scopes: vec!["https://graph.microsoft.com/.default".to_string()], - refresh_before_seconds: 30, - max_lifetime_seconds: 60, - }, - ) - .unwrap(); - put_refresh_state(&store, &state).await.unwrap(); - - let refreshed = refresh_provider_credential(&store, "my-graph", "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap(); - assert_eq!(refreshed.status, "refreshed"); - assert!(refreshed.expires_at_ms > 0); - assert!(refreshed.next_refresh_at_ms > 0); - assert!(refreshed.expires_at_ms <= before_refresh_ms + 120_000); - assert!(refreshed.last_error.is_empty()); - - let stored = store - .get_message_by_name::("my-graph") - .await - .unwrap() - .unwrap(); - assert_eq!( - stored.credentials.get("MS_GRAPH_ACCESS_TOKEN"), - Some(&"minted-graph-token".to_string()) - ); - assert_eq!( - stored.credential_expires_at_ms.get("MS_GRAPH_ACCESS_TOKEN"), - Some(&refreshed.expires_at_ms) - ); - } - - #[tokio::test] - async fn refresh_rejects_minted_credential_key_collision_for_attached_sandbox() { - let mock_server = MockServer::start().await; - Mock::given(method("POST")) - .and(path("/token")) - .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "access_token": "minted-graph-token", - "expires_in": 3600, - "token_type": "Bearer" - }))) - .mount(&mock_server) - .await; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - let mut provider_a = provider("existing-graph", "outlook"); - provider_a.credentials.insert( - "MS_GRAPH_ACCESS_TOKEN".to_string(), - "existing-token".to_string(), - ); - store.put_message(&provider_a).await.unwrap(); - let provider_b = provider("refreshing-graph", "outlook"); - store.put_message(&provider_b).await.unwrap(); - store - .put_message(&Sandbox { - metadata: Some(ObjectMeta { - id: "sandbox-collision".to_string(), - name: "collision".to_string(), - created_at_ms: 1, - labels: HashMap::new(), - resource_version: 0, - }), - spec: Some(SandboxSpec { - providers: vec!["existing-graph".to_string(), "refreshing-graph".to_string()], - ..SandboxSpec::default() - }), - ..Default::default() - }) - .await - .unwrap(); - let state = new_refresh_state( - &provider_b, - "MS_GRAPH_ACCESS_TOKEN", - NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::Oauth2ClientCredentials, - material: HashMap::from([ - ("client_id".to_string(), "client-id".to_string()), - ("client_secret".to_string(), "client-secret".to_string()), - ]), - secret_material_keys: vec!["client_secret".to_string()], - expires_at_ms: 0, - token_url: format!("{}/token", mock_server.uri()), - scopes: Vec::new(), - refresh_before_seconds: 30, - max_lifetime_seconds: 60, - }, - ) - .unwrap(); - put_refresh_state(&store, &state).await.unwrap(); - - let err = refresh_provider_credential(&store, "refreshing-graph", "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::FailedPrecondition); - assert!(err.message().contains("MS_GRAPH_ACCESS_TOKEN")); - let stored_state = - get_refresh_state(&store, provider_b.object_id(), "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap() - .unwrap(); - assert_eq!(stored_state.status, "error"); - assert!(stored_state.last_error.contains("MS_GRAPH_ACCESS_TOKEN")); - let stored_provider = store - .get_message_by_name::("refreshing-graph") - .await - .unwrap() - .unwrap(); - assert!( - !stored_provider - .credentials - .contains_key("MS_GRAPH_ACCESS_TOKEN") - ); - } - - #[tokio::test] - async fn oauth2_refresh_token_refresh_mints_access_token_and_persists_rotated_refresh_token() { - let mock_server = MockServer::start().await; - Mock::given(method("POST")) - .and(path("/token")) - .and(body_string_contains("grant_type=refresh_token")) - .and(body_string_contains("client_id=client-id")) - .and(body_string_contains("refresh_token=old-refresh-token")) - .and(body_string_contains( - "scope=https%3A%2F%2Fgraph.microsoft.com%2F.default", - )) - .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "access_token": "delegated-graph-token", - "refresh_token": "rotated-refresh-token", - "expires_in": 3600, - "token_type": "Bearer" - }))) - .mount(&mock_server) - .await; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - let provider = provider("my-delegated-graph", "outlook"); - store.put_message(&provider).await.unwrap(); - let state = new_refresh_state( - &provider, - "MS_GRAPH_ACCESS_TOKEN", - NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::Oauth2RefreshToken, - material: HashMap::from([ - ("client_id".to_string(), "client-id".to_string()), - ("refresh_token".to_string(), "old-refresh-token".to_string()), - ]), - secret_material_keys: vec!["refresh_token".to_string()], - expires_at_ms: 0, - token_url: format!("{}/token", mock_server.uri()), - scopes: vec!["https://graph.microsoft.com/.default".to_string()], - refresh_before_seconds: 30, - max_lifetime_seconds: 60, - }, - ) - .unwrap(); - put_refresh_state(&store, &state).await.unwrap(); - - let refreshed = - refresh_provider_credential(&store, "my-delegated-graph", "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap(); - assert_eq!(refreshed.status, "refreshed"); - assert!(refreshed.expires_at_ms > 0); - - let stored_provider = store - .get_message_by_name::("my-delegated-graph") - .await - .unwrap() - .unwrap(); - assert_eq!( - stored_provider.credentials.get("MS_GRAPH_ACCESS_TOKEN"), - Some(&"delegated-graph-token".to_string()) - ); - assert_eq!( - stored_provider - .credential_expires_at_ms - .get("MS_GRAPH_ACCESS_TOKEN"), - Some(&refreshed.expires_at_ms) - ); - - let stored_state = get_refresh_state(&store, provider.object_id(), "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap() - .unwrap(); - assert_eq!( - stored_state.material.get("refresh_token"), - Some(&"rotated-refresh-token".to_string()) - ); - assert!( - stored_state - .secret_material_keys - .iter() - .any(|key| key == "refresh_token") - ); - } - - #[tokio::test] - async fn google_service_account_refresh_mints_and_persists_access_token() { - let mock_server = MockServer::start().await; - Mock::given(method("POST")) - .and(path("/token")) - .and(body_string_contains( - "grant_type=urn%3Aietf%3Aparams%3Aoauth%3Agrant-type%3Ajwt-bearer", - )) - .and(body_string_contains("assertion=")) - .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "access_token": "minted-drive-token", - "expires_in": 1800, - "token_type": "Bearer" - }))) - .mount(&mock_server) - .await; - - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - let provider = provider("my-drive", "google-drive"); - store.put_message(&provider).await.unwrap(); - let state = new_refresh_state( - &provider, - "GOOGLE_DRIVE_ACCESS_TOKEN", - NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::GoogleServiceAccountJwt, - material: HashMap::from([ - ( - "client_email".to_string(), - "svc@example.iam.gserviceaccount.com".to_string(), - ), - ("private_key".to_string(), TEST_RSA_PRIVATE_KEY.to_string()), - ]), - secret_material_keys: vec!["private_key".to_string()], - expires_at_ms: 0, - token_url: format!("{}/token", mock_server.uri()), - scopes: vec!["https://www.googleapis.com/auth/drive.readonly".to_string()], - refresh_before_seconds: 300, - max_lifetime_seconds: 3600, - }, - ) - .unwrap(); - put_refresh_state(&store, &state).await.unwrap(); - - let refreshed = - refresh_provider_credential(&store, "my-drive", "GOOGLE_DRIVE_ACCESS_TOKEN") - .await - .unwrap(); - assert_eq!(refreshed.status, "refreshed"); - assert!(refreshed.expires_at_ms > 0); - - let stored = store - .get_message_by_name::("my-drive") - .await - .unwrap() - .unwrap(); - assert_eq!( - stored.credentials.get("GOOGLE_DRIVE_ACCESS_TOKEN"), - Some(&"minted-drive-token".to_string()) - ); - } - - #[tokio::test] - async fn refresh_worker_skips_non_gateway_mintable_strategies() { - let store = Store::connect("sqlite::memory:?cache=shared") - .await - .unwrap(); - let provider = provider("my-external", "outlook"); - store.put_message(&provider).await.unwrap(); - let state = new_refresh_state( - &provider, - "MS_GRAPH_ACCESS_TOKEN", - NewRefreshStateConfig { - strategy: ProviderCredentialRefreshStrategy::External, - material: HashMap::new(), - secret_material_keys: Vec::new(), - expires_at_ms: 0, - token_url: String::new(), - scopes: Vec::new(), - refresh_before_seconds: 0, - max_lifetime_seconds: 0, - }, - ) - .unwrap(); - put_refresh_state(&store, &state).await.unwrap(); - - run_refresh_worker_tick(&store).await.unwrap(); - - let stored_state = get_refresh_state(&store, provider.object_id(), "MS_GRAPH_ACCESS_TOKEN") - .await - .unwrap() - .unwrap(); - assert_ne!(stored_state.status, "error"); - assert!(stored_state.last_error.is_empty()); - - let stored_provider = store - .get_message_by_name::("my-external") - .await - .unwrap() - .unwrap(); - assert!( - !stored_provider - .credentials - .contains_key("MS_GRAPH_ACCESS_TOKEN") - ); - } - - fn provider(name: &str, provider_type: &str) -> Provider { - Provider { - metadata: Some(ObjectMeta { - id: format!("{name}-id"), - name: name.to_string(), - created_at_ms: 1, - labels: HashMap::new(), - resource_version: 0, - }), - r#type: provider_type.to_string(), - credentials: HashMap::new(), - config: HashMap::new(), - credential_expires_at_ms: HashMap::new(), - } - } - - const TEST_RSA_PRIVATE_KEY: &str = r"-----BEGIN PRIVATE KEY----- -MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCvCoZ0mVHpCHsF -zeeqw2caNIe/eb4BQUccFPhZfRnF7sCfyB84zTBmuwG2umRBdjFnVsfIIZRp2HcD -OESrRYYiE1RGfjBXImGVg2Wtza0HYhL1sLyX1eaEefylxoilmApAgWDh9p36h8J2 -s5YHwyXPTttx4DpdWDnxju1iNmwoIB8uVE/5amWgbNvlETMBOcB1RxDHtnVy+xJz -jjjrzK4Qz9WsUTHAvngdi4Yyxvci+yKpjYTg5+UWxmAN6iW522TpLe32MDb5Ug1d -trBvvepWmdQ6CBwPhBHCt/sMoSJAYSO4RKeBnBjeLQBXFTxaOv5iTGIsRTX3K471 -epHp3cT5AgMBAAECggEASQlRv/4nZN5SgsH/K8v7zb3kdHsmUly8AJYpaCGgauvr -uN/mUyueyga2uNl+MqhQBef6VWHZjO6y/gdw86v/Q2GgVQebQQhKAnpAp2w+Ceoc -siKMFqi8VkOWLU+xPbM6d97kH3TpRxt1g1T8wYFmWeF0BEiE4eUJzGaQW14M9BJ+ -G0QxmP/zjX9cNpVeApKTjBWKiH4CXG3DuI3pJ93VOMpUlOsrdLXvKGTze0e01itr -MX/MHHTE+VXB4FB+/zKSA4c36egi676OSXrGC/GDmM8ntJ4CUGeD5uZsMSADiAUn -iccv5iGRWVMIKxUS5Q4k0jy8uWuK+QVP4Y6cQWYArwKBgQDhuSNORBNpIGRfsKGN -iJo/h+qinz6pEIpa3D3oVl7rpkyvgIyaTwfXvC1vfdS9V5VIel2gV2Cx0OrI8yrr -nQu1JuNV/rLmtvqX321fgBLRdoiqF3pAy1gbmdUz1elerAIYL578gXQ6jg1bbdic -kJpn0MsoDUJGwvJnXcgLqG7q3wKBgQDGhRIa4oJsj1vqICc8zt8YsCAcot3vjWLH -588X7JdBGOWJdWxfdmGXQRn5Zw9UhMQnYa3uyTBPeVcXopThlPotYeuFhLSU856T -IJzfpzCJzC4zIQayoyvJFrKe7N70iUQ986dewYy9oxQhHvFKd/qe4ylbzZJXpthX -eWEuuBSjJwKBgGkqXt6qLPj/1IQYwUw15tfOtW0LEKCoSi3HCzjidNsJ4hSqqdeD -Fr5WuDyHvcRxt+XKzTBVRYHTOnBhiw+3XasK8UQxpJyFh/+WY1jpTNs2hLnqslTZ -6LUDWSgLc+1d6qPmHAa9Ma/OWz7L0O4xGR9hUiXY95YMYe/y668yzGq1AoGBAJyU -Gsqfu7U6gYmxoKEine6QBFPx1dD7GF2KJdq93jMXGvyHZFoLOkAdtgnz0rCcI0bY -kWKUxwj4MMxQjNM8OPMQl75xBCmz2XA8Od9htDQLmqjzNKAzePabc3lMZTJFDlE6 -29kuGf79IIRbLn/JECDAFT/2baW60Ep2T0OVJ5njAoGAfaCaQ4aVgjI027q7Y5qP -KfNSI8uuA8PLqmUY30I9KFWzN6VDLu00eKa90F4w3CeWRRQWXW1+007tTz3V1mNw -20A24Fi3HGQmXc7NyuLDODTJsWBICuOemCnRkvcxIlxb+ec7jp+XRmzDwKkzSnVN -pM2zFU8SeVkvHKlEuoHaP0s= ------END PRIVATE KEY-----"; -} diff --git a/crates/openshell-server/src/service_routing.rs b/crates/openshell-server/src/service_routing.rs index 4b99a8ef7..5615d0f15 100644 --- a/crates/openshell-server/src/service_routing.rs +++ b/crates/openshell-server/src/service_routing.rs @@ -803,7 +803,6 @@ mod tests { name: "my-sandbox--web".to_string(), created_at_ms: 1_700_000_000_000, labels: std::collections::HashMap::default(), - resource_version: 0, }), sandbox_id: "sandbox-id".to_string(), sandbox_name: "my-sandbox".to_string(), diff --git a/crates/openshell-server/src/ssh_sessions.rs b/crates/openshell-server/src/ssh_sessions.rs index e328a50ac..c3294b361 100644 --- a/crates/openshell-server/src/ssh_sessions.rs +++ b/crates/openshell-server/src/ssh_sessions.rs @@ -90,7 +90,6 @@ mod tests { name: format!("session-{id}"), created_at_ms: 1000, labels: HashMap::new(), - resource_version: 0, }), sandbox_id: sandbox_id.to_string(), token: id.to_string(), diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index 91f40c289..554c2449b 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -630,6 +630,8 @@ pub async fn handle_connect_supervisor( error = %err, "supervisor session: failed to mark sandbox ready" ); + } else { + state.telemetry.sandbox_session_connected(&sandbox_id); } // Step 4: Spawn the session loop that reads inbound messages. @@ -650,6 +652,9 @@ pub async fn handle_connect_supervisor( .remove_if_current(&sandbox_id_clone, &session_id); if still_ours { info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended"); + state_clone + .telemetry + .sandbox_session_disconnected(&sandbox_id_clone); if let Err(err) = state_clone .compute .supervisor_session_disconnected(&sandbox_id_clone) @@ -800,7 +805,6 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), - resource_version: 0, }), ..Default::default() } diff --git a/crates/openshell-server/src/telemetry.rs b/crates/openshell-server/src/telemetry.rs new file mode 100644 index 000000000..d3ec3bacf --- /dev/null +++ b/crates/openshell-server/src/telemetry.rs @@ -0,0 +1,246 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! In-memory anonymous telemetry accounting for sandbox sessions. + +use openshell_core::proto::NetworkActivitySummary; +use std::collections::HashMap; +use std::sync::Mutex; +use std::time::Instant; + +#[derive(Debug, Default)] +pub struct TelemetryState { + sessions: Mutex>, +} + +#[derive(Debug)] +struct SessionTelemetry { + started_at: Instant, + network_activity_count: u64, + denied_action_count: u64, + denials_by_group: HashMap, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct SandboxSessionSummary { + pub active_sandbox_seconds: u64, + pub network_activity_count: u64, + pub denied_action_count: u64, + pub denial_rate_pct: f64, + pub denials_by_group: Vec<(String, u64)>, +} + +impl TelemetryState { + pub fn new() -> Self { + Self::default() + } + + pub fn sandbox_session_connected(&self, sandbox_id: &str) { + if sandbox_id.is_empty() { + return; + } + let Ok(mut sessions) = self.sessions.lock() else { + return; + }; + sessions + .entry(sandbox_id.to_string()) + .or_insert_with(SessionTelemetry::new); + } + + pub fn sandbox_session_disconnected(&self, sandbox_id: &str) { + let _ = self.end_sandbox_session_summary(sandbox_id); + } + + pub fn end_sandbox_session(&self, sandbox_id: &str) { + self.sandbox_session_disconnected(sandbox_id); + } + + pub fn record_network_activity(&self, sandbox_id: &str, summary: &NetworkActivitySummary) { + if sandbox_id.is_empty() { + return; + } + #[cfg(not(test))] + emit_network_activity_summary(summary); + + let Ok(mut sessions) = self.sessions.lock() else { + return; + }; + let session = sessions + .entry(sandbox_id.to_string()) + .or_insert_with(SessionTelemetry::new); + + session.network_activity_count = session + .network_activity_count + .saturating_add(u64::from(summary.network_activity_count)); + session.denied_action_count = session + .denied_action_count + .saturating_add(u64::from(summary.denied_action_count)); + for group in &summary.denials_by_group { + let deny_group = sanitize_deny_group(&group.deny_group).to_string(); + let entry = session.denials_by_group.entry(deny_group).or_default(); + *entry = entry.saturating_add(u64::from(group.denied_count)); + } + } + + #[cfg(test)] + pub(crate) fn end_sandbox_session_summary( + &self, + sandbox_id: &str, + ) -> Option { + self.end_sandbox_session_summary_inner(sandbox_id) + } + + #[cfg(not(test))] + fn end_sandbox_session_summary(&self, sandbox_id: &str) -> Option { + self.end_sandbox_session_summary_inner(sandbox_id) + } + + fn end_sandbox_session_summary_inner(&self, sandbox_id: &str) -> Option { + let Ok(mut sessions) = self.sessions.lock() else { + return None; + }; + let session = sessions.remove(sandbox_id)?; + let active_sandbox_seconds = session.started_at.elapsed().as_secs(); + let denial_rate_pct = + calculate_denial_rate_pct(session.network_activity_count, session.denied_action_count); + let mut denials_by_group: Vec<(String, u64)> = + session.denials_by_group.into_iter().collect(); + denials_by_group.sort_by(|left, right| left.0.cmp(&right.0)); + Some(SandboxSessionSummary { + active_sandbox_seconds, + network_activity_count: session.network_activity_count, + denied_action_count: session.denied_action_count, + denial_rate_pct, + denials_by_group, + }) + } +} + +impl SessionTelemetry { + fn new() -> Self { + Self { + started_at: Instant::now(), + network_activity_count: 0, + denied_action_count: 0, + denials_by_group: HashMap::new(), + } + } +} + +#[allow(clippy::cast_precision_loss)] +fn calculate_denial_rate_pct(network_activity_count: u64, denied_action_count: u64) -> f64 { + if network_activity_count == 0 { + return 0.0; + } + ((denied_action_count as f64 / network_activity_count as f64) * 100.0).clamp(0.0, 100.0) +} + +fn sanitize_deny_group(raw: &str) -> &'static str { + match raw { + "connect_policy" | "connect" | "l4_deny" => "connect_policy", + "forward_policy" | "forward" => "forward_policy", + "l7_policy" | "l7" | "l7_deny" | "forward-l7-deny" => "l7_policy", + "l7_parse_rejection" | "parse_rejection" => "l7_parse_rejection", + "ssrf" => "ssrf", + "bypass" => "bypass", + "policy_stale" => "policy_stale", + _ => "unknown", + } +} + +#[cfg(not(test))] +fn emit_network_activity_summary(summary: &NetworkActivitySummary) { + let mut denials_by_group = HashMap::::new(); + for group in &summary.denials_by_group { + let deny_group = sanitize_deny_group(&group.deny_group).to_string(); + let entry = denials_by_group.entry(deny_group).or_default(); + *entry = entry.saturating_add(u64::from(group.denied_count)); + } + openshell_core::telemetry::emit_sandbox_activity_summary( + u64::from(summary.network_activity_count), + u64::from(summary.denied_action_count), + calculate_denial_rate_pct( + u64::from(summary.network_activity_count), + u64::from(summary.denied_action_count), + ), + denials_by_group, + ); +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_core::proto::DenialGroupCount; + + fn assert_float_eq(actual: f64, expected: f64) { + assert!((actual - expected).abs() <= f64::EPSILON); + } + + #[test] + fn denial_rate_handles_empty_and_clamps() { + assert_float_eq(calculate_denial_rate_pct(0, 1), 0.0); + assert_float_eq(calculate_denial_rate_pct(10, 2), 20.0); + assert_float_eq(calculate_denial_rate_pct(10, 15), 100.0); + } + + #[test] + fn deny_group_sanitization_drops_raw_values() { + assert_eq!(sanitize_deny_group("forward-l7-deny"), "l7_policy"); + assert_eq!(sanitize_deny_group("host=/secret.example"), "unknown"); + assert_eq!(sanitize_deny_group("acme.internal:443"), "unknown"); + assert_eq!( + sanitize_deny_group("binary=/usr/local/bin/private"), + "unknown" + ); + } + + #[test] + fn session_records_activity_until_disconnect() { + let telemetry = TelemetryState::new(); + telemetry.sandbox_session_connected("sb-1"); + telemetry.record_network_activity( + "sb-1", + &NetworkActivitySummary { + network_activity_count: 4, + denied_action_count: 1, + denials_by_group: vec![DenialGroupCount { + deny_group: "ssrf".to_string(), + denied_count: 1, + }], + }, + ); + let summary = telemetry + .end_sandbox_session_summary("sb-1") + .expect("session summary should exist"); + assert_eq!(summary.network_activity_count, 4); + assert_eq!(summary.denied_action_count, 1); + assert_float_eq(summary.denial_rate_pct, 25.0); + assert_eq!(summary.denials_by_group, vec![("ssrf".to_string(), 1)]); + assert!(telemetry.end_sandbox_session_summary("sb-1").is_none()); + } + + #[test] + fn activity_starts_missing_session_accounting() { + let telemetry = TelemetryState::new(); + telemetry.record_network_activity( + "sb-1", + &NetworkActivitySummary { + network_activity_count: 1, + denied_action_count: 1, + denials_by_group: vec![DenialGroupCount { + deny_group: "forward_policy".to_string(), + denied_count: 1, + }], + }, + ); + let summary = telemetry + .end_sandbox_session_summary("sb-1") + .expect("activity should create a telemetry session"); + assert_eq!(summary.network_activity_count, 1); + assert_eq!(summary.denied_action_count, 1); + assert_eq!( + summary.denials_by_group, + vec![("forward_policy".to_string(), 1)] + ); + } +} diff --git a/crates/openshell-server/tests/common/mod.rs b/crates/openshell-server/tests/common/mod.rs index 3a8ecb5b3..93126ac96 100644 --- a/crates/openshell-server/tests/common/mod.rs +++ b/crates/openshell-server/tests/common/mod.rs @@ -248,34 +248,6 @@ impl OpenShell for TestOpenShell { )) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - async fn delete_provider( &self, _request: tonic::Request, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index aae6d8cf1..d82c9c261 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -270,33 +270,6 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } - async fn get_provider_refresh_status( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn configure_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn rotate_provider_credential( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } - - async fn delete_provider_refresh( - &self, - _: tonic::Request, - ) -> Result, Status> { - Err(Status::unimplemented("unused")) - } async fn delete_provider( &self, diff --git a/crates/openshell-tui/src/lib.rs b/crates/openshell-tui/src/lib.rs index 1969715ce..4f2048873 100644 --- a/crates/openshell-tui/src/lib.rs +++ b/crates/openshell-tui/src/lib.rs @@ -1627,12 +1627,10 @@ fn spawn_create_provider(app: &App, tx: mpsc::UnboundedSender) { name: provider_name.clone(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: ptype.clone(), credentials: credentials.clone(), config: HashMap::default(), - credential_expires_at_ms: HashMap::default(), }), }; @@ -1719,14 +1717,11 @@ fn spawn_update_provider(app: &App, tx: mpsc::UnboundedSender) { name: name.clone(), created_at_ms: 0, labels: HashMap::new(), - resource_version: 0, }), r#type: ptype, credentials, config: HashMap::default(), - credential_expires_at_ms: HashMap::default(), }), - credential_expires_at_ms: HashMap::default(), }; match tokio::time::timeout(Duration::from_secs(5), client.update_provider(req)).await { @@ -2065,7 +2060,6 @@ fn spawn_set_global_setting(app: &App, tx: mpsc::UnboundedSender) { delete_setting: false, global: true, merge_operations: vec![], - expected_resource_version: 0, }; let result = tokio::time::timeout(Duration::from_secs(5), client.update_config(req)).await; @@ -2101,7 +2095,6 @@ fn spawn_delete_global_setting(app: &App, tx: mpsc::UnboundedSender) { delete_setting: true, global: true, merge_operations: vec![], - expected_resource_version: 0, }; let result = tokio::time::timeout(Duration::from_secs(5), client.update_config(req)).await; @@ -2171,7 +2164,6 @@ fn spawn_set_sandbox_setting(app: &App, tx: mpsc::UnboundedSender) { delete_setting: false, global: false, merge_operations: vec![], - expected_resource_version: 0, }; let result = tokio::time::timeout(Duration::from_secs(5), client.update_config(req)).await; @@ -2211,7 +2203,6 @@ fn spawn_delete_sandbox_setting(app: &App, tx: mpsc::UnboundedSender) { delete_setting: true, global: false, merge_operations: vec![], - expected_resource_version: 0, }; let result = tokio::time::timeout(Duration::from_secs(5), client.update_config(req)).await; diff --git a/deploy/deb/init-gateway-config.sh b/deploy/deb/init-gateway-config.sh new file mode 100755 index 000000000..55b07f7e5 --- /dev/null +++ b/deploy/deb/init-gateway-config.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -eu + +CONFIG_FILE="${1:?Usage: init-gateway-config.sh }" +PKI_DIR="${2:?Usage: init-gateway-config.sh }" +DRIVER_DIR="${3:?Usage: init-gateway-config.sh }" +VM_STATE_DIR="${4:?Usage: init-gateway-config.sh }" + +if [ -f "$CONFIG_FILE" ]; then + exit 0 +fi + +mkdir -p "$(dirname "$CONFIG_FILE")" "$VM_STATE_DIR" + +port="${OPENSHELL_SERVER_PORT:-17670}" +scheme="https" +if [ "${OPENSHELL_DISABLE_TLS:-false}" = "true" ]; then + scheme="http" +fi + +tmp="${CONFIG_FILE}.tmp" +{ + cat < "$tmp" + +chmod 600 "$tmp" +mv "$tmp" "$CONFIG_FILE" diff --git a/deploy/deb/openshell-gateway.service b/deploy/deb/openshell-gateway.service index 1ed112b05..1b57f3e48 100644 --- a/deploy/deb/openshell-gateway.service +++ b/deploy/deb/openshell-gateway.service @@ -6,8 +6,17 @@ After=default.target [Service] Type=simple StateDirectory=openshell/gateway -EnvironmentFile=-%E/openshell/gateway.env +# %S resolves to $XDG_STATE_HOME for user services. +Environment=OPENSHELL_BIND_ADDRESS=127.0.0.1 +Environment=OPENSHELL_SERVER_PORT=17670 +Environment=OPENSHELL_TLS_CERT=%S/openshell/tls/server/tls.crt +Environment=OPENSHELL_TLS_KEY=%S/openshell/tls/server/tls.key +Environment=OPENSHELL_TLS_CLIENT_CA=%S/openshell/tls/ca.crt +Environment=OPENSHELL_DB_URL=sqlite:%S/openshell/gateway/openshell.db +Environment=OPENSHELL_GATEWAY_CONFIG=%S/openshell/gateway/config.toml +EnvironmentFile=-%h/.config/openshell/gateway.env ExecStartPre=/usr/bin/openshell-gateway generate-certs --output-dir %S/openshell/tls --server-san host.openshell.internal +ExecStartPre=/usr/libexec/openshell/init-gateway-config.sh %S/openshell/gateway/config.toml %S/openshell/tls /usr/libexec/openshell %S/openshell/vm-driver ExecStart=/usr/bin/openshell-gateway Restart=on-failure RestartSec=5s diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 77a8c94e2..6758e0f42 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -4,12 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 # CI runner image with all development tools pre-installed -# Rebuild triggered automatically when mise.toml, mise.lock, tasks, or this file changes +# Rebuild triggered automatically when mise.toml or this file changes FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 -ARG DOCKER_VERSION=29.5.1 -ARG BUILDX_VERSION=v0.34.0 +ARG DOCKER_VERSION=29.4.1 +ARG BUILDX_VERSION=v0.33.0 ARG NPM_VERSION=11.13.0 ARG TARGETARCH @@ -57,7 +57,7 @@ RUN case "$TARGETARCH" in \ && chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx # Install GitHub CLI used by install.sh and CI jobs -ARG GH_VERSION=2.92.0 +ARG GH_VERSION=2.91.0 RUN case "$TARGETARCH" in \ amd64) gh_arch=amd64 ;; \ arm64) gh_arch=arm64 ;; \ diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway index 9dd7ed8b9..e63b0a725 100644 --- a/deploy/docker/Dockerfile.gateway +++ b/deploy/docker/Dockerfile.gateway @@ -14,12 +14,11 @@ # an artifact, which is downloaded into the same staging directory before the # image build job runs. # -# The runtime is distroless Debian 13, which provides glibc and the dynamic -# loader needed by the GNU-linked gateway binary while keeping the attack -# surface small. The default digest currently carries Debian glibc -# 2.41-12+deb13u3. +# The runtime is `nvcr.io/nvidia/distroless/cc:4.0.0`, which provides glibc and +# the dynamic loader needed by the GNU-linked gateway binary while keeping the +# attack surface small. -ARG GATEWAY_BASE_IMAGE=gcr.io/distroless/cc-debian13:nonroot@sha256:e1fd250ce83d94603e9887ec991156a6c26905a6b0001039b7a43699018c0733 +ARG GATEWAY_BASE_IMAGE=nvcr.io/nvidia/distroless/cc:v4.0.4 FROM ${GATEWAY_BASE_IMAGE} AS gateway @@ -29,7 +28,7 @@ WORKDIR /app COPY deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /usr/local/bin/openshell-gateway -USER 1000:1000 +USER nvs:nvs EXPOSE 8080 ENTRYPOINT ["/usr/local/bin/openshell-gateway"] diff --git a/deploy/helm/openshell/.helmignore b/deploy/helm/openshell/.helmignore index 0aecc346a..a12325802 100644 --- a/deploy/helm/openshell/.helmignore +++ b/deploy/helm/openshell/.helmignore @@ -18,6 +18,5 @@ .vscode/ # Ignore development files -README.md.gotmpl skaffold.yaml ci/ diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index 390571062..ba0b51f95 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -1,11 +1,6 @@ # OpenShell Helm Chart - - -> **Experimental** - the Kubernetes deployment path is under active development. Expect rough edges and breaking changes. +> **Experimental** — the Kubernetes deployment path is under active development. Expect rough edges and breaking changes. This chart deploys the OpenShell gateway into a Kubernetes cluster. It is published as an OCI artifact to GHCR at `oci://ghcr.io/nvidia/openshell/helm-chart`. @@ -13,19 +8,19 @@ This chart deploys the OpenShell gateway into a Kubernetes cluster. It is publis The Kubernetes Agent Sandbox CRDs and controller must be installed on the cluster before deploying OpenShell. Install them with: -```shell +```bash kubectl apply -f https://github.com/kubernetes-sigs/agent-sandbox/releases/latest/download/manifest.yaml ``` ## Install on Kubernetes -```shell +```bash helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version ``` ## Install on OpenShift -```shell +```bash # Precreate the openshell namespace so we can create the SCC cluster role oc create ns openshell @@ -52,11 +47,11 @@ The `dev` tags are intended for testing changes ahead of a release. Production d ## Configuration -See [`values.yaml`](values.yaml) for source defaults. Selected overlays: +See [`values.yaml`](values.yaml) for configurable values. Selected overlays: -- [`ci/values-gateway.yaml`](ci/values-gateway.yaml) - gateway-only configuration -- [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) - cert-manager integration -- [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) - Keycloak OIDC integration +- [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration +- [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration +- [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration ## PKI bootstrap @@ -67,9 +62,10 @@ openssl/alpine sidecar). The Job is idempotent: -- Both target Secrets exist: log and exit 0. -- Exactly one exists: fail with `kubectl delete secret -n ` recovery hint. -- Neither exists: generate a CA, server cert, and client cert; POST both `kubernetes.io/tls` Secrets (`tls.crt`, `tls.key`, `ca.crt`). +- Both target Secrets exist → log and exit 0. +- Exactly one exists → fail with `kubectl delete secret -n ` recovery hint. +- Neither exists → generate a CA, server cert, and client cert; POST both `kubernetes.io/tls` + Secrets (`tls.crt`, `tls.key`, `ca.crt`). Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager, external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the diff --git a/deploy/helm/openshell/README.md.gotmpl b/deploy/helm/openshell/README.md.gotmpl deleted file mode 100644 index 5068d6848..000000000 --- a/deploy/helm/openshell/README.md.gotmpl +++ /dev/null @@ -1,79 +0,0 @@ -# OpenShell Helm Chart - - - -> **Experimental** - the Kubernetes deployment path is under active development. Expect rough edges and breaking changes. - -This chart deploys the OpenShell gateway into a Kubernetes cluster. It is published as an OCI artifact to GHCR at `oci://ghcr.io/nvidia/openshell/helm-chart`. - -## Prerequisites - -The Kubernetes Agent Sandbox CRDs and controller must be installed on the cluster before deploying OpenShell. Install them with: - -```shell -kubectl apply -f https://github.com/kubernetes-sigs/agent-sandbox/releases/latest/download/manifest.yaml -``` - -## Install on Kubernetes - -```shell -helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version -``` - -## Install on OpenShift - -```shell -# Precreate the openshell namespace so we can create the SCC cluster role -oc create ns openshell - -# Sandboxes are deployed into the openshell namespace and use the default service account for now -oc adm policy add-scc-to-user privileged -z default -n openshell - -# Deploy openshell with overrides to allow SCC assignment of fsGroup and runAsUser for the gateway -helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version -n openshell \ - --set pkiInitJob.enabled=false \ - --set server.disableTls=true \ - --set podSecurityContext.fsGroup=null \ - --set securityContext.runAsUser=null -``` - -## Available versions - -| Tag | Source | Notes | -| --- | --- | --- | -| `` (e.g. `0.6.0`) | Tagged GitHub release | Tracks the matching gateway and supervisor image versions. Recommended for production. | -| `0.0.0-dev` | Latest commit on `main` | Floating tag, overwritten on every push. `appVersion` is `dev`, so images resolve to the `:dev` tag. | -| `0.0.0-dev.` | A specific commit on `main` | Per-commit pin. Chart version and `appVersion` both use the full 40-character commit SHA, which matches the image tag pushed by CI. | - -The `dev` tags are intended for testing changes ahead of a release. Production deployments should pin to a tagged release. - -## Configuration - -See [`values.yaml`](values.yaml) for source defaults. Selected overlays: - -- [`ci/values-gateway.yaml`](ci/values-gateway.yaml) - gateway-only configuration -- [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) - cert-manager integration -- [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) - Keycloak OIDC integration - -## PKI bootstrap - -By default, a pre-install/pre-upgrade hook Job runs `openshell-gateway generate-certs` -to create the gateway's server and client mTLS Secrets. The Job uses the gateway image -itself, so air-gapped environments only need to mirror that one image (no separate -openssl/alpine sidecar). - -The Job is idempotent: - -- Both target Secrets exist: log and exit 0. -- Exactly one exists: fail with `kubectl delete secret -n ` recovery hint. -- Neither exists: generate a CA, server cert, and client cert; POST both `kubernetes.io/tls` Secrets (`tls.crt`, `tls.key`, `ca.crt`). - -Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager, -external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the -cert-manager alternative. - -{{ template "chart.valuesSection" . }} -{{ template "helm-docs.versionFooter" . }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 26ba1b5b5..a9c0b1435 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -3,139 +3,96 @@ # Default values for OpenShell -# -- Number of OpenShell gateway replicas. replicaCount: 1 image: - # -- Gateway image repository. repository: ghcr.io/nvidia/openshell/gateway - # -- Gateway image pull policy. pullPolicy: IfNotPresent - # -- Gateway image tag. Defaults to the chart appVersion when empty. tag: "" -# Supervisor image - provides the openshell-sandbox binary injected into sandbox +# Supervisor image — provides the openshell-sandbox binary injected into sandbox # pods. tag defaults to appVersion (same as the gateway image) so both stay in # sync when the chart is released. supervisor: image: - # -- Supervisor image repository. repository: ghcr.io/nvidia/openshell/supervisor - # -- Supervisor image pull policy. Defaults to the gateway image pull policy when empty. pullPolicy: "" - # -- Supervisor image tag. Defaults to the chart appVersion when empty. tag: "" - # -- How the supervisor binary is delivered into sandbox pods. + # How the supervisor binary is delivered into sandbox pods. # Empty (default) = auto-detect from cluster version: - # K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) - # K8s < v1.35 -> "init-container" (copies via init container + emptyDir) + # K8s >= v1.35 → "image-volume" (ImageVolume enabled by default; GA in v1.36) + # K8s < v1.35 → "init-container" (copies via init container + emptyDir) # On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, # set this to "image-volume" explicitly. sideloadMethod: "" -# -- Image pull secrets attached to gateway and helper pods. imagePullSecrets: [] -# -- Override the chart name used in generated resource names. nameOverride: "openshell" -# -- Override the full generated resource name. fullnameOverride: "" serviceAccount: - # -- Create a service account for the gateway. create: true - # -- Annotations to add to the generated service account. annotations: {} - # -- Existing service account name to use when serviceAccount.create is false. name: "" -# -- Extra annotations to add to the gateway pod. podAnnotations: {} -# -- Extra labels to add to the gateway pod. podLabels: {} podSecurityContext: - # -- fsGroup assigned to the gateway pod. fsGroup: 1000 securityContext: - # -- Require the gateway container to run as a non-root user. runAsNonRoot: true - # -- UID assigned to the gateway container. runAsUser: 1000 - # -- Whether the gateway container can gain additional privileges. allowPrivilegeEscalation: false capabilities: - # -- Linux capabilities dropped from the gateway container. drop: - ALL service: - # -- Kubernetes Service type for the gateway. type: ClusterIP - # -- Gateway gRPC/HTTP service port. port: 8080 - # -- Gateway health service port. healthPort: 8081 - # -- Gateway metrics service port. metricsPort: 9090 # Pod restart behavior and health probe tuning. podLifecycle: - # -- Grace period, in seconds, before Kubernetes terminates the gateway pod. terminationGracePeriodSeconds: 5 probes: startup: - # -- Startup probe period, in seconds. periodSeconds: 2 - # -- Startup probe timeout, in seconds. timeoutSeconds: 1 - # -- Startup probe failure threshold before the container is killed. failureThreshold: 30 liveness: - # -- Liveness probe initial delay, in seconds. initialDelaySeconds: 2 - # -- Liveness probe period, in seconds. periodSeconds: 5 - # -- Liveness probe timeout, in seconds. timeoutSeconds: 1 - # -- Liveness probe failure threshold before the container is restarted. failureThreshold: 3 readiness: - # -- Readiness probe initial delay, in seconds. initialDelaySeconds: 1 - # -- Readiness probe period, in seconds. periodSeconds: 2 - # -- Readiness probe timeout, in seconds. timeoutSeconds: 1 - # -- Readiness probe failure threshold before the pod is marked not ready. failureThreshold: 3 -# -- Gateway pod resource requests and limits. resources: {} -# -- Node selector for the gateway pod. nodeSelector: {} -# -- Tolerations for the gateway pod. tolerations: [] -# -- Affinity rules for the gateway pod. affinity: {} # Server configuration server: - # -- Gateway log level. logLevel: info - # -- Namespace where sandbox pods are created. Defaults to the Helm release + # Namespace where sandbox pods are created. Defaults to the Helm release # namespace (.Release.Namespace) when left empty. sandboxNamespace: "" - # -- Gateway database URL. dbUrl: "sqlite:/var/openshell/openshell.db" - # -- Default sandbox image used when requests do not specify one. sandboxImage: "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" - # -- Kubernetes imagePullPolicy for sandbox pods. Empty = Kubernetes default - # (Always for :latest, IfNotPresent otherwise). Set to "Always" for dev + # Kubernetes imagePullPolicy for sandbox pods. Empty = Kubernetes default + # (Always for :latest, IfNotPresent otherwise). Set to "Always" for dev # clusters so new images are picked up without manual eviction. sandboxImagePullPolicy: "" # -- Default storage size for the workspace PVC in sandbox pods. @@ -144,72 +101,71 @@ server: workspaceDefaultStorageSize: "" # -- gRPC endpoint sandboxes call back into the gateway. Leave empty to derive # it from the chart fullname, release namespace, service port, and - # disableTls flag, for example https://openshell.openshell.svc.cluster.local:8080. + # disableTls flag (i.e. ://..svc.cluster.local:). # Override only when sandboxes must reach the gateway via a different # hostname (e.g. an external ingress or a host alias). grpcEndpoint: "" # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. - # -- Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get + # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get # hostAliases entries mapping host.docker.internal and host.openshell.internal # to this IP, allowing them to reach services running on the Docker host. # Auto-detected by the cluster entrypoint script. hostGatewayIP: "" - # -- Enable Kubernetes user namespace isolation (hostUsers: false) for sandbox + # Enable Kubernetes user namespace isolation (hostUsers: false) for sandbox # pods. Requires Kubernetes 1.33+ with user namespace support available # (beta through 1.35, GA in 1.36+), plus a supporting container runtime and # Linux 5.12+. When enabled, container UID 0 maps to an unprivileged host # UID and capabilities become namespaced. enableUserNamespaces: false - # -- Disable TLS entirely - the server listens on plaintext HTTP. + # Disable TLS entirely — the server listens on plaintext HTTP. # Set to true when a reverse proxy / tunnel terminates TLS at the edge. disableTls: false - # -- Enable plaintext HTTP routing for loopback sandbox service URLs on + # Enable plaintext HTTP routing for loopback sandbox service URLs on # TLS-enabled gateways. enableLoopbackServiceHttp: true tls: - # -- K8s secret (type kubernetes.io/tls) with tls.crt and tls.key for the server. + # K8s secret (type kubernetes.io/tls) with tls.crt and tls.key for the server certSecretName: openshell-server-tls - # -- K8s secret with ca.crt for client certificate verification (mTLS). + # K8s secret with ca.crt for client certificate verification (mTLS). # Set to "" to disable mTLS and run HTTPS-only (use OIDC for auth instead). clientCaSecretName: openshell-server-client-ca - # -- K8s secret mounted into sandbox pods for mTLS to the server. + # K8s secret mounted into sandbox pods for mTLS to the server clientTlsSecretName: openshell-client-tls # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: - # -- OIDC issuer URL (e.g. https://keycloak.example.com/realms/openshell). + # OIDC issuer URL (e.g. https://keycloak.example.com/realms/openshell). issuer: "" - # -- Expected audience claim for the API resource server. + # Expected audience claim for the API resource server. # This should match the server's --oidc-audience, NOT the CLI client ID. audience: "openshell-cli" - # -- JWKS key cache TTL in seconds. + # JWKS key cache TTL in seconds. jwksTtl: 3600 - # -- Dot-separated path to the roles array in the JWT claims. + # Dot-separated path to the roles array in the JWT claims. # Keycloak: "realm_access.roles", Entra ID: "roles", Okta: "groups". rolesClaim: "" - # -- Role name for admin access. Leave empty (with userRole also empty) for + # Role name for admin access. Leave empty (with userRole also empty) for # authentication-only mode. Both must be set or both empty. adminRole: "" - # -- Role name for standard user access. + # Role name for standard user access. userRole: "" - # -- Dot-separated path to the scopes array in the JWT claims. + # Dot-separated path to the scopes array in the JWT claims. scopesClaim: "" - # -- Name of a ConfigMap containing a CA certificate bundle (key: ca.crt) + # Name of a ConfigMap containing a CA certificate bundle (key: ca.crt) # for verifying the OIDC issuer's TLS certificate. Required when the # issuer uses a non-public CA (e.g. OpenShift ingress, private PKI). caConfigMapName: "" # NetworkPolicy restricting SSH ingress on sandbox pods to the gateway only. networkPolicy: - # -- Create a NetworkPolicy restricting SSH ingress on sandbox pods to the gateway. enabled: true # PKI bootstrap via a pre-install/pre-upgrade hook Job. # Runs `openshell-gateway generate-certs` to create the server and client TLS # Secrets in-cluster. Key material is written directly to K8s Secrets and # never appears in Helm release history. Idempotent: existing secrets are -# left untouched on upgrade. Reuses the gateway image - no extra image to +# left untouched on upgrade. Reuses the gateway image — no extra image to # mirror in air-gapped environments. # # The server certificate already includes the built-in cluster SANs @@ -220,29 +176,24 @@ networkPolicy: # that domain, for example `*.apps.example.com` enables # `--.apps.example.com`. pkiInitJob: - # -- Run a pre-install/pre-upgrade Job that creates gateway and client mTLS Secrets. enabled: true - # -- Extra DNS SANs to append to the server certificate. + # Extra DNS SANs to append to the server certificate. serverDnsNames: [] - # -- Extra IP SANs to append to the server certificate. + # Extra IP SANs to append to the server certificate. serverIpAddresses: [] # cert-manager Certificate/Issuer resources (requires cert-manager CRDs in-cluster). # Uses namespaced Issuers only (no ClusterIssuer). Does not install cert-manager itself. certManager: - # -- Create cert-manager Issuer and Certificate resources instead of using the PKI bootstrap Job. enabled: false - # -- Secret created for the intermediate CA (Certificate with isCA: true). + # Secret created for the intermediate CA (Certificate with isCA: true). caSecretName: openshell-ca-tls - # -- Mount gateway client CA from the server TLS secret's ca.crt (populated by + # Mount gateway client CA from the server TLS secret's ca.crt (populated by # cert-manager for certs issued by a CA Issuer). Avoids a separate # openshell-server-client-ca Secret. clientCaFromServerTlsSecret: true - # -- Duration for cert-manager-issued certificates. certificateDuration: 8760h - # -- Renewal window for cert-manager-issued certificates. certificateRenewBefore: 720h - # -- DNS SANs on the cert-manager-issued server certificate. serverDnsNames: - openshell - openshell.openshell.svc @@ -251,36 +202,32 @@ certManager: - openshell.localhost - "*.openshell.localhost" - host.docker.internal - # -- IP SANs on the cert-manager-issued server certificate. serverIpAddresses: - 127.0.0.1 -# Kubernetes Gateway API - HTTPRoute and Gateway resources. +# Kubernetes Gateway API — HTTPRoute and Gateway resources. # Requires a Gateway API controller in the cluster. Install Envoy Gateway via # the skaffold.yaml releases or independently: # helm install eg oci://docker.io/envoyproxy/gateway-helm \ # --version v1.4.1 -n envoy-gateway-system --create-namespace grpcRoute: - # -- Create a Gateway API GRPCRoute for the gateway service. enabled: false - # -- Hostnames the GRPCRoute matches on. Leave empty to match all hosts. + # Hostnames the GRPCRoute matches on. Leave empty to match all hosts. hostnames: [] gateway: - # -- When true, a Gateway resource is created in the release namespace. + # When true, a Gateway resource is created in the release namespace. # Set to false and provide name/namespace to attach to a pre-existing Gateway. create: false - # -- GatewayClass to reference. Envoy Gateway installs one named "eg". + # GatewayClass to reference. Envoy Gateway installs one named "eg". className: "eg" - # -- Name of the Gateway resource. Defaults to the chart fullname. + # Name of the Gateway resource. Defaults to the chart fullname. name: "" - # -- Namespace of the Gateway referenced by the GRPCRoute parentRef. + # Namespace of the Gateway referenced by the GRPCRoute parentRef. # Defaults to the release namespace. namespace: "" # Listener settings (only used when gateway.create is true). listener: - # -- Listener port for the generated Gateway resource. port: 80 - # -- Listener protocol for the generated Gateway resource. protocol: HTTP - # -- "Same" restricts attached routes to the release namespace; "All" allows any namespace. + # "Same" restricts attached routes to the release namespace; "All" allows any namespace. allowedRoutes: Same diff --git a/deploy/man/openshell-gateway.8.md b/deploy/man/openshell-gateway.8.md index 2f22b29fa..5df741ffd 100644 --- a/deploy/man/openshell-gateway.8.md +++ b/deploy/man/openshell-gateway.8.md @@ -22,13 +22,12 @@ network and filesystem policies to sandboxes, routes inference requests, and provides the SSH tunnel endpoint for CLI-to-sandbox connections. -When installed via a Linux package, the gateway runs as a systemd user -service. The packaged service starts from built-in defaults and reads -the default gateway TOML path only when that file exists. +When installed via RPM, the gateway runs as a systemd user service +with the Podman compute driver. Sandboxes are rootless Podman +containers on the host. -The gateway exposes a single port with multiplexed gRPC and HTTP, -secured by mutual TLS (mTLS) by default unless the TOML config disables -TLS. +The gateway exposes a single port (default 8080) with multiplexed +gRPC and HTTP, secured by mutual TLS (mTLS) by default. # OPTIONS @@ -37,7 +36,7 @@ TLS. Environment: **OPENSHELL_BIND_ADDRESS**. **--port** *PORT* -: Port for the gRPC/HTTP API. Default: **17670**. +: Port for the gRPC/HTTP API. Default: **8080**. Environment: **OPENSHELL_SERVER_PORT**. **--health-port** *PORT* @@ -54,26 +53,22 @@ TLS. Environment: **OPENSHELL_LOG_LEVEL**. **--db-url** *URL* -: SQLite database URL for state persistence. When unset, the gateway - stores SQLite state under *~/.local/state/openshell/gateway/*. +: SQLite database URL for state persistence. Required. Environment: **OPENSHELL_DB_URL**. **--drivers** *DRIVER*\[,*DRIVER*\] : Compute driver. Accepts a comma-delimited list. The gateway currently requires exactly one driver. Options: **podman**, - **docker**, **kubernetes**, **vm**. When unset, the gateway - auto-detects Kubernetes, then Podman, then Docker. VM is opt-in. + **docker**, **kubernetes**. Default: **kubernetes**. Environment: **OPENSHELL_DRIVERS**. **--tls-cert** *PATH* -: Path to server TLS certificate file. Defaults to the local generated - TLS bundle when present. Required unless **--disable-tls** is set. - Environment: **OPENSHELL_TLS_CERT**. +: Path to server TLS certificate file. Required unless + **--disable-tls** is set. Environment: **OPENSHELL_TLS_CERT**. **--tls-key** *PATH* -: Path to server TLS private key file. Defaults to the local generated - TLS bundle when present. Required unless **--disable-tls** is set. - Environment: **OPENSHELL_TLS_KEY**. +: Path to server TLS private key file. Required unless + **--disable-tls** is set. Environment: **OPENSHELL_TLS_KEY**. **--tls-client-ca** *PATH* : Path to CA certificate for client certificate verification (mTLS). @@ -105,7 +100,7 @@ configured in the TOML file passed with **--config**. # SYSTEMD INTEGRATION -The package installs a systemd user unit at +The RPM installs a systemd user unit at */usr/lib/systemd/user/openshell-gateway.service*. Manage the gateway with standard systemd commands: @@ -119,12 +114,15 @@ View logs: journalctl --user -u openshell-gateway journalctl --user -u openshell-gateway -f -The unit runs **openshell-gateway generate-certs** as an **ExecStartPre** -step on first start. This generates a self-signed PKI bundle for mTLS -and skips generation when the bundle already exists. +The unit runs two **ExecStartPre** steps on first start: -The gateway then starts from built-in defaults and reads -*~/.config/openshell/gateway.toml* when that file exists. +1. **openshell-gateway generate-certs --output-dir** generates a + self-signed PKI bundle for mTLS. +2. **init-gateway-env.sh** generates the environment configuration + file. + +Both steps are idempotent and skip generation if their output files +already exist. To persist the service across logouts: @@ -132,16 +130,11 @@ To persist the service across logouts: # CONFIGURATION -The systemd user unit launches the gateway with: - - openshell-gateway - -Gateway listener, TLS, database, and compute driver settings have local -defaults. Create *~/.config/openshell/gateway.toml* when you need to -override them. The gateway rejects `database_url` in TOML; set -**OPENSHELL_DB_URL** when you need a different database. +The systemd user unit reads configuration from +*~/.config/openshell/gateway.env*. See **openshell-gateway.env**(5) +for the full variable reference. -To override individual settings without creating TOML: +To override individual settings without modifying gateway.env: systemctl --user edit openshell-gateway @@ -155,13 +148,16 @@ This creates a drop-in override that persists across package upgrades. */usr/lib/systemd/user/openshell-gateway.service* : Systemd user unit file. -*~/.config/openshell/gateway.toml* -: Optional gateway TOML configuration. +*/usr/libexec/openshell/init-gateway-env.sh* +: Gateway environment file generator. + +*~/.config/openshell/gateway.env* +: Gateway environment configuration (generated on first start). *~/.local/state/openshell/tls/* : Auto-generated TLS certificates. -*~/.local/state/openshell/gateway/openshell.db* +*~/.local/state/openshell/gateway.db* : SQLite database for gateway state. *~/.config/openshell/gateways/openshell/mtls/* @@ -175,17 +171,18 @@ Start the gateway as a systemd user service: Check gateway health from the CLI: - openshell gateway add --local https://127.0.0.1:17670 + openshell gateway add --local https://127.0.0.1:8080 openshell status -Override the API port in TOML: +Override the API port via a systemd drop-in: - $EDITOR ~/.config/openshell/gateway.toml - systemctl --user restart openshell-gateway + systemctl --user edit openshell-gateway + # Add: [Service] + # Add: Environment=OPENSHELL_SERVER_PORT=9090 # SEE ALSO -**openshell**(1), **systemctl**(1), **journalctl**(1), **loginctl**(1), -**podman**(1) +**openshell**(1), **openshell-gateway.env**(5), **systemctl**(1), +**journalctl**(1), **loginctl**(1), **podman**(1) Full documentation: *https://docs.nvidia.com/openshell/* diff --git a/deploy/man/openshell-gateway.env.5.md b/deploy/man/openshell-gateway.env.5.md new file mode 100644 index 000000000..ec3f466a1 --- /dev/null +++ b/deploy/man/openshell-gateway.env.5.md @@ -0,0 +1,127 @@ +--- +title: OPENSHELL-GATEWAY.ENV +section: 5 +header: OpenShell Manual +footer: openshell-gateway +date: 2025 +--- + +# NAME + +openshell-gateway.env - OpenShell gateway environment configuration + +# DESCRIPTION + +The **openshell-gateway.env** file contains environment variables that +configure the OpenShell gateway server when running as a systemd user +service. It is generated automatically on first start by +**init-gateway-env.sh** and is not overwritten on subsequent starts or +package upgrades. + +The file uses the standard systemd **EnvironmentFile** format: one +**KEY=VALUE** pair per line. Lines beginning with **#** are comments. +Shell variable expansion is not performed. + +# LOCATION + +The file is located at: + + ~/.config/openshell/gateway.env + +The systemd user unit reads it via: + + EnvironmentFile=-~/.config/openshell/gateway.env + +The **-** prefix means the service starts normally if the file does not +exist (the unit has built-in defaults for all required settings). + +# VARIABLES + +## Gateway + +**OPENSHELL_BIND_ADDRESS** (default: 0.0.0.0) +: IP address to bind all listeners to. The RPM default of **0.0.0.0** + exposes the gateway on all network interfaces; mTLS must remain + enabled to prevent unauthenticated access. Set to **127.0.0.1** for + local-only access. + +**OPENSHELL_SERVER_PORT** (default: 8080) +: Port for the multiplexed gRPC/HTTP API. + +**OPENSHELL_HEALTH_PORT** (default: 0) +: Port for unauthenticated health endpoints (/healthz, /readyz). + Set to a non-zero value to enable a dedicated health listener. + +**OPENSHELL_METRICS_PORT** (default: 0) +: Port for Prometheus metrics endpoint (/metrics). Set to a + non-zero value to enable a dedicated metrics listener. + +**OPENSHELL_LOG_LEVEL** (default: info) +: Log verbosity: **trace**, **debug**, **info**, **warn**, **error**. + +**OPENSHELL_DRIVERS** (default: podman) +: Compute driver for sandbox management. Options: **podman**, + **docker**, **kubernetes**. The RPM unit defaults to **podman**. + +**OPENSHELL_DB_URL** (default: sqlite://$XDG_STATE_HOME/openshell/gateway.db) +: SQLite database URL for gateway state persistence. + +## TLS + +**OPENSHELL_TLS_CERT** (default: auto-generated path) +: Path to server TLS certificate. + +**OPENSHELL_TLS_KEY** (default: auto-generated path) +: Path to server TLS private key. + +**OPENSHELL_TLS_CLIENT_CA** (default: auto-generated path) +: Path to CA certificate for client certificate verification. When + set without **OPENSHELL_OIDC_ISSUER**, mTLS is required. When both + are set, callers may authenticate via Bearer token or client + certificate. + +**OPENSHELL_DISABLE_TLS** (default: unset) +: Set to **true** to disable TLS entirely and listen on plaintext + HTTP. Not recommended for production. When the bind address is + **0.0.0.0** (the RPM default), disabling TLS exposes the API to the + entire network without authentication. Restrict + **OPENSHELL_BIND_ADDRESS** to **127.0.0.1** or place the gateway + behind a TLS-terminating reverse proxy. + +**OPENSHELL_SERVER_SAN** (default: unset) +: Comma-separated SANs configured on the gateway server certificate. + Wildcard DNS SANs also enable sandbox service URLs under that + domain. + +## Driver Configuration + +Compute driver settings are configured in the TOML file referenced by +**OPENSHELL_GATEWAY_CONFIG** or **--config**. This includes sandbox +images, image pull policy, callback endpoints, Podman socket path, +Docker network name, VM state directory, and guest TLS material. + +# EXAMPLES + +Change the API port to 9090: + + OPENSHELL_SERVER_PORT=9090 + +Enable debug logging: + + OPENSHELL_LOG_LEVEL=debug + +Use externally-managed TLS certificates: + + OPENSHELL_TLS_CERT=/etc/pki/tls/certs/openshell.crt + OPENSHELL_TLS_KEY=/etc/pki/tls/private/openshell.key + OPENSHELL_TLS_CLIENT_CA=/etc/pki/tls/certs/openshell-ca.crt + +Disable TLS (behind a reverse proxy): + + OPENSHELL_DISABLE_TLS=true + +# SEE ALSO + +**openshell-gateway**(8), **openshell**(1), **systemd.exec**(5) + +Full documentation: *https://docs.nvidia.com/openshell/* diff --git a/deploy/man/openshell.1.md b/deploy/man/openshell.1.md index 6ba6f4afb..98a683ec5 100644 --- a/deploy/man/openshell.1.md +++ b/deploy/man/openshell.1.md @@ -190,7 +190,7 @@ development task, or behind a cloud reverse proxy. Register the local RPM gateway and create a sandbox: - openshell gateway add --local https://127.0.0.1:17670 + openshell gateway add --local https://127.0.0.1:8080 openshell sandbox create -- claude List sandboxes and connect to one: @@ -208,7 +208,7 @@ Check gateway health: # SEE ALSO -**openshell-gateway**(8) +**openshell-gateway**(8), **openshell-gateway.env**(5) Full documentation: *https://docs.nvidia.com/openshell/* diff --git a/deploy/rpm/CONFIGURATION.md b/deploy/rpm/CONFIGURATION.md index 8a7edca8c..95a3b2c32 100644 --- a/deploy/rpm/CONFIGURATION.md +++ b/deploy/rpm/CONFIGURATION.md @@ -6,72 +6,18 @@ the RPM package on Fedora and RHEL systems. For first-time setup, see QUICKSTART.md. For troubleshooting, see TROUBLESHOOTING.md. -## Default configuration - -The RPM ships a default TOML configuration template at -`/usr/share/openshell-gateway/gateway.toml.default`. On first start of -`openshell-gateway.service`, the systemd unit copies this template to -`~/.config/openshell/gateway.toml` if no config file exists there yet. - -The defaults are tuned for rootless Podman use: - -```toml -[openshell] -version = 1 - -[openshell.gateway] -bind_address = "0.0.0.0:17670" -compute_drivers = ["podman"] -``` - -`bind_address = "0.0.0.0:17670"` is required because Podman sandbox -containers reach the gateway over the host network bridge and cannot -connect to `127.0.0.1` inside the gateway's network namespace. mTLS is -enabled by default and protects all connections. - -`compute_drivers = ["podman"]` pins the compute driver to Podman. Without -this, the gateway auto-detects in order: Kubernetes, Podman, Docker. Pinning -prevents unexpected driver selection if Docker is also installed on the host. - -### Customizing the configuration - -Edit `~/.config/openshell/gateway.toml` directly. The template at -`/usr/share/openshell-gateway/gateway.toml.default` is not read at runtime -and is not overwritten by RPM upgrades. - -To apply environment variable overrides that persist across upgrades without -editing the TOML file, add them to `~/.config/openshell/gateway.env`: - -```shell -# Example: restrict to loopback only -OPENSHELL_BIND_ADDRESS=127.0.0.1 -``` - -To override the path to the TOML config file entirely: - -```shell -# In ~/.config/openshell/gateway.env -OPENSHELL_GATEWAY_CONFIG=/path/to/custom/gateway.toml -``` - -For one-off service overrides that persist across package upgrades: - -```shell -systemctl --user edit openshell-gateway -``` - ## TLS (mTLS) The RPM enables mutual TLS by default. The gateway requires a valid -client certificate for all API connections and listens on -`0.0.0.0:17670` by default (see "Default configuration" above). +client certificate for all API connections, protecting the API even +though it listens on all interfaces (`0.0.0.0`). ### Auto-generated certificates -On first start, the systemd user service runs -`openshell-gateway generate-certs --output-dir ~/.local/state/openshell/tls --server-san host.openshell.internal` -to generate certificates with `rcgen` (the same routine the CLI uses for -local mTLS bundles): +On first start, the gateway's `ExecStartPre` runs +`openshell-gateway generate-certs --output-dir /openshell/tls`, +which generates the certificates with `rcgen` (the same routine the CLI +uses for local mTLS bundles): | File | Purpose | Location | |------|---------|----------| @@ -105,7 +51,6 @@ Names: - `openshell.openshell.svc.cluster.local` - `host.containers.internal` - `host.docker.internal` -- `host.openshell.internal` - `127.0.0.1` To connect from a remote machine, you need externally-managed @@ -118,13 +63,13 @@ To use certificates from an external CA or cert-manager: 1. Place the server cert, key, and CA cert on the filesystem. -1. Edit `~/.config/openshell/gateway.toml`: +1. Edit `~/.config/openshell/gateway.env` or use + `systemctl --user edit openshell-gateway` to override: - ```toml - [openshell.gateway.tls] - cert_path = "/path/to/server/tls.crt" - key_path = "/path/to/server/tls.key" - client_ca_path = "/path/to/ca.crt" + ```shell + OPENSHELL_TLS_CERT=/path/to/server/tls.crt + OPENSHELL_TLS_KEY=/path/to/server/tls.key + OPENSHELL_TLS_CLIENT_CA=/path/to/ca.crt ``` 1. Place the client cert where the CLI expects it: @@ -149,17 +94,21 @@ The gateway regenerates the PKI on next start. ### Disabling TLS -> **WARNING:** With TLS disabled, the gateway API has no authentication. -> Keep the bind address on `127.0.0.1`, or place the gateway behind a -> TLS-terminating reverse proxy that enforces its own authentication. +> **WARNING:** The RPM gateway binds to all interfaces (`0.0.0.0`) by +> default. With TLS disabled, the gateway API is exposed to the entire +> network with **no authentication**. Any host that can reach the +> gateway port has full access, including the ability to create +> sandboxes, execute arbitrary code, and access configured credentials. +> Only disable TLS when the gateway is behind a TLS-terminating reverse +> proxy that enforces its own authentication. When disabling TLS without +> a reverse proxy, restrict `OPENSHELL_BIND_ADDRESS` to `127.0.0.1`. To disable TLS (not recommended for production): -1. Edit `~/.config/openshell/gateway.toml`: +1. Edit `~/.config/openshell/gateway.env`: - ```toml - [openshell.gateway] - disable_tls = true + ```shell + OPENSHELL_DISABLE_TLS=true ``` 1. Remove or comment out the `guest_tls_*` entries in @@ -195,43 +144,51 @@ configuration is required. ## Configuration reference -Gateway and driver settings have local runtime defaults. The gateway reads -`~/.config/openshell/gateway.toml` when that file exists. Set -`OPENSHELL_GATEWAY_CONFIG` in the launch environment to use a different file. +Gateway process settings are controlled via environment variables. Driver +implementation settings live in `~/.config/openshell/gateway.toml`, which is +generated on first start and selected through `OPENSHELL_GATEWAY_CONFIG`. -Use `systemctl --user edit openshell-gateway` for service environment -overrides that persist across package upgrades. +Values in `gateway.env` override the unit defaults. Use +`systemctl --user edit openshell-gateway` to add overrides that persist +across package upgrades. Gateway CLI/env values override the gateway section +of the TOML file, while driver tables are read from TOML. ### Gateway settings -| TOML option | Default | Description | -|-------------|---------|-------------| -| `bind_address` | `0.0.0.0:17670` (RPM default) | Address for the gRPC/HTTP API. | -| `compute_drivers` | `["podman"]` (RPM default) | When unset, the gateway auto-detects Kubernetes, then Podman, then Docker. The RPM default pins to Podman. | -| `default_image` | `ghcr.io/nvidia/openshell-community/sandboxes/base:latest` | Default sandbox image. | -| `supervisor_image` | `ghcr.io/nvidia/openshell/supervisor:latest` | Supervisor image mounted into Podman sandboxes. | -| `guest_tls_ca`, `guest_tls_cert`, `guest_tls_key` | auto-generated paths | Client TLS material bind-mounted into sandbox containers. | -| `[openshell.gateway.tls]` paths | auto-generated paths | Server TLS certificate, key, and client CA. | -| `disable_tls` | unset | Set to `true` to disable TLS. | - -The database URL is not accepted in TOML. When `OPENSHELL_DB_URL` is unset, -the gateway uses `sqlite:$XDG_STATE_HOME/openshell/gateway/openshell.db`. +| Variable | Default | Description | +|----------|---------|-------------| +| `OPENSHELL_BIND_ADDRESS` | `0.0.0.0` | IP address to bind all listeners to. The default exposes the gateway on all interfaces; mTLS must remain enabled to prevent unauthenticated access. Set to `127.0.0.1` for local-only access. | +| `OPENSHELL_SERVER_PORT` | `8080` | Port for the gRPC/HTTP API | +| `OPENSHELL_HEALTH_PORT` | `0` (disabled) | Port for unauthenticated health endpoints (`/healthz`, `/readyz`). Set to a non-zero value to enable. | +| `OPENSHELL_METRICS_PORT` | `0` (disabled) | Port for Prometheus metrics (`/metrics`). Set to a non-zero value to enable. | +| `OPENSHELL_LOG_LEVEL` | `info` | Log level: `trace`, `debug`, `info`, `warn`, `error` | +| `OPENSHELL_DRIVERS` | `podman` | Compute driver (`podman`, `docker`, `kubernetes`, `vm`) | +| `OPENSHELL_DB_URL` | `sqlite://$XDG_STATE_HOME/openshell/gateway.db` | SQLite database URL for state persistence | + +### TLS settings + +| Variable | Default | Description | +|----------|---------|-------------| +| `OPENSHELL_TLS_CERT` | (auto-generated path) | Server TLS certificate | +| `OPENSHELL_TLS_KEY` | (auto-generated path) | Server TLS private key | +| `OPENSHELL_TLS_CLIENT_CA` | (auto-generated path) | CA for client certificate verification; requires mTLS unless OIDC is also configured | +| `OPENSHELL_DISABLE_TLS` | (unset) | Set to `true` to disable TLS | ### Driver TOML settings -Create `~/.config/openshell/gateway.toml` when you need to customize driver -settings: +The generated `gateway.toml` contains the RPM's Podman defaults: ```toml -[openshell] -version = 1 - [openshell.gateway] -bind_address = "0.0.0.0:17670" compute_drivers = ["podman"] default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/home/user/.local/state/openshell/tls/ca.crt" +guest_tls_cert = "/home/user/.local/state/openshell/tls/client/tls.crt" +guest_tls_key = "/home/user/.local/state/openshell/tls/client/tls.key" [openshell.drivers.podman] +socket_path = "/run/user/1000/podman/podman.sock" image_pull_policy = "missing" network_name = "openshell" stop_timeout_secs = 10 @@ -292,9 +249,10 @@ For air-gapped environments: | Gateway binary | `/usr/bin/openshell-gateway` | | CLI binary | `/usr/bin/openshell` | | Systemd user unit | `/usr/lib/systemd/user/openshell-gateway.service` | -| Default TOML config template (read-only) | `/usr/share/openshell-gateway/gateway.toml.default` | -| Active gateway TOML configuration | `~/.config/openshell/gateway.toml` | -| Optional environment variable overrides | `~/.config/openshell/gateway.env` | +| PKI bootstrap | `openshell-gateway generate-certs` (run from `ExecStartPre`) | +| Env/config generator script | `/usr/libexec/openshell/init-gateway-env.sh` | | TLS certificates | `~/.local/state/openshell/tls/` | | CLI client certs | `~/.config/openshell/gateways/openshell/mtls/` | -| Gateway database | `~/.local/state/openshell/gateway/openshell.db` | +| Gateway database | `~/.local/state/openshell/gateway.db` | +| Gateway environment | `~/.config/openshell/gateway.env` | +| Gateway TOML configuration | `~/.config/openshell/gateway.toml` | diff --git a/deploy/rpm/QUICKSTART.md b/deploy/rpm/QUICKSTART.md index c6634ced9..1f89bba00 100644 --- a/deploy/rpm/QUICKSTART.md +++ b/deploy/rpm/QUICKSTART.md @@ -52,8 +52,7 @@ creation. Ensure the host can reach ghcr.io over HTTPS (port 443). For air-gapped environments, pre-load images with `podman pull` and set `image_pull_policy = "never"` in -`~/.config/openshell/gateway.toml`. See CONFIGURATION.md for -details. +`~/.config/openshell/gateway.toml`. See CONFIGURATION.md for details. ## Start the gateway @@ -64,11 +63,13 @@ systemctl --user enable --now openshell-gateway On first start, the gateway automatically generates: - A self-signed PKI bundle (CA, server cert, client cert) for mTLS +- A commented configuration file at `~/.config/openshell/gateway.env` +- A gateway TOML file at `~/.config/openshell/gateway.toml` -> **Note:** The RPM default configuration binds to `0.0.0.0:17670` so -> Podman sandbox containers can reach the gateway over the host network -> bridge. Mutual TLS (mTLS) is enabled automatically on first start, -> requiring a valid client certificate for every connection. See +> **Note:** The gateway binds to all interfaces (`0.0.0.0`) by default. +> Mutual TLS (mTLS) is enabled automatically on first start, requiring a +> valid client certificate for every connection. Do not disable TLS +> without restricting the bind address to `127.0.0.1`. See > CONFIGURATION.md for details. Verify the service is running: @@ -82,7 +83,7 @@ systemctl --user status openshell-gateway The CLI needs to know where the gateway is. Register it: ```shell -openshell gateway add --local https://127.0.0.1:17670 +openshell gateway add --local https://127.0.0.1:8080 ``` This discovers the pre-provisioned mTLS certificates at diff --git a/deploy/rpm/TROUBLESHOOTING.md b/deploy/rpm/TROUBLESHOOTING.md index c46e0ba9f..1cc39cd8d 100644 --- a/deploy/rpm/TROUBLESHOOTING.md +++ b/deploy/rpm/TROUBLESHOOTING.md @@ -5,11 +5,10 @@ and upgrade procedures for the RPM deployment. ## CLI compatibility -The RPM installs the gateway as a systemd user service. On a standard RPM -install the gateway auto-detects Podman because the package depends on it. -The published online docs and some CLI commands assume a Docker/K3s -deployment model. This section clarifies which commands work, which do not, -and what to use instead. +The RPM installs the gateway as a systemd user service with the Podman +compute driver. The published online docs and some CLI commands assume +a Docker/K3s deployment model. This section clarifies which commands +work, which do not, and what to use instead. ### Commands that work normally @@ -68,14 +67,14 @@ Forward the gateway port over SSH and connect via localhost: ```shell # On the remote CLI machine: -ssh -L 17670:127.0.0.1:17670 user@gateway-host +ssh -L 8080:127.0.0.1:8080 user@gateway-host # In another terminal on the same machine: # Copy the client certs from the gateway host first: scp -r user@gateway-host:~/.config/openshell/gateways/openshell/mtls/ \ ~/.config/openshell/gateways/openshell/mtls/ -openshell gateway add --local https://127.0.0.1:17670 +openshell gateway add --local https://127.0.0.1:8080 openshell status ``` @@ -83,9 +82,6 @@ openshell status Generate certificates that include the server's hostname or IP in the SANs. See "Using externally-managed certificates" in CONFIGURATION.md. -Then change `bind_address` in -`~/.config/openshell/gateway.toml` to the interface the remote CLI -can reach, for example `0.0.0.0:17670`, and restart the gateway. After placing the server and client certs, register from the remote CLI: @@ -95,7 +91,7 @@ CLI: mkdir -p ~/.config/openshell/gateways/openshell/mtls/ cp ca.crt tls.crt tls.key ~/.config/openshell/gateways/openshell/mtls/ -openshell gateway add --local https://:17670 +openshell gateway add --local https://:8080 ``` ### Firewall @@ -103,7 +99,7 @@ openshell gateway add --local https://:17670 For remote access, open the gateway port in firewalld: ```shell -sudo firewall-cmd --add-port=17670/tcp --permanent +sudo firewall-cmd --add-port=8080/tcp --permanent sudo firewall-cmd --reload ``` @@ -121,7 +117,7 @@ The CLI cannot find a registered gateway. This happens when the gateway is running but has not been registered with the CLI. ```shell -openshell gateway add --local https://127.0.0.1:17670 +openshell gateway add --local https://127.0.0.1:8080 ``` ### Gateway fails to start @@ -220,9 +216,10 @@ systemctl --user restart openshell-gateway The SQLite database schema is auto-migrated on startup. Running sandboxes are stopped during the restart. -Package upgrades do not overwrite `~/.config/openshell/gateway.toml` when you -create one. New gateway process options can be added manually by referencing -CONFIGURATION.md or running `openshell-gateway --help`. +The `gateway.env` and `gateway.toml` files are not overwritten during +upgrades. The `init-gateway-env.sh` script is idempotent and only generates +missing files on first start. New gateway process options can be added +manually by referencing CONFIGURATION.md or running `openshell-gateway --help`. To pick up new container images after an upgrade: diff --git a/deploy/rpm/gateway.toml.default b/deploy/rpm/gateway.toml.default deleted file mode 100644 index d85379964..000000000 --- a/deploy/rpm/gateway.toml.default +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Default gateway configuration for RPM installs. -# -# This file is seeded to ~/.config/openshell/gateway.toml on first start -# of the openshell-gateway.service systemd user unit. Edit that copy to -# customize. This file is not read directly at runtime. -# -# Configuration precedence (highest to lowest): -# CLI flag > OPENSHELL_* env var > TOML file > built-in default -# -# To override settings without editing this file, set OPENSHELL_* variables -# in ~/.config/openshell/gateway.env or run: -# systemctl --user edit openshell-gateway - -[openshell] -version = 1 - -[openshell.gateway] -# Podman sandbox containers reach the gateway over the host network bridge, -# which requires binding to all interfaces. Override to 127.0.0.1:17670 if -# you don't use Podman or want loopback-only access (e.g. behind a reverse -# proxy). mTLS is enabled by default and protects all connections. -bind_address = "0.0.0.0:17670" - -# Pin to the Podman compute driver. Without this, the gateway auto-detects -# in order: Kubernetes, Podman, Docker. Pinning prevents unexpected driver -# selection if Docker is also installed on the host. -compute_drivers = ["podman"] diff --git a/deploy/rpm/init-gateway-env.sh b/deploy/rpm/init-gateway-env.sh new file mode 100644 index 000000000..61a6517bd --- /dev/null +++ b/deploy/rpm/init-gateway-env.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Generate the gateway environment and TOML configuration files on first start. +# +# Called from the systemd ExecStartPre directive to bootstrap the +# gateway configuration. Idempotent: exits immediately if the file +# already exists. +# +# Usage: +# init-gateway-env.sh +# +# The generated file contains commented defaults for gateway +# environment variables. + +set -euo pipefail + +ENV_FILE="${1:?Usage: init-gateway-env.sh }" +CONFIG_DIR="$(dirname "${ENV_FILE}")" +CONFIG_FILE="${CONFIG_DIR}/gateway.toml" +STATE_HOME="${XDG_STATE_HOME:-${HOME}/.local/state}" +RUNTIME_HOME="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" + +write_gateway_config() { + if [ -f "${CONFIG_FILE}" ]; then + return + fi + + mkdir -p "${CONFIG_DIR}" "${STATE_HOME}/openshell/vm-driver" + cat > "${CONFIG_FILE}" << EOF +[openshell] +version = 1 + +[openshell.gateway] +compute_drivers = ["podman"] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "${STATE_HOME}/openshell/tls/ca.crt" +guest_tls_cert = "${STATE_HOME}/openshell/tls/client/tls.crt" +guest_tls_key = "${STATE_HOME}/openshell/tls/client/tls.key" + +[openshell.drivers.podman] +socket_path = "${RUNTIME_HOME}/podman/podman.sock" +image_pull_policy = "missing" +network_name = "openshell" +stop_timeout_secs = 10 + +[openshell.drivers.vm] +state_dir = "${STATE_HOME}/openshell/vm-driver" +driver_dir = "/usr/libexec/openshell" +grpc_endpoint = "https://127.0.0.1:8080" +EOF + chmod 600 "${CONFIG_FILE}" +} + +ensure_env_points_at_config() { + if grep -q '^OPENSHELL_GATEWAY_CONFIG=' "${ENV_FILE}"; then + return + fi + + cat >> "${ENV_FILE}" << EOF + +# Gateway TOML configuration. Driver implementation settings live here. +OPENSHELL_GATEWAY_CONFIG=${CONFIG_FILE} +EOF +} + +# ── Idempotent: skip if env file already exists ───────────────────── +if [ -f "${ENV_FILE}" ]; then + write_gateway_config + ensure_env_points_at_config + exit 0 +fi + +# ── Create parent directory ───────────────────────────────────────── +mkdir -p "${CONFIG_DIR}" +write_gateway_config + +# ── Write environment file ────────────────────────────────────────── +cat > "${ENV_FILE}" << EOF +# OpenShell Gateway Environment Configuration +# Generated on first start. Edit freely; this file is not overwritten. +# +# Run 'openshell-gateway --help' for the full list of options. +# See /usr/share/doc/openshell-gateway/ for guides. + +OPENSHELL_GATEWAY_CONFIG=${CONFIG_FILE} + +# ---- Optional (uncomment to override defaults) ---- + +# Database URL for gateway state persistence. +# Default for the user unit: sqlite://\$XDG_STATE_HOME/openshell/gateway.db +#OPENSHELL_DB_URL=sqlite:///path/to/gateway.db + +# Compute driver: podman (default for RPM), docker, kubernetes, vm. +#OPENSHELL_DRIVERS=podman + +# Bind address. 0.0.0.0 listens on all interfaces; mTLS prevents +# unauthenticated access. +#OPENSHELL_BIND_ADDRESS=0.0.0.0 + +# API port (default: 8080). +#OPENSHELL_SERVER_PORT=8080 + +# Log level: trace, debug, info, warn, error. +#OPENSHELL_LOG_LEVEL=info + +# Driver implementation settings, including images, pull policy, Podman +# socket, TLS mounts, and VM paths, live in: +# ${CONFIG_FILE} + +# ---- TLS (mTLS enabled by default) ---- +# PKI is auto-generated by 'openshell-gateway generate-certs' from the +# unit's ExecStartPre on first start. Client certs are placed in +# ~/.config/openshell/gateways/openshell/mtls/ so the CLI discovers them +# automatically. +# +# To use externally-managed certs, uncomment and edit the paths below. +# To rotate certs, delete ~/.local/state/openshell/tls/ and restart. +# WARNING: Disabling TLS with the default bind address (0.0.0.0) exposes +# the gateway API to the entire network with NO authentication. Only +# disable TLS when behind a TLS-terminating reverse proxy, or restrict +# OPENSHELL_BIND_ADDRESS to 127.0.0.1. +#OPENSHELL_DISABLE_TLS=true + +# Server TLS (gateway listens with these certs). +#OPENSHELL_TLS_CERT=\$XDG_STATE_HOME/openshell/tls/server/tls.crt +#OPENSHELL_TLS_KEY=\$XDG_STATE_HOME/openshell/tls/server/tls.key +#OPENSHELL_TLS_CLIENT_CA=\$XDG_STATE_HOME/openshell/tls/ca.crt + +# Comma-separated DNS SANs configured on the gateway server certificate. +# Wildcard DNS SANs also enable sandbox service URLs under that domain. +# Example: OPENSHELL_SERVER_SAN=*.apps.example.com +#OPENSHELL_SERVER_SAN= + +EOF + +chmod 600 "${ENV_FILE}" +echo "Gateway environment generated: ${ENV_FILE}" diff --git a/deploy/snap/README.md b/deploy/snap/README.md index 419aacaa4..ece73f680 100644 --- a/deploy/snap/README.md +++ b/deploy/snap/README.md @@ -88,7 +88,7 @@ The snap exposes the CLI: - `openshell` -It also defines a system service with packaged Docker driver settings. +It also defines a system service running the gateway with the Docker driver. - `openshell.gateway` @@ -97,9 +97,10 @@ it while sandboxes are active. Restart the service manually when you are ready to move the gateway to the refreshed snap revision. `openshell-sandbox` is staged next to `openshell-gateway` as the Docker -supervisor binary. The gateway app starts through a small wrapper that sets -Snap-specific defaults and reads `$SNAP_COMMON/gateway.toml` when that file -exists. The service stores its gateway database under `$SNAP_COMMON`. +supervisor binary. The gateway app starts through a small wrapper that writes +`$SNAP_COMMON/gateway.toml` on first start and points the in-process Docker +driver at `$SNAP/bin/openshell-sandbox`. The service stores its gateway +database under `$SNAP_COMMON`. ## Interfaces @@ -139,18 +140,21 @@ override is required. The OpenShell snap still requires the Docker snap because it relies on the `docker:docker-daemon` slot; it does not work with Docker installed from a Debian package or Docker's upstream packages. -The service runs the gateway with Snap-specific environment defaults: +The service runs the gateway with the Docker driver enabled: ```shell -OPENSHELL_DISABLE_TLS=true \ -OPENSHELL_DB_URL="sqlite:$SNAP_COMMON/gateway.db?mode=rwc" \ -openshell.gateway +openshell.gateway \ + --drivers docker \ + --disable-tls \ + --port 17670 \ + --db-url "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" \ + --config "$SNAP_COMMON/gateway.toml" ``` This stores the gateway SQLite database at -`/var/snap/openshell/common/gateway.db`. Create -`/var/snap/openshell/common/gateway.toml` when you need to override gateway or -Docker driver settings. +`/var/snap/openshell/common/gateway.db`. The generated TOML stores Docker +driver settings such as the supervisor binary path, network name, sandbox +namespace, sandbox image, pull policy, and callback endpoint. ## Connect with the OpenShell CLI diff --git a/deploy/snap/bin/openshell-gateway-wrapper b/deploy/snap/bin/openshell-gateway-wrapper index cfba8db36..19e24b52b 100755 --- a/deploy/snap/bin/openshell-gateway-wrapper +++ b/deploy/snap/bin/openshell-gateway-wrapper @@ -4,12 +4,24 @@ set -eu -CANONICAL_CONFIG_FILE="${SNAP_COMMON}/gateway.toml" -export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${SNAP_COMMON}/gateway.db?mode=rwc}" -export OPENSHELL_DISABLE_TLS="${OPENSHELL_DISABLE_TLS:-true}" +CONFIG_FILE="${OPENSHELL_GATEWAY_CONFIG:-${SNAP_COMMON}/gateway.toml}" -if [ -z "${OPENSHELL_GATEWAY_CONFIG:-}" ] && [ -f "$CANONICAL_CONFIG_FILE" ]; then - exec "${SNAP}/bin/openshell-gateway" --config "$CANONICAL_CONFIG_FILE" "$@" +if [ ! -f "$CONFIG_FILE" ]; then + mkdir -p "$(dirname "$CONFIG_FILE")" + cat > "$CONFIG_FILE" << EOF +[openshell] +version = 1 + +[openshell.drivers.docker] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +image_pull_policy = "IfNotPresent" +sandbox_namespace = "docker-snap" +grpc_endpoint = "http://host.openshell.internal:17670" +supervisor_bin = "${SNAP}/bin/openshell-sandbox" +network_name = "openshell-snap" +EOF + chmod 600 "$CONFIG_FILE" fi +export OPENSHELL_GATEWAY_CONFIG="$CONFIG_FILE" exec "${SNAP}/bin/openshell-gateway" "$@" diff --git a/deploy/snap/meta/snap.yaml.in b/deploy/snap/meta/snap.yaml.in index 920dd9141..4175da0ac 100644 --- a/deploy/snap/meta/snap.yaml.in +++ b/deploy/snap/meta/snap.yaml.in @@ -35,6 +35,12 @@ apps: daemon: simple refresh-mode: endure environment: + OPENSHELL_BIND_ADDRESS: 127.0.0.1 + OPENSHELL_SERVER_PORT: 17670 + OPENSHELL_DB_URL: "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" + OPENSHELL_DISABLE_TLS: true + OPENSHELL_DRIVERS: docker + OPENSHELL_GATEWAY_CONFIG: "$SNAP_COMMON/gateway.toml" XDG_DATA_HOME: "$SNAP_COMMON" # Used for creating and locating certain sockets. XDG_RUNTIME_DIR: "$SNAP_COMMON" diff --git a/docs/about/installation.mdx b/docs/about/installation.mdx index 1675e6d2f..e1d449efc 100644 --- a/docs/about/installation.mdx +++ b/docs/about/installation.mdx @@ -24,7 +24,7 @@ Use `openshell status` to confirm the CLI can reach the gateway. ## Supported Compute Drivers -OpenShell supports several local compute drivers. Package-managed gateways leave the driver unset by default so the gateway can auto-detect an available driver. Set `compute_drivers` in the gateway TOML when you need to pin a specific driver. +OpenShell supports several local compute drivers. The installer chooses a default driver for your platform, and the gateway reads the driver choice from its startup configuration. Sandbox commands use the same CLI workflow after the gateway is running. | Compute Driver | How It Is Configured | System Requirements | |---|---|---| @@ -38,9 +38,7 @@ For detailed driver behavior, refer to [Sandbox Compute Drivers](/reference/sand On macOS, the install script uses Homebrew. The Homebrew package installs the `openshell` CLI, the gateway binary, and a Homebrew-managed gateway service. -The Homebrew service listens on `https://127.0.0.1:17670` and generates a local mTLS bundle on install. The gateway starts from built-in defaults and reads `~/.config/openshell/gateway.toml` when that file exists. If that file is absent, the Homebrew service also falls back to a Homebrew prefix config when present, such as `/opt/homebrew/var/openshell/gateway.toml`. - -The CLI reads the client bundle from `~/.config/openshell/gateways/openshell/mtls/`. +The Homebrew service listens on `https://127.0.0.1:17670` and generates a local mTLS bundle on install. The CLI reads the client bundle from `~/.config/openshell/gateways/openshell/mtls/`. The installer starts the service for you. Use Homebrew service commands when you need to inspect, restart, or stop the gateway service: @@ -55,9 +53,7 @@ On Fedora and RHEL, the install script uses RPM packages. The RPM installs the ` On Debian and Ubuntu, the install script uses a Debian package. The Debian package installs the `openshell` CLI, the `openshell-gateway` daemon, VM sandbox support, and a systemd user service. -The Linux user service listens on `https://127.0.0.1:17670`, starts from built-in defaults, and generates a local mTLS bundle before the gateway starts. Create `~/.config/openshell/gateway.toml` only when you need to override those defaults. - -The CLI reads the client bundle from `~/.config/openshell/gateways/openshell/mtls/`. +The Debian user service listens on `https://127.0.0.1:17670` and generates a local mTLS bundle before the gateway starts. The CLI reads the client bundle from `~/.config/openshell/gateways/openshell/mtls/`. The installer starts the service for you. Use systemd user commands when you need to inspect, restart, or stop the gateway service: diff --git a/docs/reference/gateway-auth.mdx b/docs/reference/gateway-auth.mdx index 03920f4b6..127130108 100644 --- a/docs/reference/gateway-auth.mdx +++ b/docs/reference/gateway-auth.mdx @@ -38,7 +38,7 @@ Set these environment variables before starting the gateway: For local access, the server certificate must be valid for the endpoint the CLI uses. Include `localhost` and `127.0.0.1` in the certificate SANs when users connect to a local gateway through loopback. -Package-managed local gateways on Homebrew, Debian, and RPM generate this bundle automatically for the `openshell` gateway name and use `https://127.0.0.1:17670` by default. +Package-managed local gateways on Homebrew and Debian generate this bundle automatically for the `openshell` gateway name and use `https://127.0.0.1:17670` by default. When you register a package-managed local gateway with `openshell gateway add https://127.0.0.1:17670 --local --name openshell`, the CLI refreshes its mTLS bundle from the package-managed TLS directory. On Homebrew, the gateway service also mirrors the Docker sandbox client bundle into `$HOME/.local/state/openshell/homebrew/tls` before startup so Docker Desktop can bind-mount the files into sandbox containers. @@ -162,7 +162,7 @@ When a gateway is deployed with `server.disableTls=true`, TLS is disabled entire Register a plaintext gateway with an explicit `http://` endpoint: ```shell -openshell gateway add http://127.0.0.1:17670 --local +openshell gateway add http://127.0.0.1:8080 --local ``` This stores the gateway with `auth_mode = plaintext`, skips mTLS client certificate lookup, and does not open the browser login flow. diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 218982405..ffb932f63 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -8,7 +8,7 @@ keywords: "Generative AI, Cybersecurity, AI Agents, Sandboxing, Gateway, Configu position: 5 --- -The OpenShell gateway reads its configuration from a TOML file when `--config` or `OPENSHELL_GATEWAY_CONFIG` is set. When neither is set, the gateway reads `$XDG_CONFIG_HOME/openshell/gateway.toml` if that file exists. If no config file exists, the gateway starts from built-in defaults. Gateway process flags and gateway `OPENSHELL_*` environment variables override the file. Compute driver settings live in the driver TOML tables. See [RFC 0003](https://github.com/NVIDIA/OpenShell/blob/main/rfc/0003-gateway-configuration/README.md) for the full schema. +The OpenShell gateway reads its configuration from a TOML file when `--config` or `OPENSHELL_GATEWAY_CONFIG` is set. Gateway process flags and gateway `OPENSHELL_*` environment variables override the file. Compute driver settings live in the driver TOML tables. See [RFC 0003](https://github.com/NVIDIA/OpenShell/blob/main/rfc/0003-gateway-configuration/README.md) for the full schema. ## Source Precedence @@ -16,18 +16,7 @@ The OpenShell gateway reads its configuration from a TOML file when `--config` o Gateway CLI flag > gateway OPENSHELL_* env var > TOML file > built-in default ``` -`database_url` is env-only. The loader rejects it when it appears in the file. When `OPENSHELL_DB_URL` is unset, the gateway stores its SQLite database under `$XDG_STATE_HOME/openshell/gateway/openshell.db`. - -## Package-Managed Locations - -Package-managed gateways do not require a TOML file. Create one at the package's optional config location when you need to override built-in defaults. Set `OPENSHELL_GATEWAY_CONFIG` in the launch environment to use a different file. - -| Package | Optional Gateway TOML location | -|---|---| -| Homebrew | `$XDG_CONFIG_HOME/openshell/gateway.toml` when it exists, otherwise an existing Homebrew prefix config such as `/opt/homebrew/var/openshell/gateway.toml`. | -| Debian/Ubuntu | `$XDG_CONFIG_HOME/openshell/gateway.toml`, usually `~/.config/openshell/gateway.toml` for the systemd user service. | -| Fedora/RHEL RPM | `$XDG_CONFIG_HOME/openshell/gateway.toml`, usually `~/.config/openshell/gateway.toml` for the systemd user service. | -| Snap | `$SNAP_COMMON/gateway.toml`, usually `/var/snap/openshell/common/gateway.toml`. | +`database_url` is env-only. The loader rejects it when it appears in the file. ## Layout @@ -149,7 +138,7 @@ Sandboxes run as containers on a local bridge network. The supervisor binary is version = 1 [openshell.gateway] -bind_address = "127.0.0.1:17670" +bind_address = "127.0.0.1:8080" log_level = "info" compute_drivers = ["docker"] @@ -162,7 +151,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" image_pull_policy = "IfNotPresent" sandbox_namespace = "docker-dev" -grpc_endpoint = "https://host.openshell.internal:17670" +grpc_endpoint = "https://host.openshell.internal:8080" network_name = "openshell-docker" # Skip the image-pull-and-extract step by pointing at a locally built binary. supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" @@ -177,7 +166,7 @@ Sandboxes run as Podman containers on a user-mode bridge network. The supervisor version = 1 [openshell.gateway] -bind_address = "127.0.0.1:17670" +bind_address = "127.0.0.1:8080" log_level = "info" compute_drivers = ["podman"] @@ -204,7 +193,7 @@ Each sandbox runs inside its own libkrun microVM managed by the standalone `open version = 1 [openshell.gateway] -bind_address = "127.0.0.1:17670" +bind_address = "127.0.0.1:8080" log_level = "info" # VM is never auto-detected; an explicit entry here is required. compute_drivers = ["vm"] @@ -215,7 +204,7 @@ guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" [openshell.drivers.vm] -grpc_endpoint = "https://host.containers.internal:17670" +grpc_endpoint = "https://host.containers.internal:8080" state_dir = "/var/lib/openshell/vm" # Where the gateway looks for the openshell-driver-vm subprocess binary. driver_dir = "/usr/local/libexec/openshell" diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 33168986f..9055799c3 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -14,22 +14,21 @@ Every compute driver runs the OpenShell supervisor inside the sandbox workload. ## Configure a Compute Driver -Configure the compute driver on the gateway. Current releases accept one driver per gateway. Set `compute_drivers` in the gateway TOML file: +Configure the compute driver on the gateway. Current releases accept one driver per gateway: -```toml -[openshell.gateway] -compute_drivers = ["docker"] +```shell +openshell-gateway --drivers docker ``` -Supported values are `docker`, `podman`, `kubernetes`, and `vm`. +You can also set the driver with `OPENSHELL_DRIVERS`. Supported values are `docker`, `podman`, `kubernetes`, and `vm`. -When `compute_drivers` is unset, the gateway auto-detects Kubernetes, then Podman, then Docker by CLI availability or a local Unix socket. The VM driver is never auto-detected; configure it explicitly with `compute_drivers = ["vm"]` or set `OPENSHELL_DRIVERS=vm` in the launch environment. +When `--drivers` and `OPENSHELL_DRIVERS` are unset, the gateway auto-detects Kubernetes, then Podman, then Docker by CLI availability or a local Unix socket. The VM driver is never auto-detected; configure it explicitly with `--drivers vm`. Common gateway options: -| Gateway TOML option | Description | -|---|---| -| `compute_drivers = [""]` | Select the compute driver. Supported values are `docker`, `podman`, `kubernetes`, and `vm`. | +| Option | Environment variable | Description | +|---|---|---| +| `--drivers ` | `OPENSHELL_DRIVERS` | Select the compute driver. Supported values are `docker`, `podman`, `kubernetes`, and `vm`. | Set driver-specific values such as sandbox images, callback endpoints, network names, TLS material, and VM sizing in the gateway TOML file. See the [Gateway Configuration File](./gateway-config) reference for the full `[openshell.drivers.]` schema. @@ -46,7 +45,7 @@ The gateway talks to the Docker daemon to create sandbox containers. Docker is a For maintainer-level implementation details, refer to the [Docker driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-docker/README.md). -Select Docker with `compute_drivers = ["docker"]` in `[openshell.gateway]`. Configure Docker driver values such as `grpc_endpoint`, `network_name`, `supervisor_bin`, `supervisor_image`, `image_pull_policy`, and `guest_tls_*` in `[openshell.drivers.docker]`. +Select Docker with `--drivers docker` or `OPENSHELL_DRIVERS=docker`. Configure Docker driver values such as `grpc_endpoint`, `network_name`, `supervisor_bin`, `supervisor_image`, `image_pull_policy`, and `guest_tls_*` in `[openshell.drivers.docker]`. For GPU-backed Docker sandboxes, configure Docker CDI before starting the gateway so OpenShell can detect the daemon capability. @@ -58,7 +57,7 @@ The gateway talks to the Podman API socket. The Podman driver requires Podman 5. For maintainer-level implementation details, refer to the [Podman driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-podman/README.md) and [Podman networking notes](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-podman/NETWORKING.md). -Select Podman with `compute_drivers = ["podman"]` in `[openshell.gateway]`. Configure Podman driver values such as `socket_path`, `network_name`, `supervisor_image`, `stop_timeout_secs`, `image_pull_policy`, `grpc_endpoint`, and `guest_tls_*` in `[openshell.drivers.podman]`. +Select Podman with `--drivers podman` or `OPENSHELL_DRIVERS=podman`. Configure Podman driver values such as `socket_path`, `network_name`, `supervisor_image`, `stop_timeout_secs`, `image_pull_policy`, `grpc_endpoint`, and `guest_tls_*` in `[openshell.drivers.podman]`. ## MicroVM Driver @@ -78,16 +77,15 @@ For maintainer-level implementation details, refer to the [VM driver README](htt The VM driver is opt-in. Release packages can install `openshell-driver-vm`, but the gateway does not select it unless you configure the driver explicitly. -Enable VM by setting `compute_drivers = ["vm"]` in the gateway TOML file: +For a one-off gateway process, pass `--drivers vm`: -```toml -[openshell.gateway] -compute_drivers = ["vm"] +```shell +openshell-gateway --drivers vm ``` -For a launch-time override, set `OPENSHELL_DRIVERS=vm` in the gateway environment and restart the service. +For a service, set `OPENSHELL_DRIVERS=vm` in the service environment file and restart the service. Homebrew creates `$(brew --prefix)/var/openshell/gateway.env` with a commented `OPENSHELL_DRIVERS=vm` entry. Debian and RPM user services read `~/.config/openshell/gateway.env`. -Configure VM driver values such as `grpc_endpoint`, `driver_dir`, `state_dir`, `default_image`, `bootstrap_image`, `vcpus`, `mem_mib`, `overlay_disk_mib`, `krun_log_level`, and `guest_tls_*` in `[openshell.drivers.vm]`. The VM `state_dir` stores overlay disks, console logs, runtime state, image-rootfs cache, and the private `run/compute-driver.sock` socket. +Select VM with `--drivers vm` or `OPENSHELL_DRIVERS=vm`. Configure VM driver values such as `grpc_endpoint`, `driver_dir`, `state_dir`, `default_image`, `bootstrap_image`, `vcpus`, `mem_mib`, `overlay_disk_mib`, `krun_log_level`, and `guest_tls_*` in `[openshell.drivers.vm]`. The VM `state_dir` stores overlay disks, console logs, runtime state, image-rootfs cache, and the private `run/compute-driver.sock` socket. The gateway starts `openshell-driver-vm` over a private Unix socket and passes its process ID so the driver can reject unexpected local clients. The driver's standalone TCP listener is disabled unless `--allow-unauthenticated-tcp` is set for local development. @@ -99,12 +97,6 @@ The VM driver resolves sandbox images from a local container engine before falli systemctl --user start podman.socket ``` -### Host Firewall - -The VM driver creates nftables rules on the host for each sandbox VM's TAP network interface. These rules provide NAT for VM connectivity and defense-in-depth isolation: unsolicited inbound connections to the VM are dropped, and the VM can only reach the gateway port on the host. Primary security enforcement (proxy-only egress and bypass detection) is handled by the sandbox supervisor inside the VM guest. - -On hosts with restrictive firewalls (e.g. firewalld), the host firewall may additionally block VM traffic that the driver's rules accept. If VM sandboxes cannot reach the network, verify that the host firewall allows forwarding and input for `vmtap-*` interfaces. See the [VM driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-vm/README.md#host-side-nftables-rules) for details. - ## Kubernetes Driver Kubernetes-backed sandboxes run as pods in the configured sandbox namespace. Use Kubernetes for shared clusters, remote compute, GPU scheduling, and operator-managed environments. @@ -115,7 +107,7 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | Gateway configuration | Helm value | Description | |---|---|---| -| `compute_drivers = ["kubernetes"]` | Not applicable | Select the Kubernetes compute driver. | +| `compute_drivers = ["kubernetes"]` or `--drivers kubernetes` | Not applicable | Select the Kubernetes compute driver. | | `[openshell.drivers.kubernetes].namespace` | `server.sandboxNamespace` | Set the namespace for sandbox resources. The Helm chart defaults to the release namespace when left empty. | | `default_image` | `server.sandboxImage` | Set the default sandbox image. | | `image_pull_policy` | `server.sandboxImagePullPolicy` | Set the Kubernetes image pull policy for sandbox pods. | diff --git a/docs/sandboxes/manage-providers.mdx b/docs/sandboxes/manage-providers.mdx index e784ae467..6f50e1725 100644 --- a/docs/sandboxes/manage-providers.mdx +++ b/docs/sandboxes/manage-providers.mdx @@ -82,63 +82,9 @@ openshell provider get my-claude Update a provider's credentials: ```shell -openshell provider update my-claude --from-existing +openshell provider update my-claude --type claude --from-existing ``` -Set or clear a credential expiry timestamp: - -```shell -openshell provider update my-graph \ - --credential MS_GRAPH_ACCESS_TOKEN="$MS_GRAPH_ACCESS_TOKEN" \ - --credential-expires-at MS_GRAPH_ACCESS_TOKEN=1767225600000 -``` - -Use `0` as the timestamp to clear expiry for a credential key. - -## Credential Refresh - -Provider refresh stores non-injectable refresh material separately from the -provider's current credential values. The gateway can mint OAuth2 client -credentials tokens and Google service account JWT tokens, then write the current -access token back to the provider record for sandbox injection. - -Configure refresh metadata for one injectable credential key: - -```shell -openshell provider refresh configure my-graph \ - --credential-key MS_GRAPH_ACCESS_TOKEN \ - --strategy oauth2-client-credentials \ - --material tenant_id="$TENANT_ID" \ - --material client_id="$CLIENT_ID" \ - --material client_secret="$CLIENT_SECRET" \ - --secret-material-key client_secret \ - --credential-expires-at 1767225600000 -``` - -Check refresh status: - -```shell -openshell provider refresh status my-graph -``` - -Delete refresh metadata for a credential: - -```shell -openshell provider refresh delete my-graph \ - --credential-key MS_GRAPH_ACCESS_TOKEN -``` - -Force a gateway-managed refresh for one credential: - -```shell -openshell provider refresh rotate my-graph --credential-key MS_GRAPH_ACCESS_TOKEN -``` - -External refresh systems should continue to push new current credentials through -`openshell provider update`. The `--credential-expires-at` option works for -static credentials, externally refreshed credentials, and gateway-managed -refresh strategies. - Delete a provider: ```shell diff --git a/docs/security/best-practices.mdx b/docs/security/best-practices.mdx index 12ad00520..0c86069e1 100644 --- a/docs/security/best-practices.mdx +++ b/docs/security/best-practices.mdx @@ -120,7 +120,7 @@ This enables credential injection and L7 inspection without explicit configurati | Aspect | Detail | |---|---| -| Default | Auto-detect and terminate. OpenShell generates the sandbox CA at startup and injects it into the process trust stores (`NODE_EXTRA_CA_CERTS`, `DENO_CERT`, `SSL_CERT_FILE`, `REQUESTS_CA_BUNDLE`, `CURL_CA_BUNDLE`, `GIT_SSL_CAINFO`). | +| Default | Auto-detect and terminate. OpenShell generates the sandbox CA at startup and injects it into the process trust stores (`NODE_EXTRA_CA_CERTS`, `SSL_CERT_FILE`, `REQUESTS_CA_BUNDLE`, `CURL_CA_BUNDLE`, `GIT_SSL_CAINFO`). | | What you can change | Set `tls: skip` on an endpoint to disable TLS detection and termination for that endpoint. Use this for client-certificate mTLS to upstream or non-standard binary protocols. | | Risk if relaxed | `tls: skip` disables credential injection and L7 inspection for that endpoint. The proxy relays encrypted traffic without seeing the contents. | | Recommendation | Use auto-detect (the default) for most endpoints. Use `tls: skip` only when the upstream requires the client's own TLS certificate (mTLS) or uses a non-HTTP protocol. | @@ -225,7 +225,7 @@ OpenShell applies seccomp in two phases. A narrow supervisor-startup prelude run The sandbox supervisor applies enforcement in a specific order during process startup. This ordering is intentional: named network-namespace setup still relies on privileged helpers, and privilege dropping still needs `/etc/group` and `/etc/passwd`, which Landlock subsequently restricts. -1. Privileged supervisor bootstrap helpers, including network-namespace setup and optional `nft` probes. +1. Privileged supervisor bootstrap helpers, including network-namespace setup and optional `iptables` probes. 2. Supervisor startup prelude seccomp (`PR_SET_NO_NEW_PRIVS` plus the early syscall denylist) synchronized across runtime threads. 3. Network namespace entry (`setns`) in child `pre_exec`. 4. Privilege drop (`initgroups` + `setgid` + `setuid`). diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 7d7f1411c..03ad930b4 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -46,11 +46,6 @@ name = "gateway_resume" path = "tests/gateway_resume.rs" required-features = ["e2e-docker"] -[[test]] -name = "podman_gateway_resume" -path = "tests/podman_gateway_resume.rs" -required-features = ["e2e-podman"] - [[test]] name = "vm_gateway_resume" path = "tests/vm_gateway_resume.rs" @@ -59,7 +54,7 @@ required-features = ["e2e-vm"] [[test]] name = "websocket_conformance" path = "tests/websocket_conformance.rs" -required-features = ["e2e-host-gateway"] +required-features = ["e2e-docker"] [[test]] name = "user_namespaces" diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 5f325d0d2..c82891338 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -4,13 +4,13 @@ # Run the Rust e2e suite against a standalone gateway running the bundled Podman # compute driver. Set OPENSHELL_GATEWAY_ENDPOINT=http://host:port to reuse an -# existing gateway instead of starting an ephemeral one. +# existing plaintext gateway instead of starting an ephemeral one. set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}" -E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e-podman}" +E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}" cargo build -p openshell-cli --features openshell-core/dev-settings diff --git a/e2e/rust/src/harness/cli.rs b/e2e/rust/src/harness/cli.rs deleted file mode 100644 index 53392d752..000000000 --- a/e2e/rust/src/harness/cli.rs +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Shared CLI helpers for e2e tests that need to invoke `openshell` commands -//! and poll for readiness. - -use std::process::Stdio; -use std::time::{Duration, Instant}; - -use tokio::time::sleep; - -use super::binary::openshell_cmd; -use super::output::strip_ansi; - -pub async fn run_cli(args: &[&str]) -> (String, i32) { - let mut cmd = openshell_cmd(); - cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); - - let output = cmd.output().await.expect("spawn openshell"); - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - let combined = format!("{stdout}{stderr}"); - let code = output.status.code().unwrap_or(-1); - (combined, code) -} - -pub async fn wait_for_healthy(timeout: Duration) -> Result<(), String> { - let start = Instant::now(); - let mut last_output: String; - - loop { - let (output, code) = run_cli(&["status"]).await; - let clean = strip_ansi(&output); - let lower = clean.to_lowercase(); - if code == 0 - && (lower.contains("healthy") - || lower.contains("running") - || lower.contains("connected")) - { - return Ok(()); - } - last_output = clean; - - if start.elapsed() > timeout { - return Err(format!( - "gateway did not become healthy within {}s. Last output:\n{last_output}", - timeout.as_secs() - )); - } - sleep(Duration::from_secs(2)).await; - } -} - -pub async fn sandbox_names() -> Result, String> { - let (output, code) = run_cli(&["sandbox", "list", "--names"]).await; - let clean = strip_ansi(&output); - if code != 0 { - return Err(format!("sandbox list failed (exit {code}):\n{clean}")); - } - - Ok(clean - .lines() - .map(str::trim) - .filter(|line| !line.is_empty()) - .map(ToOwned::to_owned) - .collect()) -} - -pub async fn wait_for_sandbox_exec_contains( - sandbox_name: &str, - command: &[&str], - expected: &str, - timeout: Duration, -) -> Result<(), String> { - let start = Instant::now(); - let mut last_output: String; - - loop { - let mut cmd = openshell_cmd(); - cmd.args(["sandbox", "exec", "--name", sandbox_name, "--no-tty", "--"]) - .args(command) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - - match cmd.output().await { - Ok(output) => { - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - last_output = strip_ansi(&format!("{stdout}{stderr}")); - if output.status.success() && last_output.contains(expected) { - return Ok(()); - } - } - Err(err) => { - last_output = format!("failed to spawn openshell sandbox exec: {err}"); - } - } - - if start.elapsed() > timeout { - return Err(format!( - "sandbox '{sandbox_name}' exec did not produce '{expected}' within {}s. Last output:\n{last_output}", - timeout.as_secs() - )); - } - sleep(Duration::from_secs(2)).await; - } -} diff --git a/e2e/rust/src/harness/mod.rs b/e2e/rust/src/harness/mod.rs index f2dfd5ec9..5feb21c70 100644 --- a/e2e/rust/src/harness/mod.rs +++ b/e2e/rust/src/harness/mod.rs @@ -4,7 +4,6 @@ //! Shared test harness modules for CLI e2e tests. pub mod binary; -pub mod cli; pub mod container; pub mod gateway; pub mod output; diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index 8f850e485..e3a2e6664 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -10,12 +10,11 @@ //! gateway process, so they skip this restart-only coverage. use std::process::{Command, Stdio}; -use std::time::Duration; +use std::time::{Duration, Instant}; -use openshell_e2e::harness::cli::{ - sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains, -}; +use openshell_e2e::harness::binary::openshell_cmd; use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; use tokio::time::sleep; @@ -25,6 +24,100 @@ const RESUME_FILE: &str = "/sandbox/gateway-resume-state"; const SANDBOX_NAMESPACE_LABEL: &str = "openshell.ai/sandbox-namespace"; const SANDBOX_NAME_LABEL: &str = "openshell.ai/sandbox-name"; +async fn run_cli(args: &[&str]) -> (String, i32) { + let mut cmd = openshell_cmd(); + cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code) +} + +async fn wait_for_healthy(timeout: Duration) -> Result<(), String> { + let start = Instant::now(); + let mut last_output: String; + + loop { + let (output, code) = run_cli(&["status"]).await; + let clean = strip_ansi(&output); + let lower = clean.to_lowercase(); + if code == 0 + && (lower.contains("healthy") + || lower.contains("running") + || lower.contains("connected")) + { + return Ok(()); + } + last_output = clean; + + if start.elapsed() > timeout { + return Err(format!( + "gateway did not become healthy within {}s. Last output:\n{last_output}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(2)).await; + } +} + +async fn sandbox_names() -> Result, String> { + let (output, code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean = strip_ansi(&output); + if code != 0 { + return Err(format!("sandbox list failed (exit {code}):\n{clean}")); + } + + Ok(clean + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .map(ToOwned::to_owned) + .collect()) +} + +async fn wait_for_sandbox_exec_contains( + sandbox_name: &str, + command: &[&str], + expected: &str, + timeout: Duration, +) -> Result<(), String> { + let start = Instant::now(); + let mut last_output: String; + + loop { + let mut cmd = openshell_cmd(); + cmd.args(["sandbox", "exec", "--name", sandbox_name, "--no-tty", "--"]) + .args(command) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + match cmd.output().await { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + last_output = strip_ansi(&format!("{stdout}{stderr}")); + if output.status.success() && last_output.contains(expected) { + return Ok(()); + } + } + Err(err) => { + last_output = format!("failed to spawn openshell sandbox exec: {err}"); + } + } + + if start.elapsed() > timeout { + return Err(format!( + "sandbox '{sandbox_name}' exec did not produce '{expected}' within {}s. Last output:\n{last_output}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(2)).await; + } +} + fn sandbox_container_id(namespace: &str, sandbox_name: &str) -> Result { let namespace_filter = format!("label={SANDBOX_NAMESPACE_LABEL}={namespace}"); let sandbox_name_filter = format!("label={SANDBOX_NAME_LABEL}={sandbox_name}"); @@ -96,7 +189,7 @@ async fn wait_for_container_running( expected: bool, timeout: Duration, ) -> Result<(), String> { - let start = std::time::Instant::now(); + let start = Instant::now(); let mut last_state: String; loop { @@ -138,9 +231,12 @@ async fn docker_gateway_restart_resumes_running_sandbox() { let script = format!( "echo before-restart > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" ); - let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) - .await - .expect("create long-running sandbox"); + let mut sandbox = SandboxGuard::create_keep( + &["sh", "-lc", &script], + READY_MARKER, + ) + .await + .expect("create long-running sandbox"); let before_restart = sandbox .exec(&["cat", RESUME_FILE]) diff --git a/e2e/rust/tests/podman_gateway_resume.rs b/e2e/rust/tests/podman_gateway_resume.rs deleted file mode 100644 index fea2fab3e..000000000 --- a/e2e/rust/tests/podman_gateway_resume.rs +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#![cfg(feature = "e2e-podman")] - -//! Podman-specific E2E coverage for resuming sandboxes after a standalone -//! gateway restart. -//! -//! Unlike the Docker driver, Podman does not stop sandbox containers when the -//! gateway process exits — the containers keep running and the restarted -//! gateway re-adopts them. This test follows the `vm_gateway_resume.rs` -//! pattern: verify sandbox survival at the application level without asserting -//! intermediate container-state transitions. - -use std::time::Duration; - -use openshell_e2e::harness::cli::{sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains}; -use openshell_e2e::harness::gateway::ManagedGateway; -use openshell_e2e::harness::sandbox::SandboxGuard; - -const READY_MARKER: &str = "podman-gateway-resume-ready"; -const RESUME_FILE: &str = "/sandbox/podman-gateway-resume-state"; - -#[tokio::test] -async fn podman_gateway_restart_resumes_running_sandbox() { - if std::env::var("OPENSHELL_E2E_DRIVER").as_deref() != Ok("podman") { - eprintln!("Skipping Podman gateway resume test: e2e driver is not podman"); - return; - } - let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") - else { - eprintln!( - "Skipping Podman gateway resume test: e2e gateway is not managed by this test run" - ); - return; - }; - - wait_for_healthy(Duration::from_secs(30)) - .await - .expect("gateway should start healthy"); - - let script = format!( - "echo before-restart > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" - ); - let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) - .await - .expect("create long-running Podman sandbox"); - - let before_restart = sandbox - .exec(&["cat", RESUME_FILE]) - .await - .expect("read Podman sandbox state before restart"); - assert!( - before_restart.contains("before-restart"), - "sandbox state was not written before restart:\n{before_restart}" - ); - - gateway.stop().expect("stop e2e gateway"); - gateway.start().expect("restart e2e gateway"); - wait_for_healthy(Duration::from_secs(120)) - .await - .expect("gateway should become healthy after restart"); - - let names = sandbox_names().await.expect("list sandboxes after restart"); - assert!( - names.contains(&sandbox.name), - "sandbox '{}' should still be listed after gateway restart. Names: {names:?}", - sandbox.name - ); - - wait_for_sandbox_exec_contains( - &sandbox.name, - &["cat", RESUME_FILE], - "before-restart", - Duration::from_secs(240), - ) - .await - .expect("Podman sandbox should become ready again with its state preserved"); - - sandbox.cleanup().await; -} diff --git a/e2e/rust/tests/provider_auto_create.rs b/e2e/rust/tests/provider_auto_create.rs index 45729776e..c678c46c4 100644 --- a/e2e/rust/tests/provider_auto_create.rs +++ b/e2e/rust/tests/provider_auto_create.rs @@ -5,7 +5,7 @@ //! E2E test: `--provider ` auto-creates a provider from local credentials. //! -//! When `--provider claude-code` is passed and no provider named "claude-code" exists, +//! When `--provider claude` is passed and no provider named "claude" exists, //! the CLI should discover `ANTHROPIC_API_KEY` from the local environment, //! auto-create a provider, and inject a supervisor-managed placeholder into the //! sandbox child process environment. @@ -68,21 +68,21 @@ async fn delete_sandbox(name: &str) { let _ = cmd.status().await; } -/// `--provider claude-code --auto-providers` with `ANTHROPIC_API_KEY` set should -/// auto-create a "claude-code" provider and inject a placeholder into the sandbox. +/// `--provider claude --auto-providers` with `ANTHROPIC_API_KEY` set should +/// auto-create a "claude" provider and inject a placeholder into the sandbox. #[tokio::test] async fn auto_created_provider_credential_available_in_sandbox() { let _provider_lock = CLAUDE_PROVIDER_LOCK .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); - if provider_exists("claude-code").await { - eprintln!("Skipping test: existing provider 'claude-code' would make shared state unsafe"); + if provider_exists("claude").await { + eprintln!("Skipping test: existing provider 'claude' would make shared state unsafe"); return; } // Clean up any leftover from a previous run. - delete_provider("claude-code").await; + delete_provider("claude").await; // Create a sandbox that prints the ANTHROPIC_API_KEY env var. // --auto-providers skips the interactive prompt. @@ -90,7 +90,7 @@ async fn auto_created_provider_credential_available_in_sandbox() { cmd.arg("sandbox") .arg("create") .arg("--provider") - .arg("claude-code") + .arg("claude") .arg("--auto-providers") .arg("--") .arg("printenv") @@ -116,7 +116,7 @@ async fn auto_created_provider_credential_available_in_sandbox() { if let Some(ref name) = sandbox_name { delete_sandbox(name).await; } - delete_provider("claude-code").await; + delete_provider("claude").await; // Now assert. assert!( @@ -126,7 +126,7 @@ async fn auto_created_provider_credential_available_in_sandbox() { ); assert!( - clean.contains("Created provider claude-code"), + clean.contains("Created provider claude"), "output should confirm provider auto-creation:\n{clean}" ); diff --git a/e2e/rust/tests/vm_gateway_resume.rs b/e2e/rust/tests/vm_gateway_resume.rs index 3bff91df7..488be681a 100644 --- a/e2e/rust/tests/vm_gateway_resume.rs +++ b/e2e/rust/tests/vm_gateway_resume.rs @@ -9,15 +9,112 @@ //! This test is gated behind the `e2e-vm` feature because it requires the VM //! driver runtime prepared by `e2e/rust/e2e-vm.sh`. -use std::time::Duration; +use std::process::Stdio; +use std::time::{Duration, Instant}; -use openshell_e2e::harness::cli::{sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains}; +use openshell_e2e::harness::binary::openshell_cmd; use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; +use tokio::time::sleep; const READY_MARKER: &str = "vm-gateway-resume-ready"; const RESUME_FILE: &str = "/sandbox/vm-gateway-resume-state"; +async fn run_cli(args: &[&str]) -> (String, i32) { + let mut cmd = openshell_cmd(); + cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code) +} + +async fn wait_for_healthy(timeout: Duration) -> Result<(), String> { + let start = Instant::now(); + let mut last_output: String; + + loop { + let (output, code) = run_cli(&["status"]).await; + let clean = strip_ansi(&output); + let lower = clean.to_lowercase(); + if code == 0 + && (lower.contains("healthy") + || lower.contains("running") + || lower.contains("connected")) + { + return Ok(()); + } + last_output = clean; + + if start.elapsed() > timeout { + return Err(format!( + "gateway did not become healthy within {}s. Last output:\n{last_output}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(2)).await; + } +} + +async fn sandbox_names() -> Result, String> { + let (output, code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean = strip_ansi(&output); + if code != 0 { + return Err(format!("sandbox list failed (exit {code}):\n{clean}")); + } + + Ok(clean + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .map(ToOwned::to_owned) + .collect()) +} + +async fn wait_for_sandbox_exec_contains( + sandbox_name: &str, + command: &[&str], + expected: &str, + timeout: Duration, +) -> Result<(), String> { + let start = Instant::now(); + let mut last_output: String; + + loop { + let mut cmd = openshell_cmd(); + cmd.args(["sandbox", "exec", "--name", sandbox_name, "--no-tty", "--"]) + .args(command) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + match cmd.output().await { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + last_output = strip_ansi(&format!("{stdout}{stderr}")); + if output.status.success() && last_output.contains(expected) { + return Ok(()); + } + } + Err(err) => { + last_output = format!("failed to spawn openshell sandbox exec: {err}"); + } + } + + if start.elapsed() > timeout { + return Err(format!( + "sandbox '{sandbox_name}' exec did not produce '{expected}' within {}s. Last output:\n{last_output}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(2)).await; + } +} + #[tokio::test] async fn vm_gateway_restart_resumes_running_sandbox() { if std::env::var("OPENSHELL_E2E_DRIVER").as_deref() != Ok("vm") { @@ -37,9 +134,12 @@ async fn vm_gateway_restart_resumes_running_sandbox() { let script = format!( "echo before-restart > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" ); - let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) - .await - .expect("create long-running VM sandbox"); + let mut sandbox = SandboxGuard::create_keep( + &["sh", "-lc", &script], + READY_MARKER, + ) + .await + .expect("create long-running VM sandbox"); let before_restart = sandbox .exec(&["cat", RESUME_FILE]) diff --git a/e2e/rust/tests/websocket_conformance.rs b/e2e/rust/tests/websocket_conformance.rs index 65ba19aa1..d87c9b9dd 100644 --- a/e2e/rust/tests/websocket_conformance.rs +++ b/e2e/rust/tests/websocket_conformance.rs @@ -3,8 +3,8 @@ #![cfg(feature = "e2e")] -//! E2E regression: WebSocket credential placeholders are resolved on the -//! sandbox path after an RFC 6455 upgrade. +//! E2E regression: WebSocket credential placeholders are resolved on the real +//! Docker-backed sandbox path after an RFC 6455 upgrade. //! //! The sandbox process sends its provider-managed placeholder in a masked text //! frame. The local upstream only reports whether it saw the real secret and @@ -425,7 +425,7 @@ with connect_with_retry(HOST, PORT) as sock: } #[tokio::test] -async fn websocket_text_placeholder_is_rewritten_in_sandbox() { +async fn websocket_text_placeholder_is_rewritten_in_docker_sandbox() { let _provider_lock = PROVIDER_LOCK .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 2f8a2c141..d8acbd191 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -34,22 +34,6 @@ e2e_pick_port() { python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' } -e2e_generate_pki() { - local gateway_bin=$1 - local pki_dir=$2 - shift 2 - # Remaining args are extra --server-san values (e.g. host.containers.internal). - # host.docker.internal and localhost are already in the default SAN list. - - local san_args=() - san_args+=(--server-san host.openshell.internal) - for san in "$@"; do - san_args+=(--server-san "${san}") - done - - "${gateway_bin}" generate-certs --output-dir "${pki_dir}" "${san_args[@]}" -} - e2e_register_plaintext_gateway() { local config_home=$1 local name=$2 @@ -79,9 +63,9 @@ e2e_register_mtls_gateway() { local gateway_config_dir="${config_home}/openshell/gateways/${name}" mkdir -p "${gateway_config_dir}/mtls" - cp "${pki_dir}/ca.crt" "${gateway_config_dir}/mtls/ca.crt" - cp "${pki_dir}/client/tls.crt" "${gateway_config_dir}/mtls/tls.crt" - cp "${pki_dir}/client/tls.key" "${gateway_config_dir}/mtls/tls.key" + cp "${pki_dir}/ca.crt" "${gateway_config_dir}/mtls/ca.crt" + cp "${pki_dir}/client.crt" "${gateway_config_dir}/mtls/tls.crt" + cp "${pki_dir}/client.key" "${gateway_config_dir}/mtls/tls.key" cat >"${gateway_config_dir}/metadata.json" </dev/null 2>&1; then echo "ERROR: docker daemon is not reachable (docker info failed)" >&2 exit 2 fi +if ! command -v openssl >/dev/null 2>&1; then + echo "ERROR: openssl is required to generate ephemeral PKI" >&2 + exit 2 +fi if [ "${GPU_MODE}" = "1" ]; then DOCKER_CDI_SPEC_DIRS="$(docker info --format '{{json .CDISpecDirs}}' 2>/dev/null || true)" if [ -z "${DOCKER_CDI_SPEC_DIRS}" ] \ @@ -386,7 +390,41 @@ if ! docker image inspect "${SANDBOX_IMAGE}" >/dev/null 2>&1; then fi PKI_DIR="${WORKDIR}/pki" -e2e_generate_pki "${GATEWAY_BIN}" "${PKI_DIR}" +mkdir -p "${PKI_DIR}" +cd "${PKI_DIR}" + +cat > openssl.cnf <<'EOF' +[req] +distinguished_name = dn +prompt = no +[dn] +CN = openshell-server +[san_server] +subjectAltName = @alt_server +[alt_server] +DNS.1 = localhost +DNS.2 = host.openshell.internal +DNS.3 = host.docker.internal +IP.1 = 127.0.0.1 +IP.2 = ::1 +[san_client] +subjectAltName = DNS:openshell-client +EOF + +openssl req -x509 -newkey rsa:2048 -nodes -days 30 \ + -keyout ca.key -out ca.crt -subj "/CN=openshell-e2e-ca" >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout server.key -out server.csr \ + -config openssl.cnf >/dev/null 2>&1 +openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out server.crt -days 30 -extfile openssl.cnf -extensions san_server >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout client.key -out client.csr \ + -subj "/CN=openshell-client" >/dev/null 2>&1 +openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out client.crt -days 30 -extfile openssl.cnf -extensions san_client >/dev/null 2>&1 + +cd "${ROOT}" HOST_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" @@ -435,8 +473,8 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" printf 'default_image = %s\n' "$(toml_string "${SANDBOX_IMAGE}")" printf 'image_pull_policy = "IfNotPresent"\n' printf 'guest_tls_ca = %s\n' "$(toml_string "${PKI_DIR}/ca.crt")" - printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client/tls.crt")" - printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client/tls.key")" + printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client.crt")" + printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client.key")" # DOCKER_SUPERVISOR_ARGS holds either ("--docker-supervisor-bin" "") # or ("--docker-supervisor-image" ""); both map to TOML keys on # the docker driver config. @@ -460,8 +498,8 @@ GATEWAY_ARGS=( --bind-address 0.0.0.0 --port "${HOST_PORT}" --drivers docker - --tls-cert "${PKI_DIR}/server/tls.crt" - --tls-key "${PKI_DIR}/server/tls.key" + --tls-cert "${PKI_DIR}/server.crt" + --tls-key "${PKI_DIR}/server.key" --tls-client-ca "${PKI_DIR}/ca.crt" --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" ) diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 34a081516..1041ec2ac 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -129,14 +129,7 @@ if [ -n "${OPENSHELL_E2E_KUBE_CONTEXT:-}" ]; then exit 2 fi else - if ! command -v k3d >/dev/null 2>&1; then - if [ "$(uname -s)" = "Linux" ]; then - echo "ERROR: k3d is not installed by mise on Linux in this repo." >&2 - echo "Set OPENSHELL_E2E_KUBE_CONTEXT to a kind/existing cluster, or install k3d explicitly." >&2 - exit 2 - fi - require_cmd k3d - fi + require_cmd k3d CLUSTER_NAME="oshe2e-$$-$(date +%s | tail -c 8)" echo "Creating ephemeral k3d cluster ${CLUSTER_NAME}..." HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index 875ebee4b..727737d25 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -11,8 +11,8 @@ # - OPENSHELL_GATEWAY_ENDPOINT=http://host:port: # Use the existing plaintext gateway endpoint and run the command. # -# HTTPS endpoint-only mode is intentionally unsupported here. Use a named -# gateway config when mTLS materials are needed. +# Podman e2e currently uses plaintext gateway traffic. The Podman driver does +# not yet inject gateway mTLS client materials into sandbox containers. set -euo pipefail @@ -277,12 +277,12 @@ if [ -n "${OPENSHELL_GATEWAY_ENDPOINT:-}" ]; then case "${OPENSHELL_GATEWAY_ENDPOINT}" in http://*) ;; https://*) - echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT endpoint mode is HTTP-only for e2e." >&2 - echo " Register a named gateway with mTLS config instead of using a raw HTTPS endpoint." >&2 + echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT endpoint mode is HTTP-only for Podman e2e." >&2 + echo " Podman e2e does not yet support sandbox mTLS client material injection." >&2 exit 2 ;; *) - echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT must start with http:// for e2e endpoint mode." >&2 + echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT must start with http:// for Podman e2e endpoint mode." >&2 exit 2 ;; esac @@ -328,9 +328,6 @@ if ! podman_cmd image exists "${SANDBOX_IMAGE}" 2>/dev/null; then podman_cmd pull "${SANDBOX_IMAGE}" fi -PKI_DIR="${WORKDIR}/pki" -e2e_generate_pki "${GATEWAY_BIN}" "${PKI_DIR}" "host.containers.internal" - HOST_PORT=$(e2e_pick_port) HEALTH_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" @@ -359,27 +356,16 @@ toml_string() { } GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" - -# Start from the RPM default template so this e2e test exercises the same -# TOML config path that RPM users get on first start. The template sets -# bind_address = "0.0.0.0:17670" and compute_drivers = ["podman"]; those -# values must be correct for Podman e2e to pass, which means a regression -# to the template (wrong bind address, wrong driver) will surface here. -# -# We append the driver-specific table and override the port via CLI flag -# (CLI > TOML in the merge precedence) so the test can use an ephemeral port. -cp "${ROOT}/deploy/rpm/gateway.toml.default" "${GATEWAY_CONFIG}" { - printf '\n[openshell.drivers.podman]\n' + printf '[openshell]\nversion = 1\n\n' + printf '[openshell.gateway]\nlog_level = "info"\n\n' + printf '[openshell.drivers.podman]\n' # The Podman driver scopes isolation by network rather than namespace. printf 'network_name = %s\n' "$(toml_string "${PODMAN_NETWORK_NAME}")" printf 'gateway_port = %s\n' "${HOST_PORT}" printf 'default_image = %s\n' "$(toml_string "${SANDBOX_IMAGE}")" printf 'image_pull_policy = "missing"\n' printf 'supervisor_image = %s\n' "$(toml_string "${SUPERVISOR_IMAGE}")" - printf 'guest_tls_ca = %s\n' "$(toml_string "${PKI_DIR}/ca.crt")" - printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client/tls.crt")" - printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client/tls.key")" # The in-process Podman driver reads `socket_path` from TOML only — the # OPENSHELL_PODMAN_SOCKET env var is honoured by the standalone driver # binary, not the in-process driver used here. Pin the socket to the one @@ -388,17 +374,15 @@ cp "${ROOT}/deploy/rpm/gateway.toml.default" "${GATEWAY_CONFIG}" if [ -n "${OPENSHELL_PODMAN_SOCKET:-}" ]; then printf 'socket_path = %s\n' "$(toml_string "${OPENSHELL_PODMAN_SOCKET}")" fi -} >> "${GATEWAY_CONFIG}" +} > "${GATEWAY_CONFIG}" GATEWAY_ARGS=( --config "${GATEWAY_CONFIG}" - # bind_address and compute_drivers come from the RPM template; no CLI flags - # needed. Port is overridden via CLI (CLI > TOML) for ephemeral port selection. + --bind-address 0.0.0.0 --port "${HOST_PORT}" --health-port "${HEALTH_PORT}" - --tls-cert "${PKI_DIR}/server/tls.crt" - --tls-key "${PKI_DIR}/server/tls.key" - --tls-client-ca "${PKI_DIR}/ca.crt" + --drivers podman + --disable-tls --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" --log-level info ) @@ -417,13 +401,12 @@ GATEWAY_PID=$! printf '%s\n' "${GATEWAY_PID}" >"${GATEWAY_PID_FILE}" GATEWAY_NAME="openshell-e2e-podman-${HOST_PORT}" -CLI_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" -e2e_register_mtls_gateway \ +CLI_GATEWAY_ENDPOINT="http://127.0.0.1:${HOST_PORT}" +e2e_register_plaintext_gateway \ "${XDG_CONFIG_HOME}" \ "${GATEWAY_NAME}" \ "${CLI_GATEWAY_ENDPOINT}" \ - "${HOST_PORT}" \ - "${PKI_DIR}" + "${HOST_PORT}" export OPENSHELL_GATEWAY="${GATEWAY_NAME}" export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-300}" diff --git a/examples/bring-your-own-container/Dockerfile b/examples/bring-your-own-container/Dockerfile index 61f283970..17f8083df 100644 --- a/examples/bring-your-own-container/Dockerfile +++ b/examples/bring-your-own-container/Dockerfile @@ -9,9 +9,9 @@ FROM python:3.13-slim # System tools useful for sandbox networking and debugging. # iproute2: required for network namespace management (ip netns, veth pairs) -# nftables: optional, enables bypass detection (log + reject for direct connections) +# iptables: optional, enables bypass detection (LOG + REJECT for direct connections) RUN apt-get update && apt-get install -y --no-install-recommends \ - curl iproute2 nftables \ + curl iproute2 iptables \ && rm -rf /var/lib/apt/lists/* # Create the sandbox user for non-root execution. diff --git a/mise.lock b/mise.lock index 40050e701..f5d959069 100644 --- a/mise.lock +++ b/mise.lock @@ -27,24 +27,43 @@ url = "https://github.com/EmbarkStudios/cargo-about/releases/download/0.8.4/carg url_api = "https://api.github.com/repos/EmbarkStudios/cargo-about/releases/assets/324268695" [[tools."github:anchore/syft"]] -version = "1.44.0" +version = "1.43.0" backend = "github:anchore/syft" [tools."github:anchore/syft"."platforms.linux-arm64"] -checksum = "sha256:6f6cdcdc695721d91ce756e3b5bc3e3416599c464101f5e32e9c3f33054ee6d9" -url = "https://github.com/anchore/syft/releases/download/v1.44.0/syft_1.44.0_linux_arm64.tar.gz" -url_api = "https://api.github.com/repos/anchore/syft/releases/assets/410001182" +checksum = "sha256:afe92510c467f952a009b994f2d998ff8f9dd266dc26eca55d14a0dd46fec7f2" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_linux_arm64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658323" + +[tools."github:anchore/syft"."platforms.linux-arm64-musl"] +checksum = "sha256:afe92510c467f952a009b994f2d998ff8f9dd266dc26eca55d14a0dd46fec7f2" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_linux_arm64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658323" [tools."github:anchore/syft"."platforms.linux-x64"] -checksum = "sha256:0e91737aee2b5baf1d255b959630194a302335d848ff97bb07921eb6205b5f5a" -url = "https://github.com/anchore/syft/releases/download/v1.44.0/syft_1.44.0_linux_amd64.tar.gz" -url_api = "https://api.github.com/repos/anchore/syft/releases/assets/410001183" +checksum = "sha256:7b98251d2d08926bb5d4639b56b1f0996a58ef6667c5830e3fe3cd3ad5f4214a" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_linux_amd64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658325" + +[tools."github:anchore/syft"."platforms.linux-x64-musl"] +checksum = "sha256:7b98251d2d08926bb5d4639b56b1f0996a58ef6667c5830e3fe3cd3ad5f4214a" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_linux_amd64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658325" [tools."github:anchore/syft"."platforms.macos-arm64"] -checksum = "sha256:24e4d34078ae81da7c82539616f0ccac3e226cf4f74a38ce6fb3463619e50a55" -url = "https://github.com/anchore/syft/releases/download/v1.44.0/syft_1.44.0_darwin_arm64.tar.gz" -url_api = "https://api.github.com/repos/anchore/syft/releases/assets/410001187" -provenance = "github-attestations" +checksum = "sha256:3640e2181c8be7a56377f3c96e520d5380c924dbafd115ee3c8d45fcbc89cac2" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_darwin_arm64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658324" + +[tools."github:anchore/syft"."platforms.macos-x64"] +checksum = "sha256:08fd18f55037f999f50b2c2256a9285f0146978a0b16cdc58662ecdc85d0e3c0" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_darwin_amd64.tar.gz" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658329" + +[tools."github:anchore/syft"."platforms.windows-x64"] +checksum = "sha256:c51695d171c61460369dabdd5c71b8f350ef8618466818356a30808d7105c710" +url = "https://github.com/anchore/syft/releases/download/v1.43.0/syft_1.43.0_windows_amd64.zip" +url_api = "https://api.github.com/repos/anchore/syft/releases/assets/402658321" [[tools."github:mozilla/sccache"]] version = "0.14.0" @@ -86,36 +105,20 @@ url = "https://github.com/mozilla/sccache/releases/download/v0.14.0/sccache-v0.1 url_api = "https://api.github.com/repos/mozilla/sccache/releases/assets/353136140" [[tools.helm]] -version = "4.2.0" +version = "4.1.4" backend = "aqua:helm/helm" [tools.helm."platforms.linux-arm64"] -checksum = "sha256:1f8de130dfbd04de64978e7b852a7a547be1404956a366608276d2520b678670" -url = "https://get.helm.sh/helm-v4.2.0-linux-arm64.tar.gz" +checksum = "sha256:13d03672be289045d2ff00e4e345d61de1c6f21c1257a45955a30e8ae036d8f1" +url = "https://get.helm.sh/helm-v4.1.4-linux-arm64.tar.gz" [tools.helm."platforms.linux-x64"] -checksum = "sha256:97dbeb971be4ac4b27e3839976d9564c0fb35c6f3b1da89dd1e292d236af4096" -url = "https://get.helm.sh/helm-v4.2.0-linux-amd64.tar.gz" +checksum = "sha256:70b2c30a19da4db264dfd68c8a3664e05093a361cefd89572ffb36f8abfa3d09" +url = "https://get.helm.sh/helm-v4.1.4-linux-amd64.tar.gz" [tools.helm."platforms.macos-arm64"] -checksum = "sha256:f13f959015447b6bc309f9fd506509926543988a39035c088b52522ec95e2acb" -url = "https://get.helm.sh/helm-v4.2.0-darwin-arm64.tar.gz" - -[[tools.helm-docs]] -version = "1.14.2" -backend = "aqua:norwoodj/helm-docs" - -[tools.helm-docs."platforms.linux-arm64"] -checksum = "sha256:c3787212332386dcd122debef7848feb165aa701467ae3e3442df7638f3ac4e4" -url = "https://github.com/norwoodj/helm-docs/releases/download/v1.14.2/helm-docs_1.14.2_Linux_arm64.tar.gz" - -[tools.helm-docs."platforms.linux-x64"] -checksum = "sha256:a8cf72ada34fad93285ba2a452b38bdc5bd52cc9a571236244ec31022928d6cc" -url = "https://github.com/norwoodj/helm-docs/releases/download/v1.14.2/helm-docs_1.14.2_Linux_x86_64.tar.gz" - -[tools.helm-docs."platforms.macos-arm64"] -checksum = "sha256:2d8399db5b33d240d5f8985241bcf5483563150b968e3229823822979f3e4b8b" -url = "https://github.com/norwoodj/helm-docs/releases/download/v1.14.2/helm-docs_1.14.2_Darwin_arm64.tar.gz" +checksum = "sha256:7c2eca678e8001fa863cdf8cbf6ac1b3799f9404a89eb55c08260ef5732e658d" +url = "https://get.helm.sh/helm-v4.1.4-darwin-arm64.tar.gz" [[tools.k3d]] version = "5.8.3" @@ -150,20 +153,36 @@ checksum = "sha256:655d2aadcb1f0a0dd196c5cbc564687ba945a9547c8e82c9fc532051fb260 url = "https://github.com/k3d-io/k3d/releases/download/v5.8.3/k3d-windows-amd64.exe" [[tools.kubectl]] -version = "1.36.1" +version = "1.35.4" backend = "aqua:kubernetes/kubernetes/kubectl" [tools.kubectl."platforms.linux-arm64"] -checksum = "sha256:59f7ee8e477fae658447607dc3c8790ac17a1b016c01c622c12070e969e2d4e7" -url = "https://dl.k8s.io/v1.36.1/bin/linux/arm64/kubectl" +checksum = "sha256:6a5a4cc4e396d7626a7a693a3044b51c75520f81db30fe6816c2554e53be336f" +url = "https://dl.k8s.io/v1.35.4/bin/linux/arm64/kubectl" + +[tools.kubectl."platforms.linux-arm64-musl"] +checksum = "sha256:6a5a4cc4e396d7626a7a693a3044b51c75520f81db30fe6816c2554e53be336f" +url = "https://dl.k8s.io/v1.35.4/bin/linux/arm64/kubectl" [tools.kubectl."platforms.linux-x64"] -checksum = "sha256:629d3f410e09bf49b64ae7079f7f0bda1191efed311f7d37fdbab0ad5b0ec2b7" -url = "https://dl.k8s.io/v1.36.1/bin/linux/amd64/kubectl" +checksum = "sha256:b529430df69a688fd61b64ad2299edb5fd71cb58be2a4779dba624c7d3510efd" +url = "https://dl.k8s.io/v1.35.4/bin/linux/amd64/kubectl" + +[tools.kubectl."platforms.linux-x64-musl"] +checksum = "sha256:b529430df69a688fd61b64ad2299edb5fd71cb58be2a4779dba624c7d3510efd" +url = "https://dl.k8s.io/v1.35.4/bin/linux/amd64/kubectl" [tools.kubectl."platforms.macos-arm64"] -checksum = "sha256:9092778abaef3079449da4cd70ded0e4be112480c93efcdeace3155968d1d133" -url = "https://dl.k8s.io/v1.36.1/bin/darwin/arm64/kubectl" +checksum = "sha256:ec644a2473b64b486987f695dfb1867963ce6d42d267b86e944585a546f92b5d" +url = "https://dl.k8s.io/v1.35.4/bin/darwin/arm64/kubectl" + +[tools.kubectl."platforms.macos-x64"] +checksum = "sha256:dddb01bddb96f78e48e33105ccfa2feedff585a8b2e3b812f5d0f64c7403710a" +url = "https://dl.k8s.io/v1.35.4/bin/darwin/amd64/kubectl" + +[tools.kubectl."platforms.windows-x64"] +checksum = "sha256:d77d03309bd80de56dafe8ca59ff6f2076e2ed4ee61c6a94657a4b6e945210e6" +url = "https://dl.k8s.io/v1.35.4/bin/windows/amd64/kubectl.exe" [[tools.node]] version = "24.15.0" @@ -218,25 +237,45 @@ checksum = "sha256:b9576b5fa1a1ef3fe13a8c91d9d8204b46545759bea5ae155cd6ba2ea4cda url = "https://github.com/protocolbuffers/protobuf/releases/download/v29.6/protoc-29.6-osx-aarch_64.zip" [[tools.python]] -version = "3.14.5" +version = "3.13.13" backend = "core:python" [tools.python.options] precompiled_flavor = "install_only_stripped" [tools.python."platforms.linux-arm64"] -checksum = "sha256:bea1aa66159eaf97ade1225e40b7060d709154da961aa37792bb8066d8f6af49" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.14.5+20260510-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:67c837838c56a7d16187d1be9fad326a617e0b1ee2687e1a0dda0c85053dac33" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.13.13+20260510-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" +provenance = "github-attestations" + +[tools.python."platforms.linux-arm64-musl"] +checksum = "sha256:0556f1260a9a1fc83210dcecf9d4cbacf17eb4a684541c84798ffc8b4d618c35" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.13.13+20260414-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-x64"] -checksum = "sha256:dc10977b0db3bef1ee2275107fde6fe9c148135b556fa352e83c6baa67d17ed6" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.14.5+20260510-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:bbe27549e475fe5f22d42a8e0d553dc79d80d8a00e05712599637857d287360e" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.13.13+20260510-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +provenance = "github-attestations" + +[tools.python."platforms.linux-x64-musl"] +checksum = "sha256:13d3b6d15f4c3c1dd1955a3c81e06bdc5aef4cb5cb65076878374948be3b0412" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.13.13+20260414-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.macos-arm64"] -checksum = "sha256:1bb0b3d45448dfe7e916dc62144cfd7d7a611dc6ccf05b8bb71662cc5c2a1ad2" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.14.5+20260510-aarch64-apple-darwin-install_only_stripped.tar.gz" +checksum = "sha256:16d2332d950178968534e65fe09f01f876d13af1147176fd0c77a74c9e4d1a4b" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260510/cpython-3.13.13+20260510-aarch64-apple-darwin-install_only_stripped.tar.gz" +provenance = "github-attestations" + +[tools.python."platforms.macos-x64"] +checksum = "sha256:d34198cd856fa80ebf3aa821fe329a25fab66eeda44f72ac9576591282e31bb7" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.13.13+20260414-x86_64-apple-darwin-install_only_stripped.tar.gz" +provenance = "github-attestations" + +[tools.python."platforms.windows-x64"] +checksum = "sha256:b84dce293464cfd366ee792a3d5b42abe5174fc9cce733ba895b3ef467cb3161" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.13.13+20260414-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" provenance = "github-attestations" [[tools.rust]] @@ -244,18 +283,29 @@ version = "1.95.0" backend = "core:rust" [[tools.skaffold]] -version = "2.20.0" +version = "2.19.0" backend = "aqua:GoogleContainerTools/skaffold" [tools.skaffold."platforms.linux-arm64"] -url = "https://storage.googleapis.com/skaffold/releases/v2.20.0/skaffold-linux-arm64" +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-linux-arm64" + +[tools.skaffold."platforms.linux-arm64-musl"] +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-linux-arm64" [tools.skaffold."platforms.linux-x64"] -url = "https://storage.googleapis.com/skaffold/releases/v2.20.0/skaffold-linux-amd64" +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-linux-amd64" + +[tools.skaffold."platforms.linux-x64-musl"] +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-linux-amd64" [tools.skaffold."platforms.macos-arm64"] -checksum = "blake3:cb665940f9b861c1160ddc9dd6601c047587396cb3ca029e2aa1f397ad9e6849" -url = "https://storage.googleapis.com/skaffold/releases/v2.20.0/skaffold-darwin-arm64" +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-darwin-arm64" + +[tools.skaffold."platforms.macos-x64"] +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-darwin-amd64" + +[tools.skaffold."platforms.windows-x64"] +url = "https://storage.googleapis.com/skaffold/releases/v2.19.0/skaffold-windows-amd64.exe" [[tools.uv]] version = "0.10.12" diff --git a/mise.toml b/mise.toml index f3aa2cb9d..c994d2c8f 100644 --- a/mise.toml +++ b/mise.toml @@ -19,20 +19,16 @@ lockfile = true lockfile_platforms = ["linux-x64", "linux-arm64", "macos-arm64"] [tools] -python = "3.14.5" +python = "3.13.13" rust = "1.95.0" node = "24.15.0" -kubectl = "1.36.1" +kubectl = "1.35.4" uv = "0.10.12" protoc = "29.6" -helm = "4.2.0" -helm-docs = "1.14.2" -skaffold = "2.20.0" -# Keep k3d out of Linux CI images until upstream ships a release rebuilt with -# patched Go/container dependencies. Linux Kubernetes E2E uses kind or an -# externally provided cluster context. -k3d = { version = "5.8.3", os = ["macos"] } -"github:anchore/syft" = { version = "1.44.0" } +helm = "4.1.4" +skaffold = "2.19.0" +k3d = "5.8.3" +"github:anchore/syft" = { version = "1.43.0" } "github:EmbarkStudios/cargo-about" = { version = "0.8.4", version_prefix = "" } zig = "0.14.1" "cargo:cargo-zigbuild" = { version = "0.22.3", os = ["macos"] } diff --git a/openshell.spec b/openshell.spec index e351f5a44..d3bd26d07 100644 --- a/openshell.spec +++ b/openshell.spec @@ -2,11 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 %global crate openshell -%global openshell_version 0.0.37 -%global openshell_cargo_version %{openshell_version} +%global openshell_cargo_version %{version} # Python dist-info metadata intentionally follows the RPM Version. Dev build # identity is represented by Release for RPM packages. -%global openshell_python_version %{openshell_version} +%global openshell_python_version %{version} # Cargo/Rust builds with vendored deps do not produce debugsource listings # in the format redhat-rpm-config expects (especially on EPEL). @@ -19,14 +18,14 @@ %global image_tag dev Name: openshell -Version: %{openshell_version} -Release: 1.20260518180028805757.podman.toml.gateway.listener.11.g8c0cb7c8%{?dist} +Version: 0.0.37 +Release: 1.20260506170246815148.rpm.dev.106.g99e94469%{?dist} Summary: Safe, sandboxed runtimes for autonomous AI agents License: Apache-2.0 URL: https://github.com/NVIDIA/OpenShell -Source0: openshell-%{openshell_version}.tar.gz -Source1: openshell-%{openshell_version}-vendor.tar.xz +Source0: openshell-0.0.37.tar.gz +Source1: openshell-0.0.37-vendor.tar.xz ExclusiveArch: x86_64 aarch64 @@ -54,7 +53,7 @@ BuildRequires: pandoc BuildRequires: python3-devel # Runtime: container runtime for package-managed gateway sandboxes. -# The gateway auto-detects Podman when the package-managed service starts. +# Podman is preferred; Docker is also supported via --container-runtime flag. Recommends: podman %description @@ -72,9 +71,9 @@ Requires: %{name} = %{version}-%{release} %description gateway OpenShell gateway server providing the control-plane API for sandbox -lifecycle management. This package installs Podman-oriented defaults in -gateway TOML while leaving compute driver selection to gateway auto-detection -or explicit operator configuration. +lifecycle management. This package configures the gateway to use the +Podman compute driver, pulling sandbox and supervisor images from +ghcr.io/nvidia/openshell. # --- Python SDK sub-package --- %package -n python3-%{name} @@ -120,6 +119,7 @@ cargo build --release --bin openshell --bin openshell-gateway # Build man pages from markdown pandoc -s -t man deploy/man/openshell.1.md -o openshell.1 pandoc -s -t man deploy/man/openshell-gateway.8.md -o openshell-gateway.8 +pandoc -s -t man deploy/man/openshell-gateway.env.5.md -o openshell-gateway.env.5 %install # --- CLI binary --- @@ -128,41 +128,50 @@ install -Dpm 0755 target/release/%{name} %{buildroot}%{_bindir}/%{name} # --- Gateway binary --- install -Dpm 0755 target/release/%{name}-gateway %{buildroot}%{_bindir}/%{name}-gateway -# --- Default gateway TOML config template --- -# Shipped as a read-only reference in %{_datadir}. The systemd unit seeds a -# user-level copy at ~/.config/openshell/gateway.toml on first start. -install -Dpm 0644 deploy/rpm/gateway.toml.default %{buildroot}%{_datadir}/%{name}-gateway/gateway.toml.default - -# --- Gateway systemd user unit --- +# --- Gateway systemd user unit (rootless Podman) --- # Installed to the systemd user unit directory so any user can run: # systemctl --user enable --now openshell-gateway.service +# Podman socket activation provides the container API. install -d %{buildroot}%{_userunitdir} cat > %{buildroot}%{_userunitdir}/%{name}-gateway.service << 'EOF' [Unit] Description=OpenShell Gateway (user) Documentation=https://github.com/NVIDIA/OpenShell After=podman.socket -Wants=podman.socket +Requires=podman.socket [Service] Type=exec -# On first start the unit seeds a default TOML config and generates PKI. -# Client certs are placed in ~/.config/openshell/gateways/openshell/mtls/ so -# the CLI discovers them automatically. +# Self-contained defaults for rootless operation with mTLS. +# +# PKI and gateway.env are auto-generated on first start. Client certs +# are placed in ~/.config/openshell/gateways/openshell/mtls/ so the +# CLI discovers them automatically. # See /usr/share/doc/openshell-gateway/ for details. -# Seed a default TOML config on first start if the user has not created one. -# The template ships at /usr/share/openshell-gateway/gateway.toml.default. -# Edit ~/.config/openshell/gateway.toml to customize. -# %%E expands to $XDG_CONFIG_HOME (~/.config) in user units. -ExecStartPre=/bin/sh -c 'test -f %%E/openshell/gateway.toml || install -Dm644 /usr/share/openshell-gateway/gateway.toml.default %%E/openshell/gateway.toml' - -# Auto-generate PKI on first start if not present. -# %%S expands to $XDG_STATE_HOME (~/.local/state) in user units. -ExecStartPre=/usr/bin/openshell-gateway generate-certs --output-dir %%S/openshell/tls --server-san host.openshell.internal +# Auto-generate PKI on first start. Idempotent: skips when all six PEMs are +# already in place. %%S expands to $XDG_STATE_HOME (~/.local/state) in user +# units. +ExecStartPre=/usr/bin/openshell-gateway generate-certs --output-dir %%S/openshell/tls -# Optional OPENSHELL_* overrides. +# Auto-generate gateway.env (commented config reference) on first +# start if not present. +# %%E expands to $XDG_CONFIG_HOME (~/.config) in user units. +ExecStartPre=%{_libexecdir}/openshell/init-gateway-env.sh %%E/openshell/gateway.env EnvironmentFile=-%%E/openshell/gateway.env +Environment=OPENSHELL_BIND_ADDRESS=0.0.0.0 +Environment=OPENSHELL_DRIVERS=podman +Environment=OPENSHELL_DB_URL=sqlite://%%S/openshell/gateway.db +Environment=OPENSHELL_SUPERVISOR_IMAGE=ghcr.io/nvidia/openshell/supervisor:%{image_tag} +Environment=OPENSHELL_SANDBOX_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest +# mTLS: auto-generated certs in the state directory. +Environment=OPENSHELL_TLS_CERT=%%S/openshell/tls/server/tls.crt +Environment=OPENSHELL_TLS_KEY=%%S/openshell/tls/server/tls.key +Environment=OPENSHELL_TLS_CLIENT_CA=%%S/openshell/tls/ca.crt +# Podman driver: client certs bind-mounted into sandbox containers. +Environment=OPENSHELL_PODMAN_TLS_CA=%%S/openshell/tls/ca.crt +Environment=OPENSHELL_PODMAN_TLS_CERT=%%S/openshell/tls/client/tls.crt +Environment=OPENSHELL_PODMAN_TLS_KEY=%%S/openshell/tls/client/tls.key ExecStart=/usr/bin/openshell-gateway StateDirectory=openshell Restart=on-failure @@ -178,6 +187,14 @@ RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX WantedBy=default.target EOF +# --- Gateway env generator --- +install -d %{buildroot}%{_libexecdir}/%{name} +install -pm 0755 deploy/rpm/init-gateway-env.sh %{buildroot}%{_libexecdir}/%{name}/init-gateway-env.sh +# Patch commented image defaults to match the build type (dev or latest). +# The source file uses :latest as a generic reference; the installed copy +# reflects what this RPM actually expects from the registry. +sed -i 's|supervisor:latest|supervisor:%{image_tag}|' %{buildroot}%{_libexecdir}/%{name}/init-gateway-env.sh + # --- Gateway documentation --- install -d %{buildroot}%{_docdir}/%{name}-gateway install -pm 0644 deploy/rpm/QUICKSTART.md %{buildroot}%{_docdir}/%{name}-gateway/QUICKSTART.md @@ -187,6 +204,7 @@ install -pm 0644 deploy/rpm/TROUBLESHOOTING.md %{buildroot}%{_docdir}/%{name}-ga # --- Man pages --- install -Dpm 0644 openshell.1 %{buildroot}%{_mandir}/man1/openshell.1 install -Dpm 0644 openshell-gateway.8 %{buildroot}%{_mandir}/man8/openshell-gateway.8 +install -Dpm 0644 openshell-gateway.env.5 %{buildroot}%{_mandir}/man5/openshell-gateway.env.5 # --- Python SDK --- # Install Python SDK modules (test files are intentionally excluded) @@ -231,15 +249,6 @@ touch %{buildroot}%{python3_sitelib}/%{name}-%{openshell_python_version}.dist-in # build environment. PYTHONPATH=%{buildroot}%{python3_sitelib} %{python3} -c "from importlib.metadata import version; v = version('openshell'); print(v); assert v == '%{openshell_python_version}', f'expected %{openshell_python_version}, got {v}'" -# Verify the RPM default TOML config template was installed. -# A missing template means first-start seeding silently falls back to the -# binary default of 127.0.0.1, which breaks Podman sandbox connectivity. -test -f %{buildroot}%{_datadir}/%{name}-gateway/gateway.toml.default - -# Verify the systemd unit references the template in its ExecStartPre seed step. -# If this grep fails, the first-start seeding logic was removed from the unit. -grep -q 'gateway.toml.default' %{buildroot}%{_userunitdir}/%{name}-gateway.service - %post gateway %systemd_user_post %{name}-gateway.service @@ -266,8 +275,9 @@ grep -q 'gateway.toml.default' %{buildroot}%{_userunitdir}/%{name}-gateway.servi %doc %{_docdir}/%{name}-gateway/TROUBLESHOOTING.md %{_bindir}/%{name}-gateway %{_userunitdir}/%{name}-gateway.service -%{_datadir}/%{name}-gateway/gateway.toml.default +%{_libexecdir}/%{name}/init-gateway-env.sh %{_mandir}/man8/openshell-gateway.8* +%{_mandir}/man5/openshell-gateway.env.5* %files -n python3-%{name} %license LICENSE diff --git a/openshell_telemetry_schema.json b/openshell_telemetry_schema.json new file mode 100644 index 000000000..782fae37c --- /dev/null +++ b/openshell_telemetry_schema.json @@ -0,0 +1,287 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "Schemas for anonymous OpenShell telemetry events.", + "schemaMeta": { + "clientName": "OpenShell_Anonymous_Telemetry_Client", + "definitionVersion": "1.0", + "schemaVersion": "2.0", + "clientId": "415437562476676", + "personalization": "" + }, + "definitions": { + "types": { + "NvidiaSourceEnum": { + "type": "string", + "description": "The product that created the event.", + "enum": ["openshell"] + }, + "ResourceEnum": { + "type": "string", + "description": "The anonymous OpenShell resource category involved in a lifecycle event.", + "enum": ["sandbox", "sandbox_policy"] + }, + "OperationEnum": { + "type": "string", + "description": "The anonymous lifecycle operation.", + "enum": ["create", "delete", "update"] + }, + "OutcomeEnum": { + "type": "string", + "description": "Whether the operation succeeded or failed.", + "enum": ["success", "failure"] + }, + "DenyGroupEnum": { + "type": "string", + "description": "Sanitized denial category. Never contains hostnames, paths, binaries, or user content.", + "enum": [ + "connect_policy", + "forward_policy", + "l7_policy", + "l7_parse_rejection", + "ssrf", + "bypass", + "policy_stale", + "unknown" + ] + }, + "SandboxTemplateSourceEnum": { + "type": "string", + "description": "Anonymous sandbox template source category. Does not include image names or paths.", + "enum": ["default", "image", "undefined"] + }, + "PolicyDecisionOperationEnum": { + "type": "string", + "description": "Anonymous policy draft decision operation.", + "enum": ["approve", "reject", "undo", "approve_all"] + }, + "ProviderProfileEnum": { + "type": "string", + "description": "Sanitized built-in provider profile id, or custom for generic and non-standard provider types.", + "enum": [ + "anthropic", + "claude", + "codex", + "copilot", + "github", + "gitlab", + "nvidia", + "openai", + "opencode", + "outlook", + "custom" + ] + }, + "RestrictedTelemetryInteger": { + "type": "integer", + "description": "Numeric count constrained to the signed 64-bit integer range.", + "maximum": 9223372036854775807, + "minimum": 0 + } + }, + "events": { + "openshell_lifecycle_event": { + "description": "Anonymous lifecycle count event for gateway-observed sandbox deletes and sandbox policy updates.", + "type": "object", + "eventMeta": { + "service": "telemetry", + "gdpr": { + "category": "functional", + "description": "Anonymous technical metadata about gateway-observed OpenShell resource lifecycle operations. No IDs, names, hosts, paths, model names, provider names, or user content are collected.", + "personalization": "" + } + }, + "properties": { + "nvidiaSource": { + "$ref": "#/definitions/types/NvidiaSourceEnum" + }, + "resource": { + "$ref": "#/definitions/types/ResourceEnum" + }, + "operation": { + "$ref": "#/definitions/types/OperationEnum" + }, + "outcome": { + "$ref": "#/definitions/types/OutcomeEnum" + } + }, + "additionalProperties": false, + "required": ["nvidiaSource", "resource", "operation", "outcome"] + }, + "openshell_sandbox_activity_summary_event": { + "description": "Anonymous summary of observed OpenShell sandbox network activity, emitted only when a non-empty activity batch is flushed.", + "type": "object", + "eventMeta": { + "service": "telemetry", + "gdpr": { + "category": "functional", + "description": "Anonymous aggregate usage counters for sandbox network activity. No sandbox IDs, names, hosts, paths, binaries, raw deny reasons, or user content are collected.", + "personalization": "" + } + }, + "properties": { + "nvidiaSource": { + "$ref": "#/definitions/types/NvidiaSourceEnum" + }, + "networkActivityCount": { + "$ref": "#/definitions/types/RestrictedTelemetryInteger" + }, + "deniedActionCount": { + "$ref": "#/definitions/types/RestrictedTelemetryInteger" + }, + "denialRatePct": { + "type": "number", + "minimum": 0, + "maximum": 100, + "description": "Percent of observed sandbox network activity that was denied." + }, + "denialsByGroup": { + "type": "array", + "description": "Denied action counts grouped only by sanitized deny category.", + "items": { + "type": "object", + "properties": { + "denyGroup": { + "$ref": "#/definitions/types/DenyGroupEnum" + }, + "deniedCount": { + "$ref": "#/definitions/types/RestrictedTelemetryInteger" + } + }, + "additionalProperties": false, + "required": ["denyGroup", "deniedCount"] + } + } + }, + "additionalProperties": false, + "required": [ + "nvidiaSource", + "networkActivityCount", + "deniedActionCount", + "denialRatePct", + "denialsByGroup" + ] + }, + "openshell_sandbox_create_event": { + "description": "Anonymous sandbox creation event with aggregate configuration shape.", + "type": "object", + "eventMeta": { + "service": "telemetry", + "gdpr": { + "category": "functional", + "description": "Anonymous sandbox creation metadata. No sandbox IDs, names, image names, paths, provider names, or user content are collected.", + "personalization": "" + } + }, + "properties": { + "nvidiaSource": { + "$ref": "#/definitions/types/NvidiaSourceEnum" + }, + "outcome": { + "$ref": "#/definitions/types/OutcomeEnum" + }, + "requestedGpu": { + "type": "boolean", + "description": "Whether GPU resources were requested for the sandbox." + }, + "providerCount": { + "$ref": "#/definitions/types/RestrictedTelemetryInteger" + }, + "hasCustomPolicy": { + "type": "boolean", + "description": "Whether a sandbox policy was provided at creation time." + }, + "templateSource": { + "$ref": "#/definitions/types/SandboxTemplateSourceEnum" + } + }, + "additionalProperties": false, + "required": [ + "nvidiaSource", + "outcome", + "requestedGpu", + "providerCount", + "hasCustomPolicy", + "templateSource" + ] + }, + "openshell_policy_decision_event": { + "description": "Anonymous sandbox policy draft decision event.", + "type": "object", + "eventMeta": { + "service": "telemetry", + "gdpr": { + "category": "functional", + "description": "Anonymous policy decision metadata. No rule names, hosts, paths, binaries, raw deny reasons, or user content are collected.", + "personalization": "" + } + }, + "properties": { + "nvidiaSource": { + "$ref": "#/definitions/types/NvidiaSourceEnum" + }, + "operation": { + "$ref": "#/definitions/types/PolicyDecisionOperationEnum" + }, + "outcome": { + "$ref": "#/definitions/types/OutcomeEnum" + }, + "ruleCount": { + "$ref": "#/definitions/types/RestrictedTelemetryInteger" + } + }, + "additionalProperties": false, + "required": ["nvidiaSource", "operation", "outcome", "ruleCount"] + }, + "openshell_provider_lifecycle_event": { + "description": "Anonymous provider lifecycle event grouped by sanitized provider profile.", + "type": "object", + "eventMeta": { + "service": "telemetry", + "gdpr": { + "category": "functional", + "description": "Anonymous provider operation metadata grouped by profile id. Provider names, credentials, hosts, and user content are not collected.", + "personalization": "" + } + }, + "properties": { + "nvidiaSource": { + "$ref": "#/definitions/types/NvidiaSourceEnum" + }, + "operation": { + "$ref": "#/definitions/types/OperationEnum" + }, + "outcome": { + "$ref": "#/definitions/types/OutcomeEnum" + }, + "providerProfile": { + "$ref": "#/definitions/types/ProviderProfileEnum" + } + }, + "additionalProperties": false, + "required": [ + "nvidiaSource", + "operation", + "outcome", + "providerProfile" + ] + } + } + }, + "oneOf": [ + { + "$ref": "#/definitions/events/openshell_lifecycle_event" + }, + { + "$ref": "#/definitions/events/openshell_sandbox_activity_summary_event" + }, + { + "$ref": "#/definitions/events/openshell_sandbox_create_event" + }, + { + "$ref": "#/definitions/events/openshell_policy_decision_event" + }, + { + "$ref": "#/definitions/events/openshell_provider_lifecycle_event" + } + ] +} diff --git a/proto/datamodel.proto b/proto/datamodel.proto index f92d7b7a3..534b043ae 100644 --- a/proto/datamodel.proto +++ b/proto/datamodel.proto @@ -8,7 +8,7 @@ package openshell.datamodel.v1; // Kubernetes-style metadata shared by all top-level OpenShell domain objects. // // This structure provides consistent metadata (identity, labels, timestamps, -// resource versioning) across Sandbox, Provider, SshSession, and other resources. +// versioning) across Sandbox, Provider, SshSession, and other resources. message ObjectMeta { // Stable object ID generated by the gateway. string id = 1; @@ -22,10 +22,6 @@ message ObjectMeta { // Key-value labels for filtering and organization. // Labels must follow Kubernetes conventions: alphanumeric + `-._/`, max 63 chars per segment. map labels = 4; - - // Optimistic concurrency control version. - // Incremented by the gateway on each update. Clients can use this for compare-and-swap operations. - uint64 resource_version = 5; } // Provider model stored by OpenShell. @@ -38,7 +34,4 @@ message Provider { map credentials = 3; // Non-secret provider configuration. map config = 4; - // Expiration timestamps for credential values, keyed by credential/env var - // name. A zero or missing value means the credential does not expire. - map credential_expires_at_ms = 5; } diff --git a/proto/openshell.proto b/proto/openshell.proto index ca62646e3..5ba96b625 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -102,22 +102,6 @@ service OpenShell { // Update an existing provider by name. rpc UpdateProvider(UpdateProviderRequest) returns (ProviderResponse); - // Fetch refresh status for one provider or provider credential. - rpc GetProviderRefreshStatus(GetProviderRefreshStatusRequest) - returns (GetProviderRefreshStatusResponse); - - // Configure gateway-owned refresh material for one provider credential. - rpc ConfigureProviderRefresh(ConfigureProviderRefreshRequest) - returns (ConfigureProviderRefreshResponse); - - // Record a gateway-owned refresh request for one provider credential. - rpc RotateProviderCredential(RotateProviderCredentialRequest) - returns (RotateProviderCredentialResponse); - - // Delete gateway-owned refresh configuration for one provider credential. - rpc DeleteProviderRefresh(DeleteProviderRefreshRequest) - returns (DeleteProviderRefreshResponse); - // Delete a provider by name. rpc DeleteProvider(DeleteProviderRequest) returns (DeleteProviderResponse); @@ -400,11 +384,6 @@ message AttachSandboxProviderRequest { string sandbox_name = 1; // Provider name to attach. string provider_name = 2; - // Expected resource version for optimistic concurrency control. - // If 0, the server uses the current version (backward compatibility). - // If non-zero, the server validates that the sandbox's current resource_version - // matches this value before applying the mutation, returning ABORTED on mismatch. - uint64 expected_resource_version = 3; } // Detach provider from sandbox request. @@ -413,11 +392,6 @@ message DetachSandboxProviderRequest { string sandbox_name = 1; // Provider name to detach. string provider_name = 2; - // Expected resource version for optimistic concurrency control. - // If 0, the server uses the current version (backward compatibility). - // If non-zero, the server validates that the sandbox's current resource_version - // matches this value before applying the mutation, returning ABORTED on mismatch. - uint64 expected_resource_version = 3; } // Delete sandbox request. @@ -786,9 +760,6 @@ message ListProvidersRequest { // Update provider request. message UpdateProviderRequest { openshell.datamodel.v1.Provider provider = 1; - // Optional per-credential expiry timestamps to merge into the provider. - // A zero value removes the expiry for that credential. - map credential_expires_at_ms = 2; } // Delete provider request. @@ -841,103 +812,6 @@ message ProviderProfileCredential { string auth_style = 5; string header_name = 6; string query_param = 7; - ProviderCredentialRefresh refresh = 8; -} - -enum ProviderCredentialRefreshStrategy { - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_UNSPECIFIED = 0; - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_STATIC = 1; - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_EXTERNAL = 2; - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_OAUTH2_REFRESH_TOKEN = 3; - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_OAUTH2_CLIENT_CREDENTIALS = 4; - PROVIDER_CREDENTIAL_REFRESH_STRATEGY_GOOGLE_SERVICE_ACCOUNT_JWT = 5; -} - -message ProviderCredentialRefreshMaterial { - string name = 1; - string description = 2; - bool required = 3; - bool secret = 4; -} - -message ProviderCredentialRefresh { - ProviderCredentialRefreshStrategy strategy = 1; - string token_url = 2; - repeated string scopes = 3; - int64 refresh_before_seconds = 4; - int64 max_lifetime_seconds = 5; - repeated ProviderCredentialRefreshMaterial material = 6; -} - -message ProviderCredentialRefreshStatus { - string provider_name = 1; - string provider_id = 2; - string credential_key = 3; - ProviderCredentialRefreshStrategy strategy = 4; - string status = 5; - int64 expires_at_ms = 6; - int64 next_refresh_at_ms = 7; - int64 last_refresh_at_ms = 8; - string last_error = 9; -} - -message StoredProviderCredentialRefreshState { - openshell.datamodel.v1.ObjectMeta metadata = 1; - string provider_id = 2; - string provider_name = 3; - string credential_key = 4; - ProviderCredentialRefreshStrategy strategy = 5; - map material = 6; - repeated string secret_material_keys = 7; - int64 expires_at_ms = 8; - int64 next_refresh_at_ms = 9; - int64 last_refresh_at_ms = 10; - string status = 11; - string last_error = 12; - string token_url = 13; - repeated string scopes = 14; - int64 refresh_before_seconds = 15; - int64 max_lifetime_seconds = 16; -} - -message GetProviderRefreshStatusRequest { - string provider = 1; - string credential_key = 2; -} - -message GetProviderRefreshStatusResponse { - repeated ProviderCredentialRefreshStatus credentials = 1; -} - -message ConfigureProviderRefreshRequest { - string provider = 1; - string credential_key = 2; - ProviderCredentialRefreshStrategy strategy = 3; - map material = 4; - repeated string secret_material_keys = 5; - optional int64 expires_at_ms = 6; -} - -message ConfigureProviderRefreshResponse { - ProviderCredentialRefreshStatus status = 1; -} - -message RotateProviderCredentialRequest { - string provider = 1; - string credential_key = 2; -} - -message RotateProviderCredentialResponse { - ProviderCredentialRefreshStatus status = 1; -} - -message DeleteProviderRefreshRequest { - string provider = 1; - string credential_key = 2; -} - -message DeleteProviderRefreshResponse { - bool deleted = 1; } // Stable provider profile categories used by clients for grouping and filtering. @@ -1030,8 +904,6 @@ message GetSandboxProviderEnvironmentResponse { map environment = 1; // Fingerprint for the provider credential inputs that produced environment. uint64 provider_env_revision = 2; - // Expiration timestamps for returned environment variables. - map credential_expires_at_ms = 3; } // --------------------------------------------------------------------------- @@ -1063,12 +935,6 @@ message UpdateConfigRequest { bool global = 6; // Batched incremental policy merge operations. Sandbox-scoped only. repeated PolicyMergeOperation merge_operations = 7; - // Expected resource version for optimistic concurrency control (sandbox-scoped only). - // If 0, the server uses the current version (backward compatibility). - // If non-zero, the server validates that the sandbox's current resource_version - // matches this value before applying the mutation, returning ABORTED on mismatch. - // Ignored for global-scoped updates. - uint64 expected_resource_version = 8; } message PolicyMergeOperation { @@ -1426,6 +1292,25 @@ message DenialSummary { bool l7_inspection_active = 17; } +// Count of denied actions grouped only by sanitized telemetry category. +message DenialGroupCount { + // Sanitized denial category, e.g. "connect_policy", "l7_policy", "ssrf". + string deny_group = 1; + // Number of denied actions in this category. + uint32 denied_count = 2; +} + +// Anonymous sandbox network activity counters. This intentionally excludes +// hosts, paths, binaries, raw deny reasons, sandbox IDs, and user content. +message NetworkActivitySummary { + // Total observed network activities in the current window. + uint32 network_activity_count = 1; + // Total denied actions in the current window. + uint32 denied_action_count = 2; + // Denied action counts grouped by sanitized category. + repeated DenialGroupCount denials_by_group = 3; +} + // A proposed policy rule with rationale and approval status. message PolicyChunk { // Unique chunk identifier. @@ -1499,6 +1384,8 @@ message SubmitPolicyAnalysisRequest { string analysis_mode = 3; // Sandbox name. string name = 4; + // Anonymous network activity counters. + repeated NetworkActivitySummary network_activity_summaries = 5; } message SubmitPolicyAnalysisResponse { diff --git a/providers/anthropic.yaml b/providers/anthropic.yaml new file mode 100644 index 000000000..64aecac42 --- /dev/null +++ b/providers/anthropic.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: anthropic +display_name: Anthropic API +description: Anthropic API access for Claude models +category: inference +inference_capable: true +credentials: + - name: api_key + description: Anthropic API key + env_vars: [ANTHROPIC_API_KEY] + required: true + auth_style: header + header_name: x-api-key +endpoints: + - host: api.anthropic.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce +binaries: [/usr/bin/curl, /usr/local/bin/curl] diff --git a/providers/claude-code.yaml b/providers/claude.yaml similarity index 98% rename from providers/claude-code.yaml rename to providers/claude.yaml index b835f3d45..7b526008f 100644 --- a/providers/claude-code.yaml +++ b/providers/claude.yaml @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -id: claude-code +id: claude display_name: Claude Code description: Claude Code CLI category: agent diff --git a/providers/codex.yaml b/providers/codex.yaml new file mode 100644 index 000000000..c29d8878d --- /dev/null +++ b/providers/codex.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: codex +display_name: Codex +description: Codex CLI using OpenAI-compatible API credentials +category: agent +inference_capable: true +credentials: + - name: api_key + description: OpenAI API key + env_vars: [OPENAI_API_KEY] + required: true + auth_style: bearer + header_name: authorization +endpoints: + - host: api.openai.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce +binaries: [/usr/bin/codex, /usr/local/bin/codex] diff --git a/providers/copilot.yaml b/providers/copilot.yaml new file mode 100644 index 000000000..74f9a4cd8 --- /dev/null +++ b/providers/copilot.yaml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: copilot +display_name: GitHub Copilot +description: GitHub Copilot tooling +category: agent +credentials: + - name: github_token + description: GitHub token used by Copilot tooling + env_vars: [COPILOT_GITHUB_TOKEN, GH_TOKEN, GITHUB_TOKEN] + required: true + auth_style: bearer + header_name: authorization +endpoints: + - host: api.github.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce + - host: github.com + port: 443 + protocol: rest + access: read-only + enforcement: enforce +binaries: [/usr/bin/copilot, /usr/local/bin/copilot] diff --git a/providers/gitlab.yaml b/providers/gitlab.yaml new file mode 100644 index 000000000..6d6535c75 --- /dev/null +++ b/providers/gitlab.yaml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: gitlab +display_name: GitLab +description: GitLab API and Git operations +category: source_control +credentials: + - name: api_token + description: GitLab token + env_vars: [GITLAB_TOKEN, GLAB_TOKEN, CI_JOB_TOKEN] + required: true + auth_style: bearer + header_name: authorization +endpoints: + - host: gitlab.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce + - host: api.gitlab.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce +binaries: [/usr/bin/glab, /usr/local/bin/glab, /usr/bin/git, /usr/local/bin/git] diff --git a/providers/openai.yaml b/providers/openai.yaml new file mode 100644 index 000000000..632687f5e --- /dev/null +++ b/providers/openai.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: openai +display_name: OpenAI +description: OpenAI API access +category: inference +inference_capable: true +credentials: + - name: api_key + description: OpenAI API key + env_vars: [OPENAI_API_KEY] + required: true + auth_style: bearer + header_name: authorization +endpoints: + - host: api.openai.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce +binaries: [/usr/bin/curl, /usr/local/bin/curl] diff --git a/providers/opencode.yaml b/providers/opencode.yaml new file mode 100644 index 000000000..e8cf646dd --- /dev/null +++ b/providers/opencode.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: opencode +display_name: OpenCode +description: OpenCode-compatible inference provider +category: agent +inference_capable: true +credentials: + - name: api_key + description: OpenCode-compatible API key + env_vars: [OPENCODE_API_KEY, OPENROUTER_API_KEY, OPENAI_API_KEY] + required: true + auth_style: bearer + header_name: authorization +endpoints: + - host: api.openai.com + port: 443 + protocol: rest + access: read-write + enforcement: enforce +binaries: [/usr/bin/opencode, /usr/local/bin/opencode] diff --git a/providers/outlook.yaml b/providers/outlook.yaml new file mode 100644 index 000000000..6295bcc59 --- /dev/null +++ b/providers/outlook.yaml @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +id: outlook +display_name: Outlook +description: Outlook provider record without managed policy defaults +category: messaging diff --git a/pyproject.toml b/pyproject.toml index e4c7832a6..71ceed34f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", "Programming Language :: Rust", "License :: OSI Approved :: Apache Software License", "Topic :: Security", diff --git a/python/openshell/release_formula_test.py b/python/openshell/release_formula_test.py index 81bf89fab..f2f7bf787 100644 --- a/python/openshell/release_formula_test.py +++ b/python/openshell/release_formula_test.py @@ -51,93 +51,20 @@ def test_generate_homebrew_formula_uses_tagged_macos_driver_asset_without_defaul "v0.0.10/openshell-driver-vm-aarch64-apple-darwin.tar.gz" ) in formula assert 'sha256 "' + "b" * 64 + '"' in formula - assert "OPENSHELL_DRIVERS: " not in formula - assert 'OPENSHELL_GATEWAY_CONFIG: "#{var}/openshell/gateway.toml"' not in formula - assert "init-gateway-config.sh" not in formula - assert 'bind_address = "127.0.0.1:17670"' not in formula - assert '# compute_drivers = ["vm"]' not in formula + assert "OPENSHELL_DRIVERS:" not in formula + assert "#OPENSHELL_DRIVERS=vm" in formula + assert 'OPENSHELL_GATEWAY_CONFIG: "#{var}/openshell/gateway.toml"' in formula + assert 'driver_dir = "#{opt_libexec}"' in formula + assert 'supervisor_image = "ghcr.io/nvidia/openshell/supervisor:0.0.10"' in formula assert 'run opt_libexec/"openshell-gateway-homebrew-service"' in formula - assert 'xdg_config_home="${XDG_CONFIG_HOME:-${HOME}/.config}"' in formula - assert 'xdg_gateway_config="${xdg_config_home}/openshell/gateway.toml"' in formula - assert 'prefix_gateway_config="#{var}/openshell/gateway.toml"' in formula assert ( - 'if [ -z "${OPENSHELL_GATEWAY_CONFIG:-}" ] && [ ! -f "${xdg_gateway_config}" ] && [ -f "${prefix_gateway_config}" ]; then' + 'docker_tls_dir="${OPENSHELL_DOCKER_TLS_DIR:-${HOME}/.local/state/openshell/homebrew/tls}"' ) in formula - assert ( - 'exec "#{opt_bin}/openshell-gateway" --config "${prefix_gateway_config}"' - in formula - ) - assert 'exec "#{opt_bin}/openshell-gateway"' in formula - assert "--db-url" not in formula - assert 'docker_tls_dir="${HOME}/.local/state/openshell/homebrew/tls"' in formula - assert ( - 'export OPENSHELL_LOCAL_TLS_DIR="${OPENSHELL_LOCAL_TLS_DIR:-${docker_tls_dir}}"' - in formula - ) - assert '/usr/bin/install -m 0600 "#{var}/openshell/tls/server/tls.key"' in formula - assert "OPENSHELL_CONFIG_" not in formula - assert "OPENSHELL_DOCKER_TLS_DIR" not in formula - assert 'xdg_gateway_env="${xdg_config_home}/openshell/gateway.env"' in formula - assert 'prefix_gateway_env="#{var}/openshell/gateway.env"' in formula - assert '. "${xdg_gateway_env}"' in formula - assert '. "${prefix_gateway_env}"' in formula - assert 'gateway_env = var/"openshell/gateway.env"' not in formula - assert "#OPENSHELL_GATEWAY_CONFIG=#{var}/openshell/gateway.toml" not in formula - assert "environment_variables(" not in formula - assert " OPENSHELL_BIND_ADDRESS:" not in formula - assert " OPENSHELL_SERVER_PORT:" not in formula - assert " OPENSHELL_TLS_CERT:" not in formula + assert 'guest_tls_ca = "${docker_tls_dir}/ca.crt"' in formula + assert 'gateway_env="#{var}/openshell/gateway.env"' in formula + assert '. "${gateway_env}"' in formula assert "OPENSHELL_DRIVER_DIR:" not in formula assert "OPENSHELL_DOCKER_SUPERVISOR_IMAGE:" not in formula assert 'OPENSHELL_DOCKER_TLS_CA: "#{var}/openshell/tls/ca.crt"' not in formula assert "entitlements.atomic_write" in formula assert "brew services restart openshell" in formula - - -def test_snap_wrapper_uses_optional_gateway_config_without_generating_toml() -> None: - repo_root = Path(__file__).resolve().parents[2] - wrapper = (repo_root / "deploy/snap/bin/openshell-gateway-wrapper").read_text( - encoding="utf-8" - ) - - assert "init-gateway-config.sh" not in wrapper - assert ( - 'export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${SNAP_COMMON}/gateway.db?mode=rwc}"' - in wrapper - ) - assert 'export OPENSHELL_DISABLE_TLS="${OPENSHELL_DISABLE_TLS:-true}"' in wrapper - assert ( - 'exec "${SNAP}/bin/openshell-gateway" --config "$CANONICAL_CONFIG_FILE" "$@"' - in wrapper - ) - assert 'exec "${SNAP}/bin/openshell-gateway" "$@"' in wrapper - - -def test_rpm_spec_uses_gateway_defaults_without_config_helper() -> None: - repo_root = Path(__file__).resolve().parents[2] - spec = (repo_root / "openshell.spec").read_text(encoding="utf-8") - - assert "init-gateway-config.sh" not in spec - assert "init-pki.sh" not in spec - assert "openshell-gateway generate-certs --output-dir %%S/openshell/tls" in spec - assert "EnvironmentFile=-%%E/openshell/gateway.env" in spec - assert "Environment=OPENSHELL_DRIVERS" not in spec - assert "Environment=OPENSHELL_BIND_ADDRESS" not in spec - assert "Environment=OPENSHELL_PODMAN_TLS_CA" not in spec - assert "ExecStart=/usr/bin/openshell-gateway" in spec - assert "--config" not in spec - assert "--db-url" not in spec - - -def test_deb_user_service_uses_gateway_defaults_without_config_helper() -> None: - repo_root = Path(__file__).resolve().parents[2] - unit = (repo_root / "deploy/deb/openshell-gateway.service").read_text( - encoding="utf-8" - ) - - assert "EnvironmentFile=-%E/openshell/gateway.env" in unit - assert "openshell-gateway generate-certs --output-dir %S/openshell/tls" in unit - assert "init-gateway-config.sh" not in unit - assert "ExecStart=/usr/bin/openshell-gateway" in unit - assert "--config" not in unit - assert "--db-url" not in unit diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 83ac5d4a6..028536e9e 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -65,7 +65,7 @@ version = 1 # optional; reserved for future schema migratio # ────────────────────────────────────────────────────────────────────────────── [openshell.gateway] # Listener -bind_address = "127.0.0.1:17670" # default: 127.0.0.1:17670 (loopback) +bind_address = "127.0.0.1:8080" # default: 127.0.0.1:8080 (loopback) health_bind_address = "0.0.0.0:8081" # optional; omit to disable metrics_bind_address = "0.0.0.0:9090" # optional; omit to disable extra_bind_addresses = [] # additional listeners (driver callbacks, etc.) diff --git a/snapcraft.yaml b/snapcraft.yaml index 613437ae6..5f27ead1a 100644 --- a/snapcraft.yaml +++ b/snapcraft.yaml @@ -41,6 +41,12 @@ apps: daemon: simple refresh-mode: endure environment: + OPENSHELL_BIND_ADDRESS: 127.0.0.1 + OPENSHELL_SERVER_PORT: 17670 + OPENSHELL_DB_URL: "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" + OPENSHELL_DISABLE_TLS: "true" + OPENSHELL_DRIVERS: docker + OPENSHELL_GATEWAY_CONFIG: "$SNAP_COMMON/gateway.toml" XDG_DATA_HOME: "$SNAP_COMMON" XDG_RUNTIME_DIR: "$SNAP_COMMON" plugs: diff --git a/tasks/ci.toml b/tasks/ci.toml index b9bfe0d27..791c4b3b0 100644 --- a/tasks/ci.toml +++ b/tasks/ci.toml @@ -51,7 +51,7 @@ hide = true [lint] description = "Run repository lint checks" -depends = ["license:check", "rust:format:check", "rust:lint", "python:format:check", "python:lint", "helm:lint", "helm:docs:check", "markdown:lint"] +depends = ["license:check", "rust:format:check", "rust:lint", "python:format:check", "python:lint", "helm:lint", "markdown:lint"] hide = true [ci] diff --git a/tasks/helm.toml b/tasks/helm.toml index 31788088a..af9c2b9e5 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -3,25 +3,6 @@ # Helm chart tasks -["helm:docs"] -description = "Generate the openshell Helm chart README from Chart.yaml, values.yaml, and README.md.gotmpl" -run = "helm-docs --chart-search-root deploy/helm/openshell" - -["helm:docs:check"] -description = "Verify the openshell Helm chart README is generated and up to date" -run = """ - set -e - tmp="$(mktemp)" - trap 'rm -f "$tmp"' EXIT - - helm-docs --chart-search-root deploy/helm/openshell --dry-run > "$tmp" - if ! diff -u deploy/helm/openshell/README.md "$tmp"; then - echo "Helm chart README is out of sync. Run: mise run helm:docs" >&2 - exit 1 - fi -""" -hide = true - ["helm:lint"] description = "Lint the openshell Helm chart (defaults + all CI configuration variants)" run = """ @@ -69,11 +50,10 @@ dir = "deploy/helm/openshell" run = "skaffold diagnose" hide = true -# Local k3s via k3d (Docker required). macOS gets k3d from mise; Linux users -# should install k3d explicitly or point workflows at an existing/kind cluster. +# Local k3s via k3d (Docker required). On macOS this is the supported path for a lightweight cluster to pair with helm:skaffold:* . ["helm:k3s:create"] -description = "Create a local k3s cluster with k3d and merge kubeconfig (macOS via mise; Linux requires explicit k3d install; use with helm:skaffold:dev)" +description = "Create a local k3s cluster with k3d and merge kubeconfig (macOS/Linux + Docker; use with helm:skaffold:dev)" run = "tasks/scripts/helm-k3s-local.sh create" ["helm:k3s:delete"] diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index fd73d38c8..d4f802c0f 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -3,8 +3,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Local k3s for Helm / Skaffold workflows using k3d. macOS gets k3d from mise; -# Linux users should install k3d explicitly or point tests at a kind/existing cluster. +# Local k3s for Helm / Skaffold workflows using k3d (macOS primary; Linux also supported). # Requires Docker running. Writes merged kubeconfig to HELM_K3S_KUBECONFIG or $KUBECONFIG or ./kubeconfig. # # Multi-worktree: the cluster name is derived from the last component of the current @@ -53,8 +52,7 @@ Environment: HELM_K3S_KUBECONFIG kubeconfig file to write/merge (default: repo kubeconfig or \$KUBECONFIG) HELM_K3S_LB_HOST_PORT Host port mapped to load balancer port 80 (default: 8080) -macOS uses k3d from mise (Docker required). Linux can use this flow only when -k3d is installed explicitly; otherwise use kind or an existing cluster context. +macOS uses k3d (Docker required). Linux uses the same k3d flow when Docker is available. Pair with: mise run helm:skaffold:dev EOF } @@ -82,12 +80,7 @@ require_docker() { require_k3d() { if ! command -v k3d >/dev/null 2>&1; then - if [[ "$(uname -s)" == "Linux" ]]; then - echo "error: k3d not found. This repo no longer installs k3d through mise on Linux." >&2 - echo "Install k3d explicitly, or use kind/an existing cluster and set OPENSHELL_E2E_KUBE_CONTEXT." >&2 - else - echo "error: k3d not found. Run: mise install" >&2 - fi + echo "error: k3d not found. Run: mise install" >&2 exit 1 fi } diff --git a/tasks/scripts/package-deb.sh b/tasks/scripts/package-deb.sh index 9d7e3d328..5705e3385 100755 --- a/tasks/scripts/package-deb.sh +++ b/tasks/scripts/package-deb.sh @@ -115,6 +115,8 @@ stage_binary "$OPENSHELL_DRIVER_VM_BINARY" "$pkgroot/usr/libexec/openshell/opens # Per-user systemd unit. Each user enables it via `systemctl --user`. install -D -m 0644 "$src_dir/openshell-gateway.service" \ "$pkgroot/usr/lib/systemd/user/openshell-gateway.service" +install -D -m 0755 "$src_dir/init-gateway-config.sh" \ + "$pkgroot/usr/libexec/openshell/init-gateway-config.sh" # --------------------------------------------------------------------------- # DEBIAN/ control directory diff --git a/tasks/scripts/release.py b/tasks/scripts/release.py index 79cb7ab73..df61e0907 100644 --- a/tasks/scripts/release.py +++ b/tasks/scripts/release.py @@ -224,6 +224,11 @@ def _asset_url(release_tag: str, filename: str) -> str: return f"{GITHUB_RELEASE_DOWNLOADS}/{release_tag}/{filename}" +def _homebrew_supervisor_image(release_tag: str) -> str: + image_tag = "dev" if release_tag == "dev" else release_tag.removeprefix("v") + return f"ghcr.io/nvidia/openshell/supervisor:{image_tag}" + + def render_homebrew_formula( *, release_tag: str, @@ -235,6 +240,7 @@ def render_homebrew_formula( raise ValueError(f"release tag contains unsupported characters: {release_tag}") version = release_tag.removeprefix("v") + docker_supervisor_image = _homebrew_supervisor_image(release_tag) return f"""# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -283,37 +289,50 @@ def install exit 1 fi - xdg_config_home="${{XDG_CONFIG_HOME:-${{HOME}}/.config}}" - xdg_gateway_env="${{xdg_config_home}}/openshell/gateway.env" - prefix_gateway_env="#{{var}}/openshell/gateway.env" - if [ -f "${{xdg_gateway_env}}" ]; then - set -a - . "${{xdg_gateway_env}}" - set +a - elif [ -f "${{prefix_gateway_env}}" ]; then + gateway_env="#{{var}}/openshell/gateway.env" + if [ -f "${{gateway_env}}" ]; then set -a - . "${{prefix_gateway_env}}" + . "${{gateway_env}}" set +a fi - docker_tls_dir="${{HOME}}/.local/state/openshell/homebrew/tls" - mkdir -p "${{docker_tls_dir}}/server" + docker_tls_dir="${{OPENSHELL_DOCKER_TLS_DIR:-${{HOME}}/.local/state/openshell/homebrew/tls}}" mkdir -p "${{docker_tls_dir}}/client" - chmod 700 "${{docker_tls_dir}}" "${{docker_tls_dir}}/server" "${{docker_tls_dir}}/client" + chmod 700 "${{docker_tls_dir}}" "${{docker_tls_dir}}/client" /usr/bin/install -m 0644 "#{{var}}/openshell/tls/ca.crt" "${{docker_tls_dir}}/ca.crt" - /usr/bin/install -m 0644 "#{{var}}/openshell/tls/server/tls.crt" "${{docker_tls_dir}}/server/tls.crt" - /usr/bin/install -m 0600 "#{{var}}/openshell/tls/server/tls.key" "${{docker_tls_dir}}/server/tls.key" /usr/bin/install -m 0644 "#{{var}}/openshell/tls/client/tls.crt" "${{docker_tls_dir}}/client/tls.crt" /usr/bin/install -m 0600 "#{{var}}/openshell/tls/client/tls.key" "${{docker_tls_dir}}/client/tls.key" - export OPENSHELL_LOCAL_TLS_DIR="${{OPENSHELL_LOCAL_TLS_DIR:-${{docker_tls_dir}}}}" - xdg_gateway_config="${{xdg_config_home}}/openshell/gateway.toml" - prefix_gateway_config="#{{var}}/openshell/gateway.toml" - - if [ -z "${{OPENSHELL_GATEWAY_CONFIG:-}}" ] && [ ! -f "${{xdg_gateway_config}}" ] && [ -f "${{prefix_gateway_config}}" ]; then - exec "#{{opt_bin}}/openshell-gateway" --config "${{prefix_gateway_config}}" + gateway_config="${{OPENSHELL_GATEWAY_CONFIG:-#{{var}}/openshell/gateway.toml}}" + if [ ! -f "${{gateway_config}}" ]; then + mkdir -p "$(dirname "${{gateway_config}}")" "#{{var}}/openshell/vm-driver" + cat > "${{gateway_config}}" < @@ -342,6 +377,15 @@ def post_install service do run opt_libexec/"openshell-gateway-homebrew-service" + environment_variables( + OPENSHELL_BIND_ADDRESS: "127.0.0.1", + OPENSHELL_SERVER_PORT: "{LOCAL_GATEWAY_PORT}", + OPENSHELL_TLS_CERT: "#{{var}}/openshell/tls/server/tls.crt", + OPENSHELL_TLS_KEY: "#{{var}}/openshell/tls/server/tls.key", + OPENSHELL_TLS_CLIENT_CA: "#{{var}}/openshell/tls/ca.crt", + OPENSHELL_DB_URL: "sqlite:#{{var}}/openshell/gateway/openshell.db", + OPENSHELL_GATEWAY_CONFIG: "#{{var}}/openshell/gateway.toml", + ) keep_alive successful_exit: false log_path var/"log/openshell/openshell-gateway.out.log" error_log_path var/"log/openshell/openshell-gateway.err.log" diff --git a/tasks/test.toml b/tasks/test.toml index c6ac82180..91d2c44f6 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -67,7 +67,7 @@ env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_ run = "e2e/rust/e2e-podman.sh" ["e2e:kubernetes"] -description = "Run Rust CLI e2e tests against an OpenShell gateway deployed on Kubernetes via Helm (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster; otherwise creates a local k3d cluster when k3d is installed; set OPENSHELL_E2E_KUBE_TEST= to scope to one test)" +description = "Run Rust CLI e2e tests against an OpenShell gateway deployed on Kubernetes via Helm (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster; set OPENSHELL_E2E_KUBE_TEST= to scope to one test)" run = "e2e/rust/e2e-kubernetes.sh" ["e2e:vm"] From ed93630e205dd0d1d7bfeaae4d5050406bd61635 Mon Sep 17 00:00:00 2001 From: Kirit93 Date: Tue, 19 May 2026 09:24:41 -0700 Subject: [PATCH 2/5] Updated telemetry implementation to be non blocking --- crates/openshell-core/src/telemetry.rs | 67 +++++++++++++++++-- .../src/activity_aggregator.rs | 57 ++++++++++++++-- .../openshell-sandbox/src/bypass_monitor.rs | 9 +-- crates/openshell-sandbox/src/l7/relay.rs | 7 +- crates/openshell-sandbox/src/lib.rs | 5 +- crates/openshell-sandbox/src/proxy.rs | 24 +++---- 6 files changed, 131 insertions(+), 38 deletions(-) diff --git a/crates/openshell-core/src/telemetry.rs b/crates/openshell-core/src/telemetry.rs index 0a1745d99..796865af1 100644 --- a/crates/openshell-core/src/telemetry.rs +++ b/crates/openshell-core/src/telemetry.rs @@ -6,9 +6,11 @@ use chrono::{SecondsFormat, Utc}; use reqwest::blocking::Client; use serde_json::{Value, json}; +use std::sync::{OnceLock, mpsc}; use std::thread; use std::time::Duration; +const TELEMETRY_EVENT_QUEUE_CAPACITY: usize = 1024; const CLIENT_ID: &str = "415437562476676"; const DEFAULT_ENDPOINT: &str = "https://events.telemetry.data-uat.nvidia.com/v1.1/events/json"; const EVENT_SCHEMA_VERSION: &str = "2.0"; @@ -16,6 +18,14 @@ const EVENT_PROTOCOL_VERSION: &str = "1.6"; const EVENT_SYSTEM_VERSION: &str = "openshell-telemetry/1.0"; const HTTP_TIMEOUT: Duration = Duration::from_secs(5); const SOURCE: &str = "openshell"; +static TELEMETRY_SENDER: OnceLock>> = OnceLock::new(); + +#[derive(Debug)] +struct TelemetryEvent { + endpoint: String, + name: &'static str, + event: Value, +} fn telemetry_enabled() -> bool { telemetry_enabled_from(std::env::var("OPENSHELL_TELEMETRY_ENABLED").ok().as_deref()) @@ -93,6 +103,26 @@ fn build_payload(name: &str, event: Value, ts: &str) -> Value { }) } +fn telemetry_sender() -> Option<&'static mpsc::SyncSender> { + TELEMETRY_SENDER + .get_or_init(|| { + let (tx, rx) = mpsc::sync_channel(TELEMETRY_EVENT_QUEUE_CAPACITY); + thread::Builder::new() + .name("openshell-telemetry".to_string()) + .spawn(move || telemetry_worker(rx)) + .ok() + .map(|_| tx) + }) + .as_ref() +} + +fn telemetry_worker(rx: mpsc::Receiver) { + for event in rx { + let payload = build_payload(event.name, event.event, ×tamp()); + let _ = publish_payload(&event.endpoint, payload); + } +} + fn publish_payload(endpoint: &str, payload: Value) -> Result<(), reqwest::Error> { Client::builder() .use_rustls_tls() @@ -106,6 +136,10 @@ fn publish_payload(endpoint: &str, payload: Value) -> Result<(), reqwest::Error> Ok(()) } +fn try_enqueue_event(sender: &mpsc::SyncSender, event: TelemetryEvent) -> bool { + sender.try_send(event).is_ok() +} + fn emit_event(name: &'static str, event: Value) { if !telemetry_enabled() { return; @@ -113,11 +147,18 @@ fn emit_event(name: &'static str, event: Value) { let Some(endpoint) = telemetry_endpoint() else { return; }; + let Some(sender) = telemetry_sender() else { + return; + }; - thread::spawn(move || { - let payload = build_payload(name, event, ×tamp()); - let _ = publish_payload(&endpoint, payload); - }); + let _ = try_enqueue_event( + sender, + TelemetryEvent { + endpoint, + name, + event, + }, + ); } pub fn emit_lifecycle(resource: &str, operation: &str, outcome: &str) { @@ -263,4 +304,22 @@ mod tests { assert_eq!(payload["events"][0]["parameters"]["nvidiaSource"], SOURCE); assert_eq!(payload["events"][0]["ts"], "2026-05-18T00:00:00.000Z"); } + + #[test] + fn telemetry_enqueue_drops_when_queue_is_full() { + let (tx, _rx) = mpsc::sync_channel(1); + let event = || TelemetryEvent { + endpoint: "https://example.test/events".to_string(), + name: "openshell_lifecycle_event", + event: json!({ + "nvidiaSource": SOURCE, + "resource": "sandbox", + "operation": "create", + "outcome": "success", + }), + }; + + assert!(try_enqueue_event(&tx, event())); + assert!(!try_enqueue_event(&tx, event())); + } } diff --git a/crates/openshell-sandbox/src/activity_aggregator.rs b/crates/openshell-sandbox/src/activity_aggregator.rs index 335de51a6..692c1d3b6 100644 --- a/crates/openshell-sandbox/src/activity_aggregator.rs +++ b/crates/openshell-sandbox/src/activity_aggregator.rs @@ -8,12 +8,17 @@ use std::future::Future; use tokio::sync::mpsc; use tracing::debug; +pub const ACTIVITY_EVENT_QUEUE_CAPACITY: usize = 1024; +const ACTIVITY_FLUSH_QUEUE_CAPACITY: usize = 1; + #[derive(Debug, Clone)] pub struct ActivityEvent { pub denied: bool, pub deny_group: &'static str, } +pub type ActivitySender = mpsc::Sender; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct FlushableActivitySummary { pub network_activity_count: u32, @@ -22,7 +27,7 @@ pub struct FlushableActivitySummary { } pub struct ActivityAggregator { - rx: mpsc::UnboundedReceiver, + rx: mpsc::Receiver, network_activity_count: u32, denied_action_count: u32, denials_by_group: HashMap, @@ -30,7 +35,7 @@ pub struct ActivityAggregator { } impl ActivityAggregator { - pub fn new(rx: mpsc::UnboundedReceiver, flush_interval_secs: u64) -> Self { + pub fn new(rx: mpsc::Receiver, flush_interval_secs: u64) -> Self { Self { rx, network_activity_count: 0, @@ -42,9 +47,17 @@ impl ActivityAggregator { pub async fn run(mut self, flush_callback: F) where - F: Fn(FlushableActivitySummary) -> Fut, - Fut: Future, + F: Fn(FlushableActivitySummary) -> Fut + Send + 'static, + Fut: Future + Send + 'static, { + let (flush_tx, mut flush_rx) = + mpsc::channel::(ACTIVITY_FLUSH_QUEUE_CAPACITY); + tokio::spawn(async move { + while let Some(summary) = flush_rx.recv().await { + flush_callback(summary).await; + } + }); + let mut flush_interval = tokio::time::interval(std::time::Duration::from_secs(self.flush_interval_secs)); flush_interval.tick().await; @@ -56,7 +69,7 @@ impl ActivityAggregator { self.ingest(event); } else { if let Some(summary) = self.drain() { - flush_callback(summary).await; + queue_flush_summary(&flush_tx, summary); } debug!("ActivityAggregator: channel closed, exiting"); return; @@ -69,7 +82,7 @@ impl ActivityAggregator { denied = summary.denied_action_count, "ActivityAggregator: flushing anonymous activity summary" ); - flush_callback(summary).await; + queue_flush_summary(&flush_tx, summary); } } } @@ -103,6 +116,17 @@ impl ActivityAggregator { } } +pub fn try_record_activity(tx: &ActivitySender, denied: bool, deny_group: &'static str) -> bool { + tx.try_send(ActivityEvent { denied, deny_group }).is_ok() +} + +fn queue_flush_summary( + tx: &mpsc::Sender, + summary: FlushableActivitySummary, +) -> bool { + tx.try_send(summary).is_ok() +} + pub fn sanitize_deny_group(raw: &str) -> &'static str { match raw { "connect_policy" | "connect" | "l4_deny" => "connect_policy", @@ -150,4 +174,25 @@ mod tests { assert_float_eq(denial_rate_pct(4, 1), 25.0); assert_float_eq(denial_rate_pct(4, 10), 100.0); } + + #[test] + fn activity_send_drops_when_queue_is_full() { + let (tx, _rx) = mpsc::channel(1); + + assert!(try_record_activity(&tx, false, "unknown")); + assert!(!try_record_activity(&tx, true, "connect_policy")); + } + + #[test] + fn flush_summary_drops_when_queue_is_full() { + let (tx, _rx) = mpsc::channel(1); + let summary = FlushableActivitySummary { + network_activity_count: 1, + denied_action_count: 0, + denials_by_group: Vec::new(), + }; + + assert!(queue_flush_summary(&tx, summary.clone())); + assert!(!queue_flush_summary(&tx, summary)); + } } diff --git a/crates/openshell-sandbox/src/bypass_monitor.rs b/crates/openshell-sandbox/src/bypass_monitor.rs index bc945c01a..7f1c6c527 100644 --- a/crates/openshell-sandbox/src/bypass_monitor.rs +++ b/crates/openshell-sandbox/src/bypass_monitor.rs @@ -16,7 +16,7 @@ //! the monitor logs a one-time warning and returns. The iptables REJECT rules //! still provide fast-fail UX — the monitor only adds diagnostic visibility. -use crate::activity_aggregator::ActivityEvent; +use crate::activity_aggregator::{ActivitySender, try_record_activity}; use crate::denial_aggregator::DenialEvent; use openshell_ocsf::{ ActionId, ActivityId, ConfidenceId, DetectionFindingBuilder, DispositionId, Endpoint, @@ -119,7 +119,7 @@ pub fn spawn( namespace_name: String, entrypoint_pid: Arc, denial_tx: Option>, - activity_tx: Option>, + activity_tx: Option, ) -> Option> { use std::io::BufRead; use std::process::{Command, Stdio}; @@ -280,10 +280,7 @@ pub fn spawn( }); } if let Some(ref tx) = activity_tx { - let _ = tx.send(ActivityEvent { - denied: true, - deny_group: "bypass", - }); + let _ = try_record_activity(tx, true, "bypass"); } } diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 32283f538..9efa7ca9f 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -7,7 +7,7 @@ //! Parses each request within the tunnel, evaluates it against OPA policy, //! and either forwards or denies the request. -use crate::activity_aggregator::ActivityEvent; +use crate::activity_aggregator::{ActivitySender, try_record_activity}; use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::rest::WebSocketExtensionMode; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; @@ -20,7 +20,6 @@ use openshell_ocsf::{ }; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; -use tokio::sync::mpsc; use tracing::{debug, warn}; /// Context for L7 request policy evaluation. @@ -40,7 +39,7 @@ pub struct L7EvalContext { /// Supervisor-only placeholder resolver for outbound headers. pub(crate) secret_resolver: Option>, /// Anonymous activity counter channel. - pub(crate) activity_tx: Option>, + pub(crate) activity_tx: Option, } #[derive(Default)] @@ -458,7 +457,7 @@ fn emit_l7_request_log( fn emit_activity(ctx: &L7EvalContext, denied: bool, deny_group: &'static str) { if let Some(tx) = &ctx.activity_tx { - let _ = tx.send(ActivityEvent { denied, deny_group }); + let _ = try_record_activity(tx, denied, deny_group); } } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index fe21ec096..1a0ff0258 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -606,7 +606,8 @@ pub async fn run_sandbox( (None, None, None) }; let (activity_tx, activity_rx, bypass_activity_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let (tx, rx) = + tokio::sync::mpsc::channel(activity_aggregator::ACTIVITY_EVENT_QUEUE_CAPACITY); let bypass_tx = tx.clone(); (Some(tx), Some(rx), Some(bypass_tx)) } else { @@ -1016,7 +1017,7 @@ pub async fn run_sandbox( tokio::spawn(async move { aggregator - .run(|summary| { + .run(move |summary| { let endpoint = agg_endpoint.clone(); let sandbox_name = agg_name.clone(); async move { diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 1845218a0..7897282af 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -3,7 +3,7 @@ //! HTTP CONNECT proxy with OPA policy evaluation and process-identity binding. -use crate::activity_aggregator::ActivityEvent; +use crate::activity_aggregator::{ActivitySender, try_record_activity}; use crate::denial_aggregator::DenialEvent; use crate::identity::BinaryIdentityCache; use crate::l7::tls::ProxyTlsState; @@ -179,7 +179,7 @@ impl ProxyHandle { provider_credentials: Option, policy_local_ctx: Option>, denial_tx: Option>, - activity_tx: Option>, + activity_tx: Option, ) -> Result { // Use override bind_addr, fall back to policy http_addr, then default // to loopback:3128. The default allows the proxy to function when no @@ -294,23 +294,15 @@ impl Drop for ProxyHandle { } } -fn emit_activity( - tx: &Option>, - denied: bool, - deny_group: &'static str, -) { +fn emit_activity(tx: &Option, denied: bool, deny_group: &'static str) { if let Some(tx) = tx { - let _ = tx.send(ActivityEvent { denied, deny_group }); + let _ = try_record_activity(tx, denied, deny_group); } } -fn emit_activity_simple( - tx: Option<&mpsc::UnboundedSender>, - denied: bool, - deny_group: &'static str, -) { +fn emit_activity_simple(tx: Option<&ActivitySender>, denied: bool, deny_group: &'static str) { if let Some(tx) = tx { - let _ = tx.send(ActivityEvent { denied, deny_group }); + let _ = try_record_activity(tx, denied, deny_group); } } @@ -387,7 +379,7 @@ async fn handle_tcp_connection( trusted_host_gateway: Arc>, secret_resolver: Option>, denial_tx: Option>, - activity_tx: Option>, + activity_tx: Option, ) -> Result<()> { let mut buf = vec![0u8; MAX_HEADER_BYTES]; let mut used = 0usize; @@ -2725,7 +2717,7 @@ async fn handle_forward_proxy( trusted_host_gateway: Arc>, secret_resolver: Option>, denial_tx: Option<&mpsc::UnboundedSender>, - activity_tx: Option<&mpsc::UnboundedSender>, + activity_tx: Option<&ActivitySender>, ) -> Result<()> { // 1. Parse the absolute-form URI. `path` is marked `mut` so that, when an // L7 config applies, the canonicalized form produced below replaces it From 5df47de16a448ebaf7431eb0235b3c1300c5e4a0 Mon Sep 17 00:00:00 2001 From: Kirit93 Date: Tue, 19 May 2026 09:51:57 -0700 Subject: [PATCH 3/5] Fixed agent feedback --- crates/openshell-core/src/telemetry.rs | 209 ++++++++++++++++-- .../src/activity_aggregator.rs | 25 +++ crates/openshell-sandbox/src/lib.rs | 62 +++++- crates/openshell-sandbox/src/proxy.rs | 55 ++++- crates/openshell-server/src/grpc/policy.rs | 67 ++++++ crates/openshell-server/src/grpc/provider.rs | 84 +++++-- crates/openshell-server/src/telemetry.rs | 165 ++------------ 7 files changed, 456 insertions(+), 211 deletions(-) diff --git a/crates/openshell-core/src/telemetry.rs b/crates/openshell-core/src/telemetry.rs index 796865af1..96a5665cf 100644 --- a/crates/openshell-core/src/telemetry.rs +++ b/crates/openshell-core/src/telemetry.rs @@ -6,11 +6,13 @@ use chrono::{SecondsFormat, Utc}; use reqwest::blocking::Client; use serde_json::{Value, json}; +use std::collections::BTreeMap; use std::sync::{OnceLock, mpsc}; use std::thread; use std::time::Duration; const TELEMETRY_EVENT_QUEUE_CAPACITY: usize = 1024; +const MAX_TELEMETRY_INTEGER: u64 = 9_223_372_036_854_775_807; const CLIENT_ID: &str = "415437562476676"; const DEFAULT_ENDPOINT: &str = "https://events.telemetry.data-uat.nvidia.com/v1.1/events/json"; const EVENT_SCHEMA_VERSION: &str = "2.0"; @@ -24,10 +26,11 @@ static TELEMETRY_SENDER: OnceLock>> = On struct TelemetryEvent { endpoint: String, name: &'static str, + event_ts: String, event: Value, } -fn telemetry_enabled() -> bool { +pub fn enabled() -> bool { telemetry_enabled_from(std::env::var("OPENSHELL_TELEMETRY_ENABLED").ok().as_deref()) } @@ -61,7 +64,7 @@ fn timestamp() -> String { Utc::now().to_rfc3339_opts(SecondsFormat::Millis, true) } -fn build_payload(name: &str, event: Value, ts: &str) -> Value { +fn build_payload(name: &str, event: Value, event_ts: &str, sent_ts: &str) -> Value { json!({ "browserType": "undefined", "clientId": CLIENT_ID, @@ -90,12 +93,12 @@ fn build_payload(name: &str, event: Value, ts: &str) -> Value { "integrationId": "undefined", "productName": "undefined", "productVersion": "undefined", - "sentTs": ts, + "sentTs": sent_ts, "sessionId": "undefined", "userId": "undefined", "events": [ { - "ts": ts, + "ts": event_ts, "parameters": event, "name": name, } @@ -118,7 +121,7 @@ fn telemetry_sender() -> Option<&'static mpsc::SyncSender> { fn telemetry_worker(rx: mpsc::Receiver) { for event in rx { - let payload = build_payload(event.name, event.event, ×tamp()); + let payload = build_payload(event.name, event.event, &event.event_ts, ×tamp()); let _ = publish_payload(&event.endpoint, payload); } } @@ -141,7 +144,7 @@ fn try_enqueue_event(sender: &mpsc::SyncSender, event: Telemetry } fn emit_event(name: &'static str, event: Value) { - if !telemetry_enabled() { + if !enabled() { return; } let Some(endpoint) = telemetry_endpoint() else { @@ -156,12 +159,22 @@ fn emit_event(name: &'static str, event: Value) { TelemetryEvent { endpoint, name, + event_ts: timestamp(), event, }, ); } pub fn emit_lifecycle(resource: &str, operation: &str, outcome: &str) { + let Some(resource) = lifecycle_resource(resource) else { + return; + }; + let Some(operation) = lifecycle_operation(operation) else { + return; + }; + let Some(outcome) = telemetry_outcome(outcome) else { + return; + }; emit_event( "openshell_lifecycle_event", json!({ @@ -174,6 +187,13 @@ pub fn emit_lifecycle(resource: &str, operation: &str, outcome: &str) { } pub fn emit_provider_lifecycle(operation: &str, outcome: &str, provider_profile: &str) { + let Some(operation) = lifecycle_operation(operation) else { + return; + }; + let Some(outcome) = telemetry_outcome(outcome) else { + return; + }; + let provider_profile = provider_profile_bucket(provider_profile); emit_event( "openshell_provider_lifecycle_event", json!({ @@ -192,6 +212,13 @@ pub fn emit_sandbox_create( has_custom_policy: bool, template_source: &str, ) { + let Some(outcome) = telemetry_outcome(outcome) else { + return; + }; + if !valid_count(provider_count) { + return; + } + let template_source = sandbox_template_source_bucket(template_source); emit_event( "openshell_sandbox_create_event", json!({ @@ -206,6 +233,15 @@ pub fn emit_sandbox_create( } pub fn emit_policy_decision(operation: &str, outcome: &str, rule_count: u64) { + let Some(operation) = policy_decision_operation(operation) else { + return; + }; + let Some(outcome) = telemetry_outcome(outcome) else { + return; + }; + if !valid_count(rule_count) { + return; + } emit_event( "openshell_policy_decision_event", json!({ @@ -226,21 +262,20 @@ pub fn emit_sandbox_activity_summary( I: IntoIterator, S: Into, { - let mut rows: Vec = denials_by_group + if !valid_count(network_activity_count) + || !valid_count(denied_action_count) + || !denial_rate_pct.is_finite() + || !(0.0..=100.0).contains(&denial_rate_pct) + { + return; + } + let Some(denials_by_group) = sanitize_denials_by_group(denials_by_group) else { + return; + }; + let rows: Vec = denials_by_group .into_iter() - .map(|(group, count)| { - json!({ - "denyGroup": group.into(), - "deniedCount": count, - }) - }) + .map(|(group, count)| json!({ "denyGroup": group, "deniedCount": count })) .collect(); - rows.sort_by(|left, right| { - left["denyGroup"] - .as_str() - .unwrap_or_default() - .cmp(right["denyGroup"].as_str().unwrap_or_default()) - }); emit_event( "openshell_sandbox_activity_summary_event", json!({ @@ -253,6 +288,107 @@ pub fn emit_sandbox_activity_summary( ); } +fn valid_count(value: u64) -> bool { + value <= MAX_TELEMETRY_INTEGER +} + +fn telemetry_outcome(raw: &str) -> Option<&'static str> { + match raw { + "success" => Some("success"), + "failure" => Some("failure"), + _ => None, + } +} + +fn lifecycle_resource(raw: &str) -> Option<&'static str> { + match raw { + "sandbox" => Some("sandbox"), + "sandbox_policy" => Some("sandbox_policy"), + _ => None, + } +} + +fn lifecycle_operation(raw: &str) -> Option<&'static str> { + match raw { + "create" => Some("create"), + "delete" => Some("delete"), + "update" => Some("update"), + _ => None, + } +} + +fn policy_decision_operation(raw: &str) -> Option<&'static str> { + match raw { + "approve" => Some("approve"), + "reject" => Some("reject"), + "undo" => Some("undo"), + "approve_all" => Some("approve_all"), + _ => None, + } +} + +fn sandbox_template_source_bucket(raw: &str) -> &'static str { + match raw { + "default" => "default", + "image" => "image", + _ => "undefined", + } +} + +fn provider_profile_bucket(raw: &str) -> &'static str { + match raw.trim().to_ascii_lowercase().as_str() { + "anthropic" => "anthropic", + "claude" => "claude", + "codex" => "codex", + "copilot" => "copilot", + "github" => "github", + "gitlab" => "gitlab", + "nvidia" => "nvidia", + "openai" => "openai", + "opencode" => "opencode", + "outlook" => "outlook", + _ => "custom", + } +} + +fn deny_group_bucket(raw: &str) -> &'static str { + match raw { + "connect_policy" | "connect" | "l4_deny" => "connect_policy", + "forward_policy" | "forward" => "forward_policy", + "l7_policy" | "l7" | "l7_deny" | "forward-l7-deny" => "l7_policy", + "l7_parse_rejection" | "parse_rejection" => "l7_parse_rejection", + "ssrf" => "ssrf", + "bypass" => "bypass", + "policy_stale" => "policy_stale", + _ => "unknown", + } +} + +fn sanitize_denials_by_group(denials_by_group: I) -> Option> +where + I: IntoIterator, + S: Into, +{ + let mut sanitized = BTreeMap::<&'static str, u64>::new(); + for (group, count) in denials_by_group { + if !valid_count(count) { + return None; + } + let group = group.into(); + let bucket = deny_group_bucket(&group); + let next_count = sanitized + .get(bucket) + .copied() + .unwrap_or(0) + .checked_add(count)?; + if !valid_count(next_count) { + return None; + } + sanitized.insert(bucket, next_count); + } + Some(sanitized) +} + #[cfg(test)] mod tests { use super::*; @@ -291,6 +427,7 @@ mod tests { "templateSource": "default", }), "2026-05-18T00:00:00.000Z", + "2026-05-18T00:00:01.000Z", ); assert_eq!(payload["clientId"], CLIENT_ID); @@ -303,6 +440,7 @@ mod tests { ); assert_eq!(payload["events"][0]["parameters"]["nvidiaSource"], SOURCE); assert_eq!(payload["events"][0]["ts"], "2026-05-18T00:00:00.000Z"); + assert_eq!(payload["sentTs"], "2026-05-18T00:00:01.000Z"); } #[test] @@ -311,6 +449,7 @@ mod tests { let event = || TelemetryEvent { endpoint: "https://example.test/events".to_string(), name: "openshell_lifecycle_event", + event_ts: "2026-05-18T00:00:00.000Z".to_string(), event: json!({ "nvidiaSource": SOURCE, "resource": "sandbox", @@ -322,4 +461,36 @@ mod tests { assert!(try_enqueue_event(&tx, event())); assert!(!try_enqueue_event(&tx, event())); } + + #[test] + fn telemetry_validation_maps_privacy_sensitive_strings_to_safe_buckets() { + assert_eq!(provider_profile_bucket("corp-llm-prod"), "custom"); + assert_eq!( + sandbox_template_source_bucket("ghcr.io/acme/private:latest"), + "undefined" + ); + assert_eq!(deny_group_bucket("host=private.example"), "unknown"); + } + + #[test] + fn telemetry_validation_rejects_schema_invalid_values() { + assert_eq!(lifecycle_resource("gateway"), None); + assert_eq!(lifecycle_operation("restart"), None); + assert_eq!(policy_decision_operation("merge_internal_rule"), None); + assert_eq!(telemetry_outcome("partial"), None); + assert!(!valid_count(MAX_TELEMETRY_INTEGER + 1)); + } + + #[test] + fn activity_groups_are_sanitized_and_aggregated() { + let rows = sanitize_denials_by_group([ + ("connect".to_string(), 1), + ("connect_policy".to_string(), 2), + ("host=private.example".to_string(), 3), + ]) + .expect("rows should sanitize"); + + assert_eq!(rows.get("connect_policy"), Some(&3)); + assert_eq!(rows.get("unknown"), Some(&3)); + } } diff --git a/crates/openshell-sandbox/src/activity_aggregator.rs b/crates/openshell-sandbox/src/activity_aggregator.rs index 692c1d3b6..a653fb7b2 100644 --- a/crates/openshell-sandbox/src/activity_aggregator.rs +++ b/crates/openshell-sandbox/src/activity_aggregator.rs @@ -10,6 +10,7 @@ use tracing::debug; pub const ACTIVITY_EVENT_QUEUE_CAPACITY: usize = 1024; const ACTIVITY_FLUSH_QUEUE_CAPACITY: usize = 1; +pub const DEFAULT_ACTIVITY_FLUSH_INTERVAL_SECS: u64 = 10; #[derive(Debug, Clone)] pub struct ActivityEvent { @@ -120,6 +121,13 @@ pub fn try_record_activity(tx: &ActivitySender, denied: bool, deny_group: &'stat tx.try_send(ActivityEvent { denied, deny_group }).is_ok() } +pub fn activity_flush_interval_secs_from_env(value: Option<&str>) -> u64 { + value + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + .unwrap_or(DEFAULT_ACTIVITY_FLUSH_INTERVAL_SECS) +} + fn queue_flush_summary( tx: &mpsc::Sender, summary: FlushableActivitySummary, @@ -195,4 +203,21 @@ mod tests { assert!(queue_flush_summary(&tx, summary.clone())); assert!(!queue_flush_summary(&tx, summary)); } + + #[test] + fn activity_flush_interval_uses_positive_values_only() { + assert_eq!( + activity_flush_interval_secs_from_env(None), + DEFAULT_ACTIVITY_FLUSH_INTERVAL_SECS + ); + assert_eq!( + activity_flush_interval_secs_from_env(Some("not-a-number")), + DEFAULT_ACTIVITY_FLUSH_INTERVAL_SECS + ); + assert_eq!( + activity_flush_interval_secs_from_env(Some("0")), + DEFAULT_ACTIVITY_FLUSH_INTERVAL_SECS + ); + assert_eq!(activity_flush_interval_secs_from_env(Some("5")), 5); + } } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 1a0ff0258..f0bace602 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -238,6 +238,23 @@ fn route_refresh_interval_secs() -> u64 { } } +type ActivityCollectionChannels = ( + Option, + Option>, + Option, +); + +fn activity_collection_channels(sandbox_id: Option<&str>) -> ActivityCollectionChannels { + if sandbox_id.is_some() && openshell_core::telemetry::enabled() { + let (tx, rx) = + tokio::sync::mpsc::channel(activity_aggregator::ACTIVITY_EVENT_QUEUE_CAPACITY); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) + } else { + (None, None, None) + } +} + #[cfg(target_os = "linux")] static MANAGED_CHILDREN: LazyLock>> = LazyLock::new(|| Mutex::new(HashSet::new())); @@ -605,14 +622,8 @@ pub async fn run_sandbox( } else { (None, None, None) }; - let (activity_tx, activity_rx, bypass_activity_tx) = if sandbox_id.is_some() { - let (tx, rx) = - tokio::sync::mpsc::channel(activity_aggregator::ACTIVITY_EVENT_QUEUE_CAPACITY); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(sandbox_id.as_deref()); let proxy_handle = ProxyHandle::start_with_bind_addr( proxy_policy, @@ -1009,10 +1020,11 @@ pub async fn run_sandbox( if let Some(rx) = activity_rx { let agg_name = sandbox_name_for_agg.clone().unwrap_or_else(|| id.clone()); let agg_endpoint = endpoint.clone(); - let flush_interval_secs: u64 = std::env::var("OPENSHELL_ACTIVITY_FLUSH_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10); + let flush_interval_secs = activity_aggregator::activity_flush_interval_secs_from_env( + std::env::var("OPENSHELL_ACTIVITY_FLUSH_INTERVAL_SECS") + .ok() + .as_deref(), + ); let aggregator = activity_aggregator::ActivityAggregator::new(rx, flush_interval_secs); tokio::spawn(async move { @@ -3095,6 +3107,32 @@ filesystem_policy: ); } + #[test] + fn telemetry_opt_out_disables_activity_collection_and_flush_channel() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars([("OPENSHELL_TELEMETRY_ENABLED", Some("false"))], || { + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(Some("sb-1")); + + assert!(activity_tx.is_none()); + assert!(activity_rx.is_none()); + assert!(bypass_activity_tx.is_none()); + }); + } + + #[test] + fn telemetry_enabled_creates_activity_collection_and_flush_channel() { + let _guard = ENV_LOCK.lock().unwrap(); + with_vars([("OPENSHELL_TELEMETRY_ENABLED", Some("true"))], || { + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(Some("sb-1")); + + assert!(activity_tx.is_some()); + assert!(activity_rx.is_some()); + assert!(bypass_activity_tx.is_some()); + }); + } + #[tokio::test] async fn route_cache_preserves_content_when_not_written() { use std::sync::Arc; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 7897282af..1d61f9b15 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -300,6 +300,19 @@ fn emit_activity(tx: &Option, denied: bool, deny_group: &'static } } +fn l7_inspection_active(l7_route: Option<&L7RouteSnapshot>) -> bool { + l7_route.is_some_and(|route| !route.configs.is_empty()) +} + +fn emit_connect_activity_if_l4_only( + tx: &Option, + l7_route: Option<&L7RouteSnapshot>, +) { + if !l7_inspection_active(l7_route) { + emit_activity(tx, false, "unknown"); + } +} + fn emit_activity_simple(tx: Option<&ActivitySender>, denied: bool, deny_group: &'static str) { if let Some(tx) = tx { let _ = try_record_activity(tx, denied, deny_group); @@ -805,13 +818,11 @@ async fn handle_tcp_connection( // Check if endpoint has L7 config for protocol-aware inspection, and // retain the generation for HTTP passthrough keep-alive tunnels. let l7_route = query_l7_route_snapshot(&opa_engine, &decision, &host_lc, port); + let should_inspect_l7 = l7_inspection_active(l7_route.as_ref()); // Log the allowed CONNECT — use CONNECT_L7 when L7 inspection follows, // so log consumers can distinguish L4-only decisions from tunnel lifecycle events. - let connect_msg = if l7_route - .as_ref() - .is_some_and(|route| !route.configs.is_empty()) - { + let connect_msg = if should_inspect_l7 { "CONNECT_L7" } else { "CONNECT" @@ -834,7 +845,7 @@ async fn handle_tcp_connection( .build(); ocsf_emit!(event); } - emit_activity(&activity_tx, false, "unknown"); + emit_connect_activity_if_l4_only(&activity_tx, l7_route.as_ref()); // Determine effective TLS mode. Check the raw endpoint config for // `tls: skip` independently of L7 config (which requires `protocol`). @@ -3737,6 +3748,40 @@ mod tests { } } + #[test] + fn connect_activity_is_skipped_when_l7_will_count_the_request() { + let (tx, mut rx) = mpsc::channel(4); + let activity_tx = Some(tx); + let l7_route = L7RouteSnapshot { + configs: vec![L7ConfigSnapshot { + config: websocket_l7_config(crate::l7::L7Protocol::Rest, false), + }], + generation: 1, + }; + let l4_route = L7RouteSnapshot { + configs: Vec::new(), + generation: 1, + }; + + emit_connect_activity_if_l4_only(&activity_tx, Some(&l7_route)); + assert!( + rx.try_recv().is_err(), + "L7-inspected CONNECT should not emit an extra L4 activity event" + ); + + emit_connect_activity_if_l4_only(&activity_tx, Some(&l4_route)); + let event = rx.try_recv().expect("L4-only CONNECT should emit activity"); + assert!(!event.denied); + assert_eq!(event.deny_group, "unknown"); + + emit_connect_activity_if_l4_only(&activity_tx, None); + let event = rx + .try_recv() + .expect("CONNECT without an L7 route should emit activity"); + assert!(!event.denied); + assert_eq!(event.deny_group, "unknown"); + } + fn forward_test_guard() -> PolicyGenerationGuard { let policy = include_str!("../data/sandbox-policy.rego"); let policy_data = "network_policies: {}\n"; diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index c5ce4d435..94fbd708f 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -80,6 +80,10 @@ fn emit_sandbox_policy_update_success() { openshell_core::telemetry::emit_lifecycle("sandbox_policy", "update", "success"); } +fn emit_sandbox_policy_update_failure() { + openshell_core::telemetry::emit_lifecycle("sandbox_policy", "update", "failure"); +} + fn should_emit_config_update_policy_telemetry(sandbox_caller: bool) -> bool { !sandbox_caller } @@ -104,6 +108,10 @@ fn emit_policy_decision_success(operation: &str, rule_count: u64) { openshell_core::telemetry::emit_policy_decision(operation, "success", rule_count); } +fn emit_policy_decision_failure(operation: &str, rule_count: u64) { + openshell_core::telemetry::emit_policy_decision(operation, "failure", rule_count); +} + fn emit_gateway_policy_audit_log( sandbox_id: &str, sandbox_name: &str, @@ -678,6 +686,21 @@ pub(super) async fn handle_get_sandbox_provider_environment( pub(super) async fn handle_update_config( state: &Arc, request: Request, +) -> Result, Status> { + let sandbox_caller = is_sandbox_caller(&request); + let update = request.get_ref(); + let should_emit_policy_failure = should_emit_config_update_policy_telemetry(sandbox_caller) + && (update.policy.is_some() || !update.merge_operations.is_empty()); + let result = handle_update_config_inner(state, request).await; + if result.is_err() && should_emit_policy_failure { + emit_sandbox_policy_update_failure(); + } + result +} + +async fn handle_update_config_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let sandbox_caller = is_sandbox_caller(&request); let req = request.into_inner(); @@ -1565,6 +1588,17 @@ pub(super) async fn handle_get_draft_policy( pub(super) async fn handle_approve_draft_chunk( state: &Arc, request: Request, +) -> Result, Status> { + let result = handle_approve_draft_chunk_inner(state, request).await; + if result.is_err() { + emit_policy_decision_failure("approve", 1); + } + result +} + +async fn handle_approve_draft_chunk_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let req = request.into_inner(); if req.name.is_empty() { @@ -1654,6 +1688,17 @@ pub(super) async fn handle_approve_draft_chunk( pub(super) async fn handle_reject_draft_chunk( state: &Arc, request: Request, +) -> Result, Status> { + let result = handle_reject_draft_chunk_inner(state, request).await; + if result.is_err() { + emit_policy_decision_failure("reject", 1); + } + result +} + +async fn handle_reject_draft_chunk_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let req = request.into_inner(); if req.name.is_empty() { @@ -1740,6 +1785,17 @@ pub(super) async fn handle_reject_draft_chunk( pub(super) async fn handle_approve_all_draft_chunks( state: &Arc, request: Request, +) -> Result, Status> { + let result = handle_approve_all_draft_chunks_inner(state, request).await; + if result.is_err() { + emit_policy_decision_failure("approve_all", 0); + } + result +} + +async fn handle_approve_all_draft_chunks_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let req = request.into_inner(); if req.name.is_empty() { @@ -1911,6 +1967,17 @@ pub(super) async fn handle_edit_draft_chunk( pub(super) async fn handle_undo_draft_chunk( state: &Arc, request: Request, +) -> Result, Status> { + let result = handle_undo_draft_chunk_inner(state, request).await; + if result.is_err() { + emit_policy_decision_failure("undo", 1); + } + result +} + +async fn handle_undo_draft_chunk_inner( + state: &Arc, + request: Request, ) -> Result, Status> { let req = request.into_inner(); if req.name.is_empty() { diff --git a/crates/openshell-server/src/grpc/provider.rs b/crates/openshell-server/src/grpc/provider.rs index a13fa3fe0..49419f607 100644 --- a/crates/openshell-server/src/grpc/provider.rs +++ b/crates/openshell-server/src/grpc/provider.rs @@ -337,15 +337,24 @@ pub(super) async fn handle_create_provider( request: Request, ) -> Result, Status> { let req = request.into_inner(); - let provider = req - .provider - .ok_or_else(|| Status::invalid_argument("provider is required"))?; - let provider = create_provider_record(state.store.as_ref(), provider).await?; - emit_provider_lifecycle(&provider.r#type, "create", "success"); - - Ok(Response::new(ProviderResponse { - provider: Some(provider), - })) + let Some(provider) = req.provider else { + emit_provider_lifecycle("custom", "create", "failure"); + return Err(Status::invalid_argument("provider is required")); + }; + let provider_type = provider.r#type.clone(); + let result = create_provider_record(state.store.as_ref(), provider).await; + match result { + Ok(provider) => { + emit_provider_lifecycle(&provider.r#type, "create", "success"); + Ok(Response::new(ProviderResponse { + provider: Some(provider), + })) + } + Err(err) => { + emit_provider_lifecycle(&provider_type, "create", "failure"); + Err(err) + } + } } pub(super) async fn handle_get_provider( @@ -726,15 +735,24 @@ pub(super) async fn handle_update_provider( request: Request, ) -> Result, Status> { let req = request.into_inner(); - let provider = req - .provider - .ok_or_else(|| Status::invalid_argument("provider is required"))?; - let provider = update_provider_record(state.store.as_ref(), provider).await?; - emit_provider_lifecycle(&provider.r#type, "update", "success"); - - Ok(Response::new(ProviderResponse { - provider: Some(provider), - })) + let Some(provider) = req.provider else { + emit_provider_lifecycle("custom", "update", "failure"); + return Err(Status::invalid_argument("provider is required")); + }; + let provider_type = provider.r#type.clone(); + let result = update_provider_record(state.store.as_ref(), provider).await; + match result { + Ok(provider) => { + emit_provider_lifecycle(&provider.r#type, "update", "success"); + Ok(Response::new(ProviderResponse { + provider: Some(provider), + })) + } + Err(err) => { + emit_provider_lifecycle(&provider_type, "update", "failure"); + Err(err) + } + } } pub(super) async fn handle_delete_provider( @@ -743,17 +761,35 @@ pub(super) async fn handle_delete_provider( ) -> Result, Status> { let name = request.into_inner().name; let provider_profile = provider_profile_for_name(state.store.as_ref(), &name).await; - let deleted = delete_provider_record(state.store.as_ref(), &name).await?; - if deleted && let Some(provider_profile) = provider_profile { - openshell_core::telemetry::emit_provider_lifecycle("delete", "success", &provider_profile); + let result = delete_provider_record(state.store.as_ref(), &name).await; + match result { + Ok(deleted) => { + let outcome = if deleted { "success" } else { "failure" }; + emit_provider_profile_lifecycle( + provider_profile.as_deref().unwrap_or("custom"), + "delete", + outcome, + ); + Ok(Response::new(DeleteProviderResponse { deleted })) + } + Err(err) => { + emit_provider_profile_lifecycle( + provider_profile.as_deref().unwrap_or("custom"), + "delete", + "failure", + ); + Err(err) + } } - - Ok(Response::new(DeleteProviderResponse { deleted })) } fn emit_provider_lifecycle(provider_type: &str, operation: &str, outcome: &str) { let provider_profile = telemetry_provider_profile(provider_type); - openshell_core::telemetry::emit_provider_lifecycle(operation, outcome, &provider_profile); + emit_provider_profile_lifecycle(&provider_profile, operation, outcome); +} + +fn emit_provider_profile_lifecycle(provider_profile: &str, operation: &str, outcome: &str) { + openshell_core::telemetry::emit_provider_lifecycle(operation, outcome, provider_profile); } async fn provider_profile_for_name(store: &Store, name: &str) -> Option { diff --git a/crates/openshell-server/src/telemetry.rs b/crates/openshell-server/src/telemetry.rs index d3ec3bacf..af96421af 100644 --- a/crates/openshell-server/src/telemetry.rs +++ b/crates/openshell-server/src/telemetry.rs @@ -1,129 +1,35 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! In-memory anonymous telemetry accounting for sandbox sessions. +//! Anonymous sandbox activity telemetry forwarding. use openshell_core::proto::NetworkActivitySummary; +#[cfg(not(test))] use std::collections::HashMap; -use std::sync::Mutex; -use std::time::Instant; #[derive(Debug, Default)] -pub struct TelemetryState { - sessions: Mutex>, -} - -#[derive(Debug)] -struct SessionTelemetry { - started_at: Instant, - network_activity_count: u64, - denied_action_count: u64, - denials_by_group: HashMap, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct SandboxSessionSummary { - pub active_sandbox_seconds: u64, - pub network_activity_count: u64, - pub denied_action_count: u64, - pub denial_rate_pct: f64, - pub denials_by_group: Vec<(String, u64)>, -} +pub struct TelemetryState; +#[allow(clippy::unused_self)] impl TelemetryState { pub fn new() -> Self { - Self::default() + Self } - pub fn sandbox_session_connected(&self, sandbox_id: &str) { - if sandbox_id.is_empty() { - return; - } - let Ok(mut sessions) = self.sessions.lock() else { - return; - }; - sessions - .entry(sandbox_id.to_string()) - .or_insert_with(SessionTelemetry::new); - } + pub fn sandbox_session_connected(&self, _sandbox_id: &str) {} - pub fn sandbox_session_disconnected(&self, sandbox_id: &str) { - let _ = self.end_sandbox_session_summary(sandbox_id); - } + pub fn sandbox_session_disconnected(&self, _sandbox_id: &str) {} - pub fn end_sandbox_session(&self, sandbox_id: &str) { - self.sandbox_session_disconnected(sandbox_id); - } + pub fn end_sandbox_session(&self, _sandbox_id: &str) {} pub fn record_network_activity(&self, sandbox_id: &str, summary: &NetworkActivitySummary) { - if sandbox_id.is_empty() { + if sandbox_id.is_empty() || !openshell_core::telemetry::enabled() { return; } #[cfg(not(test))] emit_network_activity_summary(summary); - - let Ok(mut sessions) = self.sessions.lock() else { - return; - }; - let session = sessions - .entry(sandbox_id.to_string()) - .or_insert_with(SessionTelemetry::new); - - session.network_activity_count = session - .network_activity_count - .saturating_add(u64::from(summary.network_activity_count)); - session.denied_action_count = session - .denied_action_count - .saturating_add(u64::from(summary.denied_action_count)); - for group in &summary.denials_by_group { - let deny_group = sanitize_deny_group(&group.deny_group).to_string(); - let entry = session.denials_by_group.entry(deny_group).or_default(); - *entry = entry.saturating_add(u64::from(group.denied_count)); - } - } - - #[cfg(test)] - pub(crate) fn end_sandbox_session_summary( - &self, - sandbox_id: &str, - ) -> Option { - self.end_sandbox_session_summary_inner(sandbox_id) - } - - #[cfg(not(test))] - fn end_sandbox_session_summary(&self, sandbox_id: &str) -> Option { - self.end_sandbox_session_summary_inner(sandbox_id) - } - - fn end_sandbox_session_summary_inner(&self, sandbox_id: &str) -> Option { - let Ok(mut sessions) = self.sessions.lock() else { - return None; - }; - let session = sessions.remove(sandbox_id)?; - let active_sandbox_seconds = session.started_at.elapsed().as_secs(); - let denial_rate_pct = - calculate_denial_rate_pct(session.network_activity_count, session.denied_action_count); - let mut denials_by_group: Vec<(String, u64)> = - session.denials_by_group.into_iter().collect(); - denials_by_group.sort_by(|left, right| left.0.cmp(&right.0)); - Some(SandboxSessionSummary { - active_sandbox_seconds, - network_activity_count: session.network_activity_count, - denied_action_count: session.denied_action_count, - denial_rate_pct, - denials_by_group, - }) - } -} - -impl SessionTelemetry { - fn new() -> Self { - Self { - started_at: Instant::now(), - network_activity_count: 0, - denied_action_count: 0, - denials_by_group: HashMap::new(), - } + #[cfg(test)] + let _ = summary; } } @@ -170,7 +76,6 @@ fn emit_network_activity_summary(summary: &NetworkActivitySummary) { #[cfg(test)] mod tests { use super::*; - use openshell_core::proto::DenialGroupCount; fn assert_float_eq(actual: f64, expected: f64) { assert!((actual - expected).abs() <= f64::EPSILON); @@ -195,52 +100,10 @@ mod tests { } #[test] - fn session_records_activity_until_disconnect() { + fn session_lifecycle_hooks_are_noops() { let telemetry = TelemetryState::new(); telemetry.sandbox_session_connected("sb-1"); - telemetry.record_network_activity( - "sb-1", - &NetworkActivitySummary { - network_activity_count: 4, - denied_action_count: 1, - denials_by_group: vec![DenialGroupCount { - deny_group: "ssrf".to_string(), - denied_count: 1, - }], - }, - ); - let summary = telemetry - .end_sandbox_session_summary("sb-1") - .expect("session summary should exist"); - assert_eq!(summary.network_activity_count, 4); - assert_eq!(summary.denied_action_count, 1); - assert_float_eq(summary.denial_rate_pct, 25.0); - assert_eq!(summary.denials_by_group, vec![("ssrf".to_string(), 1)]); - assert!(telemetry.end_sandbox_session_summary("sb-1").is_none()); - } - - #[test] - fn activity_starts_missing_session_accounting() { - let telemetry = TelemetryState::new(); - telemetry.record_network_activity( - "sb-1", - &NetworkActivitySummary { - network_activity_count: 1, - denied_action_count: 1, - denials_by_group: vec![DenialGroupCount { - deny_group: "forward_policy".to_string(), - denied_count: 1, - }], - }, - ); - let summary = telemetry - .end_sandbox_session_summary("sb-1") - .expect("activity should create a telemetry session"); - assert_eq!(summary.network_activity_count, 1); - assert_eq!(summary.denied_action_count, 1); - assert_eq!( - summary.denials_by_group, - vec![("forward_policy".to_string(), 1)] - ); + telemetry.sandbox_session_disconnected("sb-1"); + telemetry.end_sandbox_session("sb-1"); } } From 4451a6a7c1fb42093b446462a3f149fd33791d9e Mon Sep 17 00:00:00 2001 From: Kirit93 Date: Tue, 19 May 2026 10:32:28 -0700 Subject: [PATCH 4/5] Updated env var to disable telemetry --- Cargo.lock | 3 ++ README.md | 2 +- crates/openshell-core/src/sandbox_env.rs | 3 ++ crates/openshell-core/src/telemetry.rs | 20 +++++++ crates/openshell-driver-docker/Cargo.toml | 3 ++ crates/openshell-driver-docker/src/lib.rs | 4 ++ crates/openshell-driver-docker/src/tests.rs | 37 +++++++++++++ crates/openshell-driver-kubernetes/Cargo.toml | 3 ++ .../openshell-driver-kubernetes/src/driver.rs | 39 ++++++++++++++ .../openshell-driver-podman/src/container.rs | 42 +++++++++++++++ crates/openshell-driver-vm/Cargo.toml | 3 ++ crates/openshell-driver-vm/src/driver.rs | 53 +++++++++++++++++++ 12 files changed, 211 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 450c8dba8..f36cd739d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3481,6 +3481,7 @@ dependencies = [ "openshell-core", "serde", "tar", + "temp-env", "tempfile", "tokio", "tokio-stream", @@ -3504,6 +3505,7 @@ dependencies = [ "prost-types", "serde", "serde_json", + "temp-env", "thiserror 2.0.18", "tokio", "tokio-stream", @@ -3557,6 +3559,7 @@ dependencies = [ "serde_json", "sha2 0.10.9", "tar", + "temp-env", "tokio", "tokio-stream", "tonic", diff --git a/README.md b/README.md index f2111e7bc..1a36424a0 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ OpenShell is built agent-first — your agent is your first collaborator. Before OpenShell collects anonymous telemetry to help improve the project for developers. This data is not used to track individual user behavior. It helps us understand aggregate usage of sandbox, provider, and policy workflows so we can prioritize product improvements and share usage trends with the community. -Disable telemetry with `OPENSHELL_TELEMETRY_ENABLED=false`. See the [telemetry schema](openshell_telemetry_schema.json) for details. +Disable telemetry by setting `OPENSHELL_TELEMETRY_ENABLED=false` on the gateway deployment. OpenShell propagates this deployment setting into sandbox supervisor environments so sandbox-side telemetry collection is disabled as well. See the [telemetry schema](openshell_telemetry_schema.json) for details. Telemetry events are limited to anonymous operational categories and counts, such as sandbox lifecycle outcomes, provider profile buckets, policy decision counts, and aggregate network activity denial categories. OpenShell telemetry does not collect sandbox names or IDs, hostnames, file paths, binary paths, prompts, credentials, provider names, model names, or user content. diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index d345762ca..4287a4617 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -26,6 +26,9 @@ pub const LOG_LEVEL: &str = "OPENSHELL_LOG_LEVEL"; /// Shell command to run inside the sandbox. pub const SANDBOX_COMMAND: &str = "OPENSHELL_SANDBOX_COMMAND"; +/// Deployment-controlled telemetry toggle propagated to the sandbox supervisor. +pub const TELEMETRY_ENABLED: &str = "OPENSHELL_TELEMETRY_ENABLED"; + /// Path to the CA certificate for mTLS communication with the gateway. pub const TLS_CA: &str = "OPENSHELL_TLS_CA"; diff --git a/crates/openshell-core/src/telemetry.rs b/crates/openshell-core/src/telemetry.rs index 96a5665cf..e04add3b5 100644 --- a/crates/openshell-core/src/telemetry.rs +++ b/crates/openshell-core/src/telemetry.rs @@ -34,6 +34,18 @@ pub fn enabled() -> bool { telemetry_enabled_from(std::env::var("OPENSHELL_TELEMETRY_ENABLED").ok().as_deref()) } +pub fn enabled_env_value() -> &'static str { + enabled_env_value_from(std::env::var("OPENSHELL_TELEMETRY_ENABLED").ok().as_deref()) +} + +fn enabled_env_value_from(value: Option<&str>) -> &'static str { + if telemetry_enabled_from(value) { + "true" + } else { + "false" + } +} + fn telemetry_enabled_from(value: Option<&str>) -> bool { let value = value.unwrap_or("true"); !matches!( @@ -405,6 +417,14 @@ mod tests { assert!(telemetry_enabled_from(Some("yes"))); } + #[test] + fn telemetry_enabled_env_value_is_normalized() { + assert_eq!(enabled_env_value_from(Some("false")), "false"); + assert_eq!(enabled_env_value_from(Some("0")), "false"); + assert_eq!(enabled_env_value_from(None), "true"); + assert_eq!(enabled_env_value_from(Some("yes")), "true"); + } + #[test] fn telemetry_endpoint_empty_disables_publish() { assert_eq!(telemetry_endpoint_from(Some(" ")), None); diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index e2c97532a..0cdc205ed 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -25,5 +25,8 @@ tar = "0.4" tempfile = "3" url = { workspace = true } +[dev-dependencies] +temp-env = "0.3" + [lints] workspace = true diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 30507422b..578bf7dd6 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -970,6 +970,10 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig openshell_core::sandbox_env::SANDBOX_COMMAND.to_string(), SANDBOX_COMMAND.to_string(), ); + environment.insert( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + openshell_core::telemetry::enabled_env_value().to_string(), + ); // The root supervisor executes namespace helpers during bootstrap; keep // their search path driver-owned even when the template/spec set PATH. environment.insert("PATH".to_string(), SUPERVISOR_PATH.to_string()); diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 62a6b89e4..86d6d5286 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -8,9 +8,11 @@ use openshell_core::proto::compute::v1::{ }; use std::fs; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::sync::{LazyLock, Mutex}; use tempfile::TempDir; const TLS_MOUNT_DIR: &str = "/etc/openshell/tls/client"; +static ENV_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); fn test_sandbox() -> DriverSandbox { // Mirrors the gateway-supplied request: the public `Sandbox` API no @@ -420,6 +422,41 @@ fn build_environment_keeps_path_driver_controlled() { assert_eq!(path_entries[0], &expected_path); } +#[test] +fn build_environment_keeps_telemetry_toggle_driver_controlled() { + let _guard = ENV_LOCK.lock().unwrap(); + temp_env::with_vars( + [( + openshell_core::sandbox_env::TELEMETRY_ENABLED, + Some("false"), + )], + || { + let mut sandbox = test_sandbox(); + sandbox.spec.as_mut().unwrap().environment.insert( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + "true".to_string(), + ); + + let env = build_environment(&sandbox, &runtime_config()); + let telemetry_entries = env + .iter() + .filter(|entry| { + entry.starts_with(&format!( + "{}=", + openshell_core::sandbox_env::TELEMETRY_ENABLED + )) + }) + .collect::>(); + + assert_eq!(telemetry_entries.len(), 1); + assert_eq!( + telemetry_entries[0], + &format!("{}=false", openshell_core::sandbox_env::TELEMETRY_ENABLED) + ); + }, + ); +} + #[test] fn build_binds_uses_docker_tls_directory() { let binds = build_binds(&runtime_config()); diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index c222c9c31..885c64944 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -34,5 +34,8 @@ tracing-subscriber = { workspace = true } thiserror = { workspace = true } miette = { workspace = true } +[dev-dependencies] +temp-env = "0.3" + [lints] workspace = true diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a624f787e..f8877e74c 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -1424,6 +1424,11 @@ fn apply_required_env( openshell_core::sandbox_env::SANDBOX_COMMAND, "sleep infinity", ); + upsert_env( + env, + openshell_core::sandbox_env::TELEMETRY_ENABLED, + openshell_core::telemetry::enabled_env_value(), + ); if !ssh_socket_path.is_empty() { upsert_env( env, @@ -1592,6 +1597,9 @@ mod tests { }; use prost_types::{Struct, Value, value::Kind}; + static ENV_LOCK: std::sync::LazyLock> = + std::sync::LazyLock::new(|| std::sync::Mutex::new(())); + #[test] fn kube_pulling_event_adds_image_progress_metadata() { let mut metadata = std::collections::HashMap::new(); @@ -2475,6 +2483,37 @@ mod tests { assert!(cr["spec"].get("logLevel").is_none()); } + #[test] + fn telemetry_toggle_propagates_from_driver_env_to_sandbox_pod() { + let _guard = ENV_LOCK.lock().unwrap(); + temp_env::with_vars( + [( + openshell_core::sandbox_env::TELEMETRY_ENABLED, + Some("false"), + )], + || { + let spec = SandboxSpec { + environment: std::collections::HashMap::from([( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + "true".to_string(), + )]), + ..SandboxSpec::default() + }; + let cr = sandbox_to_k8s_spec(Some(&spec), &SandboxPodParams::default()); + let env = cr["spec"]["podTemplate"]["spec"]["containers"][0]["env"] + .as_array() + .unwrap(); + let telemetry_entries = env + .iter() + .filter(|entry| entry["name"] == openshell_core::sandbox_env::TELEMETRY_ENABLED) + .collect::>(); + + assert_eq!(telemetry_entries.len(), 1); + assert_eq!(telemetry_entries[0]["value"], serde_json::json!("false")); + }, + ); + } + #[test] fn node_selector_from_platform_config() { let template = SandboxTemplate { diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 1cb58e338..d4c8c2f13 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -280,6 +280,10 @@ fn build_env( openshell_core::sandbox_env::SANDBOX_COMMAND.into(), "sleep infinity".into(), ); + env.insert( + openshell_core::sandbox_env::TELEMETRY_ENABLED.into(), + openshell_core::telemetry::enabled_env_value().into(), + ); // 3. TLS client cert paths (when mTLS is enabled). These point to // the container-side mount paths where the cert files are @@ -636,6 +640,9 @@ fn parse_memory_to_bytes(quantity: &str) -> Option { mod tests { use super::*; + static ENV_LOCK: std::sync::LazyLock> = + std::sync::LazyLock::new(|| std::sync::Mutex::new(())); + #[test] fn parse_cpu_millicore() { assert_eq!(parse_cpu_to_microseconds("500m"), Some(50_000)); @@ -908,6 +915,41 @@ mod tests { ); } + #[test] + fn container_spec_telemetry_toggle_comes_from_driver_env() { + use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; + + let _guard = ENV_LOCK.lock().unwrap(); + temp_env::with_vars( + [( + openshell_core::sandbox_env::TELEMETRY_ENABLED, + Some("false"), + )], + || { + let mut sandbox = test_sandbox("test-id", "legit-name"); + sandbox.spec = Some(DriverSandboxSpec { + environment: std::collections::HashMap::from([( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + "true".to_string(), + )]), + template: Some(DriverSandboxTemplate::default()), + ..Default::default() + }); + + let spec = build_container_spec(&sandbox, &test_config()); + let env_map = spec["env"].as_object().expect("env should be an object"); + + assert_eq!( + env_map + .get(openshell_core::sandbox_env::TELEMETRY_ENABLED) + .and_then(|v| v.as_str()), + Some("false"), + "telemetry toggle must come from the deployment environment" + ); + }, + ); + } + #[test] fn container_spec_required_labels_cannot_be_overridden() { use openshell_core::proto::compute::v1::{DriverSandboxSpec, DriverSandboxTemplate}; diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index fb1964415..fd583465a 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -45,6 +45,9 @@ flate2 = "1" sha2 = "0.10" zstd = "0.13" +[dev-dependencies] +temp-env = "0.3" + # smol-rs/polling drives the BSD/macOS parent-death detection in # procguard via kqueue's EVFILT_PROC / NOTE_EXIT filter. We could use # it on Linux too (via epoll + pidfd) but sticking with diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 52a9729f8..6c605cd2c 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -3494,6 +3494,10 @@ fn build_guest_environment( ])); } environment.extend(merged_environment(sandbox)); + environment.insert( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + openshell_core::telemetry::enabled_env_value().to_string(), + ); let mut pairs = environment.into_iter().collect::>(); pairs.sort_by(|left, right| left.0.cmp(&right.0)); @@ -4393,6 +4397,9 @@ mod tests { use std::time::{SystemTime, UNIX_EPOCH}; use tonic::Code; + static ENV_LOCK: std::sync::LazyLock> = + std::sync::LazyLock::new(|| std::sync::Mutex::new(())); + #[test] fn vm_pulling_layer_event_adds_progress_detail_metadata() { let mut event = platform_event( @@ -4963,6 +4970,52 @@ mod tests { ))); } + #[test] + fn build_guest_environment_uses_deployment_telemetry_toggle() { + let _guard = ENV_LOCK.lock().unwrap(); + temp_env::with_vars( + [( + openshell_core::sandbox_env::TELEMETRY_ENABLED, + Some("false"), + )], + || { + let config = VmDriverConfig { + openshell_endpoint: "http://127.0.0.1:8080".to_string(), + ..Default::default() + }; + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + name: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + environment: HashMap::from([( + openshell_core::sandbox_env::TELEMETRY_ENABLED.to_string(), + "true".to_string(), + )]), + ..Default::default() + }), + ..Default::default() + }; + + let env = build_guest_environment(&sandbox, &config, None); + let telemetry_entries = env + .iter() + .filter(|entry| { + entry.starts_with(&format!( + "{}=", + openshell_core::sandbox_env::TELEMETRY_ENABLED + )) + }) + .collect::>(); + + assert_eq!(telemetry_entries.len(), 1); + assert_eq!( + telemetry_entries[0], + &format!("{}=false", openshell_core::sandbox_env::TELEMETRY_ENABLED) + ); + }, + ); + } + #[test] fn build_guest_environment_uses_endpoint_override_for_tap() { let config = VmDriverConfig { From dbab5e838f791d4dd1b1fd15f5df3a0d66f37dc0 Mon Sep 17 00:00:00 2001 From: Kirit93 Date: Tue, 19 May 2026 11:23:01 -0700 Subject: [PATCH 5/5] Added client version --- crates/openshell-core/src/telemetry.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/openshell-core/src/telemetry.rs b/crates/openshell-core/src/telemetry.rs index e04add3b5..544c072f2 100644 --- a/crates/openshell-core/src/telemetry.rs +++ b/crates/openshell-core/src/telemetry.rs @@ -76,14 +76,17 @@ fn timestamp() -> String { Utc::now().to_rfc3339_opts(SecondsFormat::Millis, true) } +fn client_version() -> &'static str { + crate::VERSION +} + fn build_payload(name: &str, event: Value, event_ts: &str, sent_ts: &str) -> Value { json!({ "browserType": "undefined", "clientId": CLIENT_ID, "clientType": "Native", "clientVariant": "Release", - "clientVer": std::env::var("OPENSHELL_SOURCE_CLIENT_VERSION") - .unwrap_or_else(|_| "undefined".to_string()), + "clientVer": client_version(), "cpuArchitecture": std::env::consts::ARCH, "deviceGdprBehOptIn": "None", "deviceGdprFuncOptIn": "None", @@ -451,6 +454,7 @@ mod tests { ); assert_eq!(payload["clientId"], CLIENT_ID); + assert_eq!(payload["clientVer"], crate::VERSION); assert_eq!(payload["eventSchemaVer"], EVENT_SCHEMA_VERSION); assert_eq!(payload["deviceId"], "undefined"); assert_eq!(payload["userId"], "undefined");