From 571db627073987cd0d0b2d10e192baf17a7fd326 Mon Sep 17 00:00:00 2001 From: bilby91 Date: Mon, 18 May 2026 13:59:40 -0300 Subject: [PATCH] fix(compose): preserve container state across daemon restarts (#71) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shellout path: thread an `existingContainer` signal from `up.go` into `upComposeShellout` and append `--no-recreate` to `docker compose up` when a container is already known for this workspace. Matches the upstream devcontainers/cli gate (`container || expectExistingContainer`). Without the flag, compose's default recreate-on-drift behavior can destroy a perfectly-good container whenever it detects any change in the generated override files or in caller-injected env, taking the container's writable layer (e.g. `~/.claude/projects/...`) with it. Native orchestrator: when an existing container's config-hash and image-digest match but its state is not Running (e.g. dockerd was just restarted and brought stopped containers back from on-disk state), start it instead of stop+remove+recreating. Same root cause as the shellout flag gap — drop it now so it doesn't follow us to the native backend. Tests: - runtime/docker: `--no-recreate` is appended iff NoRecreate is set. - compose: a config-matched Exited container is reused via StartContainer with no RunContainer / RemoveContainer calls. Both tests fail against the prior code (verified by reverting each fix in isolation), so they would catch a future regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- compose/orchestrator.go | 16 ++++++++-- compose/orchestrator_test.go | 53 ++++++++++++++++++++++++++++++++++ runtime/docker/compose.go | 3 ++ runtime/docker/compose_test.go | 38 ++++++++++++++++++++++++ runtime/runtime.go | 8 +++++ up.go | 14 +++++++-- 6 files changed, 126 insertions(+), 6 deletions(-) diff --git a/compose/orchestrator.go b/compose/orchestrator.go index 2c0fd8e..94c82ab 100644 --- a/compose/orchestrator.go +++ b/compose/orchestrator.go @@ -459,11 +459,21 @@ func (o *Orchestrator) ensureService( details, ierr := o.rt.InspectContainer(ctx, c.ID) if ierr == nil && details != nil && details.Labels[LabelConfigHash] == hash && - details.Labels[LabelImageDigest] == imageDigest && - c.State == runtime.StateRunning { + details.Labels[LabelImageDigest] == imageDigest { + // Config and image match — the container is reusable. + // If it's not currently Running (e.g. dockerd was just + // restarted and brought stopped containers back from + // on-disk state), start it instead of destroying it. + // Recreating would lose the writable layer (the user's + // $HOME inside the container, etc.) — see issue #71. + if c.State != runtime.StateRunning { + if err := o.rt.StartContainer(ctx, c.ID); err != nil { + return "", fmt.Errorf("StartContainer(%s): %w", svc.Name, err) + } + } return c.ID, nil } - // Different config or not running — recreate. + // Different config — recreate. _ = o.rt.StopContainer(ctx, c.ID, runtime.StopOptions{Timeout: 10 * time.Second}) if err := o.rt.RemoveContainer(ctx, c.ID, runtime.RemoveOptions{Force: true}); err != nil { return "", fmt.Errorf("RemoveContainer(%s): %w", c.ID, err) diff --git a/compose/orchestrator_test.go b/compose/orchestrator_test.go index dcbbb2c..1593c66 100644 --- a/compose/orchestrator_test.go +++ b/compose/orchestrator_test.go @@ -412,6 +412,59 @@ func TestUp_RecreateOnImageDigestChange(t *testing.T) { } } +// TestUp_StartsStoppedContainerOnConfigMatch covers the daemon-restart +// path: a previous Up created and started a container; the docker +// daemon then went away (e.g. host pod destroyed in a k8s deployment +// where /var/lib/docker lives on a PVC) and came back, leaving the +// container Exited. A subsequent Up against an unchanged project +// must reuse the existing container by Starting it, not by destroying +// and recreating it — recreating would lose the writable layer +// (issue #71). +func TestUp_StartsStoppedContainerOnConfigMatch(t *testing.T) { + rt := newMockRuntime() + orch := NewOrchestrator(rt, "docker") + proj := newProject(t, map[string][]string{"app": nil}) + plan := &Plan{Project: proj, ProjectName: "dc-x"} + + res, err := orch.Up(context.Background(), plan) + if err != nil { + t.Fatalf("first Up: %v", err) + } + firstID := res.ContainerIDs["app"] + if firstID == "" { + t.Fatalf("first Up did not produce a container id") + } + firstRun := rt.runCalls + firstRemove := rt.removeCalls + + // Simulate the daemon-restart effect: container is on disk but + // not running. Use the public StopContainer path so the mock + // records the state transition the same way a real Exited + // container would surface to ListContainers. + if err := rt.StopContainer(context.Background(), firstID, runtime.StopOptions{}); err != nil { + t.Fatalf("stop container: %v", err) + } + startCallsBeforeReuse := rt.startCalls + + res2, err := orch.Up(context.Background(), plan) + if err != nil { + t.Fatalf("second Up: %v", err) + } + + if got := res2.ContainerIDs["app"]; got != firstID { + t.Errorf("container id changed across Up: first=%s second=%s — reuse expected", firstID, got) + } + if rt.runCalls != firstRun { + t.Errorf("runCalls=%d, want %d (no RunContainer expected on reuse)", rt.runCalls, firstRun) + } + if rt.removeCalls != firstRemove { + t.Errorf("removeCalls=%d, want %d (no RemoveContainer expected on reuse)", rt.removeCalls, firstRemove) + } + if rt.startCalls != startCallsBeforeReuse+1 { + t.Errorf("startCalls=%d, want %d (one StartContainer expected to revive the stopped container)", rt.startCalls, startCallsBeforeReuse+1) + } +} + func TestUp_PartialFailureSurfacesPartialError(t *testing.T) { rt := newMockRuntime() rt.OnRunContainer = func(spec runtime.RunSpec) (*runtime.Container, error) { diff --git a/runtime/docker/compose.go b/runtime/docker/compose.go index 9b12ece..bd35cc8 100644 --- a/runtime/docker/compose.go +++ b/runtime/docker/compose.go @@ -75,6 +75,9 @@ func composeArgs(projectName string, files []string) []string { func buildUpArgs(spec runtime.ComposeUpSpec) []string { args := composeArgs(spec.ProjectName, spec.Files) args = append(args, "up", "-d") + if spec.NoRecreate { + args = append(args, "--no-recreate") + } args = append(args, spec.Services...) return args } diff --git a/runtime/docker/compose_test.go b/runtime/docker/compose_test.go index 11d258f..4b6ddde 100644 --- a/runtime/docker/compose_test.go +++ b/runtime/docker/compose_test.go @@ -81,6 +81,44 @@ func TestBuildUpArgs_NoServicesUpsAll(t *testing.T) { } } +// TestBuildUpArgs_NoRecreate exercises the resume path: when the +// caller knows a container already exists for this workspace, pass +// --no-recreate so compose doesn't destroy it on spurious drift. +// Matches upstream devcontainers/cli's gate +// (`container || expectExistingContainer`). +func TestBuildUpArgs_NoRecreate(t *testing.T) { + got := buildUpArgs(runtime.ComposeUpSpec{ + Files: []string{"compose.yml"}, + ProjectName: "dc-x", + Services: []string{"app"}, + NoRecreate: true, + }) + want := []string{ + "compose", "--project-name", "dc-x", + "-f", "compose.yml", + "up", "-d", "--no-recreate", + "app", + } + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v\nwant %v", got, want) + } +} + +func TestBuildUpArgs_NoRecreateOmittedByDefault(t *testing.T) { + // Cold-start path: no existing container, so --no-recreate must + // not appear (it would block first creation under some compose + // behaviors / be misleading in flow logs). + got := buildUpArgs(runtime.ComposeUpSpec{ + Files: []string{"compose.yml"}, + ProjectName: "dc-x", + }) + for _, a := range got { + if a == "--no-recreate" { + t.Errorf("buildUpArgs should not pass --no-recreate when NoRecreate=false; got %v", got) + } + } +} + func TestBuildDownArgs_Plain(t *testing.T) { got := buildDownArgs(runtime.ComposeDownSpec{ Files: []string{"compose.yml"}, diff --git a/runtime/runtime.go b/runtime/runtime.go index 8f3784f..a8dac50 100644 --- a/runtime/runtime.go +++ b/runtime/runtime.go @@ -53,6 +53,14 @@ type ComposeUpSpec struct { // the base for compose's relative-path resolution. Engine sets // this to the workspace folder. WorkingDir string + + // NoRecreate, when true, appends `--no-recreate` to the compose + // invocation. Tells compose to keep an existing container even if + // it thinks the config drifted — used on the resume path so we + // don't destroy the container's writable layer (and anything in + // $HOME inside it) on a spurious drift detection. Matches the + // upstream devcontainers/cli gate (`container || expectExistingContainer`). + NoRecreate bool } // ComposeDownSpec configures ComposeDown. diff --git a/up.go b/up.go index e706868..54ccb06 100644 --- a/up.go +++ b/up.go @@ -203,7 +203,7 @@ func (e *Engine) Up(ctx context.Context, opts UpOptions) (*Workspace, error) { if existing != nil { composeOpts.RunSecretsCommand = false } - ws, err = e.createFreshCompose(ctx, cfg, composeOpts) + ws, err = e.createFreshCompose(ctx, cfg, composeOpts, existing != nil) case existing != nil: ws, err = e.attachExisting(ctx, existing, cfg, opts) default: @@ -466,7 +466,7 @@ func composeWorkingDir(cfg *config.ResolvedConfig, opts UpOptions) string { // // Returns a Workspace whose Container is the primary service's // container. -func (e *Engine) createFreshCompose(ctx context.Context, cfg *config.ResolvedConfig, opts UpOptions) (*Workspace, error) { +func (e *Engine) createFreshCompose(ctx context.Context, cfg *config.ResolvedConfig, opts UpOptions, existingContainer bool) (*Workspace, error) { // Shellout backend requires the runtime to satisfy // ComposeRuntime. Check before any project load / image work so a // missing capability fails fast with ErrNotImplemented instead of @@ -539,7 +539,7 @@ func (e *Engine) createFreshCompose(ctx context.Context, cfg *config.ResolvedCon finalImage, bindMounts, overrideEnv, overrideLabels) case ComposeBackendShellout: return e.upComposeShellout(ctx, cfg, opts, project, src, projectName, - workingDir, finalImage, bindMounts, overrideEnv, overrideLabels) + workingDir, finalImage, bindMounts, overrideEnv, overrideLabels, existingContainer) default: // Reject unknown values explicitly so a typo in // EngineOptions.ComposeBackend doesn't silently route to @@ -560,6 +560,7 @@ func (e *Engine) upComposeShellout( bindMounts []compose.BindMount, overrideEnv map[string]string, overrideLabels map[string]string, + existingContainer bool, ) (*Workspace, error) { cr, ok := e.runtime.(runtime.ComposeRuntime) if !ok { @@ -594,11 +595,18 @@ func (e *Engine) upComposeShellout( allFiles := append([]string{}, src.Files...) allFiles = append(allFiles, buildOverridePath, runOverridePath) + // Mirror upstream devcontainers/cli: when a container is already + // known to exist for this workspace id, pass --no-recreate so + // `docker compose up` keeps the existing container instead of + // destroying it on spurious config-hash drift (override file + // reordering, pod-scoped env injected by callers, etc.). Loses + // the container's writable layer otherwise — see issue #71. if err := cr.ComposeUp(ctx, runtime.ComposeUpSpec{ Files: allFiles, ProjectName: projectName, Services: src.RunServices, WorkingDir: workingDir, + NoRecreate: existingContainer, }, opts.bus.BuildChan(events.BuildSourceCompose)); err != nil { return nil, err }