From e34276628693f6f1df5eb9d731605fef7aea0e9b Mon Sep 17 00:00:00 2001 From: anmolnoor <0xdevnoor@gmail.com> Date: Wed, 27 May 2026 04:15:04 -0700 Subject: [PATCH] Stage E: fix deferred-write content generation + jitter repair retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two robustness fixes found by running the report flow against a live model. 1. Deferred-write body came out wrapped in a JSON plan. _generate_file_body replayed the planning conversation (prior plans as assistant turns), which primed the model to keep emitting an actions object instead of raw file bytes — so the written file was a JSON blob with the real markdown buried in a content field, and stats were vague/hallucinated. - Reframe the call: drop the assistant-plan turns; pass gathered data as a single plain reference block with a hard "you are a file-content writer, not a planner — output raw bytes only" instruction. - Add _unwrap_generated_file_body as a defensive net: if the model still returns an AssistantPlan-shaped blob, extract the write action's content; plain text and legitimate JSON files pass through untouched. 2. Plan repair retries were deterministic. At temperature 0 a model that emits malformed/empty JSON reproduces it identically on retry, wasting the attempt. ProviderPrompt gains an optional temperature; the planner sets 0.4 on repair retries (first attempt stays deterministic). Ollama honors it for both JSON and text calls. Verified end-to-end against qwen3.5: the report now writes as clean markdown using real GitHub values (followers, dates) instead of a JSON wrapper. Tests: unwrap extracts wrapped content and leaves plain/real-JSON untouched; orchestrator unwraps a plan-wrapped generated body; provider honors a prompt temperature; the repair retry carries temperature 0.4. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/foundation/models/orchestration.py | 1 + src/foundation/services/orchestrator.py | 70 ++++++++++++++++---- src/foundation/services/planner.py | 8 +++ src/foundation/services/provider.py | 7 +- tests/test_orchestrator.py | 85 +++++++++++++++++++++++++ tests/test_provider.py | 22 +++++++ 6 files changed, 179 insertions(+), 14 deletions(-) diff --git a/src/foundation/models/orchestration.py b/src/foundation/models/orchestration.py index f2295df..2c8f179 100644 --- a/src/foundation/models/orchestration.py +++ b/src/foundation/models/orchestration.py @@ -133,6 +133,7 @@ class ProviderPrompt(StrictModel): response_format: ProviderResponseFormat = ProviderResponseFormat.TEXT schema_name: str | None = None output_schema: dict[str, Any] | None = None + temperature: float | None = Field(default=None, ge=0.0, le=2.0) @model_validator(mode="after") def _validate_schema_requirements(self) -> ProviderPrompt: diff --git a/src/foundation/services/orchestrator.py b/src/foundation/services/orchestrator.py index 1e0bcc4..f15fd23 100644 --- a/src/foundation/services/orchestrator.py +++ b/src/foundation/services/orchestrator.py @@ -244,6 +244,38 @@ def _is_soft_failure_for_path( return path in cumulative_changed_paths +def _unwrap_generated_file_body(text: str) -> str: + """Salvage raw file content if the model wrapped it in an AssistantPlan blob. + + Some models, when asked to generate file content mid-loop, keep "planning" + and return a ``{assistant_message, actions: [...]}`` object with the real + content buried in a write action's ``content`` argument. Detect that shape + and extract the inner content; otherwise return the text untouched. + """ + stripped = text.strip() + if not stripped.startswith("{"): + return text + try: + payload = json.loads(stripped) + except (json.JSONDecodeError, ValueError): + return text + if not isinstance(payload, dict) or "actions" not in payload: + return text + actions = payload.get("actions") + if not isinstance(actions, list): + return text + for action in actions: + if not isinstance(action, dict): + continue + tool_call = action.get("tool_call") + if not isinstance(tool_call, dict): + continue + args = tool_call.get("arguments") + if isinstance(args, dict) and isinstance(args.get("content"), str): + return args["content"] + return text + + def _action_target_path(action: PlannedAction) -> str | None: if action.kind is not ActionKind.TOOL_CALL or action.tool_call is None: return None @@ -691,27 +723,39 @@ def _generate_file_body( request: UserRequest, observation_messages: list[ProviderMessage], ) -> str: - messages: list[ProviderMessage] = [ + # Fold the gathered context into a single plain-text reference block. + # We deliberately do NOT replay the planning conversation as assistant + # turns: those are JSON plans, and feeding them back primes the model to + # keep "planning" (emit another actions object) instead of writing the + # file's raw bytes. + reference = "\n\n".join(m.content for m in observation_messages).strip() + user_parts = [ + f"Write the complete, literal contents of the file `{path}`.", + f"\nWhat the file should contain:\n{brief}", + ] + if reference: + user_parts.append( + "\nReference data gathered so far (use the real values from it; " + f"do not invent facts):\n{reference}" + ) + user_parts.append(f"\nThe original user request was: {request.message}") + messages = [ ProviderMessage( role=ProviderMessageRole.DEVELOPER, content=( - f"You are generating the complete contents of the file `{path}`. " - "Output ONLY the raw file body — no markdown code fences, no " - "commentary, no surrounding quotes." + "You are a file-content writer, not a planner. Output ONLY the " + "raw, literal bytes to save to the file — nothing else. Do NOT " + "wrap the output in JSON, do NOT emit an actions/plan object, do " + "NOT fence the whole file, and do NOT add commentary before or " + "after the content." ), - ) + ), + ProviderMessage(role=ProviderMessageRole.USER, content="\n".join(user_parts)), ] - messages.extend(observation_messages) - messages.append( - ProviderMessage( - role=ProviderMessageRole.USER, - content=f"Original request: {request.message}\n\nFile to write: {brief}", - ) - ) response = self._provider.complete( ProviderPrompt(messages=messages, response_format=ProviderResponseFormat.TEXT) ) - return response.content + return _unwrap_generated_file_body(response.content) def _run_replan_loop( self, diff --git a/src/foundation/services/planner.py b/src/foundation/services/planner.py index 5045d61..b6cb4db 100644 --- a/src/foundation/services/planner.py +++ b/src/foundation/services/planner.py @@ -43,6 +43,9 @@ _MAX_PLAN_ACTIONS = 40 _MAX_TOTAL_ACTIONS = 200 +# Temperature used on a plan repair retry so the model doesn't deterministically +# reproduce the same invalid/empty response it produced at temperature 0. +_PLAN_REPAIR_TEMPERATURE = 0.4 _COMMIT_INTENT_WORD_RE = re.compile(r"\bcommit(?:ing|ted|s)?\b", re.IGNORECASE) _COMMIT_INTENT_PHRASES = ( "stop for approval", @@ -139,11 +142,16 @@ def request_plan( last_error: Exception | None = None for attempt in range(1, self._max_plan_attempts + 1): + # First attempt decodes deterministically (temperature 0). On a + # repair retry, nudge temperature up so the model doesn't + # deterministically reproduce the same malformed/empty response. + retry_temperature = _PLAN_REPAIR_TEMPERATURE if attempt > 1 else None prompt = ProviderPrompt( messages=[*base_messages, *supplemental_messages], response_format=ProviderResponseFormat.JSON_OBJECT, schema_name="assistant_plan", output_schema=AssistantPlan.model_json_schema(), + temperature=retry_temperature, ) try: response = self._provider.complete(prompt) diff --git a/src/foundation/services/provider.py b/src/foundation/services/provider.py index 41899c6..b8c95c1 100644 --- a/src/foundation/services/provider.py +++ b/src/foundation/services/provider.py @@ -618,9 +618,14 @@ def _build_payload(self, prompt: ProviderPrompt) -> dict[str, Any]: if prompt.response_format is ProviderResponseFormat.JSON_OBJECT: assert prompt.output_schema is not None payload["format"] = prompt.output_schema - options["temperature"] = 0 + # Default structured output to deterministic decoding, but allow a + # caller-supplied temperature (e.g. a repair retry nudging off 0 so + # it doesn't reproduce the same malformed JSON). + options["temperature"] = prompt.temperature if prompt.temperature is not None else 0 if self._needs_think_for_structured_output(self._model): payload["think"] = True + elif prompt.temperature is not None: + options["temperature"] = prompt.temperature if options: payload["options"] = options return payload diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 4f3f301..e81f594 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -329,6 +329,9 @@ def complete(self, prompt: ProviderPrompt) -> ProviderResponse: repair_text = "\n".join(m.content for m in provider.calls[1].messages) assert "truncated" in repair_text.lower() assert "content_brief" in repair_text + # First attempt is deterministic; the repair retry nudges temperature off 0. + assert provider.calls[0].temperature is None + assert provider.calls[1].temperature == 0.4 assert result.summary is not None @@ -2884,3 +2887,85 @@ def git(*args: str) -> None: and action.requires_approval for action in result.iterations[-1].plan.actions ) + + +def test_unwrap_generated_file_body_extracts_plan_wrapped_content() -> None: + from foundation.services.orchestrator import _unwrap_generated_file_body + + wrapped = json.dumps( + { + "assistant_message": "writing the file", + "actions": [ + { + "id": "w", + "kind": "tool_call", + "tool_call": { + "capability_id": "foundation.file.write", + "arguments": {"path": "r.md", "content": "# Real Title\n\nbody text"}, + }, + } + ], + } + ) + assert _unwrap_generated_file_body(wrapped) == "# Real Title\n\nbody text" + + +def test_unwrap_generated_file_body_passes_through_plain_and_real_json() -> None: + from foundation.services.orchestrator import _unwrap_generated_file_body + + assert _unwrap_generated_file_body("# Just markdown\n") == "# Just markdown\n" + # A legitimate JSON file (no actions array) must be left untouched. + config = '{"key": "value", "nested": {"a": 1}}' + assert _unwrap_generated_file_body(config) == config + + +def test_orchestrator_unwraps_plan_wrapped_generated_body( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + clean = "# Clean Report\n\nThe actual body of the report.\n" + wrapped_body = json.dumps( + { + "assistant_message": "writing", + "actions": [ + { + "id": "w", + "kind": "tool_call", + "tool_call": { + "capability_id": "foundation.file.write", + "arguments": {"path": "report.md", "content": clean}, + }, + } + ], + } + ) + provider = StubProvider( + [ + _provider_response( + { + "assistant_message": "Writing the report.", + "actions": [ + { + "id": "write_report", + "kind": "tool_call", + "summary": "Write the report", + "tool_call": { + "capability_id": "foundation.file.write", + "arguments": { + "path": "report.md", + "content_brief": "a report about the project", + }, + }, + } + ], + } + ), + _text_response(wrapped_body), + ] + ) + orchestrator, _, workspace_root = _orchestrator(tmp_path, monkeypatch, provider) + + orchestrator.orchestrate(UserRequest(message="write a report")) + + # The plan-wrapped generation is unwrapped to the clean file body. + assert (workspace_root / "report.md").read_text(encoding="utf-8") == clean diff --git a/tests/test_provider.py b/tests/test_provider.py index 2c6f613..ce83f63 100644 --- a/tests/test_provider.py +++ b/tests/test_provider.py @@ -535,3 +535,25 @@ def test_openai_adapter_raises_truncated_on_incomplete_response() -> None: adapter.complete(_structured_prompt()) assert exc_info.value.code is ProviderErrorCode.TRUNCATED + + +def test_ollama_adapter_honors_prompt_temperature() -> None: + transport = FakeTransport( + [{"message": {"role": "assistant", "content": '{"assistant_message":"ok","actions":[]}'}}] + ) + adapter = OllamaChatAdapter( + model="glm-5.1:cloud", + base_url="http://localhost:11434/api", + transport=transport, + ) + prompt = ProviderPrompt( + messages=[ProviderMessage(role=ProviderMessageRole.USER, content="Plan this.")], + response_format=ProviderResponseFormat.JSON_OBJECT, + schema_name="assistant_plan", + output_schema={"type": "object"}, + temperature=0.4, + ) + + adapter.complete(prompt) + + assert transport.calls[0]["payload"]["options"]["temperature"] == 0.4