From e38d267a4d6b4439c709f05181dfa60e76cc421e Mon Sep 17 00:00:00 2001 From: Kirill Kostarev Date: Mon, 15 Jun 2026 14:45:46 +0300 Subject: [PATCH] Add reviewed task-file flow for Codex sleep runs --- docs/sleep/CONTROLLABLE_DREAMING.md | 23 +- plugins/codex/README.md | 29 ++- skillopt_sleep/__main__.py | 139 ++++++++++-- skillopt_sleep/backend.py | 71 ++++-- skillopt_sleep/config.py | 10 + skillopt_sleep/cycle.py | 75 +++++-- skillopt_sleep/mine.py | 106 ++++++++- skillopt_sleep/tasks_file.py | 81 +++++++ skillopt_sleep/types.py | 1 + tests/test_sleep_engine.py | 320 +++++++++++++++++++++++++++- 10 files changed, 802 insertions(+), 53 deletions(-) create mode 100644 skillopt_sleep/tasks_file.py diff --git a/docs/sleep/CONTROLLABLE_DREAMING.md b/docs/sleep/CONTROLLABLE_DREAMING.md index 9ba3a799..9bd7efeb 100644 --- a/docs/sleep/CONTROLLABLE_DREAMING.md +++ b/docs/sleep/CONTROLLABLE_DREAMING.md @@ -12,7 +12,16 @@ optimize tasks: ```bash python -m skillopt_sleep dry-run --project "$(pwd)" --source claude --backend mock python -m skillopt_sleep dry-run --project "$(pwd)" --source codex --backend mock -python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \ + --max-sessions 5 --max-tasks 3 --progress +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \ + --target-skill-path .agents/skills/example/SKILL.md \ + --max-sessions 5 --max-tasks 3 --progress +python -m skillopt_sleep harvest --project "$(pwd)" --source codex \ + --target-skill-path .agents/skills/example/SKILL.md \ + --max-sessions 5 --max-tasks 3 --output reviewed-tasks.json +python -m skillopt_sleep dry-run --project "$(pwd)" --backend codex \ + --tasks-file reviewed-tasks.json --progress --json ``` `--source claude` reads Claude Code transcripts from `~/.claude/projects`. @@ -20,6 +29,18 @@ python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex `~/.codex/archived_sessions`. `--source auto` tries Codex archives first, then falls back to Claude Code transcripts. Use `--codex-home /path/to/.codex` or `--claude-home /path/to/.claude` to point at non-default homes. +Use `--max-sessions` to cap archived sessions before LLM mining, `--max-tasks` +to cap mined tasks before replay/consolidation, and `--progress` to print +stage progress to stderr. Use `--target-skill-path` for repo-scoped Codex +skills such as `.agents/skills//SKILL.md`. Target-skill runs over-sample +candidate tasks before the `--max-tasks` cut and prefer tasks whose +intent/context overlaps the target skill's path, headings, and content. Use +`harvest --output reviewed-tasks.json` when you want to inspect or redact the +target-filtered task set before any real backend sees it, then pass +`--tasks-file reviewed-tasks.json` to `run` or `dry-run`; task-file runs skip +transcript harvest/mining and replay only the reviewed JSON tasks. Real +backends refuse task files still marked `"reviewed": false`; after inspection, +set `"reviewed": true`. ## The mental model diff --git a/plugins/codex/README.md b/plugins/codex/README.md index 3c9ceb7f..14dbfaf8 100644 --- a/plugins/codex/README.md +++ b/plugins/codex/README.md @@ -49,18 +49,43 @@ Or call the engine directly: ```bash python -m skillopt_sleep dry-run --project "$(pwd)" --source codex --backend mock -python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \ + --max-sessions 5 --max-tasks 3 --progress +python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \ + --target-skill-path .agents/skills/example/SKILL.md \ + --max-sessions 5 --max-tasks 3 --progress ``` `--source codex` reads Codex Desktop archived sessions from `~/.codex/archived_sessions`. Use `--codex-home /path/to/.codex` to point at a different Codex home, or `--source auto` to try Codex archives first and fall back to Claude Code transcripts. Default backend is `mock` (no API spend). -`--backend codex` uses your Codex budget for real improvement. All the +`--backend codex` uses your Codex budget for real improvement. Bound live runs +with `--max-sessions` and `--max-tasks`; add `--progress` because Codex-backed +mining, replay, and reflection can be slow and otherwise quiet. Use +`--target-skill-path` to stage/adopt into a repo-scoped Codex skill such as +`.agents/skills//SKILL.md`; target runs over-sample mined tasks and +prefer tasks that match the target skill's path, headings, and content. All the controllable knobs (`--gate on|off`, `--rollouts-k`, `--budget-tokens`, `--preferences`, optimizer/target split) work identically — see [`../../docs/sleep/CONTROLLABLE_DREAMING.md`](../../docs/sleep/CONTROLLABLE_DREAMING.md). +For privacy-sensitive projects, split the run into reviewable steps: + +```bash +python -m skillopt_sleep harvest --project "$(pwd)" --source codex \ + --target-skill-path .agents/skills/example/SKILL.md \ + --max-sessions 5 --max-tasks 3 \ + --output reviewed-tasks.json + +python -m skillopt_sleep dry-run --project "$(pwd)" --backend codex \ + --tasks-file reviewed-tasks.json --progress --json +``` + +Inspect/redact the JSON and set `"reviewed": true` before using a real backend. +`--tasks-file` skips archive harvest/mining and replays only the reviewed JSON +tasks; real backends refuse task files still marked `"reviewed": false`. + ## Notes / status - Codex's `exec` runs shell, so the real-tool-loop replay (e.g. the diff --git a/skillopt_sleep/__main__.py b/skillopt_sleep/__main__.py index 2666ee6f..6a8de760 100644 --- a/skillopt_sleep/__main__.py +++ b/skillopt_sleep/__main__.py @@ -9,6 +9,10 @@ Common flags: --project PATH project to evolve (default: cwd) --scope all|invoked harvest scope (default: invoked) + --max-sessions N cap transcript sessions per run + --max-tasks N cap mined tasks per run + --target-skill-path PATH explicit live SKILL.md to stage/adopt + --tasks-file PATH reviewed TaskRecord JSON file to replay instead of harvesting --backend mock|claude|codex --source claude|codex|auto --model NAME @@ -31,6 +35,35 @@ from skillopt_sleep.staging import adopt as adopt_staging from skillopt_sleep.staging import latest_staging from skillopt_sleep.state import SleepState +from skillopt_sleep.tasks_file import load_tasks_file, make_tasks_payload, write_tasks_file + + +def _read_text(path: str) -> str: + try: + with open(path, encoding="utf-8") as f: + return f.read() + except Exception: + return "" + + +def _report_payload(rep, outcome) -> Dict[str, Any]: + return { + "night": rep.night, + "accepted": rep.accepted, + "gate_action": rep.gate_action, + "no_edits_reason": getattr(rep, "no_edits_reason", ""), + "baseline": rep.baseline_score, + "candidate": rep.candidate_score, + "n_tasks": rep.n_tasks, + "n_sessions": rep.n_sessions, + "n_accepted_edits": len(rep.edits), + "n_rejected_edits": len(rep.rejected_edits), + "edits": [e.__dict__ for e in rep.edits], + "rejected_edits": [e.__dict__ for e in rep.rejected_edits], + "notes": rep.notes, + "staging_dir": outcome.staging_dir, + "adopted": outcome.adopted, + } def _add_common(p: argparse.ArgumentParser) -> None: @@ -45,11 +78,21 @@ def _add_common(p: argparse.ArgumentParser) -> None: help="session transcript source") p.add_argument("--lookback-hours", type=int, default=0) p.add_argument("--edit-budget", type=int, default=0) + p.add_argument("--max-sessions", type=int, default=0, + help="cap harvested sessions before mining; default derives from max tasks") + p.add_argument("--max-tasks", type=int, default=0, + help="cap mined tasks for this run") + p.add_argument("--target-skill-path", default="", + help="explicit live SKILL.md path to evolve/stage/adopt") + p.add_argument("--tasks-file", default="", + help="reviewed TaskRecord JSON file to replay instead of harvesting") + p.add_argument("--progress", action="store_true", + help="print phase progress to stderr") p.add_argument("--auto-adopt", action="store_true") p.add_argument("--json", action="store_true") -def _cfg_from_args(args) -> Any: +def _cfg_from_args(args, task_meta: Dict[str, Any] | None = None) -> Any: overrides: Dict[str, Any] = {} if args.project: overrides["invoked_project"] = os.path.abspath(args.project) @@ -72,30 +115,63 @@ def _cfg_from_args(args) -> Any: overrides["lookback_hours"] = args.lookback_hours if getattr(args, "edit_budget", 0): overrides["edit_budget"] = args.edit_budget + if getattr(args, "max_sessions", 0): + overrides["max_sessions_per_night"] = args.max_sessions + if getattr(args, "max_tasks", 0): + overrides["max_tasks_per_night"] = args.max_tasks + target_skill_path = getattr(args, "target_skill_path", "") + if not target_skill_path and task_meta: + target_skill_path = str(task_meta.get("target_skill_path") or "") + if target_skill_path: + path = os.path.expanduser(target_skill_path) + if args.project and not os.path.isabs(path): + path = os.path.join(os.path.abspath(args.project), path) + overrides["target_skill_path"] = os.path.abspath(path) + if getattr(args, "progress", False): + overrides["progress"] = True if getattr(args, "auto_adopt", False): overrides["auto_adopt"] = True return load_config(**overrides) def cmd_run(args, dry: bool = False) -> int: - cfg = _cfg_from_args(args) - outcome = run_sleep_cycle(cfg, dry_run=dry) + task_meta: Dict[str, Any] = {} + tasks = None + if getattr(args, "tasks_file", ""): + # Load once before config so target_skill_path can default from metadata. + tasks, task_meta = load_tasks_file(args.tasks_file) + cfg = _cfg_from_args(args, task_meta=task_meta) + if getattr(args, "tasks_file", ""): + tasks, task_meta = load_tasks_file( + args.tasks_file, + holdout_fraction=cfg.get("holdout_fraction", 0.34), + seed=cfg.get("seed", 42), + ) + if cfg.get("backend", "mock") != "mock" and task_meta.get("reviewed") is not True: + print( + "[sleep] refusing real-backend replay from an unreviewed tasks file; " + "inspect/redact it and set \"reviewed\": true first", + file=sys.stderr, + ) + return 2 + outcome = run_sleep_cycle(cfg, seed_tasks=tasks, dry_run=dry) rep = outcome.report if args.json: - print(json.dumps({ - "night": rep.night, "accepted": rep.accepted, - "gate_action": rep.gate_action, - "baseline": rep.baseline_score, "candidate": rep.candidate_score, - "n_tasks": rep.n_tasks, "n_sessions": rep.n_sessions, - "edits": [e.__dict__ for e in rep.edits], - "staging_dir": outcome.staging_dir, "adopted": outcome.adopted, - }, ensure_ascii=False, indent=2)) + payload = _report_payload(rep, outcome) + if task_meta: + payload["tasks_file"] = task_meta.get("tasks_file", "") + payload["tasks_reviewed"] = task_meta.get("reviewed", False) + print(json.dumps(payload, ensure_ascii=False, indent=2)) else: print(f"[sleep] night {rep.night}: {rep.n_sessions} sessions -> {rep.n_tasks} tasks") print(f"[sleep] held-out {rep.baseline_score:.3f} -> {rep.candidate_score:.3f} " f"=> {rep.gate_action} (accepted={rep.accepted})") for e in rep.edits: print(f" + [{e.target}/{e.op}] {e.content}") + if rep.rejected_edits: + print("[sleep] rejected by gate:") + for e in rep.rejected_edits: + print(f" - [{e.target}/{e.op}] {e.content}") if outcome.staging_dir: print(f"[sleep] staged: {outcome.staging_dir}") if not outcome.adopted: @@ -152,16 +228,42 @@ def cmd_adopt(args) -> int: def cmd_harvest(args) -> int: cfg = _cfg_from_args(args) - digests = harvest_for_config(cfg, limit=cfg.get("max_tasks_per_night", 40) * 3) - tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40), - holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42)) + session_limit = cfg.get("max_sessions_per_night", 0) or cfg.get("max_tasks_per_night", 40) * 3 + target_skill_path = cfg.managed_skill_path() if cfg.get("target_skill_path", "") else "" + target_skill_text = _read_text(target_skill_path) if target_skill_path else "" + max_tasks = cfg.get("max_tasks_per_night", 40) + candidate_limit = max_tasks + if cfg.get("target_task_filter", True) and target_skill_text: + candidate_limit = max(max_tasks, max_tasks * 3) + digests = harvest_for_config(cfg, limit=session_limit) + tasks = mine( + digests, + max_tasks=max_tasks, + candidate_limit=candidate_limit, + holdout_fraction=cfg.get("holdout_fraction", 0.34), + seed=cfg.get("seed", 42), + target_skill_text=target_skill_text, + target_skill_path=target_skill_path, + ) + payload = make_tasks_payload( + tasks, + project=cfg.get("invoked_project") or os.getcwd(), + transcript_source=cfg.get("transcript_source", ""), + n_sessions=len(digests), + target_skill_path=target_skill_path, + ) + output_path = "" + if getattr(args, "output", ""): + output_path = write_tasks_file(args.output, payload) if args.json: - print(json.dumps({ - "n_sessions": len(digests), - "tasks": [t.to_dict() for t in tasks], - }, ensure_ascii=False, indent=2)) + json_payload = dict(payload) + if output_path: + json_payload["output"] = output_path + print(json.dumps(json_payload, ensure_ascii=False, indent=2)) else: print(f"[sleep] {len(digests)} sessions -> {len(tasks)} tasks") + if output_path: + print(f"[sleep] wrote reviewed-task draft: {output_path}") for t in tasks: print(f" [{t.split}/{t.outcome}] {t.intent[:90]}") return 0 @@ -207,6 +309,7 @@ def main(argv=None) -> int: p_adopt.add_argument("--staging", default="", help="specific staging dir") p_harvest = sub.add_parser("harvest", help="debug: show mined tasks") _add_common(p_harvest) + p_harvest.add_argument("--output", default="", help="write mined tasks JSON for review") p_sched = sub.add_parser("schedule", help="install a nightly cron entry for this project") _add_common(p_sched) p_sched.add_argument("--hour", type=int, default=3) diff --git a/skillopt_sleep/backend.py b/skillopt_sleep/backend.py index 2ec5cdd6..4634e8fc 100644 --- a/skillopt_sleep/backend.py +++ b/skillopt_sleep/backend.py @@ -315,6 +315,8 @@ def __init__(self, model: str = "", timeout: int = 180) -> None: self.timeout = timeout self._tokens = 0 self._cache: Dict[str, str] = {} + self.last_call_error = "" + self.last_reflect_raw = "" # subclasses override -------------------------------------------------- def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: @@ -691,15 +693,25 @@ class CodexCliBackend(CliBackend): name = "codex" - def __init__(self, model: str = "", codex_path: str = "", timeout: int = 240, - sandbox: str = "read-only") -> None: + def __init__( + self, + model: str = "", + codex_path: str = "", + timeout: int = 240, + sandbox: str = "read-only", + project_dir: str = "", + ) -> None: super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_CODEX_MODEL", ""), timeout=timeout) self.codex_path = resolve_codex_path(codex_path) self.sandbox = sandbox + self.project_dir = ( + os.path.abspath(os.path.expanduser(project_dir)) if project_dir else "" + ) def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: import tempfile + self.last_call_error = "" out_path = tempfile.NamedTemporaryFile( prefix="codex_last_", suffix=".txt", delete=False ).name @@ -708,18 +720,39 @@ def _call(self, prompt: str, *, max_tokens: int = 1024) -> str: "--color", "never", "--sandbox", self.sandbox, "-o", out_path, ] + if self.project_dir: + cmd[3:3] = ["-C", self.project_dir] if self.model: cmd += ["-m", self.model] cmd += ["--", prompt] + proc = None try: - subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout) - except Exception: - return "" - try: - with open(out_path, encoding="utf-8") as f: - return f.read().strip() - except Exception: - return "" + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=self.timeout, + cwd=self.project_dir or None, + ) + except subprocess.TimeoutExpired: + self.last_call_error = f"codex exec timed out after {self.timeout}s" + return "" + except Exception as exc: + self.last_call_error = f"codex exec failed: {exc}" + return "" + try: + with open(out_path, encoding="utf-8") as f: + out = f.read().strip() + if out: + return out + except Exception as exc: + self.last_call_error = f"could not read codex output file: {exc}" + stdout = (proc.stdout or "").strip() if proc is not None else "" + stderr = (proc.stderr or "").strip() if proc is not None else "" + if proc is not None and proc.returncode != 0 and not self.last_call_error: + self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}" + return stdout or stderr finally: try: os.unlink(out_path) @@ -1025,12 +1058,13 @@ def get_backend( claude_path: str = "claude", codex_path: str = "", azure_endpoint: str = "", + project_dir: str = "", ) -> Backend: n = (name or "mock").strip().lower() if n in {"claude", "anthropic", "claude_cli", "claude_code"}: return ClaudeCliBackend(model=model, claude_path=claude_path) if n in {"codex", "codex_cli", "openai_codex"}: - return CodexCliBackend(model=model, codex_path=codex_path) + return CodexCliBackend(model=model, codex_path=codex_path, project_dir=project_dir) if n in {"azure", "azure_openai", "aoai"}: return AzureOpenAIBackend(deployment=model, endpoint=azure_endpoint) if n in {"azure-responses", "azure_responses", "aoai-responses", "responses"}: @@ -1050,6 +1084,7 @@ def build_backend( codex_path: str = "", azure_endpoint: str = "", preferences: str = "", + project_dir: str = "", ) -> Backend: """Build a single or dual backend. @@ -1060,13 +1095,21 @@ def build_backend( """ has_split = any([optimizer_backend, optimizer_model, target_backend, target_model]) if not has_split: - be = get_backend(backend, model=model, codex_path=codex_path, azure_endpoint=azure_endpoint) + be = get_backend( + backend, + model=model, + codex_path=codex_path, + azure_endpoint=azure_endpoint, + project_dir=project_dir, + ) be.preferences = preferences return be tgt = get_backend(target_backend or backend, model=target_model or model, - codex_path=codex_path, azure_endpoint=azure_endpoint) + codex_path=codex_path, azure_endpoint=azure_endpoint, + project_dir=project_dir) opt = get_backend(optimizer_backend or backend, model=optimizer_model or model, - codex_path=codex_path, azure_endpoint=azure_endpoint) + codex_path=codex_path, azure_endpoint=azure_endpoint, + project_dir=project_dir) opt.preferences = preferences # reflect runs on the optimizer dual = DualBackend(target=tgt, optimizer=opt) dual.preferences = preferences diff --git a/skillopt_sleep/config.py b/skillopt_sleep/config.py index 9ddeff7f..cade308c 100644 --- a/skillopt_sleep/config.py +++ b/skillopt_sleep/config.py @@ -47,6 +47,9 @@ "evolve_memory": True, # consolidate CLAUDE.md "evolve_skill": True, # consolidate the managed SKILL.md "llm_mine": True, # use the backend to mine checkable tasks (real backends) + "target_skill_path": "", # explicit SKILL.md target for repo-scoped agents + "target_task_filter": True, # prefer mined tasks matching target_skill_path/text + "progress": False, # print phase progress to stderr # ── adoption / safety ────────────────────────────────────────────────── "auto_adopt": False, # default: stage + require explicit `adopt` "managed_skill_name": "skillopt-sleep-learned", @@ -109,6 +112,13 @@ def skills_dir(self) -> str: return os.path.join(self.data["claude_home"], "skills") def managed_skill_path(self) -> str: + target = self.data.get("target_skill_path") or "" + if target: + target = os.path.expanduser(str(target)) + if not os.path.isabs(target): + base = self.data.get("invoked_project") or os.getcwd() + target = os.path.join(base, target) + return os.path.abspath(target) return os.path.join( self.skills_dir, self.data["managed_skill_name"], "SKILL.md" ) diff --git a/skillopt_sleep/cycle.py b/skillopt_sleep/cycle.py index c9f8a286..1435cb16 100644 --- a/skillopt_sleep/cycle.py +++ b/skillopt_sleep/cycle.py @@ -10,6 +10,7 @@ from __future__ import annotations import os +import sys from dataclasses import dataclass from typing import List, Optional @@ -49,6 +50,11 @@ def _read(path: str) -> str: return "" +def _progress(cfg: SleepConfig, message: str) -> None: + if cfg.get("progress", False): + print(f"[sleep] {message}", file=sys.stderr, flush=True) + + def _render_report_md(report: SleepReport, cfg: SleepConfig) -> str: lines = [ f"# SkillOpt-Sleep — night {report.night} report", @@ -108,6 +114,26 @@ def run_sleep_cycle( cfg.get("backend", "mock"), model=cfg.get("model", ""), codex_path=cfg.get("codex_path", ""), + project_dir=project, + ) + _progress(cfg, f"night {night}: project={project} backend={backend.name}") + + # ── live skill/memory docs ─────────────────────────────────────────── + live_memory_path = os.path.join(project, "CLAUDE.md") + live_skill_path = cfg.managed_skill_path() + _progress(cfg, f"live skill: {live_skill_path}") + raw_skill = _read(live_skill_path) + skill = raw_skill + memory = _read(live_memory_path) + if not skill: + skill = ensure_skill_scaffold( + "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"), + description="Preferences and procedures learned from past local agent sessions.", + ) + target_filter = bool( + cfg.get("target_task_filter", True) + and cfg.get("target_skill_path", "") + and raw_skill ) # ── 1+2. harvest + mine (unless seed_tasks injected) ───────────────── @@ -115,14 +141,25 @@ def run_sleep_cycle( if seed_tasks is not None: tasks = seed_tasks n_sessions = 0 + _progress(cfg, f"using {len(tasks)} seeded tasks") else: since = state.last_harvest_for(project) + max_tasks = cfg.get("max_tasks_per_night", 40) + max_sessions = cfg.get("max_sessions_per_night", 0) or max_tasks * 3 + candidate_limit = max_tasks + if target_filter: + candidate_limit = max(max_tasks, max_tasks * 3) + _progress( + cfg, + f"harvest start: source={cfg.get('transcript_source')} max_sessions={max_sessions}", + ) digests = harvest_for_config( cfg, since_iso=since, - limit=cfg.get("max_tasks_per_night", 40) * 3, + limit=max_sessions, ) n_sessions = len(digests) + _progress(cfg, f"harvest done: sessions={n_sessions}") # When a real backend is configured, use it to mine checkable tasks from # the transcripts (rubric/rule judges); otherwise fall back to the # heuristic miner (no API, no checkable reference). @@ -130,27 +167,29 @@ def run_sleep_cycle( if cfg.get("backend", "mock") != "mock" and cfg.get("llm_mine", True): try: from skillopt_sleep.llm_miner import make_llm_miner - llm_miner = make_llm_miner(backend, max_tasks=cfg.get("max_tasks_per_night", 40)) + llm_miner = make_llm_miner( + backend, + max_sessions=max_sessions, + max_tasks=candidate_limit, + ) except Exception: llm_miner = None + _progress( + cfg, + f"mine start: max_tasks={max_tasks} candidate_limit={candidate_limit} " + f"llm_mine={llm_miner is not None} target_filter={target_filter}", + ) tasks = mine( digests, - max_tasks=cfg.get("max_tasks_per_night", 40), + max_tasks=max_tasks, + candidate_limit=candidate_limit, holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42), llm_miner=llm_miner, + target_skill_text=raw_skill if target_filter else "", + target_skill_path=live_skill_path if target_filter else "", ) - - # ── live skill/memory docs ─────────────────────────────────────────── - live_memory_path = os.path.join(project, "CLAUDE.md") - live_skill_path = cfg.managed_skill_path() - skill = _read(live_skill_path) - memory = _read(live_memory_path) - if not skill: - skill = ensure_skill_scaffold( - "", name=cfg.get("managed_skill_name", "skillopt-sleep-learned"), - description="Preferences and procedures learned from past local agent sessions.", - ) + _progress(cfg, f"mine done: tasks={len(tasks)}") report = SleepReport( night=night, project=project, started_at=started, @@ -168,6 +207,7 @@ def run_sleep_cycle( return CycleOutcome(report, staging_dir, False, []) # ── 3+4. replay + consolidate (gate) ───────────────────────────────── + _progress(cfg, "consolidate start") result = consolidate( backend, tasks, skill, memory, edit_budget=cfg.get("edit_budget", 4), @@ -178,12 +218,18 @@ def run_sleep_cycle( evolve_memory=cfg.get("evolve_memory", True), night=night, ) + _progress( + cfg, + f"consolidate done: gate={result.gate_action} accepted={result.accepted} " + f"edits={len(result.applied_edits)} rejected={len(result.rejected_edits)}", + ) report.n_replayed = len(tasks) report.baseline_score = result.baseline_score report.candidate_score = result.candidate_score report.accepted = result.accepted report.gate_action = result.gate_action + report.no_edits_reason = getattr(result, "no_edits_reason", "") report.edits = result.applied_edits report.rejected_edits = result.rejected_edits report.tokens_used = backend.tokens_used() @@ -194,6 +240,7 @@ def run_sleep_cycle( adopted = False adopted_paths: List[str] = [] if not dry_run: + _progress(cfg, "staging start") report_md = _render_report_md(report, cfg) proposed_skill = result.new_skill if (cfg.get("evolve_skill") and result.accepted) else None proposed_memory = result.new_memory if (cfg.get("evolve_memory") and result.accepted) else None diff --git a/skillopt_sleep/mine.py b/skillopt_sleep/mine.py index 64d75461..44830574 100644 --- a/skillopt_sleep/mine.py +++ b/skillopt_sleep/mine.py @@ -15,8 +15,10 @@ from __future__ import annotations import hashlib +import os import re -from typing import Any, Callable, List, Optional +from collections import Counter +from typing import Any, Callable, List, Optional, Set, Tuple from skillopt_sleep.types import SessionDigest, TaskRecord @@ -39,6 +41,99 @@ def _looks_positive(signals: List[str]) -> bool: return any(s.startswith("pos:") for s in signals) +_TARGET_STOPWORDS = { + "about", "after", "again", "agent", "agents", "all", "also", "always", + "and", "any", "are", "before", "being", "but", "can", "codex", + "current", "default", "docs", "does", "done", "each", "file", "files", + "for", "from", "have", "into", "keep", "must", "not", "only", "path", + "paths", "project", "read", "repo", "request", "requests", "rule", + "rules", "same", "should", "skill", "skills", "source", "start", + "task", "tasks", "that", "the", "their", "then", "this", "unless", + "update", "user", "users", "when", "with", "work", "workflow", +} + + +def _target_tokens(text: str) -> List[str]: + tokens: List[str] = [] + for raw in re.findall(r"[\w][\w.-]*", (text or "").lower(), flags=re.UNICODE): + parts = [raw] + re.split(r"[\W_]+", raw, flags=re.UNICODE) + for part in parts: + if len(part) < 3 or part.isdigit() or part in _TARGET_STOPWORDS: + continue + tokens.append(part) + return tokens + + +def _expand_target_keywords(keywords: Set[str]) -> None: + if "mcp" in keywords: + keywords.update({ + "configure", "configuration", "connect", "connected", "enable", + "enabled", "install", "installed", "server", "servers", + "настрой", "настроить", "подключи", "подключить", + }) + if {"conflict", "conflicts"} & keywords: + keywords.update({ + "cherry", "conflict", "conflicts", "git", "merge", "rebase", + "unmerged", "конфликт", "конфликты", + }) + + +def target_task_keywords( + target_skill_text: str, + target_skill_path: str = "", + *, + limit: int = 180, +) -> Tuple[Set[str], Set[str]]: + """Return (strong, weak) keywords that describe a target skill.""" + path_text = (target_skill_path or "").replace(os.sep, " ") + headings = "\n".join(re.findall(r"(?m)^#+\s+(.+)$", target_skill_text or "")) + strong = set(_target_tokens(path_text + "\n" + headings)) + weak = set(strong) + counts = Counter(_target_tokens(target_skill_text or "")) + for token, _count in counts.most_common(limit): + weak.add(token) + _expand_target_keywords(strong) + _expand_target_keywords(weak) + return strong, weak + + +def _task_search_text(task: TaskRecord) -> str: + return "\n".join([ + task.intent or "", + task.context_excerpt or "", + " ".join(task.tags or []), + ]) + + +def filter_tasks_for_target( + tasks: List[TaskRecord], + target_skill_text: str, + target_skill_path: str = "", +) -> List[TaskRecord]: + """Prefer tasks whose language overlaps the explicit target skill. + + If nothing matches, return the original list. This keeps a target run useful + even when transcripts are too sparse or the skill is too generic. + """ + strong, weak = target_task_keywords(target_skill_text, target_skill_path) + if not tasks or not (strong or weak): + return tasks + + ranked = [] + for idx, task in enumerate(tasks): + tokens = set(_target_tokens(_task_search_text(task))) + strong_hits = tokens & strong + weak_hits = tokens & weak + if not strong_hits and len(weak_hits) < 2: + continue + score = len(strong_hits) * 3 + len(weak_hits) + ranked.append((score, idx, task)) + if not ranked: + return tasks + ranked.sort(key=lambda item: (-item[0], item[1])) + return [task for _score, _idx, task in ranked] + + def heuristic_mine( digests: List[SessionDigest], *, @@ -192,11 +287,15 @@ def mine( digests: List[SessionDigest], *, max_tasks: int = 40, + candidate_limit: int = 0, holdout_fraction: float = 0.34, seed: int = 42, llm_miner: Optional[Callable[[List[SessionDigest]], List[TaskRecord]]] = None, + target_skill_text: str = "", + target_skill_path: str = "", ) -> List[TaskRecord]: """Top-level miner. Uses ``llm_miner`` if provided, else heuristic.""" + candidate_limit = candidate_limit or max_tasks tasks: List[TaskRecord] = [] if llm_miner is not None: try: @@ -204,7 +303,10 @@ def mine( except Exception: tasks = [] if not tasks: - tasks = heuristic_mine(digests, max_tasks=max_tasks) + tasks = heuristic_mine(digests, max_tasks=candidate_limit) tasks = dedup_tasks(tasks) + if target_skill_text or target_skill_path: + tasks = filter_tasks_for_target(tasks, target_skill_text, target_skill_path) + tasks = tasks[:max_tasks] tasks = assign_splits(tasks, holdout_fraction=holdout_fraction, seed=seed) return tasks diff --git a/skillopt_sleep/tasks_file.py b/skillopt_sleep/tasks_file.py new file mode 100644 index 00000000..d89166b9 --- /dev/null +++ b/skillopt_sleep/tasks_file.py @@ -0,0 +1,81 @@ +"""Reviewed task-file helpers for privacy-safe SkillOpt-Sleep runs.""" +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Tuple + +from skillopt_sleep.mine import assign_splits, normalize_legacy_split +from skillopt_sleep.types import TaskRecord + + +def make_tasks_payload( + tasks: List[TaskRecord], + *, + project: str, + transcript_source: str = "", + n_sessions: int = 0, + target_skill_path: str = "", +) -> Dict[str, Any]: + return { + "format": "skillopt_sleep.tasks.v1", + "project": project, + "transcript_source": transcript_source, + "n_sessions": n_sessions, + "target_skill_path": target_skill_path, + "reviewed": False, + "tasks": [t.to_dict() for t in tasks], + } + + +def write_tasks_file(path: str, payload: Dict[str, Any]) -> str: + out = os.path.abspath(os.path.expanduser(path)) + parent = os.path.dirname(out) + if parent: + os.makedirs(parent, exist_ok=True) + with open(out, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2) + f.write("\n") + return out + + +def _normalize_tasks( + tasks: List[TaskRecord], + *, + holdout_fraction: float, + seed: int, +) -> List[TaskRecord]: + for task in tasks: + task.split = normalize_legacy_split(task.split or "train") + if len(tasks) >= 2 and not any(task.split in {"val", "test"} for task in tasks): + tasks = assign_splits(tasks, holdout_fraction=holdout_fraction, seed=seed) + return tasks + + +def load_tasks_file( + path: str, + *, + holdout_fraction: float = 0.34, + seed: int = 42, +) -> Tuple[List[TaskRecord], Dict[str, Any]]: + source = os.path.abspath(os.path.expanduser(path)) + with open(source, encoding="utf-8") as f: + payload = json.load(f) + if isinstance(payload, list): + meta: Dict[str, Any] = {"format": "skillopt_sleep.tasks.v1", "tasks_file": source} + raw_tasks = payload + elif isinstance(payload, dict): + meta = {k: v for k, v in payload.items() if k != "tasks"} + meta["tasks_file"] = source + raw_tasks = payload.get("tasks", []) + else: + raise ValueError("tasks file must contain a JSON object with tasks or a JSON task array") + if not isinstance(raw_tasks, list): + raise ValueError("tasks file field 'tasks' must be an array") + + tasks: List[TaskRecord] = [] + for item in raw_tasks: + if not isinstance(item, dict): + raise ValueError("each task entry must be a JSON object") + tasks.append(TaskRecord.from_dict(item)) + return _normalize_tasks(tasks, holdout_fraction=holdout_fraction, seed=seed), meta diff --git a/skillopt_sleep/types.py b/skillopt_sleep/types.py index 849c170a..6cfa6239 100644 --- a/skillopt_sleep/types.py +++ b/skillopt_sleep/types.py @@ -135,6 +135,7 @@ class SleepReport: candidate_score: float = 0.0 accepted: bool = False gate_action: str = "" + no_edits_reason: str = "" edits: List[EditRecord] = field(default_factory=list) rejected_edits: List[EditRecord] = field(default_factory=list) tokens_used: int = 0 diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 1992dc74..5098d091 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -10,6 +10,7 @@ import os import tempfile import unittest +from unittest import mock from skillopt_sleep.backend import MockBackend, exact_score, keyword_soft_score from skillopt_sleep.config import load_config @@ -18,9 +19,9 @@ from skillopt_sleep.experiments.personas import programmer_persona, researcher_persona from skillopt_sleep.harvest import _detect_feedback, _is_meta_prompt, digest_transcript from skillopt_sleep.memory import apply_edits, current_learned_lines, extract_learned, set_learned -from skillopt_sleep.mine import assign_splits, heuristic_mine +from skillopt_sleep.mine import assign_splits, filter_tasks_for_target, heuristic_mine, mine from skillopt_sleep.staging import adopt -from skillopt_sleep.types import EditRecord, SessionDigest, TaskRecord +from skillopt_sleep.types import EditRecord, SessionDigest, SleepReport, TaskRecord class TestScoring(unittest.TestCase): @@ -180,6 +181,208 @@ def test_harvest_codex_filters_project_and_cli_source(self): self.assertEqual(digests[0].session_id, "rollout-yoshi") self.assertEqual(digests[0].user_prompts, ["fix Yoshi"]) + def test_cli_exposes_limits_progress_and_target_skill_path(self): + from skillopt_sleep.__main__ import _cfg_from_args + + with tempfile.TemporaryDirectory() as project: + Args = type("Args", (), { + "project": project, + "scope": "", + "backend": "codex", + "model": "", + "codex_path": "", + "claude_home": "", + "codex_home": "", + "source": "codex", + "lookback_hours": 0, + "edit_budget": 2, + "max_sessions": 5, + "max_tasks": 3, + "target_skill_path": ".agents/skills/taste-skill/SKILL.md", + "progress": True, + "auto_adopt": False, + }) + + cfg = _cfg_from_args(Args()) + + self.assertEqual(cfg.get("backend"), "codex") + self.assertEqual(cfg.get("max_sessions_per_night"), 5) + self.assertEqual(cfg.get("max_tasks_per_night"), 3) + self.assertTrue(cfg.get("progress")) + self.assertEqual( + cfg.managed_skill_path(), + os.path.join(project, ".agents/skills/taste-skill/SKILL.md"), + ) + + def test_cli_report_payload_includes_rejected_edits(self): + from skillopt_sleep.__main__ import _report_payload + + report = SleepReport( + night=1, + project="/p", + edits=[EditRecord("skill", "add", "accepted rule")], + rejected_edits=[EditRecord("skill", "add", "rejected rule")], + ) + outcome = type("Outcome", (), {"staging_dir": "", "adopted": False})() + + payload = _report_payload(report, outcome) + + self.assertEqual(payload["n_accepted_edits"], 1) + self.assertEqual(payload["n_rejected_edits"], 1) + self.assertEqual(payload["rejected_edits"][0]["content"], "rejected rule") + + def test_tasks_file_roundtrip_and_split_assignment(self): + from skillopt_sleep.tasks_file import load_tasks_file, make_tasks_payload, write_tasks_file + + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, "tasks.json") + payload = make_tasks_payload( + [ + TaskRecord(id="t1", project="/p", intent="configure MCP server"), + TaskRecord(id="t2", project="/p", intent="resolve Git conflict"), + ], + project="/p", + transcript_source="codex", + n_sessions=2, + target_skill_path="/p/.agents/skills/yoshi-monorepo/SKILL.md", + ) + + written = write_tasks_file(path, payload) + tasks, meta = load_tasks_file(written, holdout_fraction=0.5, seed=1) + + self.assertEqual(meta["target_skill_path"], "/p/.agents/skills/yoshi-monorepo/SKILL.md") + self.assertEqual([t.id for t in tasks], ["t1", "t2"]) + self.assertIn("val", {t.split for t in tasks}) + + def test_cfg_uses_tasks_file_target_skill_path_metadata(self): + from skillopt_sleep.__main__ import _cfg_from_args + + Args = type("Args", (), { + "project": "/repo/Yoshi", + "scope": "", + "backend": "", + "model": "", + "codex_path": "", + "claude_home": "", + "codex_home": "", + "source": "", + "lookback_hours": 0, + "edit_budget": 0, + "max_sessions": 0, + "max_tasks": 0, + "target_skill_path": "", + "progress": False, + "auto_adopt": False, + }) + + cfg = _cfg_from_args(Args(), task_meta={ + "target_skill_path": ".agents/skills/yoshi-monorepo/SKILL.md", + }) + + self.assertEqual( + cfg.managed_skill_path(), + "/repo/Yoshi/.agents/skills/yoshi-monorepo/SKILL.md", + ) + + def test_cmd_run_uses_tasks_file_without_harvest(self): + from contextlib import redirect_stdout + from io import StringIO + + from skillopt_sleep.__main__ import cmd_run + from skillopt_sleep.tasks_file import make_tasks_payload, write_tasks_file + + with tempfile.TemporaryDirectory() as project, tempfile.TemporaryDirectory() as home: + target = os.path.join(project, ".agents/skills/yoshi-monorepo/SKILL.md") + os.makedirs(os.path.dirname(target)) + with open(target, "w", encoding="utf-8") as f: + f.write("# Yoshi Monorepo\n") + tasks_path = os.path.join(home, "reviewed-tasks.json") + write_tasks_file( + tasks_path, + make_tasks_payload( + [ + TaskRecord(id="t1", project=project, intent="configure MCP server"), + TaskRecord(id="t2", project=project, intent="resolve Git conflict"), + ], + project=project, + n_sessions=2, + target_skill_path=target, + ), + ) + Args = type("Args", (), { + "project": project, + "scope": "", + "backend": "mock", + "model": "", + "codex_path": "", + "claude_home": os.path.join(home, ".claude"), + "codex_home": "", + "source": "", + "lookback_hours": 0, + "edit_budget": 2, + "max_sessions": 5, + "max_tasks": 3, + "target_skill_path": "", + "tasks_file": tasks_path, + "progress": False, + "auto_adopt": False, + "json": True, + }) + + out = StringIO() + with redirect_stdout(out): + rc = cmd_run(Args(), dry=True) + payload = json.loads(out.getvalue()) + + self.assertEqual(rc, 0) + self.assertEqual(payload["n_sessions"], 0) + self.assertEqual(payload["n_tasks"], 2) + self.assertEqual(payload["tasks_file"], tasks_path) + + def test_cmd_run_refuses_unreviewed_tasks_file_for_real_backend(self): + from contextlib import redirect_stderr + from io import StringIO + + from skillopt_sleep.__main__ import cmd_run + from skillopt_sleep.tasks_file import make_tasks_payload, write_tasks_file + + with tempfile.TemporaryDirectory() as project, tempfile.TemporaryDirectory() as home: + tasks_path = os.path.join(home, "reviewed-tasks.json") + write_tasks_file( + tasks_path, + make_tasks_payload( + [TaskRecord(id="t1", project=project, intent="configure MCP server")], + project=project, + target_skill_path=os.path.join(project, ".agents/skills/yoshi-monorepo/SKILL.md"), + ), + ) + Args = type("Args", (), { + "project": project, + "scope": "", + "backend": "codex", + "model": "", + "codex_path": "", + "claude_home": os.path.join(home, ".claude"), + "codex_home": "", + "source": "", + "lookback_hours": 0, + "edit_budget": 2, + "max_sessions": 0, + "max_tasks": 0, + "target_skill_path": "", + "tasks_file": tasks_path, + "progress": False, + "auto_adopt": False, + "json": True, + }) + + err = StringIO() + with redirect_stderr(err): + rc = cmd_run(Args(), dry=True) + + self.assertEqual(rc, 2) + self.assertIn("unreviewed tasks file", err.getvalue()) + class TestMine(unittest.TestCase): def _digest(self, prompts, feedback): @@ -220,6 +423,59 @@ def test_dream_never_in_val_or_test(self): # and val/test are disjoint (a task is in exactly one split) self.assertTrue(any(t.split == "val" for t in tasks)) + def test_target_filter_prefers_matching_skill_terms(self): + skill = """# Yoshi Monorepo + +## MCP Setup Requests +Configure Codex MCP servers from linked setup docs. + +## Local Git Conflicts +Resolve local Git conflicts during merge, rebase, or cherry-pick. +""" + tasks = [ + TaskRecord(id="ios", project="/p", intent="polish SwiftUI onboarding spacing"), + TaskRecord(id="mcp", project="/p", intent="configure an MCP server from docs"), + TaskRecord(id="git", project="/p", intent="resolve a local Git conflict"), + TaskRecord(id="api", project="/p", intent="deploy the Rails API with Kamal"), + ] + + filtered = filter_tasks_for_target( + tasks, + skill, + ".agents/skills/yoshi-monorepo/SKILL.md", + ) + + self.assertEqual({t.id for t in filtered}, {"mcp", "git"}) + + def test_mine_oversamples_before_target_filtering(self): + skill = """# Yoshi Monorepo + +## MCP Setup Requests +Configure Codex MCP servers. + +## Local Git Conflicts +Resolve local Git conflicts. +""" + digests = [ + self._digest(["polish SwiftUI onboarding spacing"], ["neg:missed"]), + self._digest(["configure an MCP server from docs"], ["neg:missed"]), + self._digest(["resolve a local Git conflict"], ["neg:missed"]), + ] + + tasks = mine( + digests, + max_tasks=2, + candidate_limit=3, + target_skill_text=skill, + target_skill_path=".agents/skills/yoshi-monorepo/SKILL.md", + seed=42, + ) + + self.assertEqual({t.intent for t in tasks}, { + "configure an MCP server from docs", + "resolve a local Git conflict", + }) + class TestConsolidateGate(unittest.TestCase): def test_accepts_helpful_rejects_harmful(self): @@ -366,6 +622,39 @@ def test_replay_records_cost(self): self.assertGreaterEqual(r.latency_ms, 0.0) +class TestCodexBackend(unittest.TestCase): + def test_codex_cli_backend_runs_exec_in_project_dir(self): + from skillopt_sleep.backend import CodexCliBackend + + calls = [] + + def fake_run(cmd, **kwargs): + calls.append((cmd, kwargs)) + out_path = cmd[cmd.index("-o") + 1] + with open(out_path, "w", encoding="utf-8") as f: + f.write("ok") + + class Proc: + returncode = 0 + stdout = "" + stderr = "" + + return Proc() + + with tempfile.TemporaryDirectory() as project: + expected_project = os.path.abspath(project) + backend = CodexCliBackend(codex_path="codex", project_dir=project) + + with mock.patch("skillopt_sleep.backend.subprocess.run", side_effect=fake_run): + self.assertEqual(backend._call("hello"), "ok") + + self.assertEqual(len(calls), 1) + cmd, kwargs = calls[0] + self.assertEqual(kwargs["cwd"], expected_project) + self.assertIn("-C", cmd) + self.assertEqual(cmd[cmd.index("-C") + 1], expected_project) + + class TestMultiRolloutAndBudget(unittest.TestCase): def test_rolloutset_stats(self): from skillopt_sleep.rollout import RolloutSet @@ -508,6 +797,33 @@ def test_cycle_stage_then_adopt_with_backup(self): with open(live_skill) as f: self.assertIn("answer", f.read().lower()) + def test_cycle_can_target_repo_scoped_skill_path(self): + with tempfile.TemporaryDirectory() as proj, tempfile.TemporaryDirectory() as home: + target = os.path.join(proj, ".agents/skills/taste-skill/SKILL.md") + cfg = load_config( + invoked_project=proj, + projects="invoked", + backend="mock", + claude_home=os.path.join(home, ".claude"), + target_skill_path=target, + auto_adopt=False, + ) + tasks = assign_splits(programmer_persona(), holdout_fraction=0.34, seed=42) + + outcome = run_sleep_cycle(cfg, seed_tasks=tasks) + + self.assertTrue(outcome.report.accepted) + manifest_path = os.path.join(outcome.staging_dir, "manifest.json") + with open(manifest_path, encoding="utf-8") as f: + manifest = json.load(f) + self.assertEqual(manifest["live_skill_path"], target) + self.assertFalse(os.path.exists(target)) + + updated = adopt(outcome.staging_dir) + + self.assertIn(target, updated) + self.assertTrue(os.path.exists(target)) + if __name__ == "__main__": unittest.main(verbosity=2)