diff --git a/.github/cursor-review/extract-findings.py b/.github/cursor-review/extract-findings.py index 8c774eb..42d0a08 100644 --- a/.github/cursor-review/extract-findings.py +++ b/.github/cursor-review/extract-findings.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 """Parse raw cursor-agent output into a normalized findings record. -Used by per-cell matrix steps. Each cell calls this to convert the model's -raw stdout into a JSON file the consolidate step can ingest. The output is -always structured — even on parse failures or empty output — so the -consolidate step has a uniform input. +Used by per-cell matrix steps AND by the judge/consolidate step. Each caller +converts the model's raw stdout into a JSON file the next step can ingest. The +output is always structured — even on parse failures or empty output — so the +downstream step has a uniform input. Output shape: { @@ -21,35 +21,110 @@ import re +def _try_load(snippet: str): + """json.loads `snippet`, returning the value only if it's a list or dict. + + A bare number/string/bool is never a findings payload, so reject it — this + keeps the brace-scan below from "succeeding" on a stray scalar. + """ + try: + value = json.loads(snippet) + except (json.JSONDecodeError, ValueError): + return None + return value if isinstance(value, (list, dict)) else None + + +def _iter_json_candidates(text: str): + """Yield each top-level balanced {...} / [...] region embedded in `text`. + + String- and escape-aware: braces or brackets inside JSON string literals + don't throw off the nesting count, so prose like `... the findings […] are` + surrounding a real array doesn't corrupt the match the way a naive + first-`[`/last-`]` slice does. Regions are yielded in document order; the + caller parses each and takes the first that loads. + """ + openers = {"{", "["} + closers = {"}", "]"} + i, n = 0, len(text) + while i < n: + if text[i] not in openers: + i += 1 + continue + depth = 0 + in_str = False + escape = False + j = i + while j < n: + c = text[j] + if in_str: + if escape: + escape = False + elif c == "\\": + escape = True + elif c == '"': + in_str = False + elif c == '"': + in_str = True + elif c in openers: + depth += 1 + elif c in closers: + depth -= 1 + if depth == 0: + yield text[i : j + 1] + break + j += 1 + # Resume scanning after this region (or after an unbalanced opener). + i = j + 1 + + def parse_json_findings(raw_text: str): - """Extract a JSON array from raw model output, tolerating surrounding prose. + """Extract a JSON value (array or object) from raw model output. - Returns the parsed value, or None if no JSON array could be located. + Tolerates surrounding prose and markdown fences. Returns the parsed value + (list or dict), or None if no JSON could be located. Layered most- to + least-strict so a clean response takes the fast path: + + 1. The whole output is JSON. + 2. A fenced ```json (or bare ```) block holds the JSON. + 3. A balanced {...}/[...] region is embedded in prose. """ text = raw_text.strip() - try: - return json.loads(text) - except json.JSONDecodeError: - pass - - fenced = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL) - if fenced: - try: - return json.loads(fenced.group(1).strip()) - except json.JSONDecodeError: - pass - - start = text.find("[") - end = text.rfind("]") - if start != -1 and end != -1 and end > start: - try: - return json.loads(text[start : end + 1]) - except json.JSONDecodeError: - pass + + parsed = _try_load(text) + if parsed is not None: + return parsed + + for match in re.finditer(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL): + parsed = _try_load(match.group(1).strip()) + if parsed is not None: + return parsed + + for candidate in _iter_json_candidates(text): + parsed = _try_load(candidate) + if parsed is not None: + return parsed return None +def coerce_findings_list(parsed): + """Reduce a parsed JSON value to the findings list, or None if it isn't one. + + The panel cells and judge are all asked for a bare JSON array, but a model + intermittently wraps it as `{"findings": [...]}` (or a near-synonym key). + Unwrap those so a well-formed-but-wrapped response parses instead of being + discarded as a parse_error. + """ + if isinstance(parsed, list): + return parsed + if isinstance(parsed, dict): + for key in ("findings", "results", "items", "reviews"): + value = parsed.get(key) + if isinstance(value, list): + return value + return None + + def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw", required=True, help="Path to raw cursor-agent output") @@ -75,7 +150,7 @@ def main(): json.dump(record, f) return - findings = parse_json_findings(raw) + findings = coerce_findings_list(parse_json_findings(raw)) if findings is None: # Truncate raw so artifacts stay small even on chatty parse failures. @@ -84,12 +159,6 @@ def main(): error=f"Could not parse JSON findings from output. First 500 chars:\n{raw[:500]}", findings=[], ) - elif not isinstance(findings, list): - record.update( - status="parse_error", - error=f"Output parsed but is not an array (got {type(findings).__name__}).", - findings=[], - ) else: record.update(status="ok", findings=findings) diff --git a/.github/cursor-review/post-review.py b/.github/cursor-review/post-review.py index 118a953..9a83bfe 100644 --- a/.github/cursor-review/post-review.py +++ b/.github/cursor-review/post-review.py @@ -262,10 +262,19 @@ def main(): parser.add_argument("--commit-sha", required=True) parser.add_argument("--triggered-by", default=None) parser.add_argument("--error-message", default=None, help="If set, post an error review with this message") + parser.add_argument( + "--notice", + default=None, + help="Banner prepended to the review body (e.g. a judge-failed degradation note).", + ) args = parser.parse_args() attribution = f"\n\n_Triggered by @{args.triggered_by}._" if args.triggered_by else "" header = f"## 🔍 Cursor Review — Consolidated panel{attribution}" + if args.notice: + # Surface a degradation banner (judge failed → raw panel findings) right + # under the title so every rendered body carries it. + header += f"\n\n{neutralize_mentions(args.notice)}" if args.error_message: post_error_review(args.repo, args.pr_number, args.commit_sha, header, args.error_message) diff --git a/.github/cursor-review/prompt-judge.md b/.github/cursor-review/prompt-judge.md index a372fb8..7af5d28 100644 --- a/.github/cursor-review/prompt-judge.md +++ b/.github/cursor-review/prompt-judge.md @@ -1,3 +1,8 @@ +RESPOND WITH ONLY A JSON ARRAY. Your entire response must be a single JSON +array of finding objects (or `[]` if none) — no prose, no preamble, no +explanation, no markdown code fences before or after it. Do not narrate your +reasoning. The exact element schema is specified at the end of this prompt. + You are a senior software engineer adjudicating findings from a panel of AI code reviewers. The panel ran a 4-lab × 2-review-type matrix (8 cells total): - Labs: OpenAI, Anthropic, Google, Moonshot diff --git a/.github/cursor-review/tests/test_extract_findings.py b/.github/cursor-review/tests/test_extract_findings.py new file mode 100644 index 0000000..e735138 --- /dev/null +++ b/.github/cursor-review/tests/test_extract_findings.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Regression tests for extract-findings.py JSON recovery. + +The judge/consolidate step intermittently returns valid findings JSON wrapped +in a sentence or two of prose. Discarding the whole run on that (the BE-1916 +parse_error class — hit on ComfyUI #487 and Alex's PR) is the bug these tests +guard against: prose-wrapped JSON must be recovered, not thrown away. + +Run: python3 .github/cursor-review/tests/test_extract_findings.py +""" + +import importlib.util +import json +import os +import tempfile +import unittest + +# extract-findings.py has a hyphen, so import it by path rather than `import`. +_MODULE_PATH = os.path.join(os.path.dirname(__file__), "..", "extract-findings.py") +_spec = importlib.util.spec_from_file_location("extract_findings", _MODULE_PATH) +ef = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(ef) + + +# A real-finding payload reused across cases. +FINDINGS = [ + { + "file": "internal/api/handler.go", + "line": 42, + "side": "RIGHT", + "severity": "high", + "body": "User-supplied filename reaches os.Open without traversal checks.", + }, + { + "file": "internal/worker/upload.go", + "line": 118, + "side": "RIGHT", + "severity": "medium", + "body": "Context is cancelled before the upload completes [see hunk], losing data.", + }, +] + + +def _findings(raw): + """Mirror main()'s parse → coerce pipeline.""" + return ef.coerce_findings_list(ef.parse_json_findings(raw)) + + +class ParseJsonFindingsTest(unittest.TestCase): + def test_bare_array(self): + self.assertEqual(_findings(json.dumps(FINDINGS)), FINDINGS) + + def test_empty_array(self): + self.assertEqual(_findings("[]"), []) + + def test_prose_wrapped_array_487(self): + # The #487 failure shape: a prose verdict, THEN the JSON array. The + # prose even contains a bracket pair, which is what broke the old naive + # first-`[`/last-`]` slice. + raw = ( + "Based on my review of the actual code, I can adjudicate the two " + "findings [both raised by multiple reviewers] as follows. Here is " + "the consolidated result:\n\n" + json.dumps(FINDINGS) + "\n\n" + "These are the only items that rise to the bar." + ) + self.assertEqual(_findings(raw), FINDINGS) + + def test_fenced_json_block(self): + raw = ( + "Sure — here are the findings:\n\n```json\n" + + json.dumps(FINDINGS) + + "\n```\nLet me know if you need more." + ) + self.assertEqual(_findings(raw), FINDINGS) + + def test_fenced_block_no_lang(self): + raw = "Result:\n\n```\n" + json.dumps(FINDINGS) + "\n```\n" + self.assertEqual(_findings(raw), FINDINGS) + + def test_object_wrapped_findings(self): + raw = "Here you go:\n" + json.dumps({"findings": FINDINGS}) + self.assertEqual(_findings(raw), FINDINGS) + + def test_brackets_inside_string_literals(self): + # Brackets inside a body string must not confuse the balanced scanner. + payload = [{"file": "a.py", "line": 1, "side": "RIGHT", "body": "uses arr[i] and m['k']"}] + raw = "Verdict below.\n" + json.dumps(payload) + self.assertEqual(_findings(raw), payload) + + def test_genuinely_malformed_returns_none(self): + self.assertIsNone(_findings("I reviewed the code and found nothing actionable.")) + + def test_truncated_json_returns_none(self): + # An unterminated array can't be recovered — must fail, not half-parse. + self.assertIsNone(_findings('[{"file": "a.py", "line": 1, "body": "oops"')) + + def test_scalar_is_not_findings(self): + # A bare number is valid JSON but never a findings payload. + self.assertIsNone(_findings("42")) + + def test_object_without_findings_key(self): + self.assertIsNone(_findings('{"summary": "looks good", "count": 0}')) + + +class MainEndToEndTest(unittest.TestCase): + """Drive main() the way the workflow does, asserting the status field.""" + + def _run(self, raw_text): + with tempfile.TemporaryDirectory() as d: + raw_path = os.path.join(d, "raw.txt") + out_path = os.path.join(d, "out.json") + with open(raw_path, "w", encoding="utf-8") as f: + f.write(raw_text) + import sys + + argv = sys.argv + sys.argv = [ + "extract-findings.py", + "--raw", raw_path, + "--out", out_path, + "--model", "judge-model", + "--review-type", "judge", + ] + try: + ef.main() + finally: + sys.argv = argv + with open(out_path, encoding="utf-8") as f: + return json.load(f) + + def test_prose_wrapped_parses_ok(self): + raw = "After review, my verdict is:\n\n" + json.dumps(FINDINGS) + record = self._run(raw) + self.assertEqual(record["status"], "ok") + self.assertEqual(record["findings"], FINDINGS) + + def test_malformed_is_parse_error(self): + record = self._run("I could not find anything worth flagging.") + self.assertEqual(record["status"], "parse_error") + self.assertEqual(record["findings"], []) + + def test_empty_is_empty(self): + record = self._run(" \n ") + self.assertEqual(record["status"], "empty") + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/workflows/cursor-review.yml b/.github/workflows/cursor-review.yml index 5531a15..1561c0f 100644 --- a/.github/workflows/cursor-review.yml +++ b/.github/workflows/cursor-review.yml @@ -508,12 +508,16 @@ jobs: env: CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} run: | - cursor-agent \ - --print \ - --trust \ - --model "$JUDGE_MODEL" \ - < /tmp/judge-prompt.txt \ - > /tmp/judge-raw.txt 2>/tmp/judge-stderr.txt || true + run_judge() { + cursor-agent \ + --print \ + --trust \ + --model "$JUDGE_MODEL" \ + < "$1" \ + > /tmp/judge-raw.txt 2>/tmp/judge-stderr.txt || true + } + + run_judge /tmp/judge-prompt.txt echo "=== Judge raw output (first 1000 chars) ===" head -c 1000 /tmp/judge-raw.txt @@ -521,6 +525,35 @@ jobs: echo "=== Judge stderr ===" cat /tmp/judge-stderr.txt + # LLMs intermittently wrap the findings array in prose. extract-findings.py + # recovers most of that (fenced block / embedded brace-match), but if it + # still can't parse, retry the judge ONCE with a terse strict-JSON + # reminder appended — a single retry clears the large majority of these + # transient parse failures. The retry overwrites /tmp/judge-raw.txt, so + # the parse step below reads whichever attempt the judge last produced. + python3 "$CURSOR_REVIEW_ASSETS/extract-findings.py" \ + --raw /tmp/judge-raw.txt \ + --out /tmp/judge-probe.json \ + --model "$JUDGE_MODEL" \ + --review-type judge + PROBE_STATUS=$(python3 -c "import json; print(json.load(open('/tmp/judge-probe.json')).get('status','?'))") + + if [ "$PROBE_STATUS" != "ok" ]; then + echo "Judge output did not parse (status=$PROBE_STATUS) — retrying once with a strict-JSON reminder." + { + cat /tmp/judge-prompt.txt + echo "" + echo "=== REMINDER ===" + echo "Your previous response could not be parsed as JSON. Return ONLY the JSON array of findings — no prose, no explanation, no markdown code fences. Your entire response must be a single JSON array (return [] if there are no findings)." + } > /tmp/judge-prompt-retry.txt + run_judge /tmp/judge-prompt-retry.txt + echo "=== Judge retry raw output (first 1000 chars) ===" + head -c 1000 /tmp/judge-raw.txt + echo "" + echo "=== Judge retry stderr ===" + cat /tmp/judge-stderr.txt + fi + - name: Build consolidated findings file id: consolidated env: @@ -550,9 +583,13 @@ jobs: # Combine judge findings with panel metadata (model, review_type, # status only — drop full per-cell findings to keep the post payload - # small) for the post-review script. + # small) for the post-review script. If the judge FAILED (still no + # parseable verdict after the retry), fall back to the raw per-cell + # findings so the run surfaces what the panel found instead of a bare + # "Review failed". python3 - <<'PY' - import json + import json, os + with open("/tmp/panel.json", encoding="utf-8") as f: panel = json.load(f) panel_meta = [ @@ -561,11 +598,39 @@ jobs: ] with open("/tmp/judge-findings.json", encoding="utf-8") as f: judge = json.load(f) - findings = judge.get("findings") or [] + + degraded = judge.get("status") != "ok" + if not degraded: + findings = judge.get("findings") or [] + else: + # Judge unusable — union the per-cell findings, deduping identical + # (file, line, body) across cells to cut the obvious cross-cell + # repetition the judge would normally collapse. + seen, findings = set(), [] + for cell in panel: + if cell.get("status") != "ok": + continue + for fnd in cell.get("findings") or []: + if not isinstance(fnd, dict): + continue + key = (fnd.get("file"), fnd.get("line"), fnd.get("body")) + if key in seen: + continue + seen.add(key) + findings.append(fnd) + consolidated = {"findings": findings, "panel": panel_meta} with open("/tmp/consolidated.json", "w", encoding="utf-8") as f: json.dump(consolidated, f) - print(f"Consolidated: {len(findings)} judge finding(s), {len(panel_meta)} panel cells.") + + with open(os.environ["GITHUB_OUTPUT"], "a") as g: + g.write(f"consolidated_count={len(findings)}\n") + g.write(f"degraded={'true' if degraded else 'false'}\n") + print( + f"Consolidated: {len(findings)} finding(s) " + f"({'degraded panel fallback' if degraded else 'judge-adjudicated'}), " + f"{len(panel_meta)} panel cells." + ) PY - name: Resolve triggerer @@ -610,12 +675,21 @@ jobs: HEAD_SHA: ${{ github.event.pull_request.head.sha }} TRIGGERED_BY: ${{ steps.triggerer.outputs.triggered_by }} JUDGE_STATUS: ${{ steps.consolidated.outputs.judge_status }} + CONSOLIDATED_COUNT: ${{ steps.consolidated.outputs.consolidated_count }} run: | - # If the judge call itself failed but the panel had content, post - # an error review so we don't silently misreport. The "ok with - # zero findings" path stays normal — that's the legitimate - # "no high-signal findings" case. - if [ "$JUDGE_STATUS" != "ok" ]; then + # Three cases: + # 1. Judge ok → normal consolidated review. + # 2. Judge failed, panel has findings → post the raw per-cell findings + # with a notice banner (degraded, but NOT a bare "Review failed"). + # 3. Judge failed, no findings → honest error review. + if [ "$JUDGE_STATUS" = "ok" ]; then + python3 "$CURSOR_REVIEW_ASSETS/post-review.py" \ + --findings /tmp/consolidated.json \ + --pr-number "$PR_NUMBER" \ + --repo "$REPO" \ + --commit-sha "$HEAD_SHA" \ + --triggered-by "$TRIGGERED_BY" + elif [ "${CONSOLIDATED_COUNT:-0}" != "0" ]; then JUDGE_ERROR=$(python3 -c "import json; print(json.load(open('/tmp/judge-findings.json')).get('error','(no error message)'))") python3 "$CURSOR_REVIEW_ASSETS/post-review.py" \ --findings /tmp/consolidated.json \ @@ -623,14 +697,16 @@ jobs: --repo "$REPO" \ --commit-sha "$HEAD_SHA" \ --triggered-by "$TRIGGERED_BY" \ - --error-message "Judge call failed (status=${JUDGE_STATUS}): ${JUDGE_ERROR}" + --notice "⚠️ The judge step could not produce a parseable verdict after a retry (status=${JUDGE_STATUS}). The findings below are the raw, un-adjudicated panel output — they may contain duplicates or false positives the judge would normally filter." else + JUDGE_ERROR=$(python3 -c "import json; print(json.load(open('/tmp/judge-findings.json')).get('error','(no error message)'))") python3 "$CURSOR_REVIEW_ASSETS/post-review.py" \ --findings /tmp/consolidated.json \ --pr-number "$PR_NUMBER" \ --repo "$REPO" \ --commit-sha "$HEAD_SHA" \ - --triggered-by "$TRIGGERED_BY" + --triggered-by "$TRIGGERED_BY" \ + --error-message "Judge call failed (status=${JUDGE_STATUS}): ${JUDGE_ERROR}" fi notify-start: diff --git a/.github/workflows/test-cursor-review-scripts.yml b/.github/workflows/test-cursor-review-scripts.yml new file mode 100644 index 0000000..fde5e6e --- /dev/null +++ b/.github/workflows/test-cursor-review-scripts.yml @@ -0,0 +1,38 @@ +name: Test cursor-review scripts + +# Runs the Python unit tests for the cursor-review helper scripts (the +# extract-findings.py JSON parser in particular). These scripts drive the +# review panel + judge consolidation, so a parser regression silently breaks +# every consumer repo's review — cheap to guard with a unit run on change. + +on: + pull_request: + paths: + - '.github/cursor-review/**' + - '.github/workflows/test-cursor-review-scripts.yml' + push: + branches: [main] + paths: + - '.github/cursor-review/**' + - '.github/workflows/test-cursor-review-scripts.yml' + +permissions: + contents: read + +jobs: + test: + name: unittest + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Run cursor-review script tests + run: python3 -m unittest discover -s .github/cursor-review/tests -p 'test_*.py' -v