Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 101 additions & 32 deletions .github/cursor-review/extract-findings.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
"""Parse raw cursor-agent output into a normalized findings record.

Used by per-cell matrix steps. Each cell calls this to convert the model's
raw stdout into a JSON file the consolidate step can ingest. The output is
always structured — even on parse failures or empty output — so the
consolidate step has a uniform input.
Used by per-cell matrix steps AND by the judge/consolidate step. Each caller
converts the model's raw stdout into a JSON file the next step can ingest. The
output is always structured — even on parse failures or empty output — so the
downstream step has a uniform input.

Output shape:
{
Expand All @@ -21,35 +21,110 @@
import re


def _try_load(snippet: str):
"""json.loads `snippet`, returning the value only if it's a list or dict.

A bare number/string/bool is never a findings payload, so reject it — this
keeps the brace-scan below from "succeeding" on a stray scalar.
"""
try:
value = json.loads(snippet)
except (json.JSONDecodeError, ValueError):
return None
return value if isinstance(value, (list, dict)) else None


def _iter_json_candidates(text: str):
"""Yield each top-level balanced {...} / [...] region embedded in `text`.

String- and escape-aware: braces or brackets inside JSON string literals
don't throw off the nesting count, so prose like `... the findings […] are`
surrounding a real array doesn't corrupt the match the way a naive
first-`[`/last-`]` slice does. Regions are yielded in document order; the
caller parses each and takes the first that loads.
"""
openers = {"{", "["}
closers = {"}", "]"}
i, n = 0, len(text)
while i < n:
if text[i] not in openers:
i += 1
continue
depth = 0
in_str = False
escape = False
j = i
while j < n:
c = text[j]
if in_str:
if escape:
escape = False
elif c == "\\":
escape = True
elif c == '"':
in_str = False
elif c == '"':
in_str = True
elif c in openers:
depth += 1
elif c in closers:
depth -= 1
if depth == 0:
yield text[i : j + 1]
break
j += 1
# Resume scanning after this region (or after an unbalanced opener).
i = j + 1
Comment on lines +76 to +77

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Don’t stop scanning after an unterminated opener.

If the prose contains a stray [ or { that never balances, Line 77 jumps i to EOF and the parser never reaches a later valid JSON payload. One bad bracket, big racket: a response like note [draft...\n[{"file":...}] still ends up as parse_error.

Proposed fix
-        # Resume scanning after this region (or after an unbalanced opener).
-        i = j + 1
+        # Resume after a closed region; otherwise advance one char so later JSON
+        # candidates are still reachable.
+        if j < n and depth == 0:
+            i = j + 1
+        else:
+            i += 1
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Resume scanning after this region (or after an unbalanced opener).
i = j + 1
# Resume after a closed region; otherwise advance one char so later JSON
# candidates are still reachable.
if j < n and depth == 0:
i = j + 1
else:
i += 1
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In @.github/cursor-review/extract-findings.py around lines 76 - 77, The scanning
logic in the bracket parser is incorrectly advancing past the rest of the input
when it encounters an unterminated opener, which can hide a later valid JSON
payload. Update the loop in the extraction routine that uses the `i = j + 1`
resume step so an unbalanced `[` or `{` only skips the bad region and continues
scanning from the next viable position instead of jumping to EOF. Keep the fix
localized to the parser’s main scan path so later valid content can still be
detected.



def parse_json_findings(raw_text: str):
"""Extract a JSON array from raw model output, tolerating surrounding prose.
"""Extract a JSON value (array or object) from raw model output.

Returns the parsed value, or None if no JSON array could be located.
Tolerates surrounding prose and markdown fences. Returns the parsed value
(list or dict), or None if no JSON could be located. Layered most- to
least-strict so a clean response takes the fast path:

1. The whole output is JSON.
2. A fenced ```json (or bare ```) block holds the JSON.
3. A balanced {...}/[...] region is embedded in prose.
"""
text = raw_text.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass

fenced = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL)
if fenced:
try:
return json.loads(fenced.group(1).strip())
except json.JSONDecodeError:
pass

start = text.find("[")
end = text.rfind("]")
if start != -1 and end != -1 and end > start:
try:
return json.loads(text[start : end + 1])
except json.JSONDecodeError:
pass

parsed = _try_load(text)
if parsed is not None:
return parsed

for match in re.finditer(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL):
parsed = _try_load(match.group(1).strip())
if parsed is not None:
return parsed

for candidate in _iter_json_candidates(text):
parsed = _try_load(candidate)
if parsed is not None:
return parsed

return None


def coerce_findings_list(parsed):
"""Reduce a parsed JSON value to the findings list, or None if it isn't one.

The panel cells and judge are all asked for a bare JSON array, but a model
intermittently wraps it as `{"findings": [...]}` (or a near-synonym key).
Unwrap those so a well-formed-but-wrapped response parses instead of being
discarded as a parse_error.
"""
if isinstance(parsed, list):
return parsed
if isinstance(parsed, dict):
for key in ("findings", "results", "items", "reviews"):
value = parsed.get(key)
if isinstance(value, list):
return value
Comment on lines +120 to +124

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗄️ Data Integrity & Integration | 🟠 Major | ⚡ Quick win

Validate wrapper contents before returning status="ok".

coerce_findings_list() unwraps any list under results, items, or reviews, so payloads like {"results":["oops"]} are now treated as successful findings. That breaks the downstream contract in main() and can push malformed data into the judge/post-review path instead of surfacing a parse/schema failure.

Proposed fix
+def _is_findings_list(value):
+    required = {"file", "line", "side", "severity", "body"}
+    return isinstance(value, list) and all(
+        isinstance(item, dict) and required.issubset(item) for item in value
+    )
+
 def coerce_findings_list(parsed):
@@
-    if isinstance(parsed, list):
+    if _is_findings_list(parsed):
         return parsed
     if isinstance(parsed, dict):
         for key in ("findings", "results", "items", "reviews"):
             value = parsed.get(key)
-            if isinstance(value, list):
+            if _is_findings_list(value):
                 return value
     return None
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if isinstance(parsed, dict):
for key in ("findings", "results", "items", "reviews"):
value = parsed.get(key)
if isinstance(value, list):
return value
def _is_findings_list(value):
required = {"file", "line", "side", "severity", "body"}
return isinstance(value, list) and all(
isinstance(item, dict) and required.issubset(item) for item in value
)
if _is_findings_list(parsed):
return parsed
if isinstance(parsed, dict):
for key in ("findings", "results", "items", "reviews"):
value = parsed.get(key)
if _is_findings_list(value):
return value
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In @.github/cursor-review/extract-findings.py around lines 120 - 124, The
wrapper-unwrapping logic in coerce_findings_list() is too permissive because it
accepts any list under keys like results, items, or reviews, which lets
malformed payloads through as status="ok". Tighten the validation in
coerce_findings_list() so it only returns lists whose contents match the
expected findings schema, and otherwise let main() treat the payload as a
parse/schema failure instead of a successful result.

return None


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--raw", required=True, help="Path to raw cursor-agent output")
Expand All @@ -75,7 +150,7 @@ def main():
json.dump(record, f)
return

findings = parse_json_findings(raw)
findings = coerce_findings_list(parse_json_findings(raw))

if findings is None:
# Truncate raw so artifacts stay small even on chatty parse failures.
Expand All @@ -84,12 +159,6 @@ def main():
error=f"Could not parse JSON findings from output. First 500 chars:\n{raw[:500]}",
findings=[],
)
elif not isinstance(findings, list):
record.update(
status="parse_error",
error=f"Output parsed but is not an array (got {type(findings).__name__}).",
findings=[],
)
else:
record.update(status="ok", findings=findings)

Expand Down
9 changes: 9 additions & 0 deletions .github/cursor-review/post-review.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,19 @@ def main():
parser.add_argument("--commit-sha", required=True)
parser.add_argument("--triggered-by", default=None)
parser.add_argument("--error-message", default=None, help="If set, post an error review with this message")
parser.add_argument(
"--notice",
default=None,
help="Banner prepended to the review body (e.g. a judge-failed degradation note).",
)
args = parser.parse_args()

attribution = f"\n\n_Triggered by @{args.triggered_by}._" if args.triggered_by else ""
header = f"## 🔍 Cursor Review — Consolidated panel{attribution}"
if args.notice:
# Surface a degradation banner (judge failed → raw panel findings) right
# under the title so every rendered body carries it.
header += f"\n\n{neutralize_mentions(args.notice)}"

if args.error_message:
post_error_review(args.repo, args.pr_number, args.commit_sha, header, args.error_message)
Expand Down
5 changes: 5 additions & 0 deletions .github/cursor-review/prompt-judge.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
RESPOND WITH ONLY A JSON ARRAY. Your entire response must be a single JSON
array of finding objects (or `[]` if none) — no prose, no preamble, no
explanation, no markdown code fences before or after it. Do not narrate your
reasoning. The exact element schema is specified at the end of this prompt.

You are a senior software engineer adjudicating findings from a panel of AI
code reviewers. The panel ran a 4-lab × 2-review-type matrix (8 cells total):
- Labs: OpenAI, Anthropic, Google, Moonshot
Expand Down
148 changes: 148 additions & 0 deletions .github/cursor-review/tests/test_extract_findings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env python3
"""Regression tests for extract-findings.py JSON recovery.

The judge/consolidate step intermittently returns valid findings JSON wrapped
in a sentence or two of prose. Discarding the whole run on that (the BE-1916
parse_error class — hit on ComfyUI #487 and Alex's PR) is the bug these tests
guard against: prose-wrapped JSON must be recovered, not thrown away.

Run: python3 .github/cursor-review/tests/test_extract_findings.py
"""

import importlib.util
import json
import os
import tempfile
import unittest

# extract-findings.py has a hyphen, so import it by path rather than `import`.
_MODULE_PATH = os.path.join(os.path.dirname(__file__), "..", "extract-findings.py")
_spec = importlib.util.spec_from_file_location("extract_findings", _MODULE_PATH)
ef = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(ef)


# A real-finding payload reused across cases.
FINDINGS = [
{
"file": "internal/api/handler.go",
"line": 42,
"side": "RIGHT",
"severity": "high",
"body": "User-supplied filename reaches os.Open without traversal checks.",
},
{
"file": "internal/worker/upload.go",
"line": 118,
"side": "RIGHT",
"severity": "medium",
"body": "Context is cancelled before the upload completes [see hunk], losing data.",
},
]


def _findings(raw):
"""Mirror main()'s parse → coerce pipeline."""
return ef.coerce_findings_list(ef.parse_json_findings(raw))


class ParseJsonFindingsTest(unittest.TestCase):
def test_bare_array(self):
self.assertEqual(_findings(json.dumps(FINDINGS)), FINDINGS)

def test_empty_array(self):
self.assertEqual(_findings("[]"), [])

def test_prose_wrapped_array_487(self):
# The #487 failure shape: a prose verdict, THEN the JSON array. The
# prose even contains a bracket pair, which is what broke the old naive
# first-`[`/last-`]` slice.
raw = (
"Based on my review of the actual code, I can adjudicate the two "
"findings [both raised by multiple reviewers] as follows. Here is "
"the consolidated result:\n\n" + json.dumps(FINDINGS) + "\n\n"
"These are the only items that rise to the bar."
)
self.assertEqual(_findings(raw), FINDINGS)

def test_fenced_json_block(self):
raw = (
"Sure — here are the findings:\n\n```json\n"
+ json.dumps(FINDINGS)
+ "\n```\nLet me know if you need more."
)
self.assertEqual(_findings(raw), FINDINGS)

def test_fenced_block_no_lang(self):
raw = "Result:\n\n```\n" + json.dumps(FINDINGS) + "\n```\n"
self.assertEqual(_findings(raw), FINDINGS)

def test_object_wrapped_findings(self):
raw = "Here you go:\n" + json.dumps({"findings": FINDINGS})
self.assertEqual(_findings(raw), FINDINGS)

def test_brackets_inside_string_literals(self):
# Brackets inside a body string must not confuse the balanced scanner.
payload = [{"file": "a.py", "line": 1, "side": "RIGHT", "body": "uses arr[i] and m['k']"}]
raw = "Verdict below.\n" + json.dumps(payload)
self.assertEqual(_findings(raw), payload)

def test_genuinely_malformed_returns_none(self):
self.assertIsNone(_findings("I reviewed the code and found nothing actionable."))

def test_truncated_json_returns_none(self):
# An unterminated array can't be recovered — must fail, not half-parse.
self.assertIsNone(_findings('[{"file": "a.py", "line": 1, "body": "oops"'))

def test_scalar_is_not_findings(self):
# A bare number is valid JSON but never a findings payload.
self.assertIsNone(_findings("42"))

def test_object_without_findings_key(self):
self.assertIsNone(_findings('{"summary": "looks good", "count": 0}'))


class MainEndToEndTest(unittest.TestCase):
"""Drive main() the way the workflow does, asserting the status field."""

def _run(self, raw_text):
with tempfile.TemporaryDirectory() as d:
raw_path = os.path.join(d, "raw.txt")
out_path = os.path.join(d, "out.json")
with open(raw_path, "w", encoding="utf-8") as f:
f.write(raw_text)
import sys

argv = sys.argv
sys.argv = [
"extract-findings.py",
"--raw", raw_path,
"--out", out_path,
"--model", "judge-model",
"--review-type", "judge",
]
try:
ef.main()
finally:
sys.argv = argv
with open(out_path, encoding="utf-8") as f:
return json.load(f)

def test_prose_wrapped_parses_ok(self):
raw = "After review, my verdict is:\n\n" + json.dumps(FINDINGS)
record = self._run(raw)
self.assertEqual(record["status"], "ok")
self.assertEqual(record["findings"], FINDINGS)

def test_malformed_is_parse_error(self):
record = self._run("I could not find anything worth flagging.")
self.assertEqual(record["status"], "parse_error")
self.assertEqual(record["findings"], [])

def test_empty_is_empty(self):
record = self._run(" \n ")
self.assertEqual(record["status"], "empty")


if __name__ == "__main__":
unittest.main()
Loading