"""scripts/calibration_collect.py M3.3 Phase A: load every persisted ResearchResult under ~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet to docs/stress-tests/M3.3-rating-worksheet.md. The worksheet has one row per run with the model's self-reported confidence and a blank `actual_rating` column for human review (Phase B). After rating is complete, scripts/calibration_analyze.py (Phase C) will load the same file with the rating column populated and compute calibration error. Usage: .venv/bin/python scripts/calibration_collect.py Optional env: TRACE_DIR — override default ~/.marchwarden/traces OUT — override default docs/stress-tests/M3.3-rating-worksheet.md """ from __future__ import annotations import json import os import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO_ROOT)) from researchers.web.models import ResearchResult # noqa: E402 def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]: """Load every .result.json under trace_dir, sorted by mtime.""" files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime) out: list[tuple[Path, ResearchResult]] = [] for f in files: try: result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8")) except Exception as exc: print(f"warning: skipping {f.name}: {exc}", file=sys.stderr) continue out.append((f, result)) return out def _gap_summary(result: ResearchResult) -> str: """Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'.""" if not result.gaps: return "—" counts: dict[str, int] = {} for g in result.gaps: cat = g.category.value if hasattr(g.category, "value") else str(g.category) counts[cat] = counts.get(cat, 0) + 1 return ", ".join(f"{k}({v})" for k, v in sorted(counts.items())) def _category_map(runs_dir: Path) -> dict[str, str]: """Map trace_id -> category by parsing scripts/calibration_runner.sh log files. Each log file is named like ``01-factual.log`` and contains a final ``trace_id: `` line emitted by the CLI. """ out: dict[str, str] = {} if not runs_dir.exists(): return out for log in runs_dir.glob("*.log"): # filename format: NN-category.log stem = log.stem parts = stem.split("-", 1) if len(parts) != 2: continue category = parts[1] try: text = log.read_text(encoding="utf-8") except Exception: continue # Find the last "trace_id: " line trace_id = None for line in text.splitlines(): if "trace_id:" in line: # Strip ANSI / rich markup if present token = line.split("trace_id:")[-1].strip() # Take only the UUID portion token = token.split()[0] if token else "" # Strip any surrounding rich markup token = token.replace("[/dim]", "").replace("[dim]", "") if token: trace_id = token if trace_id: out[trace_id] = category return out def _question_from_trace(trace_dir: Path, trace_id: str) -> str: """Recover the original question from the trace JSONL's `start` event.""" jsonl = trace_dir / f"{trace_id}.jsonl" if not jsonl.exists(): return "(question not recoverable — trace missing)" try: for line in jsonl.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue entry = json.loads(line) if entry.get("action") == "start": return entry.get("question", "(no question field)") except Exception as exc: return f"(parse error: {exc})" return "(no start event)" def _build_worksheet( rows: list[tuple[Path, ResearchResult]], trace_dir: Path, category_map: dict[str, str], ) -> str: """Render the markdown worksheet.""" lines: list[str] = [] lines.append("# M3.3 Calibration Rating Worksheet") lines.append("") lines.append("Issue: #46 (Phase B — human rating)") lines.append("") lines.append( "## How to use this worksheet" ) lines.append("") lines.append( "For each run below, read the answer + citations from the persisted " "result file (path in the **Result file** column). Score the answer's " "*actual* correctness on a 0.0–1.0 scale, **independent** of the " "model's self-reported confidence. Fill in the **actual_rating** " "column. Add notes in the **notes** column for anything unusual." ) lines.append("") lines.append("Rating rubric:") lines.append("") lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.") lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.") lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.") lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.") lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.") lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.") lines.append("") lines.append("After rating all rows, save this file and run:") lines.append("") lines.append("```") lines.append(".venv/bin/python scripts/calibration_analyze.py") lines.append("```") lines.append("") lines.append(f"## Runs ({len(rows)} total)") lines.append("") lines.append( "| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |" ) lines.append( "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|" ) for i, (path, result) in enumerate(rows, 1): cf = result.confidence_factors cm = result.cost_metadata question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|") # Truncate long questions for table readability if len(question) > 80: question = question[:77] + "..." gaps = _gap_summary(result).replace("|", "\\|") contradiction = "yes" if cf.contradiction_detected else "no" budget = "spent" if cf.budget_exhausted else "under" recency = cf.recency or "—" category = category_map.get(result.trace_id, "ad-hoc") lines.append( f"| {i} " f"| `{result.trace_id[:8]}` " f"| {category} " f"| {question} " f"| {result.confidence:.2f} " f"| {cf.num_corroborating_sources} " f"| {cf.source_authority} " f"| {contradiction} " f"| {budget} " f"| {recency} " f"| {gaps} " f"| {len(result.citations)} " f"| {len(result.discovery_events)} " f"| {cm.tokens_used} " f"| " f"| |" ) lines.append("") lines.append("## Result files (full content for review)") lines.append("") for i, (path, result) in enumerate(rows, 1): lines.append(f"{i}. `{path}`") lines.append("") return "\n".join(lines) def main() -> int: trace_dir = Path( os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces")) ) out_path = Path( os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md") ) out_path.parent.mkdir(parents=True, exist_ok=True) rows = _load_results(trace_dir) if not rows: print(f"No result files found under {trace_dir}", file=sys.stderr) return 1 runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs" category_map = _category_map(runs_dir) out_path.write_text( _build_worksheet(rows, trace_dir, category_map), encoding="utf-8" ) print(f"Wrote {len(rows)}-row worksheet to {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())