marchwarden/scripts/calibration_collect.py

226 lines
8.2 KiB
Python
Raw Normal View History

2026-04-09 02:21:47 +00:00
"""scripts/calibration_collect.py
M3.3 Phase A: load every persisted ResearchResult under
~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
to docs/stress-tests/M3.3-rating-worksheet.md.
The worksheet has one row per run with the model's self-reported confidence
and a blank `actual_rating` column for human review (Phase B). After rating
is complete, scripts/calibration_analyze.py (Phase C) will load the same
file with the rating column populated and compute calibration error.
Usage:
.venv/bin/python scripts/calibration_collect.py
Optional env:
TRACE_DIR override default ~/.marchwarden/traces
OUT override default docs/stress-tests/M3.3-rating-worksheet.md
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
from researchers.web.models import ResearchResult # noqa: E402
def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
"""Load every <id>.result.json under trace_dir, sorted by mtime."""
files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
out: list[tuple[Path, ResearchResult]] = []
for f in files:
try:
result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
except Exception as exc:
print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
continue
out.append((f, result))
return out
def _gap_summary(result: ResearchResult) -> str:
"""Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
if not result.gaps:
return ""
counts: dict[str, int] = {}
for g in result.gaps:
cat = g.category.value if hasattr(g.category, "value") else str(g.category)
counts[cat] = counts.get(cat, 0) + 1
return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))
def _category_map(runs_dir: Path) -> dict[str, str]:
"""Map trace_id -> category by parsing scripts/calibration_runner.sh log files.
Each log file is named like ``01-factual.log`` and contains a final
``trace_id: <uuid>`` line emitted by the CLI.
"""
out: dict[str, str] = {}
if not runs_dir.exists():
return out
for log in runs_dir.glob("*.log"):
# filename format: NN-category.log
stem = log.stem
parts = stem.split("-", 1)
if len(parts) != 2:
continue
category = parts[1]
try:
text = log.read_text(encoding="utf-8")
except Exception:
continue
# Find the last "trace_id: <uuid>" line
trace_id = None
for line in text.splitlines():
if "trace_id:" in line:
# Strip ANSI / rich markup if present
token = line.split("trace_id:")[-1].strip()
# Take only the UUID portion
token = token.split()[0] if token else ""
# Strip any surrounding rich markup
token = token.replace("[/dim]", "").replace("[dim]", "")
if token:
trace_id = token
if trace_id:
out[trace_id] = category
return out
def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
"""Recover the original question from the trace JSONL's `start` event."""
jsonl = trace_dir / f"{trace_id}.jsonl"
if not jsonl.exists():
return "(question not recoverable — trace missing)"
try:
for line in jsonl.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
entry = json.loads(line)
if entry.get("action") == "start":
return entry.get("question", "(no question field)")
except Exception as exc:
return f"(parse error: {exc})"
return "(no start event)"
def _build_worksheet(
rows: list[tuple[Path, ResearchResult]],
trace_dir: Path,
category_map: dict[str, str],
) -> str:
"""Render the markdown worksheet."""
lines: list[str] = []
lines.append("# M3.3 Calibration Rating Worksheet")
lines.append("")
lines.append("Issue: #46 (Phase B — human rating)")
lines.append("")
lines.append(
"## How to use this worksheet"
)
lines.append("")
lines.append(
"For each run below, read the answer + citations from the persisted "
"result file (path in the **Result file** column). Score the answer's "
"*actual* correctness on a 0.01.0 scale, **independent** of the "
"model's self-reported confidence. Fill in the **actual_rating** "
"column. Add notes in the **notes** column for anything unusual."
)
lines.append("")
lines.append("Rating rubric:")
lines.append("")
lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
lines.append("")
lines.append("After rating all rows, save this file and run:")
lines.append("")
lines.append("```")
lines.append(".venv/bin/python scripts/calibration_analyze.py")
lines.append("```")
lines.append("")
lines.append(f"## Runs ({len(rows)} total)")
lines.append("")
lines.append(
"| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
)
lines.append(
"|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
)
for i, (path, result) in enumerate(rows, 1):
cf = result.confidence_factors
cm = result.cost_metadata
question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
# Truncate long questions for table readability
if len(question) > 80:
question = question[:77] + "..."
gaps = _gap_summary(result).replace("|", "\\|")
contradiction = "yes" if cf.contradiction_detected else "no"
budget = "spent" if cf.budget_exhausted else "under"
recency = cf.recency or ""
category = category_map.get(result.trace_id, "ad-hoc")
lines.append(
f"| {i} "
f"| `{result.trace_id[:8]}` "
f"| {category} "
f"| {question} "
f"| {result.confidence:.2f} "
f"| {cf.num_corroborating_sources} "
f"| {cf.source_authority} "
f"| {contradiction} "
f"| {budget} "
f"| {recency} "
f"| {gaps} "
f"| {len(result.citations)} "
f"| {len(result.discovery_events)} "
f"| {cm.tokens_used} "
f"| "
f"| |"
)
lines.append("")
lines.append("## Result files (full content for review)")
lines.append("")
for i, (path, result) in enumerate(rows, 1):
lines.append(f"{i}. `{path}`")
lines.append("")
return "\n".join(lines)
def main() -> int:
trace_dir = Path(
os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
)
out_path = Path(
os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
)
out_path.parent.mkdir(parents=True, exist_ok=True)
rows = _load_results(trace_dir)
if not rows:
print(f"No result files found under {trace_dir}", file=sys.stderr)
return 1
runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
category_map = _category_map(runs_dir)
out_path.write_text(
_build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
)
print(f"Wrote {len(rows)}-row worksheet to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())