marchwarden/scripts/calibration_collect.py
Jeff Smith 13215d7ddb docs(stress-tests): M3.3 Phase A — calibration data collection
Issue #46 (Phase A only — Phase B human rating still pending, issue stays open).

Adds the data-collection half of the calibration milestone:

- scripts/calibration_runner.sh — runs 20 fixed balanced-depth queries
  across 4 categories (factual, comparative, contradiction-prone,
  scope-edge), 5 each, capturing per-run logs to docs/stress-tests/M3.3-runs/.
- scripts/calibration_collect.py — loads every persisted ResearchResult
  under ~/.marchwarden/traces/*.result.json and emits a markdown rating
  worksheet with one row per run. Recovers question text from each
  trace's start event and category from the run-log filename.
- docs/stress-tests/M3.3-rating-worksheet.md — 22 runs (20 calibration
  + caffeine smoke + M3.2 multi-axis), with empty actual_rating columns
  for the human-in-the-loop scoring step.
- docs/stress-tests/M3.3-runs/*.log — runtime logs from the calibration
  runner, kept as provenance. Gitignore updated with an exception
  carving stress-test logs out of the global *.log ignore.

Note: M3.1's 4 runs predate #54 (full result persistence) and so are
unrecoverable to the worksheet — only post-#54 runs have a result.json
sibling. 22 rateable runs is still within the milestone target of 20–30.

Phases B (human rating) and C (analysis + rubric + wiki update) follow
in a later session. This issue stays open until both are done.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 20:21:47 -06:00

225 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""scripts/calibration_collect.py
M3.3 Phase A: load every persisted ResearchResult under
~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
to docs/stress-tests/M3.3-rating-worksheet.md.
The worksheet has one row per run with the model's self-reported confidence
and a blank `actual_rating` column for human review (Phase B). After rating
is complete, scripts/calibration_analyze.py (Phase C) will load the same
file with the rating column populated and compute calibration error.
Usage:
.venv/bin/python scripts/calibration_collect.py
Optional env:
TRACE_DIR — override default ~/.marchwarden/traces
OUT — override default docs/stress-tests/M3.3-rating-worksheet.md
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
from researchers.web.models import ResearchResult # noqa: E402
def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
"""Load every <id>.result.json under trace_dir, sorted by mtime."""
files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
out: list[tuple[Path, ResearchResult]] = []
for f in files:
try:
result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
except Exception as exc:
print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
continue
out.append((f, result))
return out
def _gap_summary(result: ResearchResult) -> str:
"""Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
if not result.gaps:
return ""
counts: dict[str, int] = {}
for g in result.gaps:
cat = g.category.value if hasattr(g.category, "value") else str(g.category)
counts[cat] = counts.get(cat, 0) + 1
return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))
def _category_map(runs_dir: Path) -> dict[str, str]:
"""Map trace_id -> category by parsing scripts/calibration_runner.sh log files.
Each log file is named like ``01-factual.log`` and contains a final
``trace_id: <uuid>`` line emitted by the CLI.
"""
out: dict[str, str] = {}
if not runs_dir.exists():
return out
for log in runs_dir.glob("*.log"):
# filename format: NN-category.log
stem = log.stem
parts = stem.split("-", 1)
if len(parts) != 2:
continue
category = parts[1]
try:
text = log.read_text(encoding="utf-8")
except Exception:
continue
# Find the last "trace_id: <uuid>" line
trace_id = None
for line in text.splitlines():
if "trace_id:" in line:
# Strip ANSI / rich markup if present
token = line.split("trace_id:")[-1].strip()
# Take only the UUID portion
token = token.split()[0] if token else ""
# Strip any surrounding rich markup
token = token.replace("[/dim]", "").replace("[dim]", "")
if token:
trace_id = token
if trace_id:
out[trace_id] = category
return out
def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
"""Recover the original question from the trace JSONL's `start` event."""
jsonl = trace_dir / f"{trace_id}.jsonl"
if not jsonl.exists():
return "(question not recoverable — trace missing)"
try:
for line in jsonl.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
entry = json.loads(line)
if entry.get("action") == "start":
return entry.get("question", "(no question field)")
except Exception as exc:
return f"(parse error: {exc})"
return "(no start event)"
def _build_worksheet(
rows: list[tuple[Path, ResearchResult]],
trace_dir: Path,
category_map: dict[str, str],
) -> str:
"""Render the markdown worksheet."""
lines: list[str] = []
lines.append("# M3.3 Calibration Rating Worksheet")
lines.append("")
lines.append("Issue: #46 (Phase B — human rating)")
lines.append("")
lines.append(
"## How to use this worksheet"
)
lines.append("")
lines.append(
"For each run below, read the answer + citations from the persisted "
"result file (path in the **Result file** column). Score the answer's "
"*actual* correctness on a 0.01.0 scale, **independent** of the "
"model's self-reported confidence. Fill in the **actual_rating** "
"column. Add notes in the **notes** column for anything unusual."
)
lines.append("")
lines.append("Rating rubric:")
lines.append("")
lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
lines.append("")
lines.append("After rating all rows, save this file and run:")
lines.append("")
lines.append("```")
lines.append(".venv/bin/python scripts/calibration_analyze.py")
lines.append("```")
lines.append("")
lines.append(f"## Runs ({len(rows)} total)")
lines.append("")
lines.append(
"| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
)
lines.append(
"|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
)
for i, (path, result) in enumerate(rows, 1):
cf = result.confidence_factors
cm = result.cost_metadata
question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
# Truncate long questions for table readability
if len(question) > 80:
question = question[:77] + "..."
gaps = _gap_summary(result).replace("|", "\\|")
contradiction = "yes" if cf.contradiction_detected else "no"
budget = "spent" if cf.budget_exhausted else "under"
recency = cf.recency or ""
category = category_map.get(result.trace_id, "ad-hoc")
lines.append(
f"| {i} "
f"| `{result.trace_id[:8]}` "
f"| {category} "
f"| {question} "
f"| {result.confidence:.2f} "
f"| {cf.num_corroborating_sources} "
f"| {cf.source_authority} "
f"| {contradiction} "
f"| {budget} "
f"| {recency} "
f"| {gaps} "
f"| {len(result.citations)} "
f"| {len(result.discovery_events)} "
f"| {cm.tokens_used} "
f"| "
f"| |"
)
lines.append("")
lines.append("## Result files (full content for review)")
lines.append("")
for i, (path, result) in enumerate(rows, 1):
lines.append(f"{i}. `{path}`")
lines.append("")
return "\n".join(lines)
def main() -> int:
trace_dir = Path(
os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
)
out_path = Path(
os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
)
out_path.parent.mkdir(parents=True, exist_ok=True)
rows = _load_results(trace_dir)
if not rows:
print(f"No result files found under {trace_dir}", file=sys.stderr)
return 1
runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
category_map = _category_map(runs_dir)
out_path.write_text(
_build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
)
print(f"Wrote {len(rows)}-row worksheet to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())