226 lines
8.2 KiB
Python
226 lines
8.2 KiB
Python
|
|
"""scripts/calibration_collect.py
|
|||
|
|
|
|||
|
|
M3.3 Phase A: load every persisted ResearchResult under
|
|||
|
|
~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
|
|||
|
|
to docs/stress-tests/M3.3-rating-worksheet.md.
|
|||
|
|
|
|||
|
|
The worksheet has one row per run with the model's self-reported confidence
|
|||
|
|
and a blank `actual_rating` column for human review (Phase B). After rating
|
|||
|
|
is complete, scripts/calibration_analyze.py (Phase C) will load the same
|
|||
|
|
file with the rating column populated and compute calibration error.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
.venv/bin/python scripts/calibration_collect.py
|
|||
|
|
|
|||
|
|
Optional env:
|
|||
|
|
TRACE_DIR — override default ~/.marchwarden/traces
|
|||
|
|
OUT — override default docs/stress-tests/M3.3-rating-worksheet.md
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|||
|
|
sys.path.insert(0, str(REPO_ROOT))
|
|||
|
|
|
|||
|
|
from researchers.web.models import ResearchResult # noqa: E402
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
|
|||
|
|
"""Load every <id>.result.json under trace_dir, sorted by mtime."""
|
|||
|
|
files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
|
|||
|
|
out: list[tuple[Path, ResearchResult]] = []
|
|||
|
|
for f in files:
|
|||
|
|
try:
|
|||
|
|
result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
|
|||
|
|
except Exception as exc:
|
|||
|
|
print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
|
|||
|
|
continue
|
|||
|
|
out.append((f, result))
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _gap_summary(result: ResearchResult) -> str:
|
|||
|
|
"""Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
|
|||
|
|
if not result.gaps:
|
|||
|
|
return "—"
|
|||
|
|
counts: dict[str, int] = {}
|
|||
|
|
for g in result.gaps:
|
|||
|
|
cat = g.category.value if hasattr(g.category, "value") else str(g.category)
|
|||
|
|
counts[cat] = counts.get(cat, 0) + 1
|
|||
|
|
return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _category_map(runs_dir: Path) -> dict[str, str]:
|
|||
|
|
"""Map trace_id -> category by parsing scripts/calibration_runner.sh log files.
|
|||
|
|
|
|||
|
|
Each log file is named like ``01-factual.log`` and contains a final
|
|||
|
|
``trace_id: <uuid>`` line emitted by the CLI.
|
|||
|
|
"""
|
|||
|
|
out: dict[str, str] = {}
|
|||
|
|
if not runs_dir.exists():
|
|||
|
|
return out
|
|||
|
|
for log in runs_dir.glob("*.log"):
|
|||
|
|
# filename format: NN-category.log
|
|||
|
|
stem = log.stem
|
|||
|
|
parts = stem.split("-", 1)
|
|||
|
|
if len(parts) != 2:
|
|||
|
|
continue
|
|||
|
|
category = parts[1]
|
|||
|
|
try:
|
|||
|
|
text = log.read_text(encoding="utf-8")
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
# Find the last "trace_id: <uuid>" line
|
|||
|
|
trace_id = None
|
|||
|
|
for line in text.splitlines():
|
|||
|
|
if "trace_id:" in line:
|
|||
|
|
# Strip ANSI / rich markup if present
|
|||
|
|
token = line.split("trace_id:")[-1].strip()
|
|||
|
|
# Take only the UUID portion
|
|||
|
|
token = token.split()[0] if token else ""
|
|||
|
|
# Strip any surrounding rich markup
|
|||
|
|
token = token.replace("[/dim]", "").replace("[dim]", "")
|
|||
|
|
if token:
|
|||
|
|
trace_id = token
|
|||
|
|
if trace_id:
|
|||
|
|
out[trace_id] = category
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
|
|||
|
|
"""Recover the original question from the trace JSONL's `start` event."""
|
|||
|
|
jsonl = trace_dir / f"{trace_id}.jsonl"
|
|||
|
|
if not jsonl.exists():
|
|||
|
|
return "(question not recoverable — trace missing)"
|
|||
|
|
try:
|
|||
|
|
for line in jsonl.read_text(encoding="utf-8").splitlines():
|
|||
|
|
line = line.strip()
|
|||
|
|
if not line:
|
|||
|
|
continue
|
|||
|
|
entry = json.loads(line)
|
|||
|
|
if entry.get("action") == "start":
|
|||
|
|
return entry.get("question", "(no question field)")
|
|||
|
|
except Exception as exc:
|
|||
|
|
return f"(parse error: {exc})"
|
|||
|
|
return "(no start event)"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _build_worksheet(
|
|||
|
|
rows: list[tuple[Path, ResearchResult]],
|
|||
|
|
trace_dir: Path,
|
|||
|
|
category_map: dict[str, str],
|
|||
|
|
) -> str:
|
|||
|
|
"""Render the markdown worksheet."""
|
|||
|
|
lines: list[str] = []
|
|||
|
|
lines.append("# M3.3 Calibration Rating Worksheet")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("Issue: #46 (Phase B — human rating)")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(
|
|||
|
|
"## How to use this worksheet"
|
|||
|
|
)
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(
|
|||
|
|
"For each run below, read the answer + citations from the persisted "
|
|||
|
|
"result file (path in the **Result file** column). Score the answer's "
|
|||
|
|
"*actual* correctness on a 0.0–1.0 scale, **independent** of the "
|
|||
|
|
"model's self-reported confidence. Fill in the **actual_rating** "
|
|||
|
|
"column. Add notes in the **notes** column for anything unusual."
|
|||
|
|
)
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("Rating rubric:")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
|
|||
|
|
lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
|
|||
|
|
lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
|
|||
|
|
lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
|
|||
|
|
lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
|
|||
|
|
lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("After rating all rows, save this file and run:")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("```")
|
|||
|
|
lines.append(".venv/bin/python scripts/calibration_analyze.py")
|
|||
|
|
lines.append("```")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(f"## Runs ({len(rows)} total)")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(
|
|||
|
|
"| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
|
|||
|
|
)
|
|||
|
|
lines.append(
|
|||
|
|
"|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
for i, (path, result) in enumerate(rows, 1):
|
|||
|
|
cf = result.confidence_factors
|
|||
|
|
cm = result.cost_metadata
|
|||
|
|
question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
|
|||
|
|
# Truncate long questions for table readability
|
|||
|
|
if len(question) > 80:
|
|||
|
|
question = question[:77] + "..."
|
|||
|
|
gaps = _gap_summary(result).replace("|", "\\|")
|
|||
|
|
contradiction = "yes" if cf.contradiction_detected else "no"
|
|||
|
|
budget = "spent" if cf.budget_exhausted else "under"
|
|||
|
|
recency = cf.recency or "—"
|
|||
|
|
category = category_map.get(result.trace_id, "ad-hoc")
|
|||
|
|
lines.append(
|
|||
|
|
f"| {i} "
|
|||
|
|
f"| `{result.trace_id[:8]}` "
|
|||
|
|
f"| {category} "
|
|||
|
|
f"| {question} "
|
|||
|
|
f"| {result.confidence:.2f} "
|
|||
|
|
f"| {cf.num_corroborating_sources} "
|
|||
|
|
f"| {cf.source_authority} "
|
|||
|
|
f"| {contradiction} "
|
|||
|
|
f"| {budget} "
|
|||
|
|
f"| {recency} "
|
|||
|
|
f"| {gaps} "
|
|||
|
|
f"| {len(result.citations)} "
|
|||
|
|
f"| {len(result.discovery_events)} "
|
|||
|
|
f"| {cm.tokens_used} "
|
|||
|
|
f"| "
|
|||
|
|
f"| |"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append("## Result files (full content for review)")
|
|||
|
|
lines.append("")
|
|||
|
|
for i, (path, result) in enumerate(rows, 1):
|
|||
|
|
lines.append(f"{i}. `{path}`")
|
|||
|
|
lines.append("")
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> int:
|
|||
|
|
trace_dir = Path(
|
|||
|
|
os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
|
|||
|
|
)
|
|||
|
|
out_path = Path(
|
|||
|
|
os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
|
|||
|
|
)
|
|||
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
rows = _load_results(trace_dir)
|
|||
|
|
if not rows:
|
|||
|
|
print(f"No result files found under {trace_dir}", file=sys.stderr)
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
|
|||
|
|
category_map = _category_map(runs_dir)
|
|||
|
|
|
|||
|
|
out_path.write_text(
|
|||
|
|
_build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
|
|||
|
|
)
|
|||
|
|
print(f"Wrote {len(rows)}-row worksheet to {out_path}")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
raise SystemExit(main())
|