marchwarden/scripts/calibration_collect.py

"""scripts/calibration_collect.py

M3.3 Phase A: load every persisted ResearchResult under
~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
to docs/stress-tests/M3.3-rating-worksheet.md.

The worksheet has one row per run with the model's self-reported confidence
and a blank `actual_rating` column for human review (Phase B). After rating
is complete, scripts/calibration_analyze.py (Phase C) will load the same
file with the rating column populated and compute calibration error.

Usage:
    .venv/bin/python scripts/calibration_collect.py

Optional env:
    TRACE_DIR — override default ~/.marchwarden/traces
    OUT       — override default docs/stress-tests/M3.3-rating-worksheet.md
"""

from __future__ import annotations

import json
import os
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))

from researchers.web.models import ResearchResult  # noqa: E402


def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
    """Load every <id>.result.json under trace_dir, sorted by mtime."""
    files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
    out: list[tuple[Path, ResearchResult]] = []
    for f in files:
        try:
            result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
        except Exception as exc:
            print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
            continue
        out.append((f, result))
    return out


def _gap_summary(result: ResearchResult) -> str:
    """Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
    if not result.gaps:
        return "—"
    counts: dict[str, int] = {}
    for g in result.gaps:
        cat = g.category.value if hasattr(g.category, "value") else str(g.category)
        counts[cat] = counts.get(cat, 0) + 1
    return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))


def _category_map(runs_dir: Path) -> dict[str, str]:
    """Map trace_id -> category by parsing scripts/calibration_runner.sh log files.

    Each log file is named like ``01-factual.log`` and contains a final
    ``trace_id: <uuid>`` line emitted by the CLI.
    """
    out: dict[str, str] = {}
    if not runs_dir.exists():
        return out
    for log in runs_dir.glob("*.log"):
        # filename format: NN-category.log
        stem = log.stem
        parts = stem.split("-", 1)
        if len(parts) != 2:
            continue
        category = parts[1]
        try:
            text = log.read_text(encoding="utf-8")
        except Exception:
            continue
        # Find the last "trace_id: <uuid>" line
        trace_id = None
        for line in text.splitlines():
            if "trace_id:" in line:
                # Strip ANSI / rich markup if present
                token = line.split("trace_id:")[-1].strip()
                # Take only the UUID portion
                token = token.split()[0] if token else ""
                # Strip any surrounding rich markup
                token = token.replace("[/dim]", "").replace("[dim]", "")
                if token:
                    trace_id = token
        if trace_id:
            out[trace_id] = category
    return out


def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
    """Recover the original question from the trace JSONL's `start` event."""
    jsonl = trace_dir / f"{trace_id}.jsonl"
    if not jsonl.exists():
        return "(question not recoverable — trace missing)"
    try:
        for line in jsonl.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if not line:
                continue
            entry = json.loads(line)
            if entry.get("action") == "start":
                return entry.get("question", "(no question field)")
    except Exception as exc:
        return f"(parse error: {exc})"
    return "(no start event)"


def _build_worksheet(
    rows: list[tuple[Path, ResearchResult]],
    trace_dir: Path,
    category_map: dict[str, str],
) -> str:
    """Render the markdown worksheet."""
    lines: list[str] = []
    lines.append("# M3.3 Calibration Rating Worksheet")
    lines.append("")
    lines.append("Issue: #46 (Phase B — human rating)")
    lines.append("")
    lines.append(
        "## How to use this worksheet"
    )
    lines.append("")
    lines.append(
        "For each run below, read the answer + citations from the persisted "
        "result file (path in the **Result file** column). Score the answer's "
        "*actual* correctness on a 0.0–1.0 scale, **independent** of the "
        "model's self-reported confidence. Fill in the **actual_rating** "
        "column. Add notes in the **notes** column for anything unusual."
    )
    lines.append("")
    lines.append("Rating rubric:")
    lines.append("")
    lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
    lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
    lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
    lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
    lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
    lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
    lines.append("")
    lines.append("After rating all rows, save this file and run:")
    lines.append("")
    lines.append("```")
    lines.append(".venv/bin/python scripts/calibration_analyze.py")
    lines.append("```")
    lines.append("")
    lines.append(f"## Runs ({len(rows)} total)")
    lines.append("")
    lines.append(
        "| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
    )
    lines.append(
        "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
    )

    for i, (path, result) in enumerate(rows, 1):
        cf = result.confidence_factors
        cm = result.cost_metadata
        question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
        # Truncate long questions for table readability
        if len(question) > 80:
            question = question[:77] + "..."
        gaps = _gap_summary(result).replace("|", "\\|")
        contradiction = "yes" if cf.contradiction_detected else "no"
        budget = "spent" if cf.budget_exhausted else "under"
        recency = cf.recency or "—"
        category = category_map.get(result.trace_id, "ad-hoc")
        lines.append(
            f"| {i} "
            f"| `{result.trace_id[:8]}` "
            f"| {category} "
            f"| {question} "
            f"| {result.confidence:.2f} "
            f"| {cf.num_corroborating_sources} "
            f"| {cf.source_authority} "
            f"| {contradiction} "
            f"| {budget} "
            f"| {recency} "
            f"| {gaps} "
            f"| {len(result.citations)} "
            f"| {len(result.discovery_events)} "
            f"| {cm.tokens_used} "
            f"|  "
            f"|  |"
        )

    lines.append("")
    lines.append("## Result files (full content for review)")
    lines.append("")
    for i, (path, result) in enumerate(rows, 1):
        lines.append(f"{i}. `{path}`")
    lines.append("")
    return "\n".join(lines)


def main() -> int:
    trace_dir = Path(
        os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
    )
    out_path = Path(
        os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
    )
    out_path.parent.mkdir(parents=True, exist_ok=True)

    rows = _load_results(trace_dir)
    if not rows:
        print(f"No result files found under {trace_dir}", file=sys.stderr)
        return 1

    runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
    category_map = _category_map(runs_dir)

    out_path.write_text(
        _build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
    )
    print(f"Wrote {len(rows)}-row worksheet to {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())