marchwarden/scripts/calibration_collect.py

"""scripts/calibration_collect.py

M3.3 Phase A: load every persisted ResearchResult under
~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
to docs/stress-tests/M3.3-rating-worksheet.md.

The worksheet has one row per run with the model's self-reported confidence
and a blank `actual_rating` column for human review (Phase B). After rating
is complete, scripts/calibration_analyze.py (Phase C) will load the same
file with the rating column populated and compute calibration error.

Usage:
    .venv/bin/python scripts/calibration_collect.py

Optional env:
    TRACE_DIR — override default ~/.marchwarden/traces
    OUT       — override default docs/stress-tests/M3.3-rating-worksheet.md
"""

from __future__ import annotations

import json
import os
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))

from researchers.web.models import ResearchResult  # noqa: E402


def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
    """Load every <id>.result.json under trace_dir, sorted by mtime."""
    files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
    out: list[tuple[Path, ResearchResult]] = []
    for f in files:
        try:
            result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
        except Exception as exc:
            print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
            continue
        out.append((f, result))
    return out


def _gap_summary(result: ResearchResult) -> str:
    """Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
    if not result.gaps:
        return "—"
    counts: dict[str, int] = {}
    for g in result.gaps:
        cat = g.category.value if hasattr(g.category, "value") else str(g.category)
        counts[cat] = counts.get(cat, 0) + 1
    return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))


def _category_map(runs_dir: Path) -> dict[str, str]:
    """Map trace_id -> category by parsing scripts/calibration_runner.sh log files.

    Each log file is named like ``01-factual.log`` and contains a final
    ``trace_id: <uuid>`` line emitted by the CLI.
    """
    out: dict[str, str] = {}
    if not runs_dir.exists():
        return out
    for log in runs_dir.glob("*.log"):
        # filename format: NN-category.log
        stem = log.stem
        parts = stem.split("-", 1)
        if len(parts) != 2:
            continue
        category = parts[1]
        try:
            text = log.read_text(encoding="utf-8")
        except Exception:
            continue
        # Find the last "trace_id: <uuid>" line
        trace_id = None
        for line in text.splitlines():
            if "trace_id:" in line:
                # Strip ANSI / rich markup if present
                token = line.split("trace_id:")[-1].strip()
                # Take only the UUID portion
                token = token.split()[0] if token else ""
                # Strip any surrounding rich markup
                token = token.replace("[/dim]", "").replace("[dim]", "")
                if token:
                    trace_id = token
        if trace_id:
            out[trace_id] = category
    return out


def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
    """Recover the original question from the trace JSONL's `start` event."""
    jsonl = trace_dir / f"{trace_id}.jsonl"
    if not jsonl.exists():
        return "(question not recoverable — trace missing)"
    try:
        for line in jsonl.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if not line:
                continue
            entry = json.loads(line)
            if entry.get("action") == "start":
                return entry.get("question", "(no question field)")
    except Exception as exc:
        return f"(parse error: {exc})"
    return "(no start event)"


def _build_worksheet(
    rows: list[tuple[Path, ResearchResult]],
    trace_dir: Path,
    category_map: dict[str, str],
) -> str:
    """Render the markdown worksheet."""
    lines: list[str] = []
    lines.append("# M3.3 Calibration Rating Worksheet")
    lines.append("")
    lines.append("Issue: #46 (Phase B — human rating)")
    lines.append("")
    lines.append(
        "## How to use this worksheet"
    )
    lines.append("")
    lines.append(
        "For each run below, read the answer + citations from the persisted "
        "result file (path in the **Result file** column). Score the answer's "
        "*actual* correctness on a 0.0–1.0 scale, **independent** of the "
        "model's self-reported confidence. Fill in the **actual_rating** "
        "column. Add notes in the **notes** column for anything unusual."
    )
    lines.append("")
    lines.append("Rating rubric:")
    lines.append("")
    lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
    lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
    lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
    lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
    lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
    lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
    lines.append("")
    lines.append("After rating all rows, save this file and run:")
    lines.append("")
    lines.append("```")
    lines.append(".venv/bin/python scripts/calibration_analyze.py")
    lines.append("```")
    lines.append("")
    lines.append(f"## Runs ({len(rows)} total)")
    lines.append("")
    lines.append(
        "| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
    )
    lines.append(
        "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
    )

    for i, (path, result) in enumerate(rows, 1):
        cf = result.confidence_factors
        cm = result.cost_metadata
        question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
        # Truncate long questions for table readability
        if len(question) > 80:
            question = question[:77] + "..."
        gaps = _gap_summary(result).replace("|", "\\|")
        contradiction = "yes" if cf.contradiction_detected else "no"
        budget = "spent" if cf.budget_exhausted else "under"
        recency = cf.recency or "—"
        category = category_map.get(result.trace_id, "ad-hoc")
        lines.append(
            f"| {i} "
            f"| `{result.trace_id[:8]}` "
            f"| {category} "
            f"| {question} "
            f"| {result.confidence:.2f} "
            f"| {cf.num_corroborating_sources} "
            f"| {cf.source_authority} "
            f"| {contradiction} "
            f"| {budget} "
            f"| {recency} "
            f"| {gaps} "
            f"| {len(result.citations)} "
            f"| {len(result.discovery_events)} "
            f"| {cm.tokens_used} "
            f"|  "
            f"|  |"
        )

    lines.append("")
    lines.append("## Result files (full content for review)")
    lines.append("")
    for i, (path, result) in enumerate(rows, 1):
        lines.append(f"{i}. `{path}`")
    lines.append("")
    return "\n".join(lines)


def main() -> int:
    trace_dir = Path(
        os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
    )
    out_path = Path(
        os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
    )
    out_path.parent.mkdir(parents=True, exist_ok=True)

    rows = _load_results(trace_dir)
    if not rows:
        print(f"No result files found under {trace_dir}", file=sys.stderr)
        return 1

    runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
    category_map = _category_map(runs_dir)

    out_path.write_text(
        _build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
    )
    print(f"Wrote {len(rows)}-row worksheet to {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
-												docs(stress-tests): M3.3 Phase A — calibration data collection

Issue #46 (Phase A only — Phase B human rating still pending, issue stays open).

Adds the data-collection half of the calibration milestone:

- scripts/calibration_runner.sh — runs 20 fixed balanced-depth queries
  across 4 categories (factual, comparative, contradiction-prone,
  scope-edge), 5 each, capturing per-run logs to docs/stress-tests/M3.3-runs/.
- scripts/calibration_collect.py — loads every persisted ResearchResult
  under ~/.marchwarden/traces/*.result.json and emits a markdown rating
  worksheet with one row per run. Recovers question text from each
  trace's start event and category from the run-log filename.
- docs/stress-tests/M3.3-rating-worksheet.md — 22 runs (20 calibration
  + caffeine smoke + M3.2 multi-axis), with empty actual_rating columns
  for the human-in-the-loop scoring step.
- docs/stress-tests/M3.3-runs/*.log — runtime logs from the calibration
  runner, kept as provenance. Gitignore updated with an exception
  carving stress-test logs out of the global *.log ignore.

Note: M3.1's 4 runs predate #54 (full result persistence) and so are
unrecoverable to the worksheet — only post-#54 runs have a result.json
sibling. 22 rateable runs is still within the milestone target of 20–30.

Phases B (human rating) and C (analysis + rubric + wiki update) follow
in a later session. This issue stays open until both are done.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 02:21:47 +00:00
+								"""scripts/calibration_collect.py
 								M3.3 Phase A: load every persisted ResearchResult under
 								~/.marchwarden/traces/*.result.json and emit a markdown rating worksheet
 								to docs/stress-tests/M3.3-rating-worksheet.md.
 								The worksheet has one row per run with the model's self-reported confidence
 								and a blank `actual_rating` column for human review (Phase B). After rating
 								is complete, scripts/calibration_analyze.py (Phase C) will load the same
 								file with the rating column populated and compute calibration error.
 								Usage:
 								    .venv/bin/python scripts/calibration_collect.py
 								Optional env:
 								    TRACE_DIR — override default ~/.marchwarden/traces
 								    OUT       — override default docs/stress-tests/M3.3-rating-worksheet.md
 								"""
 								from __future__ import annotations
 								import json
 								import os
 								import sys
 								from pathlib import Path
 								REPO_ROOT = Path(__file__).resolve().parent.parent
 								sys.path.insert(0, str(REPO_ROOT))
 								from researchers.web.models import ResearchResult  # noqa: E402
 								def _load_results(trace_dir: Path) -> list[tuple[Path, ResearchResult]]:
 								    """Load every <id>.result.json under trace_dir, sorted by mtime."""
 								    files = sorted(trace_dir.glob("*.result.json"), key=lambda p: p.stat().st_mtime)
 								    out: list[tuple[Path, ResearchResult]] = []
 								    for f in files:
 								        try:
 								            result = ResearchResult.model_validate_json(f.read_text(encoding="utf-8"))
 								        except Exception as exc:
 								            print(f"warning: skipping {f.name}: {exc}", file=sys.stderr)
 								            continue
 								        out.append((f, result))
 								    return out
 								def _gap_summary(result: ResearchResult) -> str:
 								    """Render gap categories with counts, e.g. 'source_not_found(2), scope_exceeded(1)'."""
 								    if not result.gaps:
 								        return "—"
 								    counts: dict[str, int] = {}
 								    for g in result.gaps:
 								        cat = g.category.value if hasattr(g.category, "value") else str(g.category)
 								        counts[cat] = counts.get(cat, 0) + 1
 								    return ", ".join(f"{k}({v})" for k, v in sorted(counts.items()))
 								def _category_map(runs_dir: Path) -> dict[str, str]:
 								    """Map trace_id -> category by parsing scripts/calibration_runner.sh log files.
 								    Each log file is named like ``01-factual.log`` and contains a final
 								    ``trace_id: <uuid>`` line emitted by the CLI.
 								    """
 								    out: dict[str, str] = {}
 								    if not runs_dir.exists():
 								        return out
 								    for log in runs_dir.glob("*.log"):
 								        # filename format: NN-category.log
 								        stem = log.stem
 								        parts = stem.split("-", 1)
 								        if len(parts) != 2:
 								            continue
 								        category = parts[1]
 								        try:
 								            text = log.read_text(encoding="utf-8")
 								        except Exception:
 								            continue
 								        # Find the last "trace_id: <uuid>" line
 								        trace_id = None
 								        for line in text.splitlines():
 								            if "trace_id:" in line:
 								                # Strip ANSI / rich markup if present
 								                token = line.split("trace_id:")[-1].strip()
 								                # Take only the UUID portion
 								                token = token.split()[0] if token else ""
 								                # Strip any surrounding rich markup
 								                token = token.replace("[/dim]", "").replace("[dim]", "")
 								                if token:
 								                    trace_id = token
 								        if trace_id:
 								            out[trace_id] = category
 								    return out
 								def _question_from_trace(trace_dir: Path, trace_id: str) -> str:
 								    """Recover the original question from the trace JSONL's `start` event."""
 								    jsonl = trace_dir / f"{trace_id}.jsonl"
 								    if not jsonl.exists():
 								        return "(question not recoverable — trace missing)"
 								    try:
 								        for line in jsonl.read_text(encoding="utf-8").splitlines():
 								            line = line.strip()
 								            if not line:
 								                continue
 								            entry = json.loads(line)
 								            if entry.get("action") == "start":
 								                return entry.get("question", "(no question field)")
 								    except Exception as exc:
 								        return f"(parse error: {exc})"
 								    return "(no start event)"
 								def _build_worksheet(
 								    rows: list[tuple[Path, ResearchResult]],
 								    trace_dir: Path,
 								    category_map: dict[str, str],
 								) -> str:
 								    """Render the markdown worksheet."""
 								    lines: list[str] = []
 								    lines.append("# M3.3 Calibration Rating Worksheet")
 								    lines.append("")
 								    lines.append("Issue: #46 (Phase B — human rating)")
 								    lines.append("")
 								    lines.append(
 								        "## How to use this worksheet"
 								    )
 								    lines.append("")
 								    lines.append(
 								        "For each run below, read the answer + citations from the persisted "
 								        "result file (path in the **Result file** column). Score the answer's "
 								        "*actual* correctness on a 0.0–1.0 scale, **independent** of the "
 								        "model's self-reported confidence. Fill in the **actual_rating** "
 								        "column. Add notes in the **notes** column for anything unusual."
 								    )
 								    lines.append("")
 								    lines.append("Rating rubric:")
 								    lines.append("")
 								    lines.append("- **1.0** — Answer is fully correct, well-supported by cited sources, no material gaps or hallucinations.")
 								    lines.append("- **0.8** — Mostly correct; minor inaccuracies or omissions that don't change the substance.")
 								    lines.append("- **0.6** — Substantively right but with notable errors, missing context, or weak citations.")
 								    lines.append("- **0.4** — Mixed: some right, some wrong; or right answer for wrong reasons.")
 								    lines.append("- **0.2** — Mostly wrong, misleading, or hallucinated despite confident framing.")
 								    lines.append("- **0.0** — Completely wrong, fabricated, or refuses to answer a tractable question.")
 								    lines.append("")
 								    lines.append("After rating all rows, save this file and run:")
 								    lines.append("")
 								    lines.append("```")
 								    lines.append(".venv/bin/python scripts/calibration_analyze.py")
 								    lines.append("```")
 								    lines.append("")
 								    lines.append(f"## Runs ({len(rows)} total)")
 								    lines.append("")
 								    lines.append(
 								        "| # | trace_id | category | question | model_conf | corrob | authority | contradiction | budget | recency | gaps | citations | discoveries | tokens | actual_rating | notes |"
 								    )
 								    lines.append(
 								        "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|"
 								    )
 								    for i, (path, result) in enumerate(rows, 1):
 								        cf = result.confidence_factors
 								        cm = result.cost_metadata
 								        question = _question_from_trace(trace_dir, result.trace_id).replace("|", "\\|")
 								        # Truncate long questions for table readability
 								        if len(question) > 80:
 								            question = question[:77] + "..."
 								        gaps = _gap_summary(result).replace("|", "\\|")
 								        contradiction = "yes" if cf.contradiction_detected else "no"
 								        budget = "spent" if cf.budget_exhausted else "under"
 								        recency = cf.recency or "—"
 								        category = category_map.get(result.trace_id, "ad-hoc")
 								        lines.append(
 								            f"| {i} "
 								            f"| `{result.trace_id[:8]}` "
 								            f"| {category} "
 								            f"| {question} "
 								            f"| {result.confidence:.2f} "
 								            f"| {cf.num_corroborating_sources} "
 								            f"| {cf.source_authority} "
 								            f"| {contradiction} "
 								            f"| {budget} "
 								            f"| {recency} "
 								            f"| {gaps} "
 								            f"| {len(result.citations)} "
 								            f"| {len(result.discovery_events)} "
 								            f"| {cm.tokens_used} "
 								            f"|  "
 								            f"|  |"
 								        )
 								    lines.append("")
 								    lines.append("## Result files (full content for review)")
 								    lines.append("")
 								    for i, (path, result) in enumerate(rows, 1):
 								        lines.append(f"{i}. `{path}`")
 								    lines.append("")
 								    return "\n".join(lines)
 								def main() -> int:
 								    trace_dir = Path(
 								        os.environ.get("TRACE_DIR", os.path.expanduser("~/.marchwarden/traces"))
 								    )
 								    out_path = Path(
 								        os.environ.get("OUT", REPO_ROOT / "docs/stress-tests/M3.3-rating-worksheet.md")
 								    )
 								    out_path.parent.mkdir(parents=True, exist_ok=True)
 								    rows = _load_results(trace_dir)
 								    if not rows:
 								        print(f"No result files found under {trace_dir}", file=sys.stderr)
 								        return 1
 								    runs_dir = REPO_ROOT / "docs/stress-tests/M3.3-runs"
 								    category_map = _category_map(runs_dir)
 								    out_path.write_text(
 								        _build_worksheet(rows, trace_dir, category_map), encoding="utf-8"
 								    )
 								    print(f"Wrote {len(rows)}-row worksheet to {out_path}")
 								    return 0
 								if __name__ == "__main__":
 								    raise SystemExit(main())