Merge pull request 'M2.5.3: marchwarden costs CLI command' (#29 ) from feat/costs-command into main

Reviewed-on: #29 Reviewed-by: archeious <archeious@unbiasedgeek.com>
M2.5.3: marchwarden costs CLI command (#26 )
2026-04-08 21:59:07 +00:00 · 2026-04-08 15:57:39 -06:00
3 changed files with 352 additions and 1 deletions
--- a/1
+++ b/1
@ -13,6 +13,7 @@ RUN pip install --upgrade pip
 # Copy the project and install editable with dev extras.
 COPY cli ./cli
 COPY obs ./obs
 COPY researchers ./researchers
 COPY orchestrator ./orchestrator
 COPY tests ./tests
--- a/cli/main.py
+++ b/cli/main.py
@ -7,7 +7,10 @@ ResearchResult contracts to the terminal.
 import asyncio
 import json
 import os
 import re
 import sys
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Optional
@ -20,6 +23,7 @@ from rich.table import Table
 from rich.text import Text
 from obs import configure_logging, get_logger
 from obs.costs import DEFAULT_LEDGER_PATH
 from researchers.web.models import ResearchResult
@ -298,5 +302,215 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:
    render_trace(entries, trace_id, console)
 # ---------------------------------------------------------------------------
 # costs command
 # ---------------------------------------------------------------------------
 _RELATIVE_RE = re.compile(r"^(\d+)([dwhm])$")
 def _parse_when(value: str) -> datetime:
    """Parse an ISO date or a relative shorthand like '7d', '24h'."""
    m = _RELATIVE_RE.match(value)
    if m:
        n = int(m.group(1))
        unit = m.group(2)
        delta = {
            "h": timedelta(hours=n),
            "d": timedelta(days=n),
            "w": timedelta(weeks=n),
            "m": timedelta(days=30 * n),
        }[unit]
        return datetime.now(timezone.utc) - delta
    # Otherwise treat as ISO date / datetime
    dt = datetime.fromisoformat(value)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt
 def _load_ledger(path: Path) -> list[dict]:
    if not path.exists():
        return []
    entries: list[dict] = []
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                entries.append(json.loads(line))
            except json.JSONDecodeError:
                # Skip a corrupt line rather than blow up the whole report
                continue
    return entries
 def _filter_entries(
    entries: list[dict],
    since: Optional[datetime],
    until: Optional[datetime],
    model: Optional[str],
 ) -> list[dict]:
    out = []
    for e in entries:
        ts_str = e.get("timestamp", "")
        try:
            ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
        except ValueError:
            continue
        if since and ts < since:
            continue
        if until and ts > until:
            continue
        if model and e.get("model_id") != model:
            continue
        out.append(e)
    return out
 def render_costs(entries: list[dict], console: Console) -> None:
    """Render a cost summary from filtered ledger entries."""
    if not entries:
        console.print("[dim]No cost data yet.[/dim]")
        return
    total_calls = len(entries)
    total_tokens = sum(e.get("tokens_used", 0) for e in entries)
    total_input = sum(e.get("tokens_input") or 0 for e in entries)
    total_output = sum(e.get("tokens_output") or 0 for e in entries)
    total_tavily = sum(e.get("tavily_searches", 0) for e in entries)
    total_spend = sum(
        e.get("estimated_cost_usd") or 0.0 for e in entries
    )
    unknown_cost_calls = sum(
        1 for e in entries if e.get("estimated_cost_usd") is None
    )
    # Summary panel
    summary = Text()
    summary.append(f"Calls: {total_calls}\n", style="bold")
    summary.append(f"Total spend: ${total_spend:.4f}\n", style="bold green")
    summary.append(f"Total tokens: {total_tokens:,} ")
    summary.append(f"(in {total_input:,} / out {total_output:,})\n", style="dim")
    summary.append(f"Tavily searches: {total_tavily}\n")
    if unknown_cost_calls:
        summary.append(
            f"Calls with unknown model price: {unknown_cost_calls}\n",
            style="yellow",
        )
    console.print(Panel(summary, title="Cost Summary", border_style="green"))
    # Per-day breakdown
    per_day: dict[str, dict] = defaultdict(lambda: {"calls": 0, "tokens": 0, "spend": 0.0})
    for e in entries:
        day = e.get("timestamp", "")[:10]
        per_day[day]["calls"] += 1
        per_day[day]["tokens"] += e.get("tokens_used", 0)
        per_day[day]["spend"] += e.get("estimated_cost_usd") or 0.0
    day_table = Table(title="Per Day", show_lines=False, expand=True)
    day_table.add_column("Date", style="dim")
    day_table.add_column("Calls", justify="right")
    day_table.add_column("Tokens", justify="right")
    day_table.add_column("Spend (USD)", justify="right", style="green")
    for day in sorted(per_day.keys()):
        d = per_day[day]
        day_table.add_row(
            day, str(d["calls"]), f"{d['tokens']:,}", f"${d['spend']:.4f}"
        )
    console.print(day_table)
    # Per-model breakdown
    per_model: dict[str, dict] = defaultdict(
        lambda: {"calls": 0, "tokens": 0, "spend": 0.0}
    )
    for e in entries:
        m = e.get("model_id", "(unknown)")
        per_model[m]["calls"] += 1
        per_model[m]["tokens"] += e.get("tokens_used", 0)
        per_model[m]["spend"] += e.get("estimated_cost_usd") or 0.0
    model_table = Table(title="Per Model", show_lines=False, expand=True)
    model_table.add_column("Model")
    model_table.add_column("Calls", justify="right")
    model_table.add_column("Tokens", justify="right")
    model_table.add_column("Spend (USD)", justify="right", style="green")
    for m in sorted(per_model.keys()):
        d = per_model[m]
        model_table.add_row(
            m, str(d["calls"]), f"{d['tokens']:,}", f"${d['spend']:.4f}"
        )
    console.print(model_table)
    # Highest-cost call
    costed = [e for e in entries if e.get("estimated_cost_usd") is not None]
    if costed:
        top = max(costed, key=lambda e: e["estimated_cost_usd"])
        top_text = Text()
        top_text.append(f"trace_id: {top.get('trace_id', '?')}\n")
        top_text.append(f"question: {top.get('question', '')[:120]}\n")
        top_text.append(f"model: {top.get('model_id', '?')}\n")
        top_text.append(f"tokens: {top.get('tokens_used', 0):,}\n")
        top_text.append(
            f"spend: ${top.get('estimated_cost_usd', 0):.4f}\n",
            style="bold green",
        )
        console.print(
            Panel(top_text, title="Highest-Cost Call", border_style="yellow")
        )
@cli.command()
@click.option(
    "--since",
    default=None,
    help="Filter by start time. ISO date or relative (e.g. 7d, 24h, 2w).",
 )
@click.option(
    "--until",
    default=None,
    help="Filter by end time. ISO date or relative.",
 )
@click.option(
    "--model",
    default=None,
    help="Filter to a specific model_id.",
 )
@click.option(
    "--json",
    "as_json",
    is_flag=True,
    default=False,
    help="Emit raw filtered ledger entries as JSON instead of the table.",
 )
@click.option(
    "--ledger",
    default=None,
    help=f"Override ledger path (default: {DEFAULT_LEDGER_PATH}).",
 )
 def costs(
    since: Optional[str],
    until: Optional[str],
    model: Optional[str],
    as_json: bool,
    ledger: Optional[str],
 ) -> None:
    """Show cost summary from the research ledger."""
    console = Console()
    path = Path(os.path.expanduser(ledger or DEFAULT_LEDGER_PATH))
    entries = _load_ledger(path)
    since_dt = _parse_when(since) if since else None
    until_dt = _parse_when(until) if until else None
    filtered = _filter_entries(entries, since_dt, until_dt, model)
    if as_json:
        for e in filtered:
            click.echo(json.dumps(e))
        return
    render_costs(filtered, console)
 if __name__ == "__main__":
    cli()
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -4,7 +4,7 @@ from unittest.mock import patch
 from click.testing import CliRunner
-from cli.main import cli, render_result, render_trace
+from cli.main import cli, render_costs, render_result, render_trace
 from researchers.web.models import (
    Citation,
    ConfidenceFactors,
@ -186,3 +186,139 @@ class TestReplayCommand:
        out = console.export_text()
        assert "empty-trace" in out
        assert "empty" in out.lower()
 # ---------------------------------------------------------------------------
 # costs command
 # ---------------------------------------------------------------------------
 import json as _json
 def _write_ledger(path, entries):
    path.write_text("\n".join(_json.dumps(e) for e in entries) + "\n")
 def _ledger_fixture(tmp_path):
    path = tmp_path / "costs.jsonl"
    entries = [
        {
            "timestamp": "2026-04-06T10:00:00Z",
            "trace_id": "t1",
            "question": "What is X?",
            "model_id": "claude-sonnet-4-6",
            "tokens_used": 1000,
            "tokens_input": 800,
            "tokens_output": 200,
            "iterations_run": 1,
            "wall_time_sec": 5.0,
            "tavily_searches": 1,
            "estimated_cost_usd": 0.005,
            "budget_exhausted": False,
            "confidence": 0.9,
        },
        {
            "timestamp": "2026-04-07T11:00:00Z",
            "trace_id": "t2",
            "question": "Bigger query",
            "model_id": "claude-opus-4-6",
            "tokens_used": 50000,
            "tokens_input": 40000,
            "tokens_output": 10000,
            "iterations_run": 5,
            "wall_time_sec": 120.0,
            "tavily_searches": 8,
            "estimated_cost_usd": 1.25,
            "budget_exhausted": True,
            "confidence": 0.7,
        },
        {
            "timestamp": "2026-04-08T12:00:00Z",
            "trace_id": "t3",
            "question": "Unknown model run",
            "model_id": "future-model-7",
            "tokens_used": 500,
            "tokens_input": 400,
            "tokens_output": 100,
            "iterations_run": 1,
            "wall_time_sec": 2.0,
            "tavily_searches": 0,
            "estimated_cost_usd": None,
            "budget_exhausted": False,
            "confidence": 0.5,
        },
    ]
    _write_ledger(path, entries)
    return path
 class TestCostsCommand:
    def test_renders_summary(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(cli, ["costs", "--ledger", str(path)])
        assert result.exit_code == 0, result.output
        # Summary
        assert "Calls: 3" in result.output
        assert "$1.2550" in result.output
        # Per-day rows
        assert "2026-04-06" in result.output
        assert "2026-04-07" in result.output
        assert "2026-04-08" in result.output
        # Per-model rows
        assert "claude-sonnet-4-6" in result.output
        assert "claude-opus-4-6" in result.output
        # Highest-cost panel
        assert "t2" in result.output
        # Unknown model warning
        assert "unknown model price" in result.output
    def test_filter_by_model(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--model", "claude-opus-4-6"],
        )
        assert result.exit_code == 0
        assert "Calls: 1" in result.output
        assert "claude-sonnet-4-6" not in result.output
    def test_filter_by_since_iso(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--since", "2026-04-08"],
        )
        assert result.exit_code == 0
        assert "Calls: 1" in result.output
        assert "future-model-7" in result.output
        assert "claude-sonnet-4-6" not in result.output
    def test_json_output(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--json"],
        )
        assert result.exit_code == 0
        lines = [l for l in result.output.strip().splitlines() if l]
        assert len(lines) == 3
        first = _json.loads(lines[0])
        assert first["trace_id"] == "t1"
    def test_empty_ledger(self, tmp_path):
        path = tmp_path / "missing.jsonl"
        runner = CliRunner()
        result = runner.invoke(cli, ["costs", "--ledger", str(path)])
        assert result.exit_code == 0
        assert "No cost data yet" in result.output
    def test_render_costs_handles_empty(self):
        console = Console(record=True, width=120)
        render_costs([], console)
        out = console.export_text()
        assert "No cost data yet" in out