marchwarden/tests/test_cli.py

"""Tests for the marchwarden CLI."""

from unittest.mock import patch

from click.testing import CliRunner

from cli.main import cli, render_costs, render_result, render_trace
from researchers.web.models import (
    Citation,
    ConfidenceFactors,
    CostMetadata,
    DiscoveryEvent,
    Gap,
    GapCategory,
    OpenQuestion,
    ResearchResult,
)
from rich.console import Console


def _fixture_result() -> ResearchResult:
    return ResearchResult(
        answer="Tomatoes, peppers, squash, and beans grow well in Utah.",
        citations=[
            Citation(
                source="web",
                locator="https://extension.usu.edu/yard-and-garden",
                title="USU Extension — Yard and Garden",
                snippet="USU recommends warm-season crops for Utah's climate.",
                raw_excerpt="Tomatoes, peppers, and squash thrive in Utah summers.",
                confidence=0.9,
            ),
        ],
        gaps=[
            Gap(
                topic="Microclimate variation",
                category=GapCategory.SCOPE_EXCEEDED,
                detail="Did not investigate elevation-specific recommendations.",
            ),
        ],
        discovery_events=[
            DiscoveryEvent(
                type="related_research",
                suggested_researcher="docs",
                query="Utah USDA hardiness zones",
                reason="Zone-specific guidance would improve answer.",
            ),
        ],
        open_questions=[
            OpenQuestion(
                question="What are the best cool-season crops?",
                context="Answer focused on warm-season crops.",
                priority="medium",
            ),
        ],
        confidence=0.82,
        confidence_factors=ConfidenceFactors(
            num_corroborating_sources=3,
            source_authority="high",
            contradiction_detected=False,
            query_specificity_match=0.85,
            budget_exhausted=False,
            recency="current",
        ),
        cost_metadata=CostMetadata(
            tokens_used=4321,
            iterations_run=3,
            wall_time_sec=12.5,
            budget_exhausted=False,
            model_id="claude-sonnet-4-6",
        ),
        trace_id="trace-abc-123",
    )


class TestRenderResult:
    def test_renders_all_sections(self):
        console = Console(record=True, width=120)
        render_result(_fixture_result(), console)
        out = console.export_text()
        assert "Tomatoes" in out
        assert "USU Extension" in out
        assert "scope_exceeded" in out
        assert "related_research" in out
        assert "cool-season" in out
        assert "Confidence" in out
        assert "claude-sonnet-4-6" in out
        assert "trace-abc-123" in out


class TestAskCommand:
    def test_ask_invokes_mcp_and_renders(self):
        runner = CliRunner()
        fixture = _fixture_result()

        async def fake_call(question, depth, max_iterations, token_budget):
            assert question == "What grows in Utah?"
            assert depth == "shallow"
            assert max_iterations == 2
            assert token_budget == 5000
            return fixture

        with patch("cli.main.call_research_tool", side_effect=fake_call):
            result = runner.invoke(
                cli,
                [
                    "ask",
                    "What grows in Utah?",
                    "--depth",
                    "shallow",
                    "--max-iterations",
                    "2",
                    "--budget",
                    "5000",
                ],
            )

        assert result.exit_code == 0, result.output
        assert "Tomatoes" in result.output
        assert "trace-abc-123" in result.output

    def test_ask_handles_error(self):
        runner = CliRunner()

        async def boom(**kwargs):
            raise RuntimeError("mcp went sideways")

        with patch("cli.main.call_research_tool", side_effect=boom):
            result = runner.invoke(cli, ["ask", "anything"])

        assert result.exit_code == 1
        assert "mcp went sideways" in result.output


class TestReplayCommand:
    def _write_trace(self, tmp_path, trace_id="trace-xyz"):
        path = tmp_path / f"{trace_id}.jsonl"
        path.write_text(
            '{"step": 1, "action": "search", "decision": "initial query", '
            '"timestamp": "2026-04-08T00:00:00Z", "query": "utah crops"}\n'
            '{"step": 2, "action": "fetch_url", "decision": "promising source", '
            '"timestamp": "2026-04-08T00:00:01Z", "url": "https://example.com", '
            '"content_hash": "sha256:deadbeef"}\n'
            '{"step": 3, "action": "synthesize", "decision": "have enough", '
            '"timestamp": "2026-04-08T00:00:02Z"}\n'
        )
        return path

    def test_replay_renders_trace(self, tmp_path):
        runner = CliRunner()
        self._write_trace(tmp_path)
        result = runner.invoke(
            cli,
            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 0, result.output
        assert "trace-xyz" in result.output
        assert "search" in result.output
        assert "fetch_url" in result.output
        assert "synthesize" in result.output
        assert "sha256:deadbeef" in result.output
        assert "utah crops" in result.output

    def test_replay_unknown_trace_id(self, tmp_path):
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["replay", "missing-id", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 1
        assert "no trace file found" in result.output

    def test_replay_invalid_json(self, tmp_path):
        runner = CliRunner()
        (tmp_path / "broken.jsonl").write_text("{not json}\n")
        result = runner.invoke(
            cli,
            ["replay", "broken", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 1
        assert "invalid JSON" in result.output

    def test_replay_renders_persisted_result(self, tmp_path):
        """Issue #54: replay loads <id>.result.json sibling and renders it."""
        runner = CliRunner()
        self._write_trace(tmp_path)
        result_payload = {
            "answer": "Test answer about Utah crops.",
            "citations": [
                {
                    "source": "web",
                    "locator": "https://example.com/utah",
                    "title": "Utah Guide",
                    "snippet": None,
                    "raw_excerpt": "raw excerpt content",
                    "confidence": 0.9,
                }
            ],
            "gaps": [
                {
                    "topic": "irrigation",
                    "category": "scope_exceeded",
                    "detail": "out of scope",
                }
            ],
            "discovery_events": [],
            "open_questions": [],
            "confidence": 0.8,
            "confidence_factors": {
                "num_corroborating_sources": 2,
                "source_authority": "high",
                "contradiction_detected": False,
                "query_specificity_match": 0.8,
                "budget_exhausted": False,
                "recency": "current",
            },
            "cost_metadata": {
                "tokens_used": 1000,
                "iterations_run": 2,
                "wall_time_sec": 12.5,
                "budget_exhausted": False,
                "model_id": "claude-test",
            },
            "trace_id": "trace-xyz",
        }
        import json as _j
        (tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))

        result = runner.invoke(
            cli,
            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 0, result.output
        # Step log still rendered
        assert "search" in result.output
        # Persisted result also rendered
        assert "Test answer about Utah crops" in result.output
        assert "scope_exceeded" in result.output
        assert "irrigation" in result.output

    def test_replay_without_result_file_notes_absence(self, tmp_path):
        runner = CliRunner()
        self._write_trace(tmp_path)
        result = runner.invoke(
            cli,
            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 0
        assert "No persisted result file" in result.output

    def test_render_trace_empty(self):
        console = Console(record=True, width=120)
        render_trace([], "empty-trace", console)
        out = console.export_text()
        assert "empty-trace" in out
        assert "empty" in out.lower()


# ---------------------------------------------------------------------------
# costs command
# ---------------------------------------------------------------------------


import json as _json


def _write_ledger(path, entries):
    path.write_text("\n".join(_json.dumps(e) for e in entries) + "\n")


def _ledger_fixture(tmp_path):
    path = tmp_path / "costs.jsonl"
    entries = [
        {
            "timestamp": "2026-04-06T10:00:00Z",
            "trace_id": "t1",
            "question": "What is X?",
            "model_id": "claude-sonnet-4-6",
            "tokens_used": 1000,
            "tokens_input": 800,
            "tokens_output": 200,
            "iterations_run": 1,
            "wall_time_sec": 5.0,
            "tavily_searches": 1,
            "estimated_cost_usd": 0.005,
            "budget_exhausted": False,
            "confidence": 0.9,
        },
        {
            "timestamp": "2026-04-07T11:00:00Z",
            "trace_id": "t2",
            "question": "Bigger query",
            "model_id": "claude-opus-4-6",
            "tokens_used": 50000,
            "tokens_input": 40000,
            "tokens_output": 10000,
            "iterations_run": 5,
            "wall_time_sec": 120.0,
            "tavily_searches": 8,
            "estimated_cost_usd": 1.25,
            "budget_exhausted": True,
            "confidence": 0.7,
        },
        {
            "timestamp": "2026-04-08T12:00:00Z",
            "trace_id": "t3",
            "question": "Unknown model run",
            "model_id": "future-model-7",
            "tokens_used": 500,
            "tokens_input": 400,
            "tokens_output": 100,
            "iterations_run": 1,
            "wall_time_sec": 2.0,
            "tavily_searches": 0,
            "estimated_cost_usd": None,
            "budget_exhausted": False,
            "confidence": 0.5,
        },
    ]
    _write_ledger(path, entries)
    return path


class TestCostsCommand:
    def test_renders_summary(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(cli, ["costs", "--ledger", str(path)])
        assert result.exit_code == 0, result.output
        # Summary
        assert "Calls: 3" in result.output
        assert "$1.2550" in result.output
        # Per-day rows
        assert "2026-04-06" in result.output
        assert "2026-04-07" in result.output
        assert "2026-04-08" in result.output
        # Per-model rows
        assert "claude-sonnet-4-6" in result.output
        assert "claude-opus-4-6" in result.output
        # Highest-cost panel
        assert "t2" in result.output
        # Unknown model warning
        assert "unknown model price" in result.output

    def test_filter_by_model(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--model", "claude-opus-4-6"],
        )
        assert result.exit_code == 0
        assert "Calls: 1" in result.output
        assert "claude-sonnet-4-6" not in result.output

    def test_filter_by_since_iso(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--since", "2026-04-08"],
        )
        assert result.exit_code == 0
        assert "Calls: 1" in result.output
        assert "future-model-7" in result.output
        assert "claude-sonnet-4-6" not in result.output

    def test_json_output(self, tmp_path):
        path = _ledger_fixture(tmp_path)
        runner = CliRunner()
        result = runner.invoke(
            cli,
            ["costs", "--ledger", str(path), "--json"],
        )
        assert result.exit_code == 0
        lines = [l for l in result.output.strip().splitlines() if l]
        assert len(lines) == 3
        first = _json.loads(lines[0])
        assert first["trace_id"] == "t1"

    def test_empty_ledger(self, tmp_path):
        path = tmp_path / "missing.jsonl"
        runner = CliRunner()
        result = runner.invoke(cli, ["costs", "--ledger", str(path)])
        assert result.exit_code == 0
        assert "No cost data yet" in result.output

    def test_render_costs_handles_empty(self):
        console = Console(record=True, width=120)
        render_costs([], console)
        out = console.export_text()
        assert "No cost data yet" in out
M2.1: marchwarden ask CLI command (#8) Click app with `ask` subcommand that spawns the web researcher MCP server over stdio, calls the research tool, and pretty-prints the ResearchResult contract using rich (panels for answer/confidence/cost, tables for citations, gaps, discovery events, and open questions). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 20:51:40 +00:00			`"""Tests for the marchwarden CLI."""`

			`from unittest.mock import patch`

			`from click.testing import CliRunner`

M2.5.3: marchwarden costs CLI command (#26) Adds operator-facing `marchwarden costs` subcommand that reads the JSONL ledger from M2.5.2 and pretty-prints a rich summary: - Cost Summary panel: total calls, total spend, total tokens (input/ output split), Tavily search count, warning for any calls with unknown model prices - Per-Day table sorted by date - Per-Model table sorted by model id - Highest-Cost Call panel with trace_id and question Flags: --since ISO date or relative shorthand (7d, 24h, 2w, 1m) --until same --model filter to a specific model_id --json emit raw filtered ledger entries instead of the table --ledger override default path (mostly for tests) Also fixes a Dockerfile gap: the obs/ package added in M2.5.1 was not being COPYed into the image, so the installed `marchwarden` entry point couldn't import it. Tests had been passing because they mounted /app over the install. Adding `COPY obs ./obs` restores parity. Tests cover summary rendering, model filter, since-date filter, JSON output, and the empty-ledger friendly path. 110/110 passing. End-to-end verified against the real cost ledger. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 21:57:39 +00:00			`from cli.main import cli, render_costs, render_result, render_trace`
M2.1: marchwarden ask CLI command (#8) Click app with `ask` subcommand that spawns the web researcher MCP server over stdio, calls the research tool, and pretty-prints the ResearchResult contract using rich (panels for answer/confidence/cost, tables for citations, gaps, discovery events, and open questions). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 20:51:40 +00:00			`from researchers.web.models import (`
			`Citation,`
			`ConfidenceFactors,`
			`CostMetadata,`
			`DiscoveryEvent,`
			`Gap,`
			`GapCategory,`
			`OpenQuestion,`
			`ResearchResult,`
			`)`
			`from rich.console import Console`


			`def _fixture_result() -> ResearchResult:`
			`return ResearchResult(`
			`answer="Tomatoes, peppers, squash, and beans grow well in Utah.",`
			`citations=[`
			`Citation(`
			`source="web",`
			`locator="https://extension.usu.edu/yard-and-garden",`
			`title="USU Extension — Yard and Garden",`
			`snippet="USU recommends warm-season crops for Utah's climate.",`
			`raw_excerpt="Tomatoes, peppers, and squash thrive in Utah summers.",`
			`confidence=0.9,`
			`),`
			`],`
			`gaps=[`
			`Gap(`
			`topic="Microclimate variation",`
			`category=GapCategory.SCOPE_EXCEEDED,`
			`detail="Did not investigate elevation-specific recommendations.",`
			`),`
			`],`
			`discovery_events=[`
			`DiscoveryEvent(`
			`type="related_research",`
			`suggested_researcher="docs",`
			`query="Utah USDA hardiness zones",`
			`reason="Zone-specific guidance would improve answer.",`
			`),`
			`],`
			`open_questions=[`
			`OpenQuestion(`
			`question="What are the best cool-season crops?",`
			`context="Answer focused on warm-season crops.",`
			`priority="medium",`
			`),`
			`],`
			`confidence=0.82,`
			`confidence_factors=ConfidenceFactors(`
			`num_corroborating_sources=3,`
			`source_authority="high",`
			`contradiction_detected=False,`
			`query_specificity_match=0.85,`
			`budget_exhausted=False,`
			`recency="current",`
			`),`
			`cost_metadata=CostMetadata(`
			`tokens_used=4321,`
			`iterations_run=3,`
			`wall_time_sec=12.5,`
			`budget_exhausted=False,`
			`model_id="claude-sonnet-4-6",`
			`),`
			`trace_id="trace-abc-123",`
			`)`


			`class TestRenderResult:`
			`def test_renders_all_sections(self):`
			`console = Console(record=True, width=120)`
			`render_result(_fixture_result(), console)`
			`out = console.export_text()`
			`assert "Tomatoes" in out`
			`assert "USU Extension" in out`
			`assert "scope_exceeded" in out`
			`assert "related_research" in out`
			`assert "cool-season" in out`
			`assert "Confidence" in out`
			`assert "claude-sonnet-4-6" in out`
			`assert "trace-abc-123" in out`


			`class TestAskCommand:`
			`def test_ask_invokes_mcp_and_renders(self):`
			`runner = CliRunner()`
			`fixture = _fixture_result()`

			`async def fake_call(question, depth, max_iterations, token_budget):`
			`assert question == "What grows in Utah?"`
			`assert depth == "shallow"`
			`assert max_iterations == 2`
			`assert token_budget == 5000`
			`return fixture`

			`with patch("cli.main.call_research_tool", side_effect=fake_call):`
			`result = runner.invoke(`
			`cli,`
			`[`
			`"ask",`
			`"What grows in Utah?",`
			`"--depth",`
			`"shallow",`
			`"--max-iterations",`
			`"2",`
			`"--budget",`
			`"5000",`
			`],`
			`)`

			`assert result.exit_code == 0, result.output`
			`assert "Tomatoes" in result.output`
			`assert "trace-abc-123" in result.output`

			`def test_ask_handles_error(self):`
			`runner = CliRunner()`

			`async def boom(**kwargs):`
			`raise RuntimeError("mcp went sideways")`

			`with patch("cli.main.call_research_tool", side_effect=boom):`
			`result = runner.invoke(cli, ["ask", "anything"])`

			`assert result.exit_code == 1`
			`assert "mcp went sideways" in result.output`
M2.2: marchwarden replay CLI command (#9) Adds `marchwarden replay <trace_id>` to pretty-print a prior research run from its JSONL trace file. Resolves the trace under ~/.marchwarden/traces/ by default; --trace-dir overrides for tests and custom locations. Renders each step as a row with action, decision, extra fields, and content_hash. Friendly errors for unknown trace_id and malformed JSON lines. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 20:57:37 +00:00

			`class TestReplayCommand:`
			`def _write_trace(self, tmp_path, trace_id="trace-xyz"):`
			`path = tmp_path / f"{trace_id}.jsonl"`
			`path.write_text(`
			`'{"step": 1, "action": "search", "decision": "initial query", '`
			`'"timestamp": "2026-04-08T00:00:00Z", "query": "utah crops"}\n'`
			`'{"step": 2, "action": "fetch_url", "decision": "promising source", '`
			`'"timestamp": "2026-04-08T00:00:01Z", "url": "https://example.com", '`
			`'"content_hash": "sha256:deadbeef"}\n'`
			`'{"step": 3, "action": "synthesize", "decision": "have enough", '`
			`'"timestamp": "2026-04-08T00:00:02Z"}\n'`
			`)`
			`return path`

			`def test_replay_renders_trace(self, tmp_path):`
			`runner = CliRunner()`
			`self._write_trace(tmp_path)`
			`result = runner.invoke(`
			`cli,`
			`["replay", "trace-xyz", "--trace-dir", str(tmp_path)],`
			`)`
			`assert result.exit_code == 0, result.output`
			`assert "trace-xyz" in result.output`
			`assert "search" in result.output`
			`assert "fetch_url" in result.output`
			`assert "synthesize" in result.output`
			`assert "sha256:deadbeef" in result.output`
			`assert "utah crops" in result.output`

			`def test_replay_unknown_trace_id(self, tmp_path):`
			`runner = CliRunner()`
			`result = runner.invoke(`
			`cli,`
			`["replay", "missing-id", "--trace-dir", str(tmp_path)],`
			`)`
			`assert result.exit_code == 1`
			`assert "no trace file found" in result.output`

			`def test_replay_invalid_json(self, tmp_path):`
			`runner = CliRunner()`
			`(tmp_path / "broken.jsonl").write_text("{not json}\n")`
			`result = runner.invoke(`
			`cli,`
			`["replay", "broken", "--trace-dir", str(tmp_path)],`
			`)`
			`assert result.exit_code == 1`
			`assert "invalid JSON" in result.output`

fix(observability): persist full ResearchResult and per-item trace events Closes #54. The JSONL trace previously stored only counts on the `complete` event (gap_count, citation_count, discovery_count). Replay could re-render the step log but could not recover which gaps fired or which sources were cited, blocking M3.2/M3.3 stress-testing and calibration work. Two complementary fixes: 1. (a) TraceLogger.write_result() dumps the pydantic ResearchResult to `<trace_id>.result.json` next to the JSONL trace. The agent calls it right before emitting the `complete` step. `cli replay` now loads the sibling result file when present and renders the structured tables under the trace step log. 2. (b) The agent emits one `gap_recorded`, `citation_recorded`, or `discovery_recorded` trace event per item from the final result. This gives the JSONL stream a queryable timeline of what was kept, with categories and topics in-band, without needing to load the result sibling. Tests: 4 added (127 total passing). Smoke-tested live with a real ask; both files written and replay rendering verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 01:27:33 +00:00			`def test_replay_renders_persisted_result(self, tmp_path):`
			`"""Issue #54: replay loads <id>.result.json sibling and renders it."""`
			`runner = CliRunner()`
			`self._write_trace(tmp_path)`
			`result_payload = {`
			`"answer": "Test answer about Utah crops.",`
			`"citations": [`
			`{`
			`"source": "web",`
			`"locator": "https://example.com/utah",`
			`"title": "Utah Guide",`
			`"snippet": None,`
			`"raw_excerpt": "raw excerpt content",`
			`"confidence": 0.9,`
			`}`
			`],`
			`"gaps": [`
			`{`
			`"topic": "irrigation",`
			`"category": "scope_exceeded",`
			`"detail": "out of scope",`
			`}`
			`],`
			`"discovery_events": [],`
			`"open_questions": [],`
			`"confidence": 0.8,`
			`"confidence_factors": {`
			`"num_corroborating_sources": 2,`
			`"source_authority": "high",`
			`"contradiction_detected": False,`
			`"query_specificity_match": 0.8,`
			`"budget_exhausted": False,`
			`"recency": "current",`
			`},`
			`"cost_metadata": {`
			`"tokens_used": 1000,`
			`"iterations_run": 2,`
			`"wall_time_sec": 12.5,`
			`"budget_exhausted": False,`
			`"model_id": "claude-test",`
			`},`
			`"trace_id": "trace-xyz",`
			`}`
			`import json as _j`
			`(tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))`

			`result = runner.invoke(`
			`cli,`
			`["replay", "trace-xyz", "--trace-dir", str(tmp_path)],`
			`)`
			`assert result.exit_code == 0, result.output`
			`# Step log still rendered`
			`assert "search" in result.output`
			`# Persisted result also rendered`
			`assert "Test answer about Utah crops" in result.output`
			`assert "scope_exceeded" in result.output`
			`assert "irrigation" in result.output`

			`def test_replay_without_result_file_notes_absence(self, tmp_path):`
			`runner = CliRunner()`
			`self._write_trace(tmp_path)`
			`result = runner.invoke(`
			`cli,`
			`["replay", "trace-xyz", "--trace-dir", str(tmp_path)],`
			`)`
			`assert result.exit_code == 0`
			`assert "No persisted result file" in result.output`

M2.2: marchwarden replay CLI command (#9) Adds `marchwarden replay <trace_id>` to pretty-print a prior research run from its JSONL trace file. Resolves the trace under ~/.marchwarden/traces/ by default; --trace-dir overrides for tests and custom locations. Renders each step as a row with action, decision, extra fields, and content_hash. Friendly errors for unknown trace_id and malformed JSON lines. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 20:57:37 +00:00			`def test_render_trace_empty(self):`
			`console = Console(record=True, width=120)`
			`render_trace([], "empty-trace", console)`
			`out = console.export_text()`
			`assert "empty-trace" in out`
			`assert "empty" in out.lower()`
M2.5.3: marchwarden costs CLI command (#26) Adds operator-facing `marchwarden costs` subcommand that reads the JSONL ledger from M2.5.2 and pretty-prints a rich summary: - Cost Summary panel: total calls, total spend, total tokens (input/ output split), Tavily search count, warning for any calls with unknown model prices - Per-Day table sorted by date - Per-Model table sorted by model id - Highest-Cost Call panel with trace_id and question Flags: --since ISO date or relative shorthand (7d, 24h, 2w, 1m) --until same --model filter to a specific model_id --json emit raw filtered ledger entries instead of the table --ledger override default path (mostly for tests) Also fixes a Dockerfile gap: the obs/ package added in M2.5.1 was not being COPYed into the image, so the installed `marchwarden` entry point couldn't import it. Tests had been passing because they mounted /app over the install. Adding `COPY obs ./obs` restores parity. Tests cover summary rendering, model filter, since-date filter, JSON output, and the empty-ledger friendly path. 110/110 passing. End-to-end verified against the real cost ledger. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-08 21:57:39 +00:00

			`# ---------------------------------------------------------------------------`
			`# costs command`
			`# ---------------------------------------------------------------------------`


			`import json as _json`


			`def _write_ledger(path, entries):`
			`path.write_text("\n".join(_json.dumps(e) for e in entries) + "\n")`


			`def _ledger_fixture(tmp_path):`
			`path = tmp_path / "costs.jsonl"`
			`entries = [`
			`{`
			`"timestamp": "2026-04-06T10:00:00Z",`
			`"trace_id": "t1",`
			`"question": "What is X?",`
			`"model_id": "claude-sonnet-4-6",`
			`"tokens_used": 1000,`
			`"tokens_input": 800,`
			`"tokens_output": 200,`
			`"iterations_run": 1,`
			`"wall_time_sec": 5.0,`
			`"tavily_searches": 1,`
			`"estimated_cost_usd": 0.005,`
			`"budget_exhausted": False,`
			`"confidence": 0.9,`
			`},`
			`{`
			`"timestamp": "2026-04-07T11:00:00Z",`
			`"trace_id": "t2",`
			`"question": "Bigger query",`
			`"model_id": "claude-opus-4-6",`
			`"tokens_used": 50000,`
			`"tokens_input": 40000,`
			`"tokens_output": 10000,`
			`"iterations_run": 5,`
			`"wall_time_sec": 120.0,`
			`"tavily_searches": 8,`
			`"estimated_cost_usd": 1.25,`
			`"budget_exhausted": True,`
			`"confidence": 0.7,`
			`},`
			`{`
			`"timestamp": "2026-04-08T12:00:00Z",`
			`"trace_id": "t3",`
			`"question": "Unknown model run",`
			`"model_id": "future-model-7",`
			`"tokens_used": 500,`
			`"tokens_input": 400,`
			`"tokens_output": 100,`
			`"iterations_run": 1,`
			`"wall_time_sec": 2.0,`
			`"tavily_searches": 0,`
			`"estimated_cost_usd": None,`
			`"budget_exhausted": False,`
			`"confidence": 0.5,`
			`},`
			`]`
			`_write_ledger(path, entries)`
			`return path`


			`class TestCostsCommand:`
			`def test_renders_summary(self, tmp_path):`
			`path = _ledger_fixture(tmp_path)`
			`runner = CliRunner()`
			`result = runner.invoke(cli, ["costs", "--ledger", str(path)])`
			`assert result.exit_code == 0, result.output`
			`# Summary`
			`assert "Calls: 3" in result.output`
			`assert "$1.2550" in result.output`
			`# Per-day rows`
			`assert "2026-04-06" in result.output`
			`assert "2026-04-07" in result.output`
			`assert "2026-04-08" in result.output`
			`# Per-model rows`
			`assert "claude-sonnet-4-6" in result.output`
			`assert "claude-opus-4-6" in result.output`
			`# Highest-cost panel`
			`assert "t2" in result.output`
			`# Unknown model warning`
			`assert "unknown model price" in result.output`

			`def test_filter_by_model(self, tmp_path):`
			`path = _ledger_fixture(tmp_path)`
			`runner = CliRunner()`
			`result = runner.invoke(`
			`cli,`
			`["costs", "--ledger", str(path), "--model", "claude-opus-4-6"],`
			`)`
			`assert result.exit_code == 0`
			`assert "Calls: 1" in result.output`
			`assert "claude-sonnet-4-6" not in result.output`

			`def test_filter_by_since_iso(self, tmp_path):`
			`path = _ledger_fixture(tmp_path)`
			`runner = CliRunner()`
			`result = runner.invoke(`
			`cli,`
			`["costs", "--ledger", str(path), "--since", "2026-04-08"],`
			`)`
			`assert result.exit_code == 0`
			`assert "Calls: 1" in result.output`
			`assert "future-model-7" in result.output`
			`assert "claude-sonnet-4-6" not in result.output`

			`def test_json_output(self, tmp_path):`
			`path = _ledger_fixture(tmp_path)`
			`runner = CliRunner()`
			`result = runner.invoke(`
			`cli,`
			`["costs", "--ledger", str(path), "--json"],`
			`)`
			`assert result.exit_code == 0`
			`lines = [l for l in result.output.strip().splitlines() if l]`
			`assert len(lines) == 3`
			`first = _json.loads(lines[0])`
			`assert first["trace_id"] == "t1"`

			`def test_empty_ledger(self, tmp_path):`
			`path = tmp_path / "missing.jsonl"`
			`runner = CliRunner()`
			`result = runner.invoke(cli, ["costs", "--ledger", str(path)])`
			`assert result.exit_code == 0`
			`assert "No cost data yet" in result.output`

			`def test_render_costs_handles_empty(self):`
			`console = Console(record=True, width=120)`
			`render_costs([], console)`
			`out = console.export_text()`
			`assert "No cost data yet" in out`