2026-04-08 20:51:40 +00:00
|
|
|
"""Tests for the marchwarden CLI."""
|
|
|
|
|
|
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
|
|
from click.testing import CliRunner
|
|
|
|
|
|
M2.5.3: marchwarden costs CLI command (#26)
Adds operator-facing `marchwarden costs` subcommand that reads the
JSONL ledger from M2.5.2 and pretty-prints a rich summary:
- Cost Summary panel: total calls, total spend, total tokens (input/
output split), Tavily search count, warning for any calls with
unknown model prices
- Per-Day table sorted by date
- Per-Model table sorted by model id
- Highest-Cost Call panel with trace_id and question
Flags:
--since ISO date or relative shorthand (7d, 24h, 2w, 1m)
--until same
--model filter to a specific model_id
--json emit raw filtered ledger entries instead of the table
--ledger override default path (mostly for tests)
Also fixes a Dockerfile gap: the obs/ package added in M2.5.1 was
not being COPYed into the image, so the installed `marchwarden`
entry point couldn't import it. Tests had been passing because
they mounted /app over the install. Adding `COPY obs ./obs`
restores parity.
Tests cover summary rendering, model filter, since-date filter,
JSON output, and the empty-ledger friendly path. 110/110 passing.
End-to-end verified against the real cost ledger.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:57:39 +00:00
|
|
|
from cli.main import cli, render_costs, render_result, render_trace
|
2026-04-08 20:51:40 +00:00
|
|
|
from researchers.web.models import (
|
|
|
|
|
Citation,
|
|
|
|
|
ConfidenceFactors,
|
|
|
|
|
CostMetadata,
|
|
|
|
|
DiscoveryEvent,
|
|
|
|
|
Gap,
|
|
|
|
|
GapCategory,
|
|
|
|
|
OpenQuestion,
|
|
|
|
|
ResearchResult,
|
|
|
|
|
)
|
|
|
|
|
from rich.console import Console
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _fixture_result() -> ResearchResult:
|
|
|
|
|
return ResearchResult(
|
|
|
|
|
answer="Tomatoes, peppers, squash, and beans grow well in Utah.",
|
|
|
|
|
citations=[
|
|
|
|
|
Citation(
|
|
|
|
|
source="web",
|
|
|
|
|
locator="https://extension.usu.edu/yard-and-garden",
|
|
|
|
|
title="USU Extension — Yard and Garden",
|
|
|
|
|
snippet="USU recommends warm-season crops for Utah's climate.",
|
|
|
|
|
raw_excerpt="Tomatoes, peppers, and squash thrive in Utah summers.",
|
|
|
|
|
confidence=0.9,
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
gaps=[
|
|
|
|
|
Gap(
|
|
|
|
|
topic="Microclimate variation",
|
|
|
|
|
category=GapCategory.SCOPE_EXCEEDED,
|
|
|
|
|
detail="Did not investigate elevation-specific recommendations.",
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
discovery_events=[
|
|
|
|
|
DiscoveryEvent(
|
|
|
|
|
type="related_research",
|
|
|
|
|
suggested_researcher="docs",
|
|
|
|
|
query="Utah USDA hardiness zones",
|
|
|
|
|
reason="Zone-specific guidance would improve answer.",
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
open_questions=[
|
|
|
|
|
OpenQuestion(
|
|
|
|
|
question="What are the best cool-season crops?",
|
|
|
|
|
context="Answer focused on warm-season crops.",
|
|
|
|
|
priority="medium",
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
confidence=0.82,
|
|
|
|
|
confidence_factors=ConfidenceFactors(
|
|
|
|
|
num_corroborating_sources=3,
|
|
|
|
|
source_authority="high",
|
|
|
|
|
contradiction_detected=False,
|
|
|
|
|
query_specificity_match=0.85,
|
|
|
|
|
budget_exhausted=False,
|
|
|
|
|
recency="current",
|
|
|
|
|
),
|
|
|
|
|
cost_metadata=CostMetadata(
|
|
|
|
|
tokens_used=4321,
|
|
|
|
|
iterations_run=3,
|
|
|
|
|
wall_time_sec=12.5,
|
|
|
|
|
budget_exhausted=False,
|
|
|
|
|
model_id="claude-sonnet-4-6",
|
|
|
|
|
),
|
|
|
|
|
trace_id="trace-abc-123",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestRenderResult:
|
|
|
|
|
def test_renders_all_sections(self):
|
|
|
|
|
console = Console(record=True, width=120)
|
|
|
|
|
render_result(_fixture_result(), console)
|
|
|
|
|
out = console.export_text()
|
|
|
|
|
assert "Tomatoes" in out
|
|
|
|
|
assert "USU Extension" in out
|
|
|
|
|
assert "scope_exceeded" in out
|
|
|
|
|
assert "related_research" in out
|
|
|
|
|
assert "cool-season" in out
|
|
|
|
|
assert "Confidence" in out
|
|
|
|
|
assert "claude-sonnet-4-6" in out
|
|
|
|
|
assert "trace-abc-123" in out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestAskCommand:
|
|
|
|
|
def test_ask_invokes_mcp_and_renders(self):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
fixture = _fixture_result()
|
|
|
|
|
|
|
|
|
|
async def fake_call(question, depth, max_iterations, token_budget):
|
|
|
|
|
assert question == "What grows in Utah?"
|
|
|
|
|
assert depth == "shallow"
|
|
|
|
|
assert max_iterations == 2
|
|
|
|
|
assert token_budget == 5000
|
|
|
|
|
return fixture
|
|
|
|
|
|
|
|
|
|
with patch("cli.main.call_research_tool", side_effect=fake_call):
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
[
|
|
|
|
|
"ask",
|
|
|
|
|
"What grows in Utah?",
|
|
|
|
|
"--depth",
|
|
|
|
|
"shallow",
|
|
|
|
|
"--max-iterations",
|
|
|
|
|
"2",
|
|
|
|
|
"--budget",
|
|
|
|
|
"5000",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert result.exit_code == 0, result.output
|
|
|
|
|
assert "Tomatoes" in result.output
|
|
|
|
|
assert "trace-abc-123" in result.output
|
|
|
|
|
|
|
|
|
|
def test_ask_handles_error(self):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
|
|
|
|
async def boom(**kwargs):
|
|
|
|
|
raise RuntimeError("mcp went sideways")
|
|
|
|
|
|
|
|
|
|
with patch("cli.main.call_research_tool", side_effect=boom):
|
|
|
|
|
result = runner.invoke(cli, ["ask", "anything"])
|
|
|
|
|
|
|
|
|
|
assert result.exit_code == 1
|
|
|
|
|
assert "mcp went sideways" in result.output
|
2026-04-08 20:57:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestReplayCommand:
|
|
|
|
|
def _write_trace(self, tmp_path, trace_id="trace-xyz"):
|
|
|
|
|
path = tmp_path / f"{trace_id}.jsonl"
|
|
|
|
|
path.write_text(
|
|
|
|
|
'{"step": 1, "action": "search", "decision": "initial query", '
|
|
|
|
|
'"timestamp": "2026-04-08T00:00:00Z", "query": "utah crops"}\n'
|
|
|
|
|
'{"step": 2, "action": "fetch_url", "decision": "promising source", '
|
|
|
|
|
'"timestamp": "2026-04-08T00:00:01Z", "url": "https://example.com", '
|
|
|
|
|
'"content_hash": "sha256:deadbeef"}\n'
|
|
|
|
|
'{"step": 3, "action": "synthesize", "decision": "have enough", '
|
|
|
|
|
'"timestamp": "2026-04-08T00:00:02Z"}\n'
|
|
|
|
|
)
|
|
|
|
|
return path
|
|
|
|
|
|
|
|
|
|
def test_replay_renders_trace(self, tmp_path):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
self._write_trace(tmp_path)
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0, result.output
|
|
|
|
|
assert "trace-xyz" in result.output
|
|
|
|
|
assert "search" in result.output
|
|
|
|
|
assert "fetch_url" in result.output
|
|
|
|
|
assert "synthesize" in result.output
|
|
|
|
|
assert "sha256:deadbeef" in result.output
|
|
|
|
|
assert "utah crops" in result.output
|
|
|
|
|
|
|
|
|
|
def test_replay_unknown_trace_id(self, tmp_path):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["replay", "missing-id", "--trace-dir", str(tmp_path)],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 1
|
|
|
|
|
assert "no trace file found" in result.output
|
|
|
|
|
|
|
|
|
|
def test_replay_invalid_json(self, tmp_path):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
(tmp_path / "broken.jsonl").write_text("{not json}\n")
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["replay", "broken", "--trace-dir", str(tmp_path)],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 1
|
|
|
|
|
assert "invalid JSON" in result.output
|
|
|
|
|
|
2026-04-09 01:27:33 +00:00
|
|
|
def test_replay_renders_persisted_result(self, tmp_path):
|
|
|
|
|
"""Issue #54: replay loads <id>.result.json sibling and renders it."""
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
self._write_trace(tmp_path)
|
|
|
|
|
result_payload = {
|
|
|
|
|
"answer": "Test answer about Utah crops.",
|
|
|
|
|
"citations": [
|
|
|
|
|
{
|
|
|
|
|
"source": "web",
|
|
|
|
|
"locator": "https://example.com/utah",
|
|
|
|
|
"title": "Utah Guide",
|
|
|
|
|
"snippet": None,
|
|
|
|
|
"raw_excerpt": "raw excerpt content",
|
|
|
|
|
"confidence": 0.9,
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"gaps": [
|
|
|
|
|
{
|
|
|
|
|
"topic": "irrigation",
|
|
|
|
|
"category": "scope_exceeded",
|
|
|
|
|
"detail": "out of scope",
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"discovery_events": [],
|
|
|
|
|
"open_questions": [],
|
|
|
|
|
"confidence": 0.8,
|
|
|
|
|
"confidence_factors": {
|
|
|
|
|
"num_corroborating_sources": 2,
|
|
|
|
|
"source_authority": "high",
|
|
|
|
|
"contradiction_detected": False,
|
|
|
|
|
"query_specificity_match": 0.8,
|
|
|
|
|
"budget_exhausted": False,
|
|
|
|
|
"recency": "current",
|
|
|
|
|
},
|
|
|
|
|
"cost_metadata": {
|
|
|
|
|
"tokens_used": 1000,
|
|
|
|
|
"iterations_run": 2,
|
|
|
|
|
"wall_time_sec": 12.5,
|
|
|
|
|
"budget_exhausted": False,
|
|
|
|
|
"model_id": "claude-test",
|
|
|
|
|
},
|
|
|
|
|
"trace_id": "trace-xyz",
|
|
|
|
|
}
|
|
|
|
|
import json as _j
|
|
|
|
|
(tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
|
|
|
|
|
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0, result.output
|
|
|
|
|
# Step log still rendered
|
|
|
|
|
assert "search" in result.output
|
|
|
|
|
# Persisted result also rendered
|
|
|
|
|
assert "Test answer about Utah crops" in result.output
|
|
|
|
|
assert "scope_exceeded" in result.output
|
|
|
|
|
assert "irrigation" in result.output
|
|
|
|
|
|
|
|
|
|
def test_replay_without_result_file_notes_absence(self, tmp_path):
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
self._write_trace(tmp_path)
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0
|
|
|
|
|
assert "No persisted result file" in result.output
|
|
|
|
|
|
2026-04-08 20:57:37 +00:00
|
|
|
def test_render_trace_empty(self):
|
|
|
|
|
console = Console(record=True, width=120)
|
|
|
|
|
render_trace([], "empty-trace", console)
|
|
|
|
|
out = console.export_text()
|
|
|
|
|
assert "empty-trace" in out
|
|
|
|
|
assert "empty" in out.lower()
|
M2.5.3: marchwarden costs CLI command (#26)
Adds operator-facing `marchwarden costs` subcommand that reads the
JSONL ledger from M2.5.2 and pretty-prints a rich summary:
- Cost Summary panel: total calls, total spend, total tokens (input/
output split), Tavily search count, warning for any calls with
unknown model prices
- Per-Day table sorted by date
- Per-Model table sorted by model id
- Highest-Cost Call panel with trace_id and question
Flags:
--since ISO date or relative shorthand (7d, 24h, 2w, 1m)
--until same
--model filter to a specific model_id
--json emit raw filtered ledger entries instead of the table
--ledger override default path (mostly for tests)
Also fixes a Dockerfile gap: the obs/ package added in M2.5.1 was
not being COPYed into the image, so the installed `marchwarden`
entry point couldn't import it. Tests had been passing because
they mounted /app over the install. Adding `COPY obs ./obs`
restores parity.
Tests cover summary rendering, model filter, since-date filter,
JSON output, and the empty-ledger friendly path. 110/110 passing.
End-to-end verified against the real cost ledger.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:57:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# costs command
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json as _json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_ledger(path, entries):
|
|
|
|
|
path.write_text("\n".join(_json.dumps(e) for e in entries) + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _ledger_fixture(tmp_path):
|
|
|
|
|
path = tmp_path / "costs.jsonl"
|
|
|
|
|
entries = [
|
|
|
|
|
{
|
|
|
|
|
"timestamp": "2026-04-06T10:00:00Z",
|
|
|
|
|
"trace_id": "t1",
|
|
|
|
|
"question": "What is X?",
|
|
|
|
|
"model_id": "claude-sonnet-4-6",
|
|
|
|
|
"tokens_used": 1000,
|
|
|
|
|
"tokens_input": 800,
|
|
|
|
|
"tokens_output": 200,
|
|
|
|
|
"iterations_run": 1,
|
|
|
|
|
"wall_time_sec": 5.0,
|
|
|
|
|
"tavily_searches": 1,
|
|
|
|
|
"estimated_cost_usd": 0.005,
|
|
|
|
|
"budget_exhausted": False,
|
|
|
|
|
"confidence": 0.9,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"timestamp": "2026-04-07T11:00:00Z",
|
|
|
|
|
"trace_id": "t2",
|
|
|
|
|
"question": "Bigger query",
|
|
|
|
|
"model_id": "claude-opus-4-6",
|
|
|
|
|
"tokens_used": 50000,
|
|
|
|
|
"tokens_input": 40000,
|
|
|
|
|
"tokens_output": 10000,
|
|
|
|
|
"iterations_run": 5,
|
|
|
|
|
"wall_time_sec": 120.0,
|
|
|
|
|
"tavily_searches": 8,
|
|
|
|
|
"estimated_cost_usd": 1.25,
|
|
|
|
|
"budget_exhausted": True,
|
|
|
|
|
"confidence": 0.7,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"timestamp": "2026-04-08T12:00:00Z",
|
|
|
|
|
"trace_id": "t3",
|
|
|
|
|
"question": "Unknown model run",
|
|
|
|
|
"model_id": "future-model-7",
|
|
|
|
|
"tokens_used": 500,
|
|
|
|
|
"tokens_input": 400,
|
|
|
|
|
"tokens_output": 100,
|
|
|
|
|
"iterations_run": 1,
|
|
|
|
|
"wall_time_sec": 2.0,
|
|
|
|
|
"tavily_searches": 0,
|
|
|
|
|
"estimated_cost_usd": None,
|
|
|
|
|
"budget_exhausted": False,
|
|
|
|
|
"confidence": 0.5,
|
|
|
|
|
},
|
|
|
|
|
]
|
|
|
|
|
_write_ledger(path, entries)
|
|
|
|
|
return path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCostsCommand:
|
|
|
|
|
def test_renders_summary(self, tmp_path):
|
|
|
|
|
path = _ledger_fixture(tmp_path)
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(cli, ["costs", "--ledger", str(path)])
|
|
|
|
|
assert result.exit_code == 0, result.output
|
|
|
|
|
# Summary
|
|
|
|
|
assert "Calls: 3" in result.output
|
|
|
|
|
assert "$1.2550" in result.output
|
|
|
|
|
# Per-day rows
|
|
|
|
|
assert "2026-04-06" in result.output
|
|
|
|
|
assert "2026-04-07" in result.output
|
|
|
|
|
assert "2026-04-08" in result.output
|
|
|
|
|
# Per-model rows
|
|
|
|
|
assert "claude-sonnet-4-6" in result.output
|
|
|
|
|
assert "claude-opus-4-6" in result.output
|
|
|
|
|
# Highest-cost panel
|
|
|
|
|
assert "t2" in result.output
|
|
|
|
|
# Unknown model warning
|
|
|
|
|
assert "unknown model price" in result.output
|
|
|
|
|
|
|
|
|
|
def test_filter_by_model(self, tmp_path):
|
|
|
|
|
path = _ledger_fixture(tmp_path)
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["costs", "--ledger", str(path), "--model", "claude-opus-4-6"],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0
|
|
|
|
|
assert "Calls: 1" in result.output
|
|
|
|
|
assert "claude-sonnet-4-6" not in result.output
|
|
|
|
|
|
|
|
|
|
def test_filter_by_since_iso(self, tmp_path):
|
|
|
|
|
path = _ledger_fixture(tmp_path)
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["costs", "--ledger", str(path), "--since", "2026-04-08"],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0
|
|
|
|
|
assert "Calls: 1" in result.output
|
|
|
|
|
assert "future-model-7" in result.output
|
|
|
|
|
assert "claude-sonnet-4-6" not in result.output
|
|
|
|
|
|
|
|
|
|
def test_json_output(self, tmp_path):
|
|
|
|
|
path = _ledger_fixture(tmp_path)
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(
|
|
|
|
|
cli,
|
|
|
|
|
["costs", "--ledger", str(path), "--json"],
|
|
|
|
|
)
|
|
|
|
|
assert result.exit_code == 0
|
|
|
|
|
lines = [l for l in result.output.strip().splitlines() if l]
|
|
|
|
|
assert len(lines) == 3
|
|
|
|
|
first = _json.loads(lines[0])
|
|
|
|
|
assert first["trace_id"] == "t1"
|
|
|
|
|
|
|
|
|
|
def test_empty_ledger(self, tmp_path):
|
|
|
|
|
path = tmp_path / "missing.jsonl"
|
|
|
|
|
runner = CliRunner()
|
|
|
|
|
result = runner.invoke(cli, ["costs", "--ledger", str(path)])
|
|
|
|
|
assert result.exit_code == 0
|
|
|
|
|
assert "No cost data yet" in result.output
|
|
|
|
|
|
|
|
|
|
def test_render_costs_handles_empty(self):
|
|
|
|
|
console = Console(record=True, width=120)
|
|
|
|
|
render_costs([], console)
|
|
|
|
|
out = console.export_text()
|
|
|
|
|
assert "No cost data yet" in out
|