marchwarden/tests/test_costs.py

180 lines
6 KiB
Python
Raw Normal View History

M2.5.2: Cost ledger with price table (#25) Adds an append-only JSONL ledger of every research() call at ~/.marchwarden/costs.jsonl, supplementing (not replacing) the per-call cost_metadata field returned to callers. The ledger is the operator-facing source of truth for spend tracking, queryable via the upcoming `marchwarden costs` command (M2.5.3). Fields per entry: timestamp, trace_id, question (truncated 200ch), model_id, tokens_used, tokens_input, tokens_output, iterations_run, wall_time_sec, tavily_searches, estimated_cost_usd, budget_exhausted, confidence. Cost estimation reads ~/.marchwarden/prices.toml, which is auto-created with seed values for current Anthropic + Tavily rates on first run. Operators are expected to update prices.toml manually when upstream rates change — there is no automatic fetching. Existing files are never overwritten. Unknown models log a WARN and record estimated_cost_usd: null instead of crashing. Each ledger write also emits a structured `cost_recorded` log line via the M2.5.1 logger, so cost data ships to OpenSearch alongside the ledger file with no extra plumbing. Tracking changes in agent.py: - Track tokens_input / tokens_output split (not just total) - Count tavily_searches across iterations - _synthesize now returns (result, synth_in, synth_out) so the caller can attribute synthesis tokens to the running counters - Ledger.record() called after research_completed log; failures are caught and warn-logged so a ledger write can never poison a successful research call Tests cover: price table seeding, no-overwrite of existing files, cost estimation for known/unknown models, tavily-only cost, ledger appends, question truncation, env var override. End-to-end verified with a real Anthropic+Tavily call: 9107 input + 1140 output tokens, 1 tavily search, $0.049 estimated. 104/104 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:52:25 +00:00
"""Tests for the obs.costs cost ledger and price table."""
import json
from pathlib import Path
import pytest
from obs.costs import (
DEFAULT_PRICES_PATH,
SEED_PRICES_TOML,
CostLedger,
PriceTable,
)
class TestPriceTable:
def test_seeds_missing_file(self, tmp_path):
prices_path = tmp_path / "prices.toml"
assert not prices_path.exists()
table = PriceTable(path=str(prices_path))
assert prices_path.exists()
assert "claude-sonnet-4-6" in prices_path.read_text()
# Loaded into memory
assert table._data["models"]["claude-sonnet-4-6"]["input_per_mtok_usd"] == 3.00
def test_does_not_overwrite_existing_file(self, tmp_path):
prices_path = tmp_path / "prices.toml"
prices_path.write_text(
'[models."custom-model"]\n'
'input_per_mtok_usd = 1.23\n'
'output_per_mtok_usd = 4.56\n'
)
table = PriceTable(path=str(prices_path))
assert table._data["models"]["custom-model"]["input_per_mtok_usd"] == 1.23
assert "claude-sonnet-4-6" not in table._data.get("models", {})
def test_estimates_known_model(self, tmp_path):
table = PriceTable(path=str(tmp_path / "prices.toml"))
# 1M input @ $3 + 1M output @ $15 = $18, no tavily
cost = table.estimate_call_usd(
model_id="claude-sonnet-4-6",
tokens_input=1_000_000,
tokens_output=1_000_000,
tavily_searches=0,
)
assert cost == 18.00
def test_estimates_with_tavily(self, tmp_path):
table = PriceTable(path=str(tmp_path / "prices.toml"))
cost = table.estimate_call_usd(
model_id="claude-sonnet-4-6",
tokens_input=0,
tokens_output=0,
tavily_searches=10,
)
# 10 * $0.005 = $0.05
assert cost == 0.05
def test_unknown_model_returns_none(self, tmp_path):
table = PriceTable(path=str(tmp_path / "prices.toml"))
cost = table.estimate_call_usd(
model_id="some-future-model",
tokens_input=1000,
tokens_output=1000,
tavily_searches=0,
)
assert cost is None
class TestCostLedger:
def _ledger(self, tmp_path):
return CostLedger(
ledger_path=str(tmp_path / "costs.jsonl"),
price_table=PriceTable(path=str(tmp_path / "prices.toml")),
)
def test_record_writes_jsonl(self, tmp_path):
ledger = self._ledger(tmp_path)
entry = ledger.record(
trace_id="abc-123",
question="What grows in Utah?",
model_id="claude-sonnet-4-6",
tokens_used=10_000,
tokens_input=8_000,
tokens_output=2_000,
iterations_run=3,
wall_time_sec=42.5,
tavily_searches=4,
budget_exhausted=False,
confidence=0.9,
)
# File contains one JSON line
lines = (tmp_path / "costs.jsonl").read_text().strip().splitlines()
assert len(lines) == 1
on_disk = json.loads(lines[0])
assert on_disk == entry
# All required fields present and shaped correctly
assert on_disk["trace_id"] == "abc-123"
assert on_disk["question"] == "What grows in Utah?"
assert on_disk["model_id"] == "claude-sonnet-4-6"
assert on_disk["tokens_used"] == 10_000
assert on_disk["tokens_input"] == 8_000
assert on_disk["tokens_output"] == 2_000
assert on_disk["iterations_run"] == 3
assert on_disk["wall_time_sec"] == 42.5
assert on_disk["tavily_searches"] == 4
assert on_disk["budget_exhausted"] is False
assert on_disk["confidence"] == 0.9
assert "timestamp" in on_disk
# 8000 input @ $3/Mtok + 2000 output @ $15/Mtok + 4 * $0.005 = $0.074
assert on_disk["estimated_cost_usd"] == pytest.approx(0.074, abs=1e-6)
def test_record_appends(self, tmp_path):
ledger = self._ledger(tmp_path)
for i in range(3):
ledger.record(
trace_id=f"trace-{i}",
question=f"q{i}",
model_id="claude-sonnet-4-6",
tokens_used=100,
tokens_input=80,
tokens_output=20,
iterations_run=1,
wall_time_sec=1.0,
tavily_searches=0,
budget_exhausted=False,
confidence=0.5,
)
lines = (tmp_path / "costs.jsonl").read_text().strip().splitlines()
assert len(lines) == 3
assert json.loads(lines[0])["trace_id"] == "trace-0"
assert json.loads(lines[2])["trace_id"] == "trace-2"
def test_unknown_model_records_null_cost(self, tmp_path):
ledger = self._ledger(tmp_path)
entry = ledger.record(
trace_id="abc",
question="q",
model_id="some-future-model",
tokens_used=1000,
tokens_input=500,
tokens_output=500,
iterations_run=1,
wall_time_sec=1.0,
tavily_searches=0,
budget_exhausted=False,
confidence=0.5,
)
assert entry["estimated_cost_usd"] is None
def test_question_is_truncated(self, tmp_path):
ledger = self._ledger(tmp_path)
long_q = "x" * 1000
entry = ledger.record(
trace_id="abc",
question=long_q,
model_id="claude-sonnet-4-6",
tokens_used=10,
tokens_input=5,
tokens_output=5,
iterations_run=1,
wall_time_sec=0.1,
tavily_searches=0,
budget_exhausted=False,
confidence=0.5,
)
assert len(entry["question"]) == 200
def test_env_var_override(self, tmp_path, monkeypatch):
custom = tmp_path / "custom-ledger.jsonl"
monkeypatch.setenv("MARCHWARDEN_COST_LEDGER", str(custom))
ledger = CostLedger(
price_table=PriceTable(path=str(tmp_path / "prices.toml")),
)
assert ledger.path == custom