fix(ai): correct context budget metric — track per-call, not sum (#44)
The dir loop was exiting early on small targets (a 13-file Python lib hit the budget at 92k–139k cumulative tokens) because _TokenTracker compared the SUM of input_tokens across all turns to the context window size. input_tokens from each API response is the size of the full prompt sent on that turn (system + every prior message + new tool results), so summing across turns multi-counts everything. The real per-call context size never approached the limit. Verified empirically: on luminos_lib pre-fix, the loop bailed when the most recent call's input_tokens was 20,535 (~10% of Sonnet's 200k window) but the cumulative sum was 134,983. Changes: - _TokenTracker now tracks last_input (the most recent call's input_tokens), separate from the cumulative loop_input/total_input used for cost reporting. - budget_exceeded() returns last_input > CONTEXT_BUDGET, not the cumulative sum. - MAX_CONTEXT bumped from 180_000 to 200_000 (Sonnet 4's real context window). CONTEXT_BUDGET stays at 70% = 140,000. - Early-exit message now shows context size, threshold, AND cumulative spend separately so future debugging is unambiguous. Smoke test on luminos_lib: investigation completes without early exit (~$0.37). 6 unit tests added covering the new semantics, including the key regression: a sequence of small calls whose sum exceeds the budget must NOT trip the check. Wiki Architecture page updated. #51 filed for the separate message-history-growth issue.
This commit is contained in:
parent
157ac3f606
commit
036c3a934a
2 changed files with 80 additions and 5 deletions
|
|
@ -31,8 +31,12 @@ from luminos_lib.tree import build_tree, render_tree
|
||||||
|
|
||||||
MODEL = "claude-sonnet-4-20250514"
|
MODEL = "claude-sonnet-4-20250514"
|
||||||
|
|
||||||
# Context budget: trigger early exit at 70% of Sonnet's context window.
|
# Context budget: trigger early exit when a single API call's input_tokens
|
||||||
MAX_CONTEXT = 180_000
|
# (the actual size of the context window in use, NOT the cumulative sum
|
||||||
|
# across turns) approaches the model's real context limit. Sonnet 4 has
|
||||||
|
# a 200k context window; we leave a 30% safety margin for the response
|
||||||
|
# and any tool result we're about to append.
|
||||||
|
MAX_CONTEXT = 200_000
|
||||||
CONTEXT_BUDGET = int(MAX_CONTEXT * 0.70)
|
CONTEXT_BUDGET = int(MAX_CONTEXT * 0.70)
|
||||||
|
|
||||||
# Pricing per 1M tokens (Claude Sonnet).
|
# Pricing per 1M tokens (Claude Sonnet).
|
||||||
|
|
@ -88,13 +92,25 @@ def _should_skip_dir(name):
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
class _TokenTracker:
|
class _TokenTracker:
|
||||||
"""Track cumulative token usage across API calls."""
|
"""Track token usage across API calls.
|
||||||
|
|
||||||
|
Two distinct quantities are tracked:
|
||||||
|
- cumulative totals (total_*, loop_*) — for cost reporting
|
||||||
|
- last_input — the size of the context window on the most recent
|
||||||
|
call, used to detect approaching the model's context limit
|
||||||
|
|
||||||
|
Cumulative input is NOT a meaningful proxy for context size: each
|
||||||
|
turn's input_tokens already includes the full message history, so
|
||||||
|
summing across turns double-counts everything. Use last_input for
|
||||||
|
budget decisions, totals for billing. (See #44.)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.total_input = 0
|
self.total_input = 0
|
||||||
self.total_output = 0
|
self.total_output = 0
|
||||||
self.loop_input = 0
|
self.loop_input = 0
|
||||||
self.loop_output = 0
|
self.loop_output = 0
|
||||||
|
self.last_input = 0
|
||||||
|
|
||||||
def record(self, usage):
|
def record(self, usage):
|
||||||
"""Record usage from a single API call."""
|
"""Record usage from a single API call."""
|
||||||
|
|
@ -104,18 +120,21 @@ class _TokenTracker:
|
||||||
self.total_output += out
|
self.total_output += out
|
||||||
self.loop_input += inp
|
self.loop_input += inp
|
||||||
self.loop_output += out
|
self.loop_output += out
|
||||||
|
self.last_input = inp
|
||||||
|
|
||||||
def reset_loop(self):
|
def reset_loop(self):
|
||||||
"""Reset per-loop counters (called between directory loops)."""
|
"""Reset per-loop counters (called between directory loops)."""
|
||||||
self.loop_input = 0
|
self.loop_input = 0
|
||||||
self.loop_output = 0
|
self.loop_output = 0
|
||||||
|
self.last_input = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def loop_total(self):
|
def loop_total(self):
|
||||||
return self.loop_input + self.loop_output
|
return self.loop_input + self.loop_output
|
||||||
|
|
||||||
def budget_exceeded(self):
|
def budget_exceeded(self):
|
||||||
return self.loop_total > CONTEXT_BUDGET
|
"""True when the most recent call's context exceeded the budget."""
|
||||||
|
return self.last_input > CONTEXT_BUDGET
|
||||||
|
|
||||||
def summary(self):
|
def summary(self):
|
||||||
cost_in = self.total_input * INPUT_PRICE_PER_M / 1_000_000
|
cost_in = self.total_input * INPUT_PRICE_PER_M / 1_000_000
|
||||||
|
|
@ -862,7 +881,10 @@ def _run_dir_loop(client, target, cache, tracker, dir_path, max_turns=14,
|
||||||
# Check context budget
|
# Check context budget
|
||||||
if tracker.budget_exceeded():
|
if tracker.budget_exceeded():
|
||||||
print(f" [AI] Context budget reached — exiting early "
|
print(f" [AI] Context budget reached — exiting early "
|
||||||
f"({tracker.loop_total:,} tokens used)", file=sys.stderr)
|
f"(context size {tracker.last_input:,} > "
|
||||||
|
f"{CONTEXT_BUDGET:,} budget; "
|
||||||
|
f"loop spend {tracker.loop_total:,} tokens)",
|
||||||
|
file=sys.stderr)
|
||||||
# Flush a partial directory summary from cached file entries
|
# Flush a partial directory summary from cached file entries
|
||||||
if not cache.has_entry("dir", dir_path):
|
if not cache.has_entry("dir", dir_path):
|
||||||
dir_real = os.path.realpath(dir_path)
|
dir_real = os.path.realpath(dir_path)
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,59 @@ class FormatSurveyBlockTests(unittest.TestCase):
|
||||||
self.assertNotIn("Skip tools", block)
|
self.assertNotIn("Skip tools", block)
|
||||||
|
|
||||||
|
|
||||||
|
class TokenTrackerTests(unittest.TestCase):
|
||||||
|
def _usage(self, inp, out):
|
||||||
|
u = MagicMock()
|
||||||
|
u.input_tokens = inp
|
||||||
|
u.output_tokens = out
|
||||||
|
return u
|
||||||
|
|
||||||
|
def test_record_updates_cumulative_and_last(self):
|
||||||
|
t = ai._TokenTracker()
|
||||||
|
t.record(self._usage(100, 20))
|
||||||
|
t.record(self._usage(200, 30))
|
||||||
|
self.assertEqual(t.total_input, 300)
|
||||||
|
self.assertEqual(t.total_output, 50)
|
||||||
|
self.assertEqual(t.loop_input, 300)
|
||||||
|
self.assertEqual(t.loop_output, 50)
|
||||||
|
self.assertEqual(t.last_input, 200) # last call only
|
||||||
|
|
||||||
|
def test_budget_uses_last_input_not_sum(self):
|
||||||
|
t = ai._TokenTracker()
|
||||||
|
# Many small calls whose sum exceeds the budget but whose
|
||||||
|
# last input is well under the budget should NOT trip.
|
||||||
|
for _ in range(20):
|
||||||
|
t.record(self._usage(10_000, 100))
|
||||||
|
self.assertGreater(t.loop_input, ai.CONTEXT_BUDGET)
|
||||||
|
self.assertLess(t.last_input, ai.CONTEXT_BUDGET)
|
||||||
|
self.assertFalse(t.budget_exceeded())
|
||||||
|
|
||||||
|
def test_budget_trips_when_last_input_over_threshold(self):
|
||||||
|
t = ai._TokenTracker()
|
||||||
|
t.record(self._usage(ai.CONTEXT_BUDGET + 1, 100))
|
||||||
|
self.assertTrue(t.budget_exceeded())
|
||||||
|
|
||||||
|
def test_reset_loop_clears_loop_and_last(self):
|
||||||
|
t = ai._TokenTracker()
|
||||||
|
t.record(self._usage(500, 50))
|
||||||
|
t.reset_loop()
|
||||||
|
self.assertEqual(t.loop_input, 0)
|
||||||
|
self.assertEqual(t.loop_output, 0)
|
||||||
|
self.assertEqual(t.last_input, 0)
|
||||||
|
# Cumulative totals are NOT reset
|
||||||
|
self.assertEqual(t.total_input, 500)
|
||||||
|
self.assertEqual(t.total_output, 50)
|
||||||
|
|
||||||
|
def test_loop_total_property_still_works(self):
|
||||||
|
t = ai._TokenTracker()
|
||||||
|
t.record(self._usage(100, 25))
|
||||||
|
t.record(self._usage(200, 50))
|
||||||
|
self.assertEqual(t.loop_total, 375)
|
||||||
|
|
||||||
|
def test_max_context_is_sonnet_real_window(self):
|
||||||
|
self.assertEqual(ai.MAX_CONTEXT, 200_000)
|
||||||
|
|
||||||
|
|
||||||
class DefaultSurveyTests(unittest.TestCase):
|
class DefaultSurveyTests(unittest.TestCase):
|
||||||
def test_has_all_required_keys(self):
|
def test_has_all_required_keys(self):
|
||||||
survey = ai._default_survey()
|
survey = ai._default_survey()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue