From 036c3a934a4e50e835c93ef38f799bd2b39a3986 Mon Sep 17 00:00:00 2001 From: Jeff Smith Date: Mon, 6 Apr 2026 22:49:25 -0600 Subject: [PATCH] =?UTF-8?q?fix(ai):=20correct=20context=20budget=20metric?= =?UTF-8?q?=20=E2=80=94=20track=20per-call,=20not=20sum=20(#44)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dir loop was exiting early on small targets (a 13-file Python lib hit the budget at 92k–139k cumulative tokens) because _TokenTracker compared the SUM of input_tokens across all turns to the context window size. input_tokens from each API response is the size of the full prompt sent on that turn (system + every prior message + new tool results), so summing across turns multi-counts everything. The real per-call context size never approached the limit. Verified empirically: on luminos_lib pre-fix, the loop bailed when the most recent call's input_tokens was 20,535 (~10% of Sonnet's 200k window) but the cumulative sum was 134,983. Changes: - _TokenTracker now tracks last_input (the most recent call's input_tokens), separate from the cumulative loop_input/total_input used for cost reporting. - budget_exceeded() returns last_input > CONTEXT_BUDGET, not the cumulative sum. - MAX_CONTEXT bumped from 180_000 to 200_000 (Sonnet 4's real context window). CONTEXT_BUDGET stays at 70% = 140,000. - Early-exit message now shows context size, threshold, AND cumulative spend separately so future debugging is unambiguous. Smoke test on luminos_lib: investigation completes without early exit (~$0.37). 6 unit tests added covering the new semantics, including the key regression: a sequence of small calls whose sum exceeds the budget must NOT trip the check. Wiki Architecture page updated. #51 filed for the separate message-history-growth issue. --- luminos_lib/ai.py | 32 +++++++++++++++++++++---- tests/test_ai_filter.py | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/luminos_lib/ai.py b/luminos_lib/ai.py index a240ec1..cbe2353 100644 --- a/luminos_lib/ai.py +++ b/luminos_lib/ai.py @@ -31,8 +31,12 @@ from luminos_lib.tree import build_tree, render_tree MODEL = "claude-sonnet-4-20250514" -# Context budget: trigger early exit at 70% of Sonnet's context window. -MAX_CONTEXT = 180_000 +# Context budget: trigger early exit when a single API call's input_tokens +# (the actual size of the context window in use, NOT the cumulative sum +# across turns) approaches the model's real context limit. Sonnet 4 has +# a 200k context window; we leave a 30% safety margin for the response +# and any tool result we're about to append. +MAX_CONTEXT = 200_000 CONTEXT_BUDGET = int(MAX_CONTEXT * 0.70) # Pricing per 1M tokens (Claude Sonnet). @@ -88,13 +92,25 @@ def _should_skip_dir(name): # --------------------------------------------------------------------------- class _TokenTracker: - """Track cumulative token usage across API calls.""" + """Track token usage across API calls. + + Two distinct quantities are tracked: + - cumulative totals (total_*, loop_*) — for cost reporting + - last_input — the size of the context window on the most recent + call, used to detect approaching the model's context limit + + Cumulative input is NOT a meaningful proxy for context size: each + turn's input_tokens already includes the full message history, so + summing across turns double-counts everything. Use last_input for + budget decisions, totals for billing. (See #44.) + """ def __init__(self): self.total_input = 0 self.total_output = 0 self.loop_input = 0 self.loop_output = 0 + self.last_input = 0 def record(self, usage): """Record usage from a single API call.""" @@ -104,18 +120,21 @@ class _TokenTracker: self.total_output += out self.loop_input += inp self.loop_output += out + self.last_input = inp def reset_loop(self): """Reset per-loop counters (called between directory loops).""" self.loop_input = 0 self.loop_output = 0 + self.last_input = 0 @property def loop_total(self): return self.loop_input + self.loop_output def budget_exceeded(self): - return self.loop_total > CONTEXT_BUDGET + """True when the most recent call's context exceeded the budget.""" + return self.last_input > CONTEXT_BUDGET def summary(self): cost_in = self.total_input * INPUT_PRICE_PER_M / 1_000_000 @@ -862,7 +881,10 @@ def _run_dir_loop(client, target, cache, tracker, dir_path, max_turns=14, # Check context budget if tracker.budget_exceeded(): print(f" [AI] Context budget reached — exiting early " - f"({tracker.loop_total:,} tokens used)", file=sys.stderr) + f"(context size {tracker.last_input:,} > " + f"{CONTEXT_BUDGET:,} budget; " + f"loop spend {tracker.loop_total:,} tokens)", + file=sys.stderr) # Flush a partial directory summary from cached file entries if not cache.has_entry("dir", dir_path): dir_real = os.path.realpath(dir_path) diff --git a/tests/test_ai_filter.py b/tests/test_ai_filter.py index 466755d..5c20fef 100644 --- a/tests/test_ai_filter.py +++ b/tests/test_ai_filter.py @@ -108,6 +108,59 @@ class FormatSurveyBlockTests(unittest.TestCase): self.assertNotIn("Skip tools", block) +class TokenTrackerTests(unittest.TestCase): + def _usage(self, inp, out): + u = MagicMock() + u.input_tokens = inp + u.output_tokens = out + return u + + def test_record_updates_cumulative_and_last(self): + t = ai._TokenTracker() + t.record(self._usage(100, 20)) + t.record(self._usage(200, 30)) + self.assertEqual(t.total_input, 300) + self.assertEqual(t.total_output, 50) + self.assertEqual(t.loop_input, 300) + self.assertEqual(t.loop_output, 50) + self.assertEqual(t.last_input, 200) # last call only + + def test_budget_uses_last_input_not_sum(self): + t = ai._TokenTracker() + # Many small calls whose sum exceeds the budget but whose + # last input is well under the budget should NOT trip. + for _ in range(20): + t.record(self._usage(10_000, 100)) + self.assertGreater(t.loop_input, ai.CONTEXT_BUDGET) + self.assertLess(t.last_input, ai.CONTEXT_BUDGET) + self.assertFalse(t.budget_exceeded()) + + def test_budget_trips_when_last_input_over_threshold(self): + t = ai._TokenTracker() + t.record(self._usage(ai.CONTEXT_BUDGET + 1, 100)) + self.assertTrue(t.budget_exceeded()) + + def test_reset_loop_clears_loop_and_last(self): + t = ai._TokenTracker() + t.record(self._usage(500, 50)) + t.reset_loop() + self.assertEqual(t.loop_input, 0) + self.assertEqual(t.loop_output, 0) + self.assertEqual(t.last_input, 0) + # Cumulative totals are NOT reset + self.assertEqual(t.total_input, 500) + self.assertEqual(t.total_output, 50) + + def test_loop_total_property_still_works(self): + t = ai._TokenTracker() + t.record(self._usage(100, 25)) + t.record(self._usage(200, 50)) + self.assertEqual(t.loop_total, 375) + + def test_max_context_is_sonnet_real_window(self): + self.assertEqual(ai.MAX_CONTEXT, 200_000) + + class DefaultSurveyTests(unittest.TestCase): def test_has_all_required_keys(self): survey = ai._default_survey()