From 036c3a934a4e50e835c93ef38f799bd2b39a3986 Mon Sep 17 00:00:00 2001
From: Jeff Smith <jeff@unbiasedgeek.com>
Date: Mon, 6 Apr 2026 22:49:25 -0600
Subject: [PATCH] =?UTF-8?q?fix(ai):=20correct=20context=20budget=20metric?=
 =?UTF-8?q?=20=E2=80=94=20track=20per-call,=20not=20sum=20(#44)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dir loop was exiting early on small targets (a 13-file Python lib
hit the budget at 92k–139k cumulative tokens) because _TokenTracker
compared the SUM of input_tokens across all turns to the context
window size. input_tokens from each API response is the size of the
full prompt sent on that turn (system + every prior message + new
tool results), so summing across turns multi-counts everything. The
real per-call context size never approached the limit.

Verified empirically: on luminos_lib pre-fix, the loop bailed when
the most recent call's input_tokens was 20,535 (~10% of Sonnet's
200k window) but the cumulative sum was 134,983.

Changes:
- _TokenTracker now tracks last_input (the most recent call's
  input_tokens), separate from the cumulative loop_input/total_input
  used for cost reporting.
- budget_exceeded() returns last_input > CONTEXT_BUDGET, not the
  cumulative sum.
- MAX_CONTEXT bumped from 180_000 to 200_000 (Sonnet 4's real
  context window). CONTEXT_BUDGET stays at 70% = 140,000.
- Early-exit message now shows context size, threshold, AND
  cumulative spend separately so future debugging is unambiguous.

Smoke test on luminos_lib: investigation completes without early
exit (~$0.37). 6 unit tests added covering the new semantics,
including the key regression: a sequence of small calls whose sum
exceeds the budget must NOT trip the check.

Wiki Architecture page updated.

#51 filed for the separate message-history-growth issue.
---
 luminos_lib/ai.py       | 32 +++++++++++++++++++++----
 tests/test_ai_filter.py | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/luminos_lib/ai.py b/luminos_lib/ai.py
index a240ec1..cbe2353 100644
--- a/luminos_lib/ai.py
+++ b/luminos_lib/ai.py
@@ -31,8 +31,12 @@ from luminos_lib.tree import build_tree, render_tree
 
 MODEL = "claude-sonnet-4-20250514"
 
-# Context budget: trigger early exit at 70% of Sonnet's context window.
-MAX_CONTEXT = 180_000
+# Context budget: trigger early exit when a single API call's input_tokens
+# (the actual size of the context window in use, NOT the cumulative sum
+# across turns) approaches the model's real context limit. Sonnet 4 has
+# a 200k context window; we leave a 30% safety margin for the response
+# and any tool result we're about to append.
+MAX_CONTEXT = 200_000
 CONTEXT_BUDGET = int(MAX_CONTEXT * 0.70)
 
 # Pricing per 1M tokens (Claude Sonnet).
@@ -88,13 +92,25 @@ def _should_skip_dir(name):
 # ---------------------------------------------------------------------------
 
 class _TokenTracker:
-    """Track cumulative token usage across API calls."""
+    """Track token usage across API calls.
+
+    Two distinct quantities are tracked:
+      - cumulative totals (total_*, loop_*) — for cost reporting
+      - last_input — the size of the context window on the most recent
+        call, used to detect approaching the model's context limit
+
+    Cumulative input is NOT a meaningful proxy for context size: each
+    turn's input_tokens already includes the full message history, so
+    summing across turns double-counts everything. Use last_input for
+    budget decisions, totals for billing. (See #44.)
+    """
 
     def __init__(self):
         self.total_input = 0
         self.total_output = 0
         self.loop_input = 0
         self.loop_output = 0
+        self.last_input = 0
 
     def record(self, usage):
         """Record usage from a single API call."""
@@ -104,18 +120,21 @@ class _TokenTracker:
         self.total_output += out
         self.loop_input += inp
         self.loop_output += out
+        self.last_input = inp
 
     def reset_loop(self):
         """Reset per-loop counters (called between directory loops)."""
         self.loop_input = 0
         self.loop_output = 0
+        self.last_input = 0
 
     @property
     def loop_total(self):
         return self.loop_input + self.loop_output
 
     def budget_exceeded(self):
-        return self.loop_total > CONTEXT_BUDGET
+        """True when the most recent call's context exceeded the budget."""
+        return self.last_input > CONTEXT_BUDGET
 
     def summary(self):
         cost_in = self.total_input * INPUT_PRICE_PER_M / 1_000_000
@@ -862,7 +881,10 @@ def _run_dir_loop(client, target, cache, tracker, dir_path, max_turns=14,
         # Check context budget
         if tracker.budget_exceeded():
             print(f"  [AI]   Context budget reached — exiting early "
-                  f"({tracker.loop_total:,} tokens used)", file=sys.stderr)
+                  f"(context size {tracker.last_input:,} > "
+                  f"{CONTEXT_BUDGET:,} budget; "
+                  f"loop spend {tracker.loop_total:,} tokens)",
+                  file=sys.stderr)
             # Flush a partial directory summary from cached file entries
             if not cache.has_entry("dir", dir_path):
                 dir_real = os.path.realpath(dir_path)
diff --git a/tests/test_ai_filter.py b/tests/test_ai_filter.py
index 466755d..5c20fef 100644
--- a/tests/test_ai_filter.py
+++ b/tests/test_ai_filter.py
@@ -108,6 +108,59 @@ class FormatSurveyBlockTests(unittest.TestCase):
         self.assertNotIn("Skip tools", block)
 
 
+class TokenTrackerTests(unittest.TestCase):
+    def _usage(self, inp, out):
+        u = MagicMock()
+        u.input_tokens = inp
+        u.output_tokens = out
+        return u
+
+    def test_record_updates_cumulative_and_last(self):
+        t = ai._TokenTracker()
+        t.record(self._usage(100, 20))
+        t.record(self._usage(200, 30))
+        self.assertEqual(t.total_input, 300)
+        self.assertEqual(t.total_output, 50)
+        self.assertEqual(t.loop_input, 300)
+        self.assertEqual(t.loop_output, 50)
+        self.assertEqual(t.last_input, 200)  # last call only
+
+    def test_budget_uses_last_input_not_sum(self):
+        t = ai._TokenTracker()
+        # Many small calls whose sum exceeds the budget but whose
+        # last input is well under the budget should NOT trip.
+        for _ in range(20):
+            t.record(self._usage(10_000, 100))
+        self.assertGreater(t.loop_input, ai.CONTEXT_BUDGET)
+        self.assertLess(t.last_input, ai.CONTEXT_BUDGET)
+        self.assertFalse(t.budget_exceeded())
+
+    def test_budget_trips_when_last_input_over_threshold(self):
+        t = ai._TokenTracker()
+        t.record(self._usage(ai.CONTEXT_BUDGET + 1, 100))
+        self.assertTrue(t.budget_exceeded())
+
+    def test_reset_loop_clears_loop_and_last(self):
+        t = ai._TokenTracker()
+        t.record(self._usage(500, 50))
+        t.reset_loop()
+        self.assertEqual(t.loop_input, 0)
+        self.assertEqual(t.loop_output, 0)
+        self.assertEqual(t.last_input, 0)
+        # Cumulative totals are NOT reset
+        self.assertEqual(t.total_input, 500)
+        self.assertEqual(t.total_output, 50)
+
+    def test_loop_total_property_still_works(self):
+        t = ai._TokenTracker()
+        t.record(self._usage(100, 25))
+        t.record(self._usage(200, 50))
+        self.assertEqual(t.loop_total, 375)
+
+    def test_max_context_is_sonnet_real_window(self):
+        self.assertEqual(ai.MAX_CONTEXT, 200_000)
+
+
 class DefaultSurveyTests(unittest.TestCase):
     def test_has_all_required_keys(self):
         survey = ai._default_survey()