From 8fb2f90678a3afbeb75eb7cd8f83f520a15f6367 Mon Sep 17 00:00:00 2001
From: Jeff Smith <jeff@unbiasedgeek.com>
Date: Mon, 6 Apr 2026 22:19:25 -0600
Subject: [PATCH] feat(ai): skip survey pass for tiny targets (#7)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a gate in _run_investigation that skips the survey API call when
a target has both fewer than _SURVEY_MIN_FILES (5) files AND fewer
than _SURVEY_MIN_DIRS (2) directories. AND semantics handle the
deep-narrow edge case correctly: a target with 4 files spread across
50 directories still gets a survey because dir count amortizes the
cost across 50 dir loops.

When skipped, _default_survey() supplies a synthetic dict with
confidence=0.0 — chosen specifically so _filter_dir_tools() never
enforces skip_tools from a synthetic value. The dir loop receives
a generic "small target, read everything" framing in its prompt and
keeps its full toolbox.

Reorders _discover_directories() to run before the survey gate so
total_dirs is available without a second walk.

#46 tracks revisiting the threshold values with empirical data after
Phase 2 ships and we've run --ai on a variety of real targets.

Smoke tested on a 2-file target: gate triggers, default survey
substituted, dir loop completes normally. Adds 4 unit tests for
_default_survey() covering schema, confidence guard, filter
interaction, and empty skip_tools.
---
 luminos_lib/ai.py       | 44 ++++++++++++++++++++++++++++++++++++-----
 tests/test_ai_filter.py | 24 ++++++++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/luminos_lib/ai.py b/luminos_lib/ai.py
index bc330a1..4115957 100644
--- a/luminos_lib/ai.py
+++ b/luminos_lib/ai.py
@@ -756,6 +756,30 @@ def _get_child_summaries(dir_path, cache):
 _SURVEY_CONFIDENCE_THRESHOLD = 0.5
 _PROTECTED_DIR_TOOLS = {"submit_report"}
 
+# Survey-skip thresholds. Skip the survey only when BOTH are below.
+# See #46 for the plan to revisit these with empirical data.
+_SURVEY_MIN_FILES = 5
+_SURVEY_MIN_DIRS = 2
+
+
+def _default_survey():
+    """Synthetic survey for targets too small to justify the API call.
+
+    confidence=0.0 ensures _filter_dir_tools() never enforces skip_tools
+    based on this synthetic value — the dir loop keeps its full toolbox.
+    """
+    return {
+        "description": "Small target — survey skipped.",
+        "approach": (
+            "The target is small enough to investigate exhaustively. "
+            "Read every file directly."
+        ),
+        "relevant_tools": [],
+        "skip_tools": [],
+        "domain_notes": "",
+        "confidence": 0.0,
+    }
+
 
 def _format_survey_block(survey):
     """Render survey output as a labeled text block for the dir prompt."""
@@ -1228,8 +1252,21 @@ def _run_investigation(client, target, report, show_hidden=False,
           f"{'' if is_new else ' (resumed)'}", file=sys.stderr)
     print(f"  [AI] Cache: {cache.root}/", file=sys.stderr)
 
-    print("  [AI] Survey pass...", file=sys.stderr)
-    survey = _run_survey(client, target, report, tracker, verbose=verbose)
+    all_dirs = _discover_directories(target, show_hidden=show_hidden,
+                                     exclude=exclude)
+
+    total_files = sum((report.get("file_categories") or {}).values())
+    total_dirs = len(all_dirs)
+    if total_files < _SURVEY_MIN_FILES and total_dirs < _SURVEY_MIN_DIRS:
+        print(
+            f"  [AI] Survey skipped — {total_files} files, {total_dirs} dirs "
+            f"(below threshold).",
+            file=sys.stderr,
+        )
+        survey = _default_survey()
+    else:
+        print("  [AI] Survey pass...", file=sys.stderr)
+        survey = _run_survey(client, target, report, tracker, verbose=verbose)
     if survey:
         print(
             f"  [AI] Survey: {survey['description']} "
@@ -1251,9 +1288,6 @@ def _run_investigation(client, target, report, show_hidden=False,
     else:
         print("  [AI] Survey unavailable — proceeding without it.", file=sys.stderr)
 
-    all_dirs = _discover_directories(target, show_hidden=show_hidden,
-                                     exclude=exclude)
-
     to_investigate = []
     cached_count = 0
     for d in all_dirs:
diff --git a/tests/test_ai_filter.py b/tests/test_ai_filter.py
index b396710..466755d 100644
--- a/tests/test_ai_filter.py
+++ b/tests/test_ai_filter.py
@@ -108,5 +108,29 @@ class FormatSurveyBlockTests(unittest.TestCase):
         self.assertNotIn("Skip tools", block)
 
 
+class DefaultSurveyTests(unittest.TestCase):
+    def test_has_all_required_keys(self):
+        survey = ai._default_survey()
+        for key in ("description", "approach", "relevant_tools",
+                    "skip_tools", "domain_notes", "confidence"):
+            self.assertIn(key, survey)
+
+    def test_confidence_below_filter_threshold(self):
+        # Must be < _SURVEY_CONFIDENCE_THRESHOLD so _filter_dir_tools()
+        # never enforces skip_tools from a synthetic survey.
+        self.assertLess(
+            ai._default_survey()["confidence"],
+            ai._SURVEY_CONFIDENCE_THRESHOLD,
+        )
+
+    def test_filter_returns_full_toolbox_for_default(self):
+        all_names = {t["name"] for t in ai._DIR_TOOLS}
+        result = {t["name"] for t in ai._filter_dir_tools(ai._default_survey())}
+        self.assertEqual(result, all_names)
+
+    def test_skip_tools_is_empty(self):
+        self.assertEqual(ai._default_survey()["skip_tools"], [])
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
2.45.2