merge: feat/issue-42-classifier-bias (#42 )

feat(filetypes): expose raw signals to survey, remove classifier bias (#42 )
The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring.
2026-04-06 22:36:26 -06:00 · 2026-04-06 22:36:14 -06:00
5 changed files with 180 additions and 21 deletions
--- a/luminos.py
+++ b/luminos.py
@ -8,7 +8,11 @@ import shutil
 import sys
 from luminos_lib.tree import build_tree, render_tree
-from luminos_lib.filetypes import classify_files, summarize_categories
+from luminos_lib.filetypes import (
    classify_files,
    summarize_categories,
    survey_signals,
 )
 from luminos_lib.code import detect_languages, find_large_files
 from luminos_lib.recency import find_recent_files
 from luminos_lib.disk import get_disk_usage, top_directories
@ -56,6 +60,7 @@ def scan(target, depth=3, show_hidden=False, exclude=None):
    finish()
    report["file_categories"] = summarize_categories(classified)
    report["classified_files"] = classified
    report["survey_signals"] = survey_signals(classified)
    on_file, finish = _progress("Counting lines")
    languages, loc = detect_languages(classified, on_file=on_file)
--- a/luminos_lib/ai.py
+++ b/luminos_lib/ai.py
@ -996,21 +996,44 @@ def _block_to_dict(block):
 # Synthesis pass
 # ---------------------------------------------------------------------------
 def _format_survey_signals(signals):
    """Render the survey_signals dict as a labeled text block."""
    if not signals or not signals.get("total_files"):
        return "(no files classified)"
    lines = [f"Total files: {signals.get('total_files', 0)}", ""]
    ext_hist = signals.get("extension_histogram") or {}
    if ext_hist:
        lines.append("Extensions (top, by count):")
        for ext, n in ext_hist.items():
            lines.append(f"  {ext}: {n}")
        lines.append("")
    descs = signals.get("file_descriptions") or {}
    if descs:
        lines.append("file --brief output (top, by count):")
        for desc, n in descs.items():
            lines.append(f"  {desc}: {n}")
        lines.append("")
    samples = signals.get("filename_samples") or []
    if samples:
        lines.append("Filename samples (evenly drawn):")
        for name in samples:
            lines.append(f"  {name}")
    return "\n".join(lines).rstrip()
 def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
    """Run the reconnaissance survey pass.
    Returns a survey dict on success, or None on failure / out-of-turns.
    Survey is advisory — callers must treat None as "no survey context".
    """
-    categories = report.get("file_categories", {}) or {}
+    signals = report.get("survey_signals") or {}
-    if categories:
+    survey_signals_text = _format_survey_signals(signals)
        ftd_lines = [
            f"  {cat}: {n}"
            for cat, n in sorted(categories.items(), key=lambda kv: -kv[1])
        ]
        file_type_distribution = "\n".join(ftd_lines)
    else:
        file_type_distribution = "  (no files classified)"
    try:
        tree_node = build_tree(target, max_depth=2)
@ -1023,7 +1046,7 @@ def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
    system = _SURVEY_SYSTEM_PROMPT.format(
        target=target,
-        file_type_distribution=file_type_distribution,
+        survey_signals=survey_signals_text,
        tree_preview=tree_preview,
        available_tools=available_tools,
    )
--- a/luminos_lib/filetypes.py
+++ b/luminos_lib/filetypes.py
@ -128,3 +128,64 @@ def summarize_categories(classified):
        cat = f["category"]
        summary[cat] = summary.get(cat, 0) + 1
    return summary
 _SURVEY_TOP_N = 20
 _SURVEY_DESC_TRUNCATE = 80
 def survey_signals(classified, max_samples=20):
    """Return raw, unbucketed signals for the AI survey pass.
    Unlike `summarize_categories`, which collapses files into a small
    biased taxonomy, this exposes the primary signals so the survey
    LLM can characterize the target without being misled by the
    classifier's source-code bias.
    See #42 for the rationale and #48 for the unit-of-analysis
    limitation: the unit here is still "file" — containers like mbox,
    SQLite, and zip will under-count, while dense file collections like
    Maildir will over-count.
    Returns a dict with:
      total_files       — total count
      extension_histogram — {ext: count}, top _SURVEY_TOP_N by count
      file_descriptions — {description: count}, top _SURVEY_TOP_N by count
      filename_samples  — up to max_samples filenames, evenly drawn
    """
    total = len(classified)
    ext_counts = {}
    desc_counts = {}
    for f in classified:
        ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"
        ext_counts[ext] = ext_counts.get(ext, 0) + 1
        desc = (f.get("description") or "").strip()
        if desc:
            if len(desc) > _SURVEY_DESC_TRUNCATE:
                desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."
            desc_counts[desc] = desc_counts.get(desc, 0) + 1
    def _top(d):
        items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))
        return dict(items[:_SURVEY_TOP_N])
    if total > 0 and max_samples > 0:
        if total <= max_samples:
            samples = [f.get("name", "") for f in classified]
        else:
            stride = total / max_samples
            samples = [
                classified[int(i * stride)].get("name", "")
                for i in range(max_samples)
            ]
    else:
        samples = []
    return {
        "total_files": total,
        "extension_histogram": _top(ext_counts),
        "file_descriptions": _top(desc_counts),
        "filename_samples": samples,
    }
--- a/luminos_lib/prompts.py
+++ b/luminos_lib/prompts.py
@ -131,17 +131,22 @@ Answer three questions about the target: {target}
 ## Inputs
 You have exactly two signals. Do not ask for more.
-File type distribution (counts by category):
+File-level signals (raw, unbucketed):
-{file_type_distribution}
+{survey_signals}
-IMPORTANT: the file type distribution is produced by a classifier
+These signals are intentionally raw. The extension histogram and
-that is biased toward source code. Its categories are: source,
+the `file --brief` descriptions reflect what is actually on disk,
-config, data, document, media, archive, unknown. It has NO concept
+without any taxonomy collapsing distinct content into one bucket.
-of mail, notebooks, calendars, contacts, ledgers, photo libraries,
+Use them together: an extension alone can mislead (`.txt` could be
-or other personal-data domains — anything text-shaped tends to be
+notes, logs, or message bodies); the `file` command output and
-labeled `source` even when it is not code. If the tree preview
+filename samples disambiguate.
-suggests a non-code target, trust the tree over the histogram and
+
-say so in `domain_notes`.
+Note on units: each signal counts filesystem files. Some targets
 have a different natural unit — a Maildir is one logical mailbox
 with thousands of message files; an mbox is one file containing
 many messages; an archive is one file containing many entries. If
 the signals point at a container shape, name it in `description`
 and `domain_notes` even though the count is in files.
 Top-level tree (2 levels deep):
 {tree_preview}
--- a/tests/test_filetypes.py
+++ b/tests/test_filetypes.py
@ -10,6 +10,7 @@ from luminos_lib.filetypes import (
    _classify_one,
    classify_files,
    summarize_categories,
    survey_signals,
 )
@ -140,5 +141,69 @@ class TestClassifyFiles(unittest.TestCase):
        self.assertGreater(item["size"], 0)
 class TestSurveySignals(unittest.TestCase):
    def _f(self, name, description="", category="source"):
        return {"name": name, "path": f"/x/{name}", "category": category,
                "size": 10, "description": description}
    def test_empty_input(self):
        s = survey_signals([])
        self.assertEqual(s["total_files"], 0)
        self.assertEqual(s["extension_histogram"], {})
        self.assertEqual(s["file_descriptions"], {})
        self.assertEqual(s["filename_samples"], [])
    def test_extension_histogram_uses_lowercase_and_keeps_none(self):
        files = [
            self._f("a.PY"), self._f("b.py"), self._f("c.py"),
            self._f("README"), self._f("Makefile"),
        ]
        s = survey_signals(files)
        self.assertEqual(s["extension_histogram"][".py"], 3)
        self.assertEqual(s["extension_histogram"]["(none)"], 2)
    def test_file_descriptions_aggregated_and_truncated(self):
        long_desc = "x" * 200
        files = [
            self._f("a.py", "Python script, ASCII text"),
            self._f("b.py", "Python script, ASCII text"),
            self._f("c.bin", long_desc),
        ]
        s = survey_signals(files)
        self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
        # The long description was truncated and still counted once
        truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
        self.assertEqual(len(truncated_keys), 1)
        self.assertLessEqual(len(truncated_keys[0]), 84)  # 80 + "..."
    def test_descriptions_skipped_when_empty(self):
        files = [self._f("a.py", ""), self._f("b.py", None)]
        s = survey_signals(files)
        self.assertEqual(s["file_descriptions"], {})
    def test_top_n_caps_at_20(self):
        files = [self._f(f"f{i}.ext{i}") for i in range(50)]
        s = survey_signals(files)
        self.assertEqual(len(s["extension_histogram"]), 20)
    def test_filename_samples_evenly_drawn(self):
        files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
        s = survey_signals(files, max_samples=10)
        self.assertEqual(len(s["filename_samples"]), 10)
        # First sample is the first file (stride 10, index 0)
        self.assertEqual(s["filename_samples"][0], "file_0000.txt")
        # Last sample is around index 90, not 99
        self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
    def test_filename_samples_returns_all_when_under_cap(self):
        files = [self._f(f"f{i}.txt") for i in range(5)]
        s = survey_signals(files, max_samples=20)
        self.assertEqual(len(s["filename_samples"]), 5)
    def test_total_files_matches_input(self):
        files = [self._f(f"f{i}.py") for i in range(7)]
        self.assertEqual(survey_signals(files)["total_files"], 7)
 if __name__ == "__main__":
    unittest.main()