merge: feat/issue-42-classifier-bias (#42)
This commit is contained in:
commit
157ac3f606
5 changed files with 180 additions and 21 deletions
|
|
@ -8,7 +8,11 @@ import shutil
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from luminos_lib.tree import build_tree, render_tree
|
from luminos_lib.tree import build_tree, render_tree
|
||||||
from luminos_lib.filetypes import classify_files, summarize_categories
|
from luminos_lib.filetypes import (
|
||||||
|
classify_files,
|
||||||
|
summarize_categories,
|
||||||
|
survey_signals,
|
||||||
|
)
|
||||||
from luminos_lib.code import detect_languages, find_large_files
|
from luminos_lib.code import detect_languages, find_large_files
|
||||||
from luminos_lib.recency import find_recent_files
|
from luminos_lib.recency import find_recent_files
|
||||||
from luminos_lib.disk import get_disk_usage, top_directories
|
from luminos_lib.disk import get_disk_usage, top_directories
|
||||||
|
|
@ -56,6 +60,7 @@ def scan(target, depth=3, show_hidden=False, exclude=None):
|
||||||
finish()
|
finish()
|
||||||
report["file_categories"] = summarize_categories(classified)
|
report["file_categories"] = summarize_categories(classified)
|
||||||
report["classified_files"] = classified
|
report["classified_files"] = classified
|
||||||
|
report["survey_signals"] = survey_signals(classified)
|
||||||
|
|
||||||
on_file, finish = _progress("Counting lines")
|
on_file, finish = _progress("Counting lines")
|
||||||
languages, loc = detect_languages(classified, on_file=on_file)
|
languages, loc = detect_languages(classified, on_file=on_file)
|
||||||
|
|
|
||||||
|
|
@ -996,21 +996,44 @@ def _block_to_dict(block):
|
||||||
# Synthesis pass
|
# Synthesis pass
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _format_survey_signals(signals):
|
||||||
|
"""Render the survey_signals dict as a labeled text block."""
|
||||||
|
if not signals or not signals.get("total_files"):
|
||||||
|
return "(no files classified)"
|
||||||
|
|
||||||
|
lines = [f"Total files: {signals.get('total_files', 0)}", ""]
|
||||||
|
|
||||||
|
ext_hist = signals.get("extension_histogram") or {}
|
||||||
|
if ext_hist:
|
||||||
|
lines.append("Extensions (top, by count):")
|
||||||
|
for ext, n in ext_hist.items():
|
||||||
|
lines.append(f" {ext}: {n}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
descs = signals.get("file_descriptions") or {}
|
||||||
|
if descs:
|
||||||
|
lines.append("file --brief output (top, by count):")
|
||||||
|
for desc, n in descs.items():
|
||||||
|
lines.append(f" {desc}: {n}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
samples = signals.get("filename_samples") or []
|
||||||
|
if samples:
|
||||||
|
lines.append("Filename samples (evenly drawn):")
|
||||||
|
for name in samples:
|
||||||
|
lines.append(f" {name}")
|
||||||
|
|
||||||
|
return "\n".join(lines).rstrip()
|
||||||
|
|
||||||
|
|
||||||
def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
||||||
"""Run the reconnaissance survey pass.
|
"""Run the reconnaissance survey pass.
|
||||||
|
|
||||||
Returns a survey dict on success, or None on failure / out-of-turns.
|
Returns a survey dict on success, or None on failure / out-of-turns.
|
||||||
Survey is advisory — callers must treat None as "no survey context".
|
Survey is advisory — callers must treat None as "no survey context".
|
||||||
"""
|
"""
|
||||||
categories = report.get("file_categories", {}) or {}
|
signals = report.get("survey_signals") or {}
|
||||||
if categories:
|
survey_signals_text = _format_survey_signals(signals)
|
||||||
ftd_lines = [
|
|
||||||
f" {cat}: {n}"
|
|
||||||
for cat, n in sorted(categories.items(), key=lambda kv: -kv[1])
|
|
||||||
]
|
|
||||||
file_type_distribution = "\n".join(ftd_lines)
|
|
||||||
else:
|
|
||||||
file_type_distribution = " (no files classified)"
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree_node = build_tree(target, max_depth=2)
|
tree_node = build_tree(target, max_depth=2)
|
||||||
|
|
@ -1023,7 +1046,7 @@ def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
||||||
|
|
||||||
system = _SURVEY_SYSTEM_PROMPT.format(
|
system = _SURVEY_SYSTEM_PROMPT.format(
|
||||||
target=target,
|
target=target,
|
||||||
file_type_distribution=file_type_distribution,
|
survey_signals=survey_signals_text,
|
||||||
tree_preview=tree_preview,
|
tree_preview=tree_preview,
|
||||||
available_tools=available_tools,
|
available_tools=available_tools,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -128,3 +128,64 @@ def summarize_categories(classified):
|
||||||
cat = f["category"]
|
cat = f["category"]
|
||||||
summary[cat] = summary.get(cat, 0) + 1
|
summary[cat] = summary.get(cat, 0) + 1
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
_SURVEY_TOP_N = 20
|
||||||
|
_SURVEY_DESC_TRUNCATE = 80
|
||||||
|
|
||||||
|
|
||||||
|
def survey_signals(classified, max_samples=20):
|
||||||
|
"""Return raw, unbucketed signals for the AI survey pass.
|
||||||
|
|
||||||
|
Unlike `summarize_categories`, which collapses files into a small
|
||||||
|
biased taxonomy, this exposes the primary signals so the survey
|
||||||
|
LLM can characterize the target without being misled by the
|
||||||
|
classifier's source-code bias.
|
||||||
|
|
||||||
|
See #42 for the rationale and #48 for the unit-of-analysis
|
||||||
|
limitation: the unit here is still "file" — containers like mbox,
|
||||||
|
SQLite, and zip will under-count, while dense file collections like
|
||||||
|
Maildir will over-count.
|
||||||
|
|
||||||
|
Returns a dict with:
|
||||||
|
total_files — total count
|
||||||
|
extension_histogram — {ext: count}, top _SURVEY_TOP_N by count
|
||||||
|
file_descriptions — {description: count}, top _SURVEY_TOP_N by count
|
||||||
|
filename_samples — up to max_samples filenames, evenly drawn
|
||||||
|
"""
|
||||||
|
total = len(classified)
|
||||||
|
|
||||||
|
ext_counts = {}
|
||||||
|
desc_counts = {}
|
||||||
|
for f in classified:
|
||||||
|
ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"
|
||||||
|
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
||||||
|
|
||||||
|
desc = (f.get("description") or "").strip()
|
||||||
|
if desc:
|
||||||
|
if len(desc) > _SURVEY_DESC_TRUNCATE:
|
||||||
|
desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."
|
||||||
|
desc_counts[desc] = desc_counts.get(desc, 0) + 1
|
||||||
|
|
||||||
|
def _top(d):
|
||||||
|
items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||||
|
return dict(items[:_SURVEY_TOP_N])
|
||||||
|
|
||||||
|
if total > 0 and max_samples > 0:
|
||||||
|
if total <= max_samples:
|
||||||
|
samples = [f.get("name", "") for f in classified]
|
||||||
|
else:
|
||||||
|
stride = total / max_samples
|
||||||
|
samples = [
|
||||||
|
classified[int(i * stride)].get("name", "")
|
||||||
|
for i in range(max_samples)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
samples = []
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_files": total,
|
||||||
|
"extension_histogram": _top(ext_counts),
|
||||||
|
"file_descriptions": _top(desc_counts),
|
||||||
|
"filename_samples": samples,
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -131,17 +131,22 @@ Answer three questions about the target: {target}
|
||||||
## Inputs
|
## Inputs
|
||||||
You have exactly two signals. Do not ask for more.
|
You have exactly two signals. Do not ask for more.
|
||||||
|
|
||||||
File type distribution (counts by category):
|
File-level signals (raw, unbucketed):
|
||||||
{file_type_distribution}
|
{survey_signals}
|
||||||
|
|
||||||
IMPORTANT: the file type distribution is produced by a classifier
|
These signals are intentionally raw. The extension histogram and
|
||||||
that is biased toward source code. Its categories are: source,
|
the `file --brief` descriptions reflect what is actually on disk,
|
||||||
config, data, document, media, archive, unknown. It has NO concept
|
without any taxonomy collapsing distinct content into one bucket.
|
||||||
of mail, notebooks, calendars, contacts, ledgers, photo libraries,
|
Use them together: an extension alone can mislead (`.txt` could be
|
||||||
or other personal-data domains — anything text-shaped tends to be
|
notes, logs, or message bodies); the `file` command output and
|
||||||
labeled `source` even when it is not code. If the tree preview
|
filename samples disambiguate.
|
||||||
suggests a non-code target, trust the tree over the histogram and
|
|
||||||
say so in `domain_notes`.
|
Note on units: each signal counts filesystem files. Some targets
|
||||||
|
have a different natural unit — a Maildir is one logical mailbox
|
||||||
|
with thousands of message files; an mbox is one file containing
|
||||||
|
many messages; an archive is one file containing many entries. If
|
||||||
|
the signals point at a container shape, name it in `description`
|
||||||
|
and `domain_notes` even though the count is in files.
|
||||||
|
|
||||||
Top-level tree (2 levels deep):
|
Top-level tree (2 levels deep):
|
||||||
{tree_preview}
|
{tree_preview}
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from luminos_lib.filetypes import (
|
||||||
_classify_one,
|
_classify_one,
|
||||||
classify_files,
|
classify_files,
|
||||||
summarize_categories,
|
summarize_categories,
|
||||||
|
survey_signals,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -140,5 +141,69 @@ class TestClassifyFiles(unittest.TestCase):
|
||||||
self.assertGreater(item["size"], 0)
|
self.assertGreater(item["size"], 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSurveySignals(unittest.TestCase):
|
||||||
|
def _f(self, name, description="", category="source"):
|
||||||
|
return {"name": name, "path": f"/x/{name}", "category": category,
|
||||||
|
"size": 10, "description": description}
|
||||||
|
|
||||||
|
def test_empty_input(self):
|
||||||
|
s = survey_signals([])
|
||||||
|
self.assertEqual(s["total_files"], 0)
|
||||||
|
self.assertEqual(s["extension_histogram"], {})
|
||||||
|
self.assertEqual(s["file_descriptions"], {})
|
||||||
|
self.assertEqual(s["filename_samples"], [])
|
||||||
|
|
||||||
|
def test_extension_histogram_uses_lowercase_and_keeps_none(self):
|
||||||
|
files = [
|
||||||
|
self._f("a.PY"), self._f("b.py"), self._f("c.py"),
|
||||||
|
self._f("README"), self._f("Makefile"),
|
||||||
|
]
|
||||||
|
s = survey_signals(files)
|
||||||
|
self.assertEqual(s["extension_histogram"][".py"], 3)
|
||||||
|
self.assertEqual(s["extension_histogram"]["(none)"], 2)
|
||||||
|
|
||||||
|
def test_file_descriptions_aggregated_and_truncated(self):
|
||||||
|
long_desc = "x" * 200
|
||||||
|
files = [
|
||||||
|
self._f("a.py", "Python script, ASCII text"),
|
||||||
|
self._f("b.py", "Python script, ASCII text"),
|
||||||
|
self._f("c.bin", long_desc),
|
||||||
|
]
|
||||||
|
s = survey_signals(files)
|
||||||
|
self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
|
||||||
|
# The long description was truncated and still counted once
|
||||||
|
truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
|
||||||
|
self.assertEqual(len(truncated_keys), 1)
|
||||||
|
self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."
|
||||||
|
|
||||||
|
def test_descriptions_skipped_when_empty(self):
|
||||||
|
files = [self._f("a.py", ""), self._f("b.py", None)]
|
||||||
|
s = survey_signals(files)
|
||||||
|
self.assertEqual(s["file_descriptions"], {})
|
||||||
|
|
||||||
|
def test_top_n_caps_at_20(self):
|
||||||
|
files = [self._f(f"f{i}.ext{i}") for i in range(50)]
|
||||||
|
s = survey_signals(files)
|
||||||
|
self.assertEqual(len(s["extension_histogram"]), 20)
|
||||||
|
|
||||||
|
def test_filename_samples_evenly_drawn(self):
|
||||||
|
files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
|
||||||
|
s = survey_signals(files, max_samples=10)
|
||||||
|
self.assertEqual(len(s["filename_samples"]), 10)
|
||||||
|
# First sample is the first file (stride 10, index 0)
|
||||||
|
self.assertEqual(s["filename_samples"][0], "file_0000.txt")
|
||||||
|
# Last sample is around index 90, not 99
|
||||||
|
self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
|
||||||
|
|
||||||
|
def test_filename_samples_returns_all_when_under_cap(self):
|
||||||
|
files = [self._f(f"f{i}.txt") for i in range(5)]
|
||||||
|
s = survey_signals(files, max_samples=20)
|
||||||
|
self.assertEqual(len(s["filename_samples"]), 5)
|
||||||
|
|
||||||
|
def test_total_files_matches_input(self):
|
||||||
|
files = [self._f(f"f{i}.py") for i in range(7)]
|
||||||
|
self.assertEqual(survey_signals(files)["total_files"], 7)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue