The survey pass no longer receives the bucketed file_categories
histogram, which was biased toward source-code targets and would
mislabel mail, notebooks, ledgers, and other non-code domains as
"source" via the file --brief "text" pattern fallback.
Adds filetypes.survey_signals(), which assembles raw signals from
the same `classified` data the bucketer already processes — no new
walks, no new dependencies:
total_files — total count
extension_histogram — top 20 extensions, raw, no taxonomy
file_descriptions — top 20 `file --brief` outputs, by count
filename_samples — 20 names, evenly drawn (not first-20)
`survey --brief` descriptions are truncated at 80 chars before
counting so prefixes group correctly without exploding key cardinality.
The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the
histogram was biased toward source code) is removed and replaced
with neutral guidance on how to read the raw signals together.
The {file_type_distribution} placeholder is renamed to
{survey_signals} to reflect the broader content.
luminos.py base scan computes survey_signals once and stores it on
report["survey_signals"]; AI consumers read from there.
summarize_categories() and report["file_categories"] are unchanged
— the terminal report still uses the bucketed view (#49 tracks
fixing that follow-up).
Smoke tested on two targets:
- luminos_lib: identical-quality survey ("Python library package",
confidence 0.85), unchanged behavior on code targets.
- A synthetic Maildir of 8 messages with `:2,S` flag suffixes:
survey now correctly identifies it as "A Maildir-format mailbox
containing 8 email messages" with confidence 0.90, names the
Maildir naming convention in domain_notes, and correctly marks
parse_structure as a skip tool. Before #42 this would have been
"8 source files."
Adds 8 unit tests for survey_signals covering empty input, extension
histogram, description aggregation/truncation, top-N cap, and
even-stride filename sampling.
#48 tracks the unit-of-analysis limitation (file is the wrong unit
for mbox, SQLite, archives, notebooks) — explicitly out of scope
for #42 and documented in survey_signals' docstring.
209 lines
7.6 KiB
Python
209 lines
7.6 KiB
Python
"""Tests for luminos_lib/filetypes.py"""
|
|
|
|
import os
|
|
import tempfile
|
|
import unittest
|
|
from unittest.mock import patch
|
|
|
|
from luminos_lib.filetypes import (
|
|
EXTENSION_MAP,
|
|
_classify_one,
|
|
classify_files,
|
|
summarize_categories,
|
|
survey_signals,
|
|
)
|
|
|
|
|
|
class TestExtensionMap(unittest.TestCase):
|
|
def test_python_is_source(self):
|
|
self.assertEqual(EXTENSION_MAP[".py"], "source")
|
|
|
|
def test_json_is_config(self):
|
|
self.assertEqual(EXTENSION_MAP[".json"], "config")
|
|
|
|
def test_csv_is_data(self):
|
|
self.assertEqual(EXTENSION_MAP[".csv"], "data")
|
|
|
|
def test_png_is_media(self):
|
|
self.assertEqual(EXTENSION_MAP[".png"], "media")
|
|
|
|
def test_md_is_document(self):
|
|
self.assertEqual(EXTENSION_MAP[".md"], "document")
|
|
|
|
def test_zip_is_archive(self):
|
|
self.assertEqual(EXTENSION_MAP[".zip"], "archive")
|
|
|
|
|
|
class TestClassifyOne(unittest.TestCase):
|
|
def test_known_extension(self):
|
|
category, desc = _classify_one("script.py")
|
|
self.assertEqual(category, "source")
|
|
self.assertIsNone(desc)
|
|
|
|
def test_known_extension_case_insensitive(self):
|
|
category, desc = _classify_one("image.PNG")
|
|
self.assertEqual(category, "media")
|
|
self.assertIsNone(desc)
|
|
|
|
def test_unknown_extension_falls_back_to_file_command(self):
|
|
with patch("luminos_lib.filetypes._file_command", return_value="ASCII text"):
|
|
category, desc = _classify_one("README")
|
|
self.assertEqual(category, "source")
|
|
self.assertEqual(desc, "ASCII text")
|
|
|
|
def test_unknown_extension_unrecognized_file_output(self):
|
|
with patch("luminos_lib.filetypes._file_command", return_value="data"):
|
|
category, desc = _classify_one("somefile.xyz")
|
|
self.assertEqual(category, "unknown")
|
|
|
|
def test_file_command_timeout_returns_unknown(self):
|
|
with patch("luminos_lib.filetypes._file_command", return_value=""):
|
|
category, desc = _classify_one("oddfile")
|
|
self.assertEqual(category, "unknown")
|
|
|
|
|
|
class TestSummarizeCategories(unittest.TestCase):
|
|
def test_empty(self):
|
|
self.assertEqual(summarize_categories([]), {})
|
|
|
|
def test_single_category(self):
|
|
files = [{"category": "source"}, {"category": "source"}]
|
|
result = summarize_categories(files)
|
|
self.assertEqual(result, {"source": 2})
|
|
|
|
def test_multiple_categories(self):
|
|
files = [
|
|
{"category": "source"},
|
|
{"category": "config"},
|
|
{"category": "source"},
|
|
{"category": "media"},
|
|
]
|
|
result = summarize_categories(files)
|
|
self.assertEqual(result["source"], 2)
|
|
self.assertEqual(result["config"], 1)
|
|
self.assertEqual(result["media"], 1)
|
|
|
|
|
|
class TestClassifyFiles(unittest.TestCase):
|
|
def setUp(self):
|
|
self.tmpdir = tempfile.mkdtemp()
|
|
|
|
def _make_file(self, name, content=""):
|
|
path = os.path.join(self.tmpdir, name)
|
|
with open(path, "w") as f:
|
|
f.write(content)
|
|
return path
|
|
|
|
def test_classifies_python_file(self):
|
|
self._make_file("script.py", "print('hello')")
|
|
results = classify_files(self.tmpdir)
|
|
names = [r["name"] for r in results]
|
|
self.assertIn("script.py", names)
|
|
py = next(r for r in results if r["name"] == "script.py")
|
|
self.assertEqual(py["category"], "source")
|
|
|
|
def test_excludes_hidden_files_by_default(self):
|
|
self._make_file(".hidden.py")
|
|
self._make_file("visible.py")
|
|
results = classify_files(self.tmpdir)
|
|
names = [r["name"] for r in results]
|
|
self.assertNotIn(".hidden.py", names)
|
|
self.assertIn("visible.py", names)
|
|
|
|
def test_includes_hidden_files_when_requested(self):
|
|
self._make_file(".hidden.py")
|
|
results = classify_files(self.tmpdir, show_hidden=True)
|
|
names = [r["name"] for r in results]
|
|
self.assertIn(".hidden.py", names)
|
|
|
|
def test_excludes_directories(self):
|
|
excluded_dir = os.path.join(self.tmpdir, "node_modules")
|
|
os.makedirs(excluded_dir)
|
|
with open(os.path.join(excluded_dir, "pkg.js"), "w") as f:
|
|
f.write("")
|
|
self._make_file("main.py")
|
|
results = classify_files(self.tmpdir, exclude=["node_modules"])
|
|
names = [r["name"] for r in results]
|
|
self.assertNotIn("pkg.js", names)
|
|
self.assertIn("main.py", names)
|
|
|
|
def test_on_file_callback(self):
|
|
self._make_file("a.py")
|
|
self._make_file("b.py")
|
|
seen = []
|
|
classify_files(self.tmpdir, on_file=seen.append)
|
|
self.assertEqual(len(seen), 2)
|
|
|
|
def test_size_is_populated(self):
|
|
self._make_file("data.json", '{"key": "value"}')
|
|
results = classify_files(self.tmpdir)
|
|
item = next(r for r in results if r["name"] == "data.json")
|
|
self.assertGreater(item["size"], 0)
|
|
|
|
|
|
class TestSurveySignals(unittest.TestCase):
|
|
def _f(self, name, description="", category="source"):
|
|
return {"name": name, "path": f"/x/{name}", "category": category,
|
|
"size": 10, "description": description}
|
|
|
|
def test_empty_input(self):
|
|
s = survey_signals([])
|
|
self.assertEqual(s["total_files"], 0)
|
|
self.assertEqual(s["extension_histogram"], {})
|
|
self.assertEqual(s["file_descriptions"], {})
|
|
self.assertEqual(s["filename_samples"], [])
|
|
|
|
def test_extension_histogram_uses_lowercase_and_keeps_none(self):
|
|
files = [
|
|
self._f("a.PY"), self._f("b.py"), self._f("c.py"),
|
|
self._f("README"), self._f("Makefile"),
|
|
]
|
|
s = survey_signals(files)
|
|
self.assertEqual(s["extension_histogram"][".py"], 3)
|
|
self.assertEqual(s["extension_histogram"]["(none)"], 2)
|
|
|
|
def test_file_descriptions_aggregated_and_truncated(self):
|
|
long_desc = "x" * 200
|
|
files = [
|
|
self._f("a.py", "Python script, ASCII text"),
|
|
self._f("b.py", "Python script, ASCII text"),
|
|
self._f("c.bin", long_desc),
|
|
]
|
|
s = survey_signals(files)
|
|
self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
|
|
# The long description was truncated and still counted once
|
|
truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
|
|
self.assertEqual(len(truncated_keys), 1)
|
|
self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."
|
|
|
|
def test_descriptions_skipped_when_empty(self):
|
|
files = [self._f("a.py", ""), self._f("b.py", None)]
|
|
s = survey_signals(files)
|
|
self.assertEqual(s["file_descriptions"], {})
|
|
|
|
def test_top_n_caps_at_20(self):
|
|
files = [self._f(f"f{i}.ext{i}") for i in range(50)]
|
|
s = survey_signals(files)
|
|
self.assertEqual(len(s["extension_histogram"]), 20)
|
|
|
|
def test_filename_samples_evenly_drawn(self):
|
|
files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
|
|
s = survey_signals(files, max_samples=10)
|
|
self.assertEqual(len(s["filename_samples"]), 10)
|
|
# First sample is the first file (stride 10, index 0)
|
|
self.assertEqual(s["filename_samples"][0], "file_0000.txt")
|
|
# Last sample is around index 90, not 99
|
|
self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
|
|
|
|
def test_filename_samples_returns_all_when_under_cap(self):
|
|
files = [self._f(f"f{i}.txt") for i in range(5)]
|
|
s = survey_signals(files, max_samples=20)
|
|
self.assertEqual(len(s["filename_samples"]), 5)
|
|
|
|
def test_total_files_matches_input(self):
|
|
files = [self._f(f"f{i}.py") for i in range(7)]
|
|
self.assertEqual(survey_signals(files)["total_files"], 7)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|