luminos/tests/test_filetypes.py

210 lines
7.6 KiB
Python
Raw Normal View History

"""Tests for luminos_lib/filetypes.py"""
import os
import tempfile
import unittest
from unittest.mock import patch
from luminos_lib.filetypes import (
EXTENSION_MAP,
_classify_one,
classify_files,
summarize_categories,
feat(filetypes): expose raw signals to survey, remove classifier bias (#42) The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring.
2026-04-07 04:36:14 +00:00
survey_signals,
)
class TestExtensionMap(unittest.TestCase):
def test_python_is_source(self):
self.assertEqual(EXTENSION_MAP[".py"], "source")
def test_json_is_config(self):
self.assertEqual(EXTENSION_MAP[".json"], "config")
def test_csv_is_data(self):
self.assertEqual(EXTENSION_MAP[".csv"], "data")
def test_png_is_media(self):
self.assertEqual(EXTENSION_MAP[".png"], "media")
def test_md_is_document(self):
self.assertEqual(EXTENSION_MAP[".md"], "document")
def test_zip_is_archive(self):
self.assertEqual(EXTENSION_MAP[".zip"], "archive")
class TestClassifyOne(unittest.TestCase):
def test_known_extension(self):
category, desc = _classify_one("script.py")
self.assertEqual(category, "source")
self.assertIsNone(desc)
def test_known_extension_case_insensitive(self):
category, desc = _classify_one("image.PNG")
self.assertEqual(category, "media")
self.assertIsNone(desc)
def test_unknown_extension_falls_back_to_file_command(self):
with patch("luminos_lib.filetypes._file_command", return_value="ASCII text"):
category, desc = _classify_one("README")
self.assertEqual(category, "source")
self.assertEqual(desc, "ASCII text")
def test_unknown_extension_unrecognized_file_output(self):
with patch("luminos_lib.filetypes._file_command", return_value="data"):
category, desc = _classify_one("somefile.xyz")
self.assertEqual(category, "unknown")
def test_file_command_timeout_returns_unknown(self):
with patch("luminos_lib.filetypes._file_command", return_value=""):
category, desc = _classify_one("oddfile")
self.assertEqual(category, "unknown")
class TestSummarizeCategories(unittest.TestCase):
def test_empty(self):
self.assertEqual(summarize_categories([]), {})
def test_single_category(self):
files = [{"category": "source"}, {"category": "source"}]
result = summarize_categories(files)
self.assertEqual(result, {"source": 2})
def test_multiple_categories(self):
files = [
{"category": "source"},
{"category": "config"},
{"category": "source"},
{"category": "media"},
]
result = summarize_categories(files)
self.assertEqual(result["source"], 2)
self.assertEqual(result["config"], 1)
self.assertEqual(result["media"], 1)
class TestClassifyFiles(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.mkdtemp()
def _make_file(self, name, content=""):
path = os.path.join(self.tmpdir, name)
with open(path, "w") as f:
f.write(content)
return path
def test_classifies_python_file(self):
self._make_file("script.py", "print('hello')")
results = classify_files(self.tmpdir)
names = [r["name"] for r in results]
self.assertIn("script.py", names)
py = next(r for r in results if r["name"] == "script.py")
self.assertEqual(py["category"], "source")
def test_excludes_hidden_files_by_default(self):
self._make_file(".hidden.py")
self._make_file("visible.py")
results = classify_files(self.tmpdir)
names = [r["name"] for r in results]
self.assertNotIn(".hidden.py", names)
self.assertIn("visible.py", names)
def test_includes_hidden_files_when_requested(self):
self._make_file(".hidden.py")
results = classify_files(self.tmpdir, show_hidden=True)
names = [r["name"] for r in results]
self.assertIn(".hidden.py", names)
def test_excludes_directories(self):
excluded_dir = os.path.join(self.tmpdir, "node_modules")
os.makedirs(excluded_dir)
with open(os.path.join(excluded_dir, "pkg.js"), "w") as f:
f.write("")
self._make_file("main.py")
results = classify_files(self.tmpdir, exclude=["node_modules"])
names = [r["name"] for r in results]
self.assertNotIn("pkg.js", names)
self.assertIn("main.py", names)
def test_on_file_callback(self):
self._make_file("a.py")
self._make_file("b.py")
seen = []
classify_files(self.tmpdir, on_file=seen.append)
self.assertEqual(len(seen), 2)
def test_size_is_populated(self):
self._make_file("data.json", '{"key": "value"}')
results = classify_files(self.tmpdir)
item = next(r for r in results if r["name"] == "data.json")
self.assertGreater(item["size"], 0)
feat(filetypes): expose raw signals to survey, remove classifier bias (#42) The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring.
2026-04-07 04:36:14 +00:00
class TestSurveySignals(unittest.TestCase):
def _f(self, name, description="", category="source"):
return {"name": name, "path": f"/x/{name}", "category": category,
"size": 10, "description": description}
def test_empty_input(self):
s = survey_signals([])
self.assertEqual(s["total_files"], 0)
self.assertEqual(s["extension_histogram"], {})
self.assertEqual(s["file_descriptions"], {})
self.assertEqual(s["filename_samples"], [])
def test_extension_histogram_uses_lowercase_and_keeps_none(self):
files = [
self._f("a.PY"), self._f("b.py"), self._f("c.py"),
self._f("README"), self._f("Makefile"),
]
s = survey_signals(files)
self.assertEqual(s["extension_histogram"][".py"], 3)
self.assertEqual(s["extension_histogram"]["(none)"], 2)
def test_file_descriptions_aggregated_and_truncated(self):
long_desc = "x" * 200
files = [
self._f("a.py", "Python script, ASCII text"),
self._f("b.py", "Python script, ASCII text"),
self._f("c.bin", long_desc),
]
s = survey_signals(files)
self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
# The long description was truncated and still counted once
truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
self.assertEqual(len(truncated_keys), 1)
self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."
def test_descriptions_skipped_when_empty(self):
files = [self._f("a.py", ""), self._f("b.py", None)]
s = survey_signals(files)
self.assertEqual(s["file_descriptions"], {})
def test_top_n_caps_at_20(self):
files = [self._f(f"f{i}.ext{i}") for i in range(50)]
s = survey_signals(files)
self.assertEqual(len(s["extension_histogram"]), 20)
def test_filename_samples_evenly_drawn(self):
files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
s = survey_signals(files, max_samples=10)
self.assertEqual(len(s["filename_samples"]), 10)
# First sample is the first file (stride 10, index 0)
self.assertEqual(s["filename_samples"][0], "file_0000.txt")
# Last sample is around index 90, not 99
self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
def test_filename_samples_returns_all_when_under_cap(self):
files = [self._f(f"f{i}.txt") for i in range(5)]
s = survey_signals(files, max_samples=20)
self.assertEqual(len(s["filename_samples"]), 5)
def test_total_files_matches_input(self):
files = [self._f(f"f{i}.py") for i in range(7)]
self.assertEqual(survey_signals(files)["total_files"], 7)
if __name__ == "__main__":
unittest.main()