luminos/tests/test_filetypes.py

"""Tests for luminos_lib/filetypes.py"""

import os
import tempfile
import unittest
from unittest.mock import patch

from luminos_lib.filetypes import (
    EXTENSION_MAP,
    _classify_one,
    classify_files,
    summarize_categories,
    survey_signals,
)


class TestExtensionMap(unittest.TestCase):
    def test_python_is_source(self):
        self.assertEqual(EXTENSION_MAP[".py"], "source")

    def test_json_is_config(self):
        self.assertEqual(EXTENSION_MAP[".json"], "config")

    def test_csv_is_data(self):
        self.assertEqual(EXTENSION_MAP[".csv"], "data")

    def test_png_is_media(self):
        self.assertEqual(EXTENSION_MAP[".png"], "media")

    def test_md_is_document(self):
        self.assertEqual(EXTENSION_MAP[".md"], "document")

    def test_zip_is_archive(self):
        self.assertEqual(EXTENSION_MAP[".zip"], "archive")


class TestClassifyOne(unittest.TestCase):
    def test_known_extension(self):
        category, desc = _classify_one("script.py")
        self.assertEqual(category, "source")
        self.assertIsNone(desc)

    def test_known_extension_case_insensitive(self):
        category, desc = _classify_one("image.PNG")
        self.assertEqual(category, "media")
        self.assertIsNone(desc)

    def test_unknown_extension_falls_back_to_file_command(self):
        with patch("luminos_lib.filetypes._file_command", return_value="ASCII text"):
            category, desc = _classify_one("README")
            self.assertEqual(category, "source")
            self.assertEqual(desc, "ASCII text")

    def test_unknown_extension_unrecognized_file_output(self):
        with patch("luminos_lib.filetypes._file_command", return_value="data"):
            category, desc = _classify_one("somefile.xyz")
            self.assertEqual(category, "unknown")

    def test_file_command_timeout_returns_unknown(self):
        with patch("luminos_lib.filetypes._file_command", return_value=""):
            category, desc = _classify_one("oddfile")
            self.assertEqual(category, "unknown")


class TestSummarizeCategories(unittest.TestCase):
    def test_empty(self):
        self.assertEqual(summarize_categories([]), {})

    def test_single_category(self):
        files = [{"category": "source"}, {"category": "source"}]
        result = summarize_categories(files)
        self.assertEqual(result, {"source": 2})

    def test_multiple_categories(self):
        files = [
            {"category": "source"},
            {"category": "config"},
            {"category": "source"},
            {"category": "media"},
        ]
        result = summarize_categories(files)
        self.assertEqual(result["source"], 2)
        self.assertEqual(result["config"], 1)
        self.assertEqual(result["media"], 1)


class TestClassifyFiles(unittest.TestCase):
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

    def _make_file(self, name, content=""):
        path = os.path.join(self.tmpdir, name)
        with open(path, "w") as f:
            f.write(content)
        return path

    def test_classifies_python_file(self):
        self._make_file("script.py", "print('hello')")
        results = classify_files(self.tmpdir)
        names = [r["name"] for r in results]
        self.assertIn("script.py", names)
        py = next(r for r in results if r["name"] == "script.py")
        self.assertEqual(py["category"], "source")

    def test_excludes_hidden_files_by_default(self):
        self._make_file(".hidden.py")
        self._make_file("visible.py")
        results = classify_files(self.tmpdir)
        names = [r["name"] for r in results]
        self.assertNotIn(".hidden.py", names)
        self.assertIn("visible.py", names)

    def test_includes_hidden_files_when_requested(self):
        self._make_file(".hidden.py")
        results = classify_files(self.tmpdir, show_hidden=True)
        names = [r["name"] for r in results]
        self.assertIn(".hidden.py", names)

    def test_excludes_directories(self):
        excluded_dir = os.path.join(self.tmpdir, "node_modules")
        os.makedirs(excluded_dir)
        with open(os.path.join(excluded_dir, "pkg.js"), "w") as f:
            f.write("")
        self._make_file("main.py")
        results = classify_files(self.tmpdir, exclude=["node_modules"])
        names = [r["name"] for r in results]
        self.assertNotIn("pkg.js", names)
        self.assertIn("main.py", names)

    def test_on_file_callback(self):
        self._make_file("a.py")
        self._make_file("b.py")
        seen = []
        classify_files(self.tmpdir, on_file=seen.append)
        self.assertEqual(len(seen), 2)

    def test_size_is_populated(self):
        self._make_file("data.json", '{"key": "value"}')
        results = classify_files(self.tmpdir)
        item = next(r for r in results if r["name"] == "data.json")
        self.assertGreater(item["size"], 0)


class TestSurveySignals(unittest.TestCase):
    def _f(self, name, description="", category="source"):
        return {"name": name, "path": f"/x/{name}", "category": category,
                "size": 10, "description": description}

    def test_empty_input(self):
        s = survey_signals([])
        self.assertEqual(s["total_files"], 0)
        self.assertEqual(s["extension_histogram"], {})
        self.assertEqual(s["file_descriptions"], {})
        self.assertEqual(s["filename_samples"], [])

    def test_extension_histogram_uses_lowercase_and_keeps_none(self):
        files = [
            self._f("a.PY"), self._f("b.py"), self._f("c.py"),
            self._f("README"), self._f("Makefile"),
        ]
        s = survey_signals(files)
        self.assertEqual(s["extension_histogram"][".py"], 3)
        self.assertEqual(s["extension_histogram"]["(none)"], 2)

    def test_file_descriptions_aggregated_and_truncated(self):
        long_desc = "x" * 200
        files = [
            self._f("a.py", "Python script, ASCII text"),
            self._f("b.py", "Python script, ASCII text"),
            self._f("c.bin", long_desc),
        ]
        s = survey_signals(files)
        self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
        # The long description was truncated and still counted once
        truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
        self.assertEqual(len(truncated_keys), 1)
        self.assertLessEqual(len(truncated_keys[0]), 84)  # 80 + "..."

    def test_descriptions_skipped_when_empty(self):
        files = [self._f("a.py", ""), self._f("b.py", None)]
        s = survey_signals(files)
        self.assertEqual(s["file_descriptions"], {})

    def test_top_n_caps_at_20(self):
        files = [self._f(f"f{i}.ext{i}") for i in range(50)]
        s = survey_signals(files)
        self.assertEqual(len(s["extension_histogram"]), 20)

    def test_filename_samples_evenly_drawn(self):
        files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
        s = survey_signals(files, max_samples=10)
        self.assertEqual(len(s["filename_samples"]), 10)
        # First sample is the first file (stride 10, index 0)
        self.assertEqual(s["filename_samples"][0], "file_0000.txt")
        # Last sample is around index 90, not 99
        self.assertTrue(s["filename_samples"][-1].startswith("file_009"))

    def test_filename_samples_returns_all_when_under_cap(self):
        files = [self._f(f"f{i}.txt") for i in range(5)]
        s = survey_signals(files, max_samples=20)
        self.assertEqual(len(s["filename_samples"]), 5)

    def test_total_files_matches_input(self):
        files = [self._f(f"f{i}.py") for i in range(7)]
        self.assertEqual(survey_signals(files)["total_files"], 7)


if __name__ == "__main__":
    unittest.main()
feat(tests): add unit test coverage for all testable modules (#37) 129 tests across cache, filetypes, code, disk, recency, tree, report, and capabilities. Uses stdlib unittest only — no new dependencies. Also updates CLAUDE.md development workflow to require test coverage for all future changes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-04-06 22:57:26 +00:00			`"""Tests for luminos_lib/filetypes.py"""`

			`import os`
			`import tempfile`
			`import unittest`
			`from unittest.mock import patch`

			`from luminos_lib.filetypes import (`
			`EXTENSION_MAP,`
			`_classify_one,`
			`classify_files,`
			`summarize_categories,`
feat(filetypes): expose raw signals to survey, remove classifier bias (#42) The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring. 2026-04-07 04:36:14 +00:00			`survey_signals,`
feat(tests): add unit test coverage for all testable modules (#37) 129 tests across cache, filetypes, code, disk, recency, tree, report, and capabilities. Uses stdlib unittest only — no new dependencies. Also updates CLAUDE.md development workflow to require test coverage for all future changes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-04-06 22:57:26 +00:00			`)`


			`class TestExtensionMap(unittest.TestCase):`
			`def test_python_is_source(self):`
			`self.assertEqual(EXTENSION_MAP[".py"], "source")`

			`def test_json_is_config(self):`
			`self.assertEqual(EXTENSION_MAP[".json"], "config")`

			`def test_csv_is_data(self):`
			`self.assertEqual(EXTENSION_MAP[".csv"], "data")`

			`def test_png_is_media(self):`
			`self.assertEqual(EXTENSION_MAP[".png"], "media")`

			`def test_md_is_document(self):`
			`self.assertEqual(EXTENSION_MAP[".md"], "document")`

			`def test_zip_is_archive(self):`
			`self.assertEqual(EXTENSION_MAP[".zip"], "archive")`


			`class TestClassifyOne(unittest.TestCase):`
			`def test_known_extension(self):`
			`category, desc = _classify_one("script.py")`
			`self.assertEqual(category, "source")`
			`self.assertIsNone(desc)`

			`def test_known_extension_case_insensitive(self):`
			`category, desc = _classify_one("image.PNG")`
			`self.assertEqual(category, "media")`
			`self.assertIsNone(desc)`

			`def test_unknown_extension_falls_back_to_file_command(self):`
			`with patch("luminos_lib.filetypes._file_command", return_value="ASCII text"):`
			`category, desc = _classify_one("README")`
			`self.assertEqual(category, "source")`
			`self.assertEqual(desc, "ASCII text")`

			`def test_unknown_extension_unrecognized_file_output(self):`
			`with patch("luminos_lib.filetypes._file_command", return_value="data"):`
			`category, desc = _classify_one("somefile.xyz")`
			`self.assertEqual(category, "unknown")`

			`def test_file_command_timeout_returns_unknown(self):`
			`with patch("luminos_lib.filetypes._file_command", return_value=""):`
			`category, desc = _classify_one("oddfile")`
			`self.assertEqual(category, "unknown")`


			`class TestSummarizeCategories(unittest.TestCase):`
			`def test_empty(self):`
			`self.assertEqual(summarize_categories([]), {})`

			`def test_single_category(self):`
			`files = [{"category": "source"}, {"category": "source"}]`
			`result = summarize_categories(files)`
			`self.assertEqual(result, {"source": 2})`

			`def test_multiple_categories(self):`
			`files = [`
			`{"category": "source"},`
			`{"category": "config"},`
			`{"category": "source"},`
			`{"category": "media"},`
			`]`
			`result = summarize_categories(files)`
			`self.assertEqual(result["source"], 2)`
			`self.assertEqual(result["config"], 1)`
			`self.assertEqual(result["media"], 1)`


			`class TestClassifyFiles(unittest.TestCase):`
			`def setUp(self):`
			`self.tmpdir = tempfile.mkdtemp()`

			`def _make_file(self, name, content=""):`
			`path = os.path.join(self.tmpdir, name)`
			`with open(path, "w") as f:`
			`f.write(content)`
			`return path`

			`def test_classifies_python_file(self):`
			`self._make_file("script.py", "print('hello')")`
			`results = classify_files(self.tmpdir)`
			`names = [r["name"] for r in results]`
			`self.assertIn("script.py", names)`
			`py = next(r for r in results if r["name"] == "script.py")`
			`self.assertEqual(py["category"], "source")`

			`def test_excludes_hidden_files_by_default(self):`
			`self._make_file(".hidden.py")`
			`self._make_file("visible.py")`
			`results = classify_files(self.tmpdir)`
			`names = [r["name"] for r in results]`
			`self.assertNotIn(".hidden.py", names)`
			`self.assertIn("visible.py", names)`

			`def test_includes_hidden_files_when_requested(self):`
			`self._make_file(".hidden.py")`
			`results = classify_files(self.tmpdir, show_hidden=True)`
			`names = [r["name"] for r in results]`
			`self.assertIn(".hidden.py", names)`

			`def test_excludes_directories(self):`
			`excluded_dir = os.path.join(self.tmpdir, "node_modules")`
			`os.makedirs(excluded_dir)`
			`with open(os.path.join(excluded_dir, "pkg.js"), "w") as f:`
			`f.write("")`
			`self._make_file("main.py")`
			`results = classify_files(self.tmpdir, exclude=["node_modules"])`
			`names = [r["name"] for r in results]`
			`self.assertNotIn("pkg.js", names)`
			`self.assertIn("main.py", names)`

			`def test_on_file_callback(self):`
			`self._make_file("a.py")`
			`self._make_file("b.py")`
			`seen = []`
			`classify_files(self.tmpdir, on_file=seen.append)`
			`self.assertEqual(len(seen), 2)`

			`def test_size_is_populated(self):`
			`self._make_file("data.json", '{"key": "value"}')`
			`results = classify_files(self.tmpdir)`
			`item = next(r for r in results if r["name"] == "data.json")`
			`self.assertGreater(item["size"], 0)`


feat(filetypes): expose raw signals to survey, remove classifier bias (#42) The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring. 2026-04-07 04:36:14 +00:00			`class TestSurveySignals(unittest.TestCase):`
			`def _f(self, name, description="", category="source"):`
			`return {"name": name, "path": f"/x/{name}", "category": category,`
			`"size": 10, "description": description}`

			`def test_empty_input(self):`
			`s = survey_signals([])`
			`self.assertEqual(s["total_files"], 0)`
			`self.assertEqual(s["extension_histogram"], {})`
			`self.assertEqual(s["file_descriptions"], {})`
			`self.assertEqual(s["filename_samples"], [])`

			`def test_extension_histogram_uses_lowercase_and_keeps_none(self):`
			`files = [`
			`self._f("a.PY"), self._f("b.py"), self._f("c.py"),`
			`self._f("README"), self._f("Makefile"),`
			`]`
			`s = survey_signals(files)`
			`self.assertEqual(s["extension_histogram"][".py"], 3)`
			`self.assertEqual(s["extension_histogram"]["(none)"], 2)`

			`def test_file_descriptions_aggregated_and_truncated(self):`
			`long_desc = "x" * 200`
			`files = [`
			`self._f("a.py", "Python script, ASCII text"),`
			`self._f("b.py", "Python script, ASCII text"),`
			`self._f("c.bin", long_desc),`
			`]`
			`s = survey_signals(files)`
			`self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)`
			`# The long description was truncated and still counted once`
			`truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]`
			`self.assertEqual(len(truncated_keys), 1)`
			`self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."`

			`def test_descriptions_skipped_when_empty(self):`
			`files = [self._f("a.py", ""), self._f("b.py", None)]`
			`s = survey_signals(files)`
			`self.assertEqual(s["file_descriptions"], {})`

			`def test_top_n_caps_at_20(self):`
			`files = [self._f(f"f{i}.ext{i}") for i in range(50)]`
			`s = survey_signals(files)`
			`self.assertEqual(len(s["extension_histogram"]), 20)`

			`def test_filename_samples_evenly_drawn(self):`
			`files = [self._f(f"file_{i:04d}.txt") for i in range(100)]`
			`s = survey_signals(files, max_samples=10)`
			`self.assertEqual(len(s["filename_samples"]), 10)`
			`# First sample is the first file (stride 10, index 0)`
			`self.assertEqual(s["filename_samples"][0], "file_0000.txt")`
			`# Last sample is around index 90, not 99`
			`self.assertTrue(s["filename_samples"][-1].startswith("file_009"))`

			`def test_filename_samples_returns_all_when_under_cap(self):`
			`files = [self._f(f"f{i}.txt") for i in range(5)]`
			`s = survey_signals(files, max_samples=20)`
			`self.assertEqual(len(s["filename_samples"]), 5)`

			`def test_total_files_matches_input(self):`
			`files = [self._f(f"f{i}.py") for i in range(7)]`
			`self.assertEqual(survey_signals(files)["total_files"], 7)`


feat(tests): add unit test coverage for all testable modules (#37) 129 tests across cache, filetypes, code, disk, recency, tree, report, and capabilities. Uses stdlib unittest only — no new dependencies. Also updates CLAUDE.md development workflow to require test coverage for all future changes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-04-06 22:57:26 +00:00			`if __name__ == "__main__":`
			`unittest.main()`