luminos/luminos_lib/filetypes.py

"""File type intelligence — classify files by category."""

import os
import subprocess

# Extension-based classification
EXTENSION_MAP = {
    # Source code
    ".py": "source", ".js": "source", ".ts": "source", ".jsx": "source",
    ".tsx": "source", ".java": "source", ".c": "source", ".cpp": "source",
    ".cc": "source", ".h": "source", ".hpp": "source", ".go": "source",
    ".rs": "source", ".rb": "source", ".php": "source", ".swift": "source",
    ".kt": "source", ".scala": "source", ".sh": "source", ".bash": "source",
    ".zsh": "source", ".pl": "source", ".lua": "source", ".r": "source",
    ".m": "source", ".cs": "source", ".hs": "source", ".ex": "source",
    ".exs": "source", ".erl": "source", ".clj": "source", ".vim": "source",
    ".el": "source", ".sql": "source",

    # Config
    ".json": "config", ".yaml": "config", ".yml": "config", ".toml": "config",
    ".ini": "config", ".cfg": "config", ".conf": "config", ".xml": "config",
    ".env": "config", ".properties": "config", ".editorconfig": "config",

    # Data
    ".csv": "data", ".tsv": "data", ".parquet": "data", ".sqlite": "data",
    ".db": "data", ".sql": "data", ".ndjson": "data", ".jsonl": "data",

    # Media
    ".png": "media", ".jpg": "media", ".jpeg": "media", ".gif": "media",
    ".svg": "media", ".bmp": "media", ".ico": "media", ".webp": "media",
    ".mp3": "media", ".wav": "media", ".mp4": "media", ".avi": "media",
    ".mkv": "media", ".mov": "media", ".flac": "media", ".ogg": "media",

    # Documents
    ".md": "document", ".txt": "document", ".rst": "document",
    ".pdf": "document", ".doc": "document", ".docx": "document",
    ".odt": "document", ".rtf": "document", ".tex": "document",
    ".html": "document", ".htm": "document", ".css": "document",

    # Archives
    ".zip": "archive", ".tar": "archive", ".gz": "archive",
    ".bz2": "archive", ".xz": "archive", ".7z": "archive",
    ".rar": "archive", ".tgz": "archive",
}

# Patterns from `file` command output
FILE_CMD_PATTERNS = {
    "text": "source",
    "script": "source",
    "program": "source",
    "JSON": "config",
    "XML": "config",
    "image": "media",
    "audio": "media",
    "video": "media",
    "PDF": "document",
    "document": "document",
    "archive": "archive",
    "compressed": "archive",
}


def _file_command(path):
    """Run `file --brief` on a path and return the output."""
    try:
        result = subprocess.run(
            ["file", "--brief", path],
            capture_output=True, text=True, timeout=5,
        )
        return result.stdout.strip()
    except (subprocess.TimeoutExpired, FileNotFoundError):
        return ""


def _classify_one(filepath):
    """Classify a single file. Returns (category, file_description)."""
    ext = os.path.splitext(filepath)[1].lower()
    if ext in EXTENSION_MAP:
        return EXTENSION_MAP[ext], None

    desc = _file_command(filepath)
    for pattern, category in FILE_CMD_PATTERNS.items():
        if pattern.lower() in desc.lower():
            return category, desc

    return "unknown", desc


def classify_files(target, show_hidden=False, exclude=None, on_file=None):
    exclude = exclude or []
    """Walk the target directory and classify every file.

    Returns a list of dicts: {path, name, category, size, description}.
    on_file(path) is called after each file is classified, if provided.
    """
    results = []
    for root, dirs, files in os.walk(target):
        dirs[:] = [d for d in dirs
                   if d not in exclude
                   and (show_hidden or not d.startswith("."))]
        if not show_hidden:
            files = [f for f in files if not f.startswith(".")]
        for fname in files:
            full = os.path.join(root, fname)
            if not os.path.isfile(full):
                continue
            try:
                size = os.path.getsize(full)
            except OSError:
                size = 0
            category, desc = _classify_one(full)
            results.append({
                "path": full,
                "name": fname,
                "category": category,
                "size": size,
                "description": desc,
            })
            if on_file:
                on_file(full)
    return results


def summarize_categories(classified):
    """Return a dict of category -> count."""
    summary = {}
    for f in classified:
        cat = f["category"]
        summary[cat] = summary.get(cat, 0) + 1
    return summary


_SURVEY_TOP_N = 20
_SURVEY_DESC_TRUNCATE = 80


def survey_signals(classified, max_samples=20):
    """Return raw, unbucketed signals for the AI survey pass.

    Unlike `summarize_categories`, which collapses files into a small
    biased taxonomy, this exposes the primary signals so the survey
    LLM can characterize the target without being misled by the
    classifier's source-code bias.

    See #42 for the rationale and #48 for the unit-of-analysis
    limitation: the unit here is still "file" — containers like mbox,
    SQLite, and zip will under-count, while dense file collections like
    Maildir will over-count.

    Returns a dict with:
      total_files       — total count
      extension_histogram — {ext: count}, top _SURVEY_TOP_N by count
      file_descriptions — {description: count}, top _SURVEY_TOP_N by count
      filename_samples  — up to max_samples filenames, evenly drawn
    """
    total = len(classified)

    ext_counts = {}
    desc_counts = {}
    for f in classified:
        ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"
        ext_counts[ext] = ext_counts.get(ext, 0) + 1

        desc = (f.get("description") or "").strip()
        if desc:
            if len(desc) > _SURVEY_DESC_TRUNCATE:
                desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."
            desc_counts[desc] = desc_counts.get(desc, 0) + 1

    def _top(d):
        items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))
        return dict(items[:_SURVEY_TOP_N])

    if total > 0 and max_samples > 0:
        if total <= max_samples:
            samples = [f.get("name", "") for f in classified]
        else:
            stride = total / max_samples
            samples = [
                classified[int(i * stride)].get("name", "")
                for i in range(max_samples)
            ]
    else:
        samples = []

    return {
        "total_files": total,
        "extension_histogram": _top(ext_counts),
        "file_descriptions": _top(desc_counts),
        "filename_samples": samples,
    }
feat: add file type intelligence Classifies files by category (source, config, data, media, document, archive, unknown) using extension mapping and the `file` command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-30 15:57:18 +00:00			`"""File type intelligence — classify files by category."""`

			`import os`
			`import subprocess`

			`# Extension-based classification`
			`EXTENSION_MAP = {`
			`# Source code`
			`".py": "source", ".js": "source", ".ts": "source", ".jsx": "source",`
			`".tsx": "source", ".java": "source", ".c": "source", ".cpp": "source",`
			`".cc": "source", ".h": "source", ".hpp": "source", ".go": "source",`
			`".rs": "source", ".rb": "source", ".php": "source", ".swift": "source",`
			`".kt": "source", ".scala": "source", ".sh": "source", ".bash": "source",`
			`".zsh": "source", ".pl": "source", ".lua": "source", ".r": "source",`
			`".m": "source", ".cs": "source", ".hs": "source", ".ex": "source",`
			`".exs": "source", ".erl": "source", ".clj": "source", ".vim": "source",`
			`".el": "source", ".sql": "source",`

			`# Config`
			`".json": "config", ".yaml": "config", ".yml": "config", ".toml": "config",`
			`".ini": "config", ".cfg": "config", ".conf": "config", ".xml": "config",`
			`".env": "config", ".properties": "config", ".editorconfig": "config",`

			`# Data`
			`".csv": "data", ".tsv": "data", ".parquet": "data", ".sqlite": "data",`
			`".db": "data", ".sql": "data", ".ndjson": "data", ".jsonl": "data",`

			`# Media`
			`".png": "media", ".jpg": "media", ".jpeg": "media", ".gif": "media",`
			`".svg": "media", ".bmp": "media", ".ico": "media", ".webp": "media",`
			`".mp3": "media", ".wav": "media", ".mp4": "media", ".avi": "media",`
			`".mkv": "media", ".mov": "media", ".flac": "media", ".ogg": "media",`

			`# Documents`
			`".md": "document", ".txt": "document", ".rst": "document",`
			`".pdf": "document", ".doc": "document", ".docx": "document",`
			`".odt": "document", ".rtf": "document", ".tex": "document",`
			`".html": "document", ".htm": "document", ".css": "document",`

			`# Archives`
			`".zip": "archive", ".tar": "archive", ".gz": "archive",`
			`".bz2": "archive", ".xz": "archive", ".7z": "archive",`
			`".rar": "archive", ".tgz": "archive",`
			`}`

			# Patterns from `file` command output
			`FILE_CMD_PATTERNS = {`
			`"text": "source",`
			`"script": "source",`
			`"program": "source",`
			`"JSON": "config",`
			`"XML": "config",`
			`"image": "media",`
			`"audio": "media",`
			`"video": "media",`
			`"PDF": "document",`
			`"document": "document",`
			`"archive": "archive",`
			`"compressed": "archive",`
			`}`


			`def _file_command(path):`
			"""Run `file --brief` on a path and return the output."""
			`try:`
			`result = subprocess.run(`
			`["file", "--brief", path],`
			`capture_output=True, text=True, timeout=5,`
			`)`
			`return result.stdout.strip()`
			`except (subprocess.TimeoutExpired, FileNotFoundError):`
			`return ""`


			`def _classify_one(filepath):`
			`"""Classify a single file. Returns (category, file_description)."""`
			`ext = os.path.splitext(filepath)[1].lower()`
			`if ext in EXTENSION_MAP:`
			`return EXTENSION_MAP[ext], None`

			`desc = _file_command(filepath)`
			`for pattern, category in FILE_CMD_PATTERNS.items():`
			`if pattern.lower() in desc.lower():`
			`return category, desc`

			`return "unknown", desc`


feat: add -x/--exclude flag to exclude directories from scan and AI analysis 2026-04-06 20:32:12 +00:00			`def classify_files(target, show_hidden=False, exclude=None, on_file=None):`
			`exclude = exclude or []`
feat: add file type intelligence Classifies files by category (source, config, data, media, document, archive, unknown) using extension mapping and the `file` command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-30 15:57:18 +00:00			`"""Walk the target directory and classify every file.`

			`Returns a list of dicts: {path, name, category, size, description}.`
feat: in-place per-file progress for classify, count, and large-file steps 2026-04-06 20:26:37 +00:00			`on_file(path) is called after each file is classified, if provided.`
feat: add file type intelligence Classifies files by category (source, config, data, media, document, archive, unknown) using extension mapping and the `file` command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-30 15:57:18 +00:00			`"""`
			`results = []`
			`for root, dirs, files in os.walk(target):`
feat: add -x/--exclude flag to exclude directories from scan and AI analysis 2026-04-06 20:32:12 +00:00			`dirs[:] = [d for d in dirs`
			`if d not in exclude`
			`and (show_hidden or not d.startswith("."))]`
feat: add file type intelligence Classifies files by category (source, config, data, media, document, archive, unknown) using extension mapping and the `file` command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-30 15:57:18 +00:00			`if not show_hidden:`
			`files = [f for f in files if not f.startswith(".")]`
			`for fname in files:`
			`full = os.path.join(root, fname)`
			`if not os.path.isfile(full):`
			`continue`
			`try:`
			`size = os.path.getsize(full)`
			`except OSError:`
			`size = 0`
			`category, desc = _classify_one(full)`
			`results.append({`
			`"path": full,`
			`"name": fname,`
			`"category": category,`
			`"size": size,`
			`"description": desc,`
			`})`
feat: in-place per-file progress for classify, count, and large-file steps 2026-04-06 20:26:37 +00:00			`if on_file:`
			`on_file(full)`
feat: add file type intelligence Classifies files by category (source, config, data, media, document, archive, unknown) using extension mapping and the `file` command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-30 15:57:18 +00:00			`return results`


			`def summarize_categories(classified):`
			`"""Return a dict of category -> count."""`
			`summary = {}`
			`for f in classified:`
			`cat = f["category"]`
			`summary[cat] = summary.get(cat, 0) + 1`
			`return summary`
feat(filetypes): expose raw signals to survey, remove classifier bias (#42) The survey pass no longer receives the bucketed file_categories histogram, which was biased toward source-code targets and would mislabel mail, notebooks, ledgers, and other non-code domains as "source" via the file --brief "text" pattern fallback. Adds filetypes.survey_signals(), which assembles raw signals from the same `classified` data the bucketer already processes — no new walks, no new dependencies: total_files — total count extension_histogram — top 20 extensions, raw, no taxonomy file_descriptions — top 20 `file --brief` outputs, by count filename_samples — 20 names, evenly drawn (not first-20) `survey --brief` descriptions are truncated at 80 chars before counting so prefixes group correctly without exploding key cardinality. The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the histogram was biased toward source code) is removed and replaced with neutral guidance on how to read the raw signals together. The {file_type_distribution} placeholder is renamed to {survey_signals} to reflect the broader content. luminos.py base scan computes survey_signals once and stores it on report["survey_signals"]; AI consumers read from there. summarize_categories() and report["file_categories"] are unchanged — the terminal report still uses the bucketed view (#49 tracks fixing that follow-up). Smoke tested on two targets: - luminos_lib: identical-quality survey ("Python library package", confidence 0.85), unchanged behavior on code targets. - A synthetic Maildir of 8 messages with `:2,S` flag suffixes: survey now correctly identifies it as "A Maildir-format mailbox containing 8 email messages" with confidence 0.90, names the Maildir naming convention in domain_notes, and correctly marks parse_structure as a skip tool. Before #42 this would have been "8 source files." Adds 8 unit tests for survey_signals covering empty input, extension histogram, description aggregation/truncation, top-N cap, and even-stride filename sampling. #48 tracks the unit-of-analysis limitation (file is the wrong unit for mbox, SQLite, archives, notebooks) — explicitly out of scope for #42 and documented in survey_signals' docstring. 2026-04-07 04:36:14 +00:00

			`_SURVEY_TOP_N = 20`
			`_SURVEY_DESC_TRUNCATE = 80`


			`def survey_signals(classified, max_samples=20):`
			`"""Return raw, unbucketed signals for the AI survey pass.`

			Unlike `summarize_categories`, which collapses files into a small
			`biased taxonomy, this exposes the primary signals so the survey`
			`LLM can characterize the target without being misled by the`
			`classifier's source-code bias.`

			`See #42 for the rationale and #48 for the unit-of-analysis`
			`limitation: the unit here is still "file" — containers like mbox,`
			`SQLite, and zip will under-count, while dense file collections like`
			`Maildir will over-count.`

			`Returns a dict with:`
			`total_files — total count`
			`extension_histogram — {ext: count}, top _SURVEY_TOP_N by count`
			`file_descriptions — {description: count}, top _SURVEY_TOP_N by count`
			`filename_samples — up to max_samples filenames, evenly drawn`
			`"""`
			`total = len(classified)`

			`ext_counts = {}`
			`desc_counts = {}`
			`for f in classified:`
			`ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"`
			`ext_counts[ext] = ext_counts.get(ext, 0) + 1`

			`desc = (f.get("description") or "").strip()`
			`if desc:`
			`if len(desc) > _SURVEY_DESC_TRUNCATE:`
			`desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."`
			`desc_counts[desc] = desc_counts.get(desc, 0) + 1`

			`def _top(d):`
			`items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))`
			`return dict(items[:_SURVEY_TOP_N])`

			`if total > 0 and max_samples > 0:`
			`if total <= max_samples:`
			`samples = [f.get("name", "") for f in classified]`
			`else:`
			`stride = total / max_samples`
			`samples = [`
			`classified[int(i * stride)].get("name", "")`
			`for i in range(max_samples)`
			`]`
			`else:`
			`samples = []`

			`return {`
			`"total_files": total,`
			`"extension_histogram": _top(ext_counts),`
			`"file_descriptions": _top(desc_counts),`
			`"filename_samples": samples,`
			`}`