2026-03-30 15:57:11 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""Luminos — file system intelligence tool."""
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import os
|
2026-04-06 20:26:37 +00:00
|
|
|
import shutil
|
|
|
|
|
import sys
|
2026-03-30 15:57:11 +00:00
|
|
|
|
|
|
|
|
from luminos_lib.tree import build_tree, render_tree
|
feat(filetypes): expose raw signals to survey, remove classifier bias (#42)
The survey pass no longer receives the bucketed file_categories
histogram, which was biased toward source-code targets and would
mislabel mail, notebooks, ledgers, and other non-code domains as
"source" via the file --brief "text" pattern fallback.
Adds filetypes.survey_signals(), which assembles raw signals from
the same `classified` data the bucketer already processes — no new
walks, no new dependencies:
total_files — total count
extension_histogram — top 20 extensions, raw, no taxonomy
file_descriptions — top 20 `file --brief` outputs, by count
filename_samples — 20 names, evenly drawn (not first-20)
`survey --brief` descriptions are truncated at 80 chars before
counting so prefixes group correctly without exploding key cardinality.
The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the
histogram was biased toward source code) is removed and replaced
with neutral guidance on how to read the raw signals together.
The {file_type_distribution} placeholder is renamed to
{survey_signals} to reflect the broader content.
luminos.py base scan computes survey_signals once and stores it on
report["survey_signals"]; AI consumers read from there.
summarize_categories() and report["file_categories"] are unchanged
— the terminal report still uses the bucketed view (#49 tracks
fixing that follow-up).
Smoke tested on two targets:
- luminos_lib: identical-quality survey ("Python library package",
confidence 0.85), unchanged behavior on code targets.
- A synthetic Maildir of 8 messages with `:2,S` flag suffixes:
survey now correctly identifies it as "A Maildir-format mailbox
containing 8 email messages" with confidence 0.90, names the
Maildir naming convention in domain_notes, and correctly marks
parse_structure as a skip tool. Before #42 this would have been
"8 source files."
Adds 8 unit tests for survey_signals covering empty input, extension
histogram, description aggregation/truncation, top-N cap, and
even-stride filename sampling.
#48 tracks the unit-of-analysis limitation (file is the wrong unit
for mbox, SQLite, archives, notebooks) — explicitly out of scope
for #42 and documented in survey_signals' docstring.
2026-04-07 04:36:14 +00:00
|
|
|
from luminos_lib.filetypes import (
|
|
|
|
|
classify_files,
|
|
|
|
|
summarize_categories,
|
|
|
|
|
survey_signals,
|
|
|
|
|
)
|
2026-03-30 15:57:11 +00:00
|
|
|
from luminos_lib.code import detect_languages, find_large_files
|
|
|
|
|
from luminos_lib.recency import find_recent_files
|
|
|
|
|
from luminos_lib.disk import get_disk_usage, top_directories
|
|
|
|
|
from luminos_lib.watch import watch_loop
|
|
|
|
|
from luminos_lib.report import format_report
|
|
|
|
|
|
|
|
|
|
|
2026-04-06 20:26:37 +00:00
|
|
|
def _progress(label):
|
|
|
|
|
"""Return (on_file, finish) for in-place per-file progress on stderr.
|
|
|
|
|
|
|
|
|
|
on_file(path) overwrites the current line with the label and truncated path.
|
|
|
|
|
finish() finalises the line with a newline.
|
|
|
|
|
"""
|
|
|
|
|
cols = shutil.get_terminal_size((80, 20)).columns
|
|
|
|
|
prefix = f" [scan] {label}... "
|
|
|
|
|
available = max(cols - len(prefix), 10)
|
|
|
|
|
|
|
|
|
|
def on_file(path):
|
|
|
|
|
rel = os.path.relpath(path)
|
|
|
|
|
if len(rel) > available:
|
|
|
|
|
rel = "..." + rel[-(available - 3):]
|
|
|
|
|
print(f"\r{prefix}{rel}\033[K", end="", file=sys.stderr, flush=True)
|
|
|
|
|
|
|
|
|
|
def finish():
|
|
|
|
|
print(f"\r{prefix}done\033[K", file=sys.stderr, flush=True)
|
|
|
|
|
|
|
|
|
|
return on_file, finish
|
|
|
|
|
|
|
|
|
|
|
2026-04-06 20:32:12 +00:00
|
|
|
def scan(target, depth=3, show_hidden=False, exclude=None):
|
2026-03-30 15:57:11 +00:00
|
|
|
"""Run all analyses on the target directory and return a report dict."""
|
|
|
|
|
report = {}
|
|
|
|
|
|
2026-04-06 20:32:12 +00:00
|
|
|
exclude = exclude or []
|
|
|
|
|
|
2026-04-06 20:21:17 +00:00
|
|
|
print(f" [scan] Building directory tree (depth={depth})...", file=sys.stderr)
|
2026-04-06 20:32:12 +00:00
|
|
|
tree = build_tree(target, max_depth=depth, show_hidden=show_hidden,
|
|
|
|
|
exclude=exclude)
|
2026-03-30 15:57:11 +00:00
|
|
|
report["tree"] = tree
|
|
|
|
|
report["tree_rendered"] = render_tree(tree)
|
|
|
|
|
|
2026-04-06 20:26:37 +00:00
|
|
|
on_file, finish = _progress("Classifying files")
|
2026-04-06 20:32:12 +00:00
|
|
|
classified = classify_files(target, show_hidden=show_hidden,
|
|
|
|
|
exclude=exclude, on_file=on_file)
|
2026-04-06 20:26:37 +00:00
|
|
|
finish()
|
2026-03-30 15:57:11 +00:00
|
|
|
report["file_categories"] = summarize_categories(classified)
|
|
|
|
|
report["classified_files"] = classified
|
feat(filetypes): expose raw signals to survey, remove classifier bias (#42)
The survey pass no longer receives the bucketed file_categories
histogram, which was biased toward source-code targets and would
mislabel mail, notebooks, ledgers, and other non-code domains as
"source" via the file --brief "text" pattern fallback.
Adds filetypes.survey_signals(), which assembles raw signals from
the same `classified` data the bucketer already processes — no new
walks, no new dependencies:
total_files — total count
extension_histogram — top 20 extensions, raw, no taxonomy
file_descriptions — top 20 `file --brief` outputs, by count
filename_samples — 20 names, evenly drawn (not first-20)
`survey --brief` descriptions are truncated at 80 chars before
counting so prefixes group correctly without exploding key cardinality.
The Band-Aid in _SURVEY_SYSTEM_PROMPT (warning the LLM that the
histogram was biased toward source code) is removed and replaced
with neutral guidance on how to read the raw signals together.
The {file_type_distribution} placeholder is renamed to
{survey_signals} to reflect the broader content.
luminos.py base scan computes survey_signals once and stores it on
report["survey_signals"]; AI consumers read from there.
summarize_categories() and report["file_categories"] are unchanged
— the terminal report still uses the bucketed view (#49 tracks
fixing that follow-up).
Smoke tested on two targets:
- luminos_lib: identical-quality survey ("Python library package",
confidence 0.85), unchanged behavior on code targets.
- A synthetic Maildir of 8 messages with `:2,S` flag suffixes:
survey now correctly identifies it as "A Maildir-format mailbox
containing 8 email messages" with confidence 0.90, names the
Maildir naming convention in domain_notes, and correctly marks
parse_structure as a skip tool. Before #42 this would have been
"8 source files."
Adds 8 unit tests for survey_signals covering empty input, extension
histogram, description aggregation/truncation, top-N cap, and
even-stride filename sampling.
#48 tracks the unit-of-analysis limitation (file is the wrong unit
for mbox, SQLite, archives, notebooks) — explicitly out of scope
for #42 and documented in survey_signals' docstring.
2026-04-07 04:36:14 +00:00
|
|
|
report["survey_signals"] = survey_signals(classified)
|
2026-03-30 15:57:11 +00:00
|
|
|
|
2026-04-06 20:26:37 +00:00
|
|
|
on_file, finish = _progress("Counting lines")
|
|
|
|
|
languages, loc = detect_languages(classified, on_file=on_file)
|
|
|
|
|
finish()
|
2026-03-30 15:57:11 +00:00
|
|
|
report["languages"] = languages
|
|
|
|
|
report["lines_of_code"] = loc
|
2026-04-06 20:26:37 +00:00
|
|
|
|
|
|
|
|
on_file, finish = _progress("Checking for large files")
|
|
|
|
|
report["large_files"] = find_large_files(classified, on_file=on_file)
|
|
|
|
|
finish()
|
2026-03-30 15:57:11 +00:00
|
|
|
|
2026-04-06 20:21:17 +00:00
|
|
|
print(" [scan] Finding recently modified files...", file=sys.stderr)
|
2026-04-06 20:32:12 +00:00
|
|
|
report["recent_files"] = find_recent_files(target, show_hidden=show_hidden,
|
|
|
|
|
exclude=exclude)
|
2026-03-30 15:57:11 +00:00
|
|
|
|
2026-04-06 20:21:17 +00:00
|
|
|
print(" [scan] Calculating disk usage...", file=sys.stderr)
|
2026-04-06 20:32:12 +00:00
|
|
|
usage = get_disk_usage(target, show_hidden=show_hidden, exclude=exclude)
|
2026-03-30 15:57:11 +00:00
|
|
|
report["disk_usage"] = usage
|
|
|
|
|
report["top_directories"] = top_directories(usage, n=5)
|
|
|
|
|
|
2026-04-06 20:21:17 +00:00
|
|
|
print(" [scan] Base scan complete.", file=sys.stderr)
|
2026-03-30 15:57:11 +00:00
|
|
|
return report
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
prog="luminos",
|
|
|
|
|
description="Luminos — file system intelligence tool. "
|
|
|
|
|
"Explores a directory and produces a reconnaissance report.",
|
|
|
|
|
)
|
2026-03-30 18:14:05 +00:00
|
|
|
parser.add_argument("target", nargs="?", help="Target directory to analyze")
|
2026-03-30 15:57:11 +00:00
|
|
|
parser.add_argument("-d", "--depth", type=int, default=3,
|
|
|
|
|
help="Maximum tree depth (default: 3)")
|
|
|
|
|
parser.add_argument("-a", "--all", action="store_true",
|
|
|
|
|
help="Include hidden files and directories")
|
|
|
|
|
parser.add_argument("--json", action="store_true", dest="json_output",
|
|
|
|
|
help="Output report as JSON")
|
|
|
|
|
parser.add_argument("-o", "--output", metavar="FILE",
|
|
|
|
|
help="Write report to a file")
|
2026-03-30 16:03:48 +00:00
|
|
|
parser.add_argument("--ai", action="store_true",
|
|
|
|
|
help="Use Claude AI to analyze directory purpose "
|
|
|
|
|
"(requires ANTHROPIC_API_KEY)")
|
2026-03-30 15:57:11 +00:00
|
|
|
parser.add_argument("--watch", action="store_true",
|
|
|
|
|
help="Re-scan every 30 seconds and show diffs")
|
2026-03-30 18:14:05 +00:00
|
|
|
parser.add_argument("--clear-cache", action="store_true",
|
|
|
|
|
help="Clear the AI investigation cache (/tmp/luminos/)")
|
|
|
|
|
parser.add_argument("--fresh", action="store_true",
|
|
|
|
|
help="Force a new AI investigation (ignore cached results)")
|
|
|
|
|
parser.add_argument("--install-extras", action="store_true",
|
|
|
|
|
help="Show status of optional AI dependencies")
|
2026-04-06 20:32:12 +00:00
|
|
|
parser.add_argument("-x", "--exclude", metavar="DIR", action="append",
|
|
|
|
|
default=[],
|
|
|
|
|
help="Exclude a directory name from scan and analysis "
|
|
|
|
|
"(repeatable, e.g. -x .git -x node_modules)")
|
2026-03-30 15:57:11 +00:00
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2026-03-30 18:14:05 +00:00
|
|
|
# --install-extras: show package status and exit
|
|
|
|
|
if args.install_extras:
|
|
|
|
|
from luminos_lib.capabilities import print_status
|
|
|
|
|
print_status()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# --clear-cache: wipe /tmp/luminos/ (lazy import to avoid AI deps)
|
|
|
|
|
if args.clear_cache:
|
|
|
|
|
from luminos_lib.capabilities import clear_cache
|
|
|
|
|
clear_cache()
|
|
|
|
|
if not args.target:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if not args.target:
|
|
|
|
|
parser.error("the following arguments are required: target")
|
|
|
|
|
|
2026-03-30 15:57:11 +00:00
|
|
|
target = os.path.abspath(args.target)
|
|
|
|
|
if not os.path.isdir(target):
|
|
|
|
|
print(f"Error: '{args.target}' is not a directory or does not exist.",
|
|
|
|
|
file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
2026-04-06 20:32:12 +00:00
|
|
|
if args.exclude:
|
|
|
|
|
print(f" [scan] Excluding: {', '.join(args.exclude)}", file=sys.stderr)
|
|
|
|
|
|
2026-03-30 15:57:11 +00:00
|
|
|
if args.watch:
|
|
|
|
|
watch_loop(target, depth=args.depth, show_hidden=args.all,
|
|
|
|
|
json_output=args.json_output)
|
|
|
|
|
return
|
|
|
|
|
|
2026-04-06 20:32:12 +00:00
|
|
|
report = scan(target, depth=args.depth, show_hidden=args.all,
|
|
|
|
|
exclude=args.exclude)
|
2026-03-30 15:57:11 +00:00
|
|
|
|
2026-03-30 19:02:19 +00:00
|
|
|
flags = []
|
2026-03-30 16:03:48 +00:00
|
|
|
if args.ai:
|
2026-03-30 18:14:05 +00:00
|
|
|
from luminos_lib.ai import analyze_directory
|
2026-04-06 20:32:12 +00:00
|
|
|
brief, detailed, flags = analyze_directory(
|
|
|
|
|
report, target, fresh=args.fresh, exclude=args.exclude)
|
2026-03-30 16:03:48 +00:00
|
|
|
report["ai_brief"] = brief
|
|
|
|
|
report["ai_detailed"] = detailed
|
2026-03-30 19:02:19 +00:00
|
|
|
report["flags"] = flags
|
2026-03-30 16:03:48 +00:00
|
|
|
|
2026-03-30 15:57:11 +00:00
|
|
|
if args.json_output:
|
|
|
|
|
output = json.dumps(report, indent=2, default=str)
|
|
|
|
|
else:
|
2026-03-30 19:02:19 +00:00
|
|
|
output = format_report(report, target, flags=flags)
|
2026-03-30 15:57:11 +00:00
|
|
|
|
|
|
|
|
if args.output:
|
|
|
|
|
try:
|
|
|
|
|
with open(args.output, "w") as f:
|
|
|
|
|
f.write(output + "\n")
|
|
|
|
|
print(f"Report written to {args.output}")
|
|
|
|
|
except OSError as e:
|
|
|
|
|
print(f"Error writing to '{args.output}': {e}", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
else:
|
|
|
|
|
print(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|