merge: add -x/--exclude flag for directory exclusion

feat: add -x/--exclude flag to exclude directories from scan and AI analysis
merge: in-place per-file progress for scan steps
2026-04-06 14:32:17 -06:00 · 2026-04-06 14:32:12 -06:00 · 2026-04-06 14:26:40 -06:00 · 2026-04-06 14:26:37 -06:00 · 2026-04-06 14:21:19 -06:00 · 2026-04-06 14:21:17 -06:00
9 changed files with 510 additions and 326 deletions
--- a/luminos.py
+++ b/luminos.py
@ -3,8 +3,9 @@
 import argparse
 import json
 import sys
 import os
 import shutil
 import sys
 from luminos_lib.tree import build_tree, render_tree
 from luminos_lib.filetypes import classify_files, summarize_categories
@ -15,29 +16,67 @@ from luminos_lib.watch import watch_loop
 from luminos_lib.report import format_report
-def scan(target, depth=3, show_hidden=False):
+def _progress(label):
    """Return (on_file, finish) for in-place per-file progress on stderr.
    on_file(path) overwrites the current line with the label and truncated path.
    finish() finalises the line with a newline.
    """
    cols = shutil.get_terminal_size((80, 20)).columns
    prefix = f"  [scan] {label}... "
    available = max(cols - len(prefix), 10)
    def on_file(path):
        rel = os.path.relpath(path)
        if len(rel) > available:
            rel = "..." + rel[-(available - 3):]
        print(f"\r{prefix}{rel}\033[K", end="", file=sys.stderr, flush=True)
    def finish():
        print(f"\r{prefix}done\033[K", file=sys.stderr, flush=True)
    return on_file, finish
 def scan(target, depth=3, show_hidden=False, exclude=None):
    """Run all analyses on the target directory and return a report dict."""
    report = {}
-    tree = build_tree(target, max_depth=depth, show_hidden=show_hidden)
+    exclude = exclude or []
    print(f"  [scan] Building directory tree (depth={depth})...", file=sys.stderr)
    tree = build_tree(target, max_depth=depth, show_hidden=show_hidden,
                      exclude=exclude)
    report["tree"] = tree
    report["tree_rendered"] = render_tree(tree)
-    classified = classify_files(target, show_hidden=show_hidden)
+    on_file, finish = _progress("Classifying files")
    classified = classify_files(target, show_hidden=show_hidden,
                                exclude=exclude, on_file=on_file)
    finish()
    report["file_categories"] = summarize_categories(classified)
    report["classified_files"] = classified
-    languages, loc = detect_languages(classified)
+    on_file, finish = _progress("Counting lines")
    languages, loc = detect_languages(classified, on_file=on_file)
    finish()
    report["languages"] = languages
    report["lines_of_code"] = loc
    report["large_files"] = find_large_files(classified)
-    report["recent_files"] = find_recent_files(target, show_hidden=show_hidden)
+    on_file, finish = _progress("Checking for large files")
    report["large_files"] = find_large_files(classified, on_file=on_file)
    finish()
-    usage = get_disk_usage(target, show_hidden=show_hidden)
+    print("  [scan] Finding recently modified files...", file=sys.stderr)
    report["recent_files"] = find_recent_files(target, show_hidden=show_hidden,
                                               exclude=exclude)
    print("  [scan] Calculating disk usage...", file=sys.stderr)
    usage = get_disk_usage(target, show_hidden=show_hidden, exclude=exclude)
    report["disk_usage"] = usage
    report["top_directories"] = top_directories(usage, n=5)
    print("  [scan] Base scan complete.", file=sys.stderr)
    return report
@ -67,6 +106,10 @@ def main():
                        help="Force a new AI investigation (ignore cached results)")
    parser.add_argument("--install-extras", action="store_true",
                        help="Show status of optional AI dependencies")
    parser.add_argument("-x", "--exclude", metavar="DIR", action="append",
                        default=[],
                        help="Exclude a directory name from scan and analysis "
                             "(repeatable, e.g. -x .git -x node_modules)")
    args = parser.parse_args()
@ -92,17 +135,22 @@ def main():
              file=sys.stderr)
        sys.exit(1)
    if args.exclude:
        print(f"  [scan] Excluding: {', '.join(args.exclude)}", file=sys.stderr)
    if args.watch:
        watch_loop(target, depth=args.depth, show_hidden=args.all,
                   json_output=args.json_output)
        return
-    report = scan(target, depth=args.depth, show_hidden=args.all)
+    report = scan(target, depth=args.depth, show_hidden=args.all,
                  exclude=args.exclude)
    flags = []
    if args.ai:
        from luminos_lib.ai import analyze_directory
-        brief, detailed, flags = analyze_directory(report, target, fresh=args.fresh)
+        brief, detailed, flags = analyze_directory(
            report, target, fresh=args.fresh, exclude=args.exclude)
        report["ai_brief"] = brief
        report["ai_detailed"] = detailed
        report["flags"] = flags
--- a/luminos_lib/ai.py
+++ b/luminos_lib/ai.py
@ -19,14 +19,10 @@ from datetime import datetime, timezone
 import anthropic
 import magic
-import tree_sitter
+from luminos_lib.ast_parser import parse_structure
 import tree_sitter_python
 import tree_sitter_javascript
 import tree_sitter_rust
 import tree_sitter_go
 from luminos_lib.cache import _CacheManager, _get_investigation_id
 from luminos_lib.capabilities import check_ai_dependencies
 from luminos_lib.prompts import _DIR_SYSTEM_PROMPT, _SYNTHESIS_SYSTEM_PROMPT
 MODEL = "claude-sonnet-4-20250514"
@ -48,33 +44,6 @@ _SKIP_DIRS = {
 # Commands the run_command tool is allowed to execute.
 _COMMAND_WHITELIST = {"wc", "file", "grep", "head", "tail", "stat", "du", "find"}
 # tree-sitter language registry: extension → (grammar_module, language_name)
 _TS_LANGUAGES = {
    ".py": (tree_sitter_python, "python"),
    ".js": (tree_sitter_javascript, "javascript"),
    ".jsx": (tree_sitter_javascript, "javascript"),
    ".mjs": (tree_sitter_javascript, "javascript"),
    ".rs": (tree_sitter_rust, "rust"),
    ".go": (tree_sitter_go, "go"),
 }
 # Precompute Language objects once.
 _TS_LANG_CACHE = {}
 def _get_ts_parser(ext):
    """Return a (Parser, language_name) tuple for a file extension, or None."""
    entry = _TS_LANGUAGES.get(ext)
    if entry is None:
        return None
    module, lang_name = entry
    if lang_name not in _TS_LANG_CACHE:
        _TS_LANG_CACHE[lang_name] = tree_sitter.Language(module.language())
    lang = _TS_LANG_CACHE[lang_name]
    parser = tree_sitter.Parser(lang)
    return parser, lang_name
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@ -533,181 +502,7 @@ def _tool_parse_structure(args, target, _cache):
        path = os.path.join(target, path)
    if not _path_is_safe(path, target):
        return f"Error: path '{path}' is outside the target directory."
-    if not os.path.isfile(path):
+    return parse_structure(path)
        return f"Error: '{path}' is not a file."
    ext = os.path.splitext(path)[1].lower()
    ts = _get_ts_parser(ext)
    if ts is None:
        return f"Error: no grammar for extension '{ext}'. Supported: {', '.join(sorted(_TS_LANGUAGES.keys()))}"
    parser, lang_name = ts
    try:
        with open(path, "rb") as f:
            source = f.read()
    except OSError as e:
        return f"Error reading file: {e}"
    tree = parser.parse(source)
    root = tree.root_node
    source_text = source.decode("utf-8", errors="replace")
    lines = source_text.split("\n")
    line_count = len(lines)
    functions = []
    classes = []
    imports = []
    has_docstrings = False
    comment_lines = 0
    def _walk(node):
        nonlocal has_docstrings, comment_lines
        for child in node.children:
            ntype = child.type
            # Comments
            if ntype in ("comment", "line_comment", "block_comment"):
                comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
            # Python
            if lang_name == "python":
                if ntype == "function_definition":
                    functions.append(_py_func_sig(child))
                elif ntype == "class_definition":
                    classes.append(_py_class(child))
                elif ntype in ("import_statement", "import_from_statement"):
                    imports.append(child.text.decode("utf-8", errors="replace").strip())
                elif ntype == "expression_statement":
                    first = child.children[0] if child.children else None
                    if first and first.type == "string":
                        has_docstrings = True
            # JavaScript
            elif lang_name == "javascript":
                if ntype in ("function_declaration", "arrow_function",
                             "function"):
                    functions.append(_js_func_sig(child))
                elif ntype == "class_declaration":
                    classes.append(_js_class(child))
                elif ntype in ("import_statement",):
                    imports.append(child.text.decode("utf-8", errors="replace").strip())
            # Rust
            elif lang_name == "rust":
                if ntype == "function_item":
                    functions.append(_rust_func_sig(child))
                elif ntype in ("struct_item", "enum_item", "impl_item"):
                    classes.append(_rust_struct(child))
                elif ntype == "use_declaration":
                    imports.append(child.text.decode("utf-8", errors="replace").strip())
            # Go
            elif lang_name == "go":
                if ntype == "function_declaration":
                    functions.append(_go_func_sig(child))
                elif ntype == "type_declaration":
                    classes.append(_go_type(child))
                elif ntype == "import_declaration":
                    imports.append(child.text.decode("utf-8", errors="replace").strip())
            _walk(child)
    _walk(root)
    code_lines = max(1, line_count - comment_lines)
    result = {
        "language": lang_name,
        "functions": functions[:50],
        "classes": classes[:30],
        "imports": imports[:30],
        "line_count": line_count,
        "has_docstrings": has_docstrings,
        "has_comments": comment_lines > 0,
        "comment_to_code_ratio": round(comment_lines / code_lines, 2),
    }
    return json.dumps(result, indent=2)
 # --- tree-sitter extraction helpers ---
 def _child_by_type(node, *types):
    for c in node.children:
        if c.type in types:
            return c
    return None
 def _text(node):
    return node.text.decode("utf-8", errors="replace") if node else ""
 def _py_func_sig(node):
    name = _text(_child_by_type(node, "identifier"))
    params = _text(_child_by_type(node, "parameters"))
    ret = _child_by_type(node, "type")
    sig = f"{name}{params}"
    if ret:
        sig += f" -> {_text(ret)}"
    return sig
 def _py_class(node):
    name = _text(_child_by_type(node, "identifier"))
    methods = []
    body = _child_by_type(node, "block")
    if body:
        for child in body.children:
            if child.type == "function_definition":
                methods.append(_py_func_sig(child))
    return {"name": name, "methods": methods[:20]}
 def _js_func_sig(node):
    name = _text(_child_by_type(node, "identifier"))
    params = _text(_child_by_type(node, "formal_parameters"))
    return f"{name}{params}" if name else f"(anonymous){params}"
 def _js_class(node):
    name = _text(_child_by_type(node, "identifier"))
    methods = []
    body = _child_by_type(node, "class_body")
    if body:
        for child in body.children:
            if child.type == "method_definition":
                mname = _text(_child_by_type(child, "property_identifier"))
                mparams = _text(_child_by_type(child, "formal_parameters"))
                methods.append(f"{mname}{mparams}")
    return {"name": name, "methods": methods[:20]}
 def _rust_func_sig(node):
    name = _text(_child_by_type(node, "identifier"))
    params = _text(_child_by_type(node, "parameters"))
    ret = _child_by_type(node, "type_identifier", "generic_type",
                         "reference_type", "scoped_type_identifier")
    sig = f"{name}{params}"
    if ret:
        sig += f" -> {_text(ret)}"
    return sig
 def _rust_struct(node):
    name = _text(_child_by_type(node, "type_identifier"))
    return {"name": name or _text(node)[:60], "methods": []}
 def _go_func_sig(node):
    name = _text(_child_by_type(node, "identifier"))
    params = _text(_child_by_type(node, "parameter_list"))
    return f"{name}{params}"
 def _go_type(node):
    spec = _child_by_type(node, "type_spec")
    name = _text(_child_by_type(spec, "type_identifier")) if spec else ""
    return {"name": name or _text(node)[:60], "methods": []}
 def _tool_write_cache(args, _target, cache):
@ -848,14 +643,16 @@ def _call_api_streaming(client, system, messages, tools, tracker):
 # Directory discovery
 # ---------------------------------------------------------------------------
-def _discover_directories(target, show_hidden=False):
+def _discover_directories(target, show_hidden=False, exclude=None):
    """Walk the target and return all directories sorted leaves-first."""
    extra = set(exclude or [])
    dirs = []
    target_real = os.path.realpath(target)
    for root, subdirs, _files in os.walk(target_real, topdown=True):
        subdirs[:] = [
            d for d in subdirs
            if not _should_skip_dir(d)
            and d not in extra
            and (show_hidden or not d.startswith("."))
        ]
        dirs.append(root)
@ -867,74 +664,6 @@ def _discover_directories(target, show_hidden=False):
 # Per-directory agent loop
 # ---------------------------------------------------------------------------
 _DIR_SYSTEM_PROMPT = """\
 You are an expert analyst investigating a SINGLE directory on a file system.
 Do NOT assume the type of content before investigating. Discover what this
 directory contains from what you find.
 ## Your Task
 Investigate the directory: {dir_path}
 (relative to target: {dir_rel})
 You must:
 1. Read the important files in THIS directory (not subdirectories)
 2. For each file you read, call write_cache to save a summary
 3. Call write_cache for the directory itself with a synthesis
 4. Call submit_report with a 1-3 sentence summary
 ## Tools
 parse_structure gives you the skeleton of a file. It does NOT replace \
 reading the file. Use parse_structure first to understand structure, then \
 use read_file if you need to verify intent, check for anomalies, or \
 understand content that structure cannot capture (comments, documentation, \
 data files, config values). A file where structure and content appear to \
 contradict each other is always worth reading in full.
 Use the think tool when choosing which file or directory to investigate \
 next — before starting a new file or switching investigation direction. \
 Do NOT call think before every individual tool call in a sequence.
 Use the checkpoint tool after completing investigation of a meaningful \
 cluster of files. Not after every file — once or twice per directory \
 loop at most.
 Use the flag tool immediately when you find something notable, \
 surprising, or concerning. Severity guide:
  info     = interesting but not problematic
  concern  = worth addressing
  critical = likely broken or dangerous
 ## Step Numbering
 Number your investigation steps as you go. Before starting each new \
 file cluster or phase transition, output:
 Step N: <what you are doing and why>
 Output this as plain text before tool calls, not as a tool call itself.
 ## Efficiency Rules
 - Batch multiple tool calls in a single turn whenever possible
 - Skip binary/compiled/generated files (.pyc, .class, .o, .min.js, etc.)
 - Skip files >100KB unless uniquely important
 - Prioritize: README, index, main, config, schema, manifest files
 - For source files: try parse_structure first, then read_file if needed
 - If read_file returns truncated content, use a larger max_bytes or
  run_command('tail ...') — NEVER retry the identical call
 - You have only {max_turns} turns — be efficient
 ## Cache Schemas
 File: {{path, relative_path, size_bytes, category, summary, notable,
  notable_reason, cached_at}}
 Dir: {{path, relative_path, child_count, summary, dominant_category,
  notable_files, cached_at}}
 category values: source, config, data, document, media, archive, unknown
 ## Context
 {context}
 ## Child Directory Summaries (already investigated)
 {child_summaries}"""
 def _build_dir_context(dir_path):
    lines = []
    try:
@ -1144,32 +873,6 @@ def _block_to_dict(block):
 # Synthesis pass
 # ---------------------------------------------------------------------------
 _SYNTHESIS_SYSTEM_PROMPT = """\
 You are an expert analyst synthesizing a final report about a directory tree.
 ALL directory summaries are provided below — you do NOT need to call
 list_cache or read_cache. Just read the summaries and call submit_report
 immediately in your first turn.
 Do NOT assume the type of content. Let the summaries speak for themselves.
 ## Your Goal
 Produce two outputs via the submit_report tool:
 1. **brief**: A 2-4 sentence summary of what this directory tree is.
 2. **detailed**: A thorough breakdown covering purpose, structure, key
   components, technologies, notable patterns, and any concerns.
 ## Rules
 - ALL summaries are below — call submit_report directly
 - Be specific — reference actual directory and file names
 - Do NOT call list_cache or read_cache
 ## Target
 {target}
 ## Directory Summaries
 {summaries_text}"""
 def _run_synthesis(client, target, cache, tracker, max_turns=5, verbose=False):
    """Run the final synthesis pass. Returns (brief, detailed)."""
    dir_entries = cache.read_all_entries("dir")
@ -1300,7 +1003,7 @@ def _synthesize_from_cache(cache):
 # ---------------------------------------------------------------------------
 def _run_investigation(client, target, report, show_hidden=False,
-                       fresh=False, verbose=False):
+                       fresh=False, verbose=False, exclude=None):
    """Orchestrate the multi-pass investigation. Returns (brief, detailed, flags)."""
    investigation_id, is_new = _get_investigation_id(target, fresh=fresh)
    cache = _CacheManager(investigation_id, target)
@ -1313,7 +1016,8 @@ def _run_investigation(client, target, report, show_hidden=False,
          f"{'' if is_new else ' (resumed)'}", file=sys.stderr)
    print(f"  [AI] Cache: {cache.root}/", file=sys.stderr)
-    all_dirs = _discover_directories(target, show_hidden=show_hidden)
+    all_dirs = _discover_directories(target, show_hidden=show_hidden,
                                     exclude=exclude)
    to_investigate = []
    cached_count = 0
@ -1386,7 +1090,8 @@ def _run_investigation(client, target, report, show_hidden=False,
 # Public interface
 # ---------------------------------------------------------------------------
-def analyze_directory(report, target, verbose_tools=False, fresh=False):
+def analyze_directory(report, target, verbose_tools=False, fresh=False,
                      exclude=None):
    """Run AI analysis on the directory. Returns (brief, detailed, flags).
    Returns ("", "", []) if the API key is missing or dependencies are not met.
@ -1405,6 +1110,7 @@ def analyze_directory(report, target, verbose_tools=False, fresh=False):
    try:
        brief, detailed, flags = _run_investigation(
            client, target, report, fresh=fresh, verbose=verbose_tools,
            exclude=exclude,
        )
    except Exception as e:
        print(f"Warning: AI analysis failed: {e}", file=sys.stderr)
--- a/luminos_lib/ast_parser.py
+++ b/luminos_lib/ast_parser.py
@ -0,0 +1,314 @@
 """AST structure extraction for Luminos using tree-sitter."""
 import json
 import os
 import tree_sitter
 import tree_sitter_python
 import tree_sitter_javascript
 import tree_sitter_rust
 import tree_sitter_go
 # Extension → (grammar_module, language_name)
 _TS_LANGUAGES = {
    ".py": (tree_sitter_python, "python"),
    ".js": (tree_sitter_javascript, "javascript"),
    ".jsx": (tree_sitter_javascript, "javascript"),
    ".mjs": (tree_sitter_javascript, "javascript"),
    ".rs": (tree_sitter_rust, "rust"),
    ".go": (tree_sitter_go, "go"),
 }
 # Precomputed Language objects.
 _TS_LANG_CACHE = {}
 def _get_ts_parser(ext):
    """Return a (Parser, language_name) tuple for a file extension, or None."""
    entry = _TS_LANGUAGES.get(ext)
    if entry is None:
        return None
    module, lang_name = entry
    if lang_name not in _TS_LANG_CACHE:
        _TS_LANG_CACHE[lang_name] = tree_sitter.Language(module.language())
    lang = _TS_LANG_CACHE[lang_name]
    parser = tree_sitter.Parser(lang)
    return parser, lang_name
 # ---------------------------------------------------------------------------
 # Tree-sitter node helpers
 # ---------------------------------------------------------------------------
 def _child_by_type(node, *types):
    for c in node.children:
        if c.type in types:
            return c
    return None
 def _text(node):
    return node.text.decode("utf-8", errors="replace") if node else ""
 # ---------------------------------------------------------------------------
 # Per-language handlers: (root_node, source_bytes) -> dict
 # ---------------------------------------------------------------------------
 def _parse_python(root, source):
    functions = []
    classes = []
    imports = []
    has_docstrings = False
    comment_lines = 0
    def _walk(node):
        nonlocal has_docstrings, comment_lines
        for child in node.children:
            ntype = child.type
            if ntype in ("comment", "line_comment", "block_comment"):
                comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
            if ntype == "function_definition":
                name = _text(_child_by_type(child, "identifier"))
                params = _text(_child_by_type(child, "parameters"))
                ret = _child_by_type(child, "type")
                sig = f"{name}{params}"
                if ret:
                    sig += f" -> {_text(ret)}"
                functions.append(sig)
            elif ntype == "class_definition":
                name = _text(_child_by_type(child, "identifier"))
                methods = []
                body = _child_by_type(child, "block")
                if body:
                    for c in body.children:
                        if c.type == "function_definition":
                            mname = _text(_child_by_type(c, "identifier"))
                            mparams = _text(_child_by_type(c, "parameters"))
                            mret = _child_by_type(c, "type")
                            msig = f"{mname}{mparams}"
                            if mret:
                                msig += f" -> {_text(mret)}"
                            methods.append(msig)
                classes.append({"name": name, "methods": methods[:20]})
            elif ntype in ("import_statement", "import_from_statement"):
                imports.append(child.text.decode("utf-8", errors="replace").strip())
            elif ntype == "expression_statement":
                first = child.children[0] if child.children else None
                if first and first.type == "string":
                    has_docstrings = True
            _walk(child)
    _walk(root)
    source_text = source.decode("utf-8", errors="replace")
    line_count = len(source_text.split("\n"))
    code_lines = max(1, line_count - comment_lines)
    return {
        "language": "python",
        "functions": functions[:50],
        "classes": classes[:30],
        "imports": imports[:30],
        "line_count": line_count,
        "has_docstrings": has_docstrings,
        "has_comments": comment_lines > 0,
        "comment_to_code_ratio": round(comment_lines / code_lines, 2),
    }
 def _parse_javascript(root, source):
    functions = []
    classes = []
    imports = []
    comment_lines = 0
    def _walk(node):
        nonlocal comment_lines
        for child in node.children:
            ntype = child.type
            if ntype in ("comment", "line_comment", "block_comment"):
                comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
            if ntype in ("function_declaration", "arrow_function", "function"):
                name = _text(_child_by_type(child, "identifier"))
                params = _text(_child_by_type(child, "formal_parameters"))
                functions.append(f"{name}{params}" if name else f"(anonymous){params}")
            elif ntype == "class_declaration":
                name = _text(_child_by_type(child, "identifier"))
                methods = []
                body = _child_by_type(child, "class_body")
                if body:
                    for c in body.children:
                        if c.type == "method_definition":
                            mname = _text(_child_by_type(c, "property_identifier"))
                            mparams = _text(_child_by_type(c, "formal_parameters"))
                            methods.append(f"{mname}{mparams}")
                classes.append({"name": name, "methods": methods[:20]})
            elif ntype == "import_statement":
                imports.append(child.text.decode("utf-8", errors="replace").strip())
            _walk(child)
    _walk(root)
    source_text = source.decode("utf-8", errors="replace")
    line_count = len(source_text.split("\n"))
    code_lines = max(1, line_count - comment_lines)
    return {
        "language": "javascript",
        "functions": functions[:50],
        "classes": classes[:30],
        "imports": imports[:30],
        "line_count": line_count,
        "has_docstrings": False,
        "has_comments": comment_lines > 0,
        "comment_to_code_ratio": round(comment_lines / code_lines, 2),
    }
 def _parse_rust(root, source):
    functions = []
    classes = []
    imports = []
    comment_lines = 0
    def _walk(node):
        nonlocal comment_lines
        for child in node.children:
            ntype = child.type
            if ntype in ("comment", "line_comment", "block_comment"):
                comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
            if ntype == "function_item":
                name = _text(_child_by_type(child, "identifier"))
                params = _text(_child_by_type(child, "parameters"))
                ret = _child_by_type(child, "type_identifier", "generic_type",
                                     "reference_type", "scoped_type_identifier")
                sig = f"{name}{params}"
                if ret:
                    sig += f" -> {_text(ret)}"
                functions.append(sig)
            elif ntype in ("struct_item", "enum_item", "impl_item"):
                name = _text(_child_by_type(child, "type_identifier"))
                classes.append({"name": name or _text(child)[:60], "methods": []})
            elif ntype == "use_declaration":
                imports.append(child.text.decode("utf-8", errors="replace").strip())
            _walk(child)
    _walk(root)
    source_text = source.decode("utf-8", errors="replace")
    line_count = len(source_text.split("\n"))
    code_lines = max(1, line_count - comment_lines)
    return {
        "language": "rust",
        "functions": functions[:50],
        "classes": classes[:30],
        "imports": imports[:30],
        "line_count": line_count,
        "has_docstrings": False,
        "has_comments": comment_lines > 0,
        "comment_to_code_ratio": round(comment_lines / code_lines, 2),
    }
 def _parse_go(root, source):
    functions = []
    classes = []
    imports = []
    comment_lines = 0
    def _walk(node):
        nonlocal comment_lines
        for child in node.children:
            ntype = child.type
            if ntype in ("comment", "line_comment", "block_comment"):
                comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
            if ntype == "function_declaration":
                name = _text(_child_by_type(child, "identifier"))
                params = _text(_child_by_type(child, "parameter_list"))
                functions.append(f"{name}{params}")
            elif ntype == "type_declaration":
                spec = _child_by_type(child, "type_spec")
                name = _text(_child_by_type(spec, "type_identifier")) if spec else ""
                classes.append({"name": name or _text(child)[:60], "methods": []})
            elif ntype == "import_declaration":
                imports.append(child.text.decode("utf-8", errors="replace").strip())
            _walk(child)
    _walk(root)
    source_text = source.decode("utf-8", errors="replace")
    line_count = len(source_text.split("\n"))
    code_lines = max(1, line_count - comment_lines)
    return {
        "language": "go",
        "functions": functions[:50],
        "classes": classes[:30],
        "imports": imports[:30],
        "line_count": line_count,
        "has_docstrings": False,
        "has_comments": comment_lines > 0,
        "comment_to_code_ratio": round(comment_lines / code_lines, 2),
    }
 # ---------------------------------------------------------------------------
 # Language handler registry
 # ---------------------------------------------------------------------------
 _LANGUAGE_HANDLERS = {
    "python": _parse_python,
    "javascript": _parse_javascript,
    "rust": _parse_rust,
    "go": _parse_go,
 }
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 def parse_structure(path):
    """Parse a source file and return its structural skeleton as a JSON string.
    Takes an absolute path. Returns a JSON string of the structure dict,
    or an error string if parsing fails or the language is unsupported.
    """
    if not os.path.isfile(path):
        return f"Error: '{path}' is not a file."
    ext = os.path.splitext(path)[1].lower()
    ts = _get_ts_parser(ext)
    if ts is None:
        return (f"Error: no grammar for extension '{ext}'. "
                f"Supported: {', '.join(sorted(_TS_LANGUAGES.keys()))}")
    parser, lang_name = ts
    handler = _LANGUAGE_HANDLERS.get(lang_name)
    if handler is None:
        return f"Error: no handler for language '{lang_name}'."
    try:
        with open(path, "rb") as f:
            source = f.read()
    except OSError as e:
        return f"Error reading file: {e}"
    tree = parser.parse(source)
    result = handler(tree.root_node, source)
    return json.dumps(result, indent=2)
--- a/luminos_lib/code.py
+++ b/luminos_lib/code.py
@ -34,10 +34,11 @@ def _count_lines(filepath):
    return 0
-def detect_languages(classified_files):
+def detect_languages(classified_files, on_file=None):
    """Detect languages present and count lines of code per language.
    Returns (languages_set, loc_by_language).
    on_file(path) is called per source file, if provided.
    """
    source_files = [f for f in classified_files if f["category"] == "source"]
    languages = set()
@ -49,12 +50,17 @@ def detect_languages(classified_files):
        languages.add(lang)
        lines = _count_lines(f["path"])
        loc[lang] = loc.get(lang, 0) + lines
        if on_file:
            on_file(f["path"])
    return sorted(languages), loc
-def find_large_files(classified_files):
+def find_large_files(classified_files, on_file=None):
-    """Find files that are unusually large (>1000 lines or >10MB)."""
+    """Find files that are unusually large (>1000 lines or >10MB).
    on_file(path) is called per source file checked, if provided.
    """
    source_files = [f for f in classified_files if f["category"] == "source"]
    large = []
@ -68,5 +74,7 @@ def find_large_files(classified_files):
        if reasons:
            large.append({"path": f["path"], "name": f["name"],
                          "reasons": reasons})
        if on_file:
            on_file(f["path"])
    return large
--- a/luminos_lib/disk.py
+++ b/luminos_lib/disk.py
@ -3,12 +3,15 @@
 import subprocess
-def get_disk_usage(target, show_hidden=False):
+def get_disk_usage(target, show_hidden=False, exclude=None):
    """Get per-directory disk usage via du.
    Returns a list of dicts: {path, size_bytes, size_human}.
    """
-    cmd = ["du", "-b", "--max-depth=2", target]
+    cmd = ["du", "-b", "--max-depth=2"]
    for name in (exclude or []):
        cmd.append(f"--exclude={name}")
    cmd.append(target)
    try:
        result = subprocess.run(
--- a/luminos_lib/filetypes.py
+++ b/luminos_lib/filetypes.py
@ -86,15 +86,19 @@ def _classify_one(filepath):
    return "unknown", desc
-def classify_files(target, show_hidden=False):
+def classify_files(target, show_hidden=False, exclude=None, on_file=None):
    exclude = exclude or []
    """Walk the target directory and classify every file.
    Returns a list of dicts: {path, name, category, size, description}.
    on_file(path) is called after each file is classified, if provided.
    """
    results = []
    for root, dirs, files in os.walk(target):
        dirs[:] = [d for d in dirs
                   if d not in exclude
                   and (show_hidden or not d.startswith("."))]
        if not show_hidden:
            dirs[:] = [d for d in dirs if not d.startswith(".")]
            files = [f for f in files if not f.startswith(".")]
        for fname in files:
            full = os.path.join(root, fname)
@ -112,6 +116,8 @@ def classify_files(target, show_hidden=False):
                "size": size,
                "description": desc,
            })
            if on_file:
                on_file(full)
    return results
--- a/luminos_lib/prompts.py
+++ b/luminos_lib/prompts.py
@ -0,0 +1,93 @@
 """System prompt templates for the Luminos agent loops."""
 _DIR_SYSTEM_PROMPT = """\
 You are an expert analyst investigating a SINGLE directory on a file system.
 Do NOT assume the type of content before investigating. Discover what this
 directory contains from what you find.
 ## Your Task
 Investigate the directory: {dir_path}
 (relative to target: {dir_rel})
 You must:
 1. Read the important files in THIS directory (not subdirectories)
 2. For each file you read, call write_cache to save a summary
 3. Call write_cache for the directory itself with a synthesis
 4. Call submit_report with a 1-3 sentence summary
 ## Tools
 parse_structure gives you the skeleton of a file. It does NOT replace \
 reading the file. Use parse_structure first to understand structure, then \
 use read_file if you need to verify intent, check for anomalies, or \
 understand content that structure cannot capture (comments, documentation, \
 data files, config values). A file where structure and content appear to \
 contradict each other is always worth reading in full.
 Use the think tool when choosing which file or directory to investigate \
 next — before starting a new file or switching investigation direction. \
 Do NOT call think before every individual tool call in a sequence.
 Use the checkpoint tool after completing investigation of a meaningful \
 cluster of files. Not after every file — once or twice per directory \
 loop at most.
 Use the flag tool immediately when you find something notable, \
 surprising, or concerning. Severity guide:
  info     = interesting but not problematic
  concern  = worth addressing
  critical = likely broken or dangerous
 ## Step Numbering
 Number your investigation steps as you go. Before starting each new \
 file cluster or phase transition, output:
 Step N: <what you are doing and why>
 Output this as plain text before tool calls, not as a tool call itself.
 ## Efficiency Rules
 - Batch multiple tool calls in a single turn whenever possible
 - Skip binary/compiled/generated files (.pyc, .class, .o, .min.js, etc.)
 - Skip files >100KB unless uniquely important
 - Prioritize: README, index, main, config, schema, manifest files
 - For source files: try parse_structure first, then read_file if needed
 - If read_file returns truncated content, use a larger max_bytes or
  run_command('tail ...') — NEVER retry the identical call
 - You have only {max_turns} turns — be efficient
 ## Cache Schemas
 File: {{path, relative_path, size_bytes, category, summary, notable,
  notable_reason, cached_at}}
 Dir: {{path, relative_path, child_count, summary, dominant_category,
  notable_files, cached_at}}
 category values: source, config, data, document, media, archive, unknown
 ## Context
 {context}
 ## Child Directory Summaries (already investigated)
 {child_summaries}"""
 _SYNTHESIS_SYSTEM_PROMPT = """\
 You are an expert analyst synthesizing a final report about a directory tree.
 ALL directory summaries are provided below — you do NOT need to call
 list_cache or read_cache. Just read the summaries and call submit_report
 immediately in your first turn.
 Do NOT assume the type of content. Let the summaries speak for themselves.
 ## Your Goal
 Produce two outputs via the submit_report tool:
 1. **brief**: A 2-4 sentence summary of what this directory tree is.
 2. **detailed**: A thorough breakdown covering purpose, structure, key
   components, technologies, notable patterns, and any concerns.
 ## Rules
 - ALL summaries are below — call submit_report directly
 - Be specific — reference actual directory and file names
 - Do NOT call list_cache or read_cache
 ## Target
 {target}
 ## Directory Summaries
 {summaries_text}"""
--- a/luminos_lib/recency.py
+++ b/luminos_lib/recency.py
@ -5,7 +5,7 @@ import os
 from datetime import datetime
-def find_recent_files(target, n=10, show_hidden=False):
+def find_recent_files(target, n=10, show_hidden=False, exclude=None):
    """Find the n most recently modified files using find and stat.
    Returns a list of dicts: {path, name, modified, modified_human}.
@ -14,6 +14,9 @@ def find_recent_files(target, n=10, show_hidden=False):
    cmd = ["find", target, "-type", "f"]
    if not show_hidden:
        cmd.extend(["-not", "-path", "*/.*"])
    for name in (exclude or []):
        cmd.extend(["-not", "-path", f"*/{name}/*",
                    "-not", "-path", f"*/{name}"])
    cmd.extend(["-printf", "%T@\t%p\n"])
    try:
--- a/luminos_lib/tree.py
+++ b/luminos_lib/tree.py
@ -3,7 +3,8 @@
 import os
-def build_tree(path, max_depth=3, show_hidden=False, _depth=0):
+def build_tree(path, max_depth=3, show_hidden=False, exclude=None, _depth=0):
    exclude = exclude or []
    """Build a nested dict representing the directory tree with file sizes."""
    name = os.path.basename(path) or path
    node = {"name": name, "path": path, "type": "directory", "children": []}
@ -17,10 +18,12 @@ def build_tree(path, max_depth=3, show_hidden=False, _depth=0):
    for entry in entries:
        if not show_hidden and entry.startswith("."):
            continue
        if entry in exclude:
            continue
        full = os.path.join(path, entry)
        if os.path.isdir(full):
            if _depth < max_depth:
-                child = build_tree(full, max_depth, show_hidden, _depth + 1)
+                child = build_tree(full, max_depth, show_hidden, exclude, _depth + 1)
                node["children"].append(child)
            else:
                node["children"].append({
Author	SHA1	Message	Date
Jeff Smith	d323190866	merge: add -x/--exclude flag for directory exclusion	2026-04-06 14:32:17 -06:00
Jeff Smith	78f9a396dd	feat: add -x/--exclude flag to exclude directories from scan and AI analysis	2026-04-06 14:32:12 -06:00
Jeff Smith	78f80c31ed	merge: in-place per-file progress for scan steps	2026-04-06 14:26:40 -06:00
Jeff Smith	206d2d34f6	feat: in-place per-file progress for classify, count, and large-file steps	2026-04-06 14:26:37 -06:00
Jeff Smith	bbaf387cb7	merge: add progress output to base scan steps	2026-04-06 14:21:19 -06:00
Jeff Smith	ebc6b852f1	feat: add progress output to base scan steps	2026-04-06 14:21:17 -06:00
Jeff Smith	33df555a8c	merge: extract system prompts module	2026-03-30 14:44:57 -06:00
Jeff Smith	ea8c07a692	refactor: extract system prompts into luminos_lib/prompts.py Moves _DIR_SYSTEM_PROMPT and _SYNTHESIS_SYSTEM_PROMPT from ai.py into a dedicated prompts module. Both are pure template strings with .format() placeholders — no runtime imports needed in prompts.py. Prompt content is byte-for-byte identical to the original. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-03-30 14:44:45 -06:00
Jeff Smith	5c6124a715	merge: extract AST parser module	2026-03-30 14:34:06 -06:00
Jeff Smith	0c49da23ab	refactor: extract AST parsing into luminos_lib/ast_parser.py Moves all tree-sitter parsing logic from ai.py into a dedicated module. Replaces the if/elif language chain with a _LANGUAGE_HANDLERS registry mapping language names to handler functions. Extracted: _tool_parse_structure body, _get_ts_parser, _child_by_type, _text, and all per-language helpers (_py_func_sig, _py_class, etc.). ai.py retains a thin wrapper for path validation. Public API: parse_structure(path) -> JSON string Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-03-30 14:34:02 -06:00