luminos/luminos_lib/ast_parser.py

315 lines
11 KiB
Python
Raw Normal View History

"""AST structure extraction for Luminos using tree-sitter."""
import json
import os
import tree_sitter
import tree_sitter_python
import tree_sitter_javascript
import tree_sitter_rust
import tree_sitter_go
# Extension → (grammar_module, language_name)
_TS_LANGUAGES = {
".py": (tree_sitter_python, "python"),
".js": (tree_sitter_javascript, "javascript"),
".jsx": (tree_sitter_javascript, "javascript"),
".mjs": (tree_sitter_javascript, "javascript"),
".rs": (tree_sitter_rust, "rust"),
".go": (tree_sitter_go, "go"),
}
# Precomputed Language objects.
_TS_LANG_CACHE = {}
def _get_ts_parser(ext):
"""Return a (Parser, language_name) tuple for a file extension, or None."""
entry = _TS_LANGUAGES.get(ext)
if entry is None:
return None
module, lang_name = entry
if lang_name not in _TS_LANG_CACHE:
_TS_LANG_CACHE[lang_name] = tree_sitter.Language(module.language())
lang = _TS_LANG_CACHE[lang_name]
parser = tree_sitter.Parser(lang)
return parser, lang_name
# ---------------------------------------------------------------------------
# Tree-sitter node helpers
# ---------------------------------------------------------------------------
def _child_by_type(node, *types):
for c in node.children:
if c.type in types:
return c
return None
def _text(node):
return node.text.decode("utf-8", errors="replace") if node else ""
# ---------------------------------------------------------------------------
# Per-language handlers: (root_node, source_bytes) -> dict
# ---------------------------------------------------------------------------
def _parse_python(root, source):
functions = []
classes = []
imports = []
has_docstrings = False
comment_lines = 0
def _walk(node):
nonlocal has_docstrings, comment_lines
for child in node.children:
ntype = child.type
if ntype in ("comment", "line_comment", "block_comment"):
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
if ntype == "function_definition":
name = _text(_child_by_type(child, "identifier"))
params = _text(_child_by_type(child, "parameters"))
ret = _child_by_type(child, "type")
sig = f"{name}{params}"
if ret:
sig += f" -> {_text(ret)}"
functions.append(sig)
elif ntype == "class_definition":
name = _text(_child_by_type(child, "identifier"))
methods = []
body = _child_by_type(child, "block")
if body:
for c in body.children:
if c.type == "function_definition":
mname = _text(_child_by_type(c, "identifier"))
mparams = _text(_child_by_type(c, "parameters"))
mret = _child_by_type(c, "type")
msig = f"{mname}{mparams}"
if mret:
msig += f" -> {_text(mret)}"
methods.append(msig)
classes.append({"name": name, "methods": methods[:20]})
elif ntype in ("import_statement", "import_from_statement"):
imports.append(child.text.decode("utf-8", errors="replace").strip())
elif ntype == "expression_statement":
first = child.children[0] if child.children else None
if first and first.type == "string":
has_docstrings = True
_walk(child)
_walk(root)
source_text = source.decode("utf-8", errors="replace")
line_count = len(source_text.split("\n"))
code_lines = max(1, line_count - comment_lines)
return {
"language": "python",
"functions": functions[:50],
"classes": classes[:30],
"imports": imports[:30],
"line_count": line_count,
"has_docstrings": has_docstrings,
"has_comments": comment_lines > 0,
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
}
def _parse_javascript(root, source):
functions = []
classes = []
imports = []
comment_lines = 0
def _walk(node):
nonlocal comment_lines
for child in node.children:
ntype = child.type
if ntype in ("comment", "line_comment", "block_comment"):
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
if ntype in ("function_declaration", "arrow_function", "function"):
name = _text(_child_by_type(child, "identifier"))
params = _text(_child_by_type(child, "formal_parameters"))
functions.append(f"{name}{params}" if name else f"(anonymous){params}")
elif ntype == "class_declaration":
name = _text(_child_by_type(child, "identifier"))
methods = []
body = _child_by_type(child, "class_body")
if body:
for c in body.children:
if c.type == "method_definition":
mname = _text(_child_by_type(c, "property_identifier"))
mparams = _text(_child_by_type(c, "formal_parameters"))
methods.append(f"{mname}{mparams}")
classes.append({"name": name, "methods": methods[:20]})
elif ntype == "import_statement":
imports.append(child.text.decode("utf-8", errors="replace").strip())
_walk(child)
_walk(root)
source_text = source.decode("utf-8", errors="replace")
line_count = len(source_text.split("\n"))
code_lines = max(1, line_count - comment_lines)
return {
"language": "javascript",
"functions": functions[:50],
"classes": classes[:30],
"imports": imports[:30],
"line_count": line_count,
"has_docstrings": False,
"has_comments": comment_lines > 0,
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
}
def _parse_rust(root, source):
functions = []
classes = []
imports = []
comment_lines = 0
def _walk(node):
nonlocal comment_lines
for child in node.children:
ntype = child.type
if ntype in ("comment", "line_comment", "block_comment"):
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
if ntype == "function_item":
name = _text(_child_by_type(child, "identifier"))
params = _text(_child_by_type(child, "parameters"))
ret = _child_by_type(child, "type_identifier", "generic_type",
"reference_type", "scoped_type_identifier")
sig = f"{name}{params}"
if ret:
sig += f" -> {_text(ret)}"
functions.append(sig)
elif ntype in ("struct_item", "enum_item", "impl_item"):
name = _text(_child_by_type(child, "type_identifier"))
classes.append({"name": name or _text(child)[:60], "methods": []})
elif ntype == "use_declaration":
imports.append(child.text.decode("utf-8", errors="replace").strip())
_walk(child)
_walk(root)
source_text = source.decode("utf-8", errors="replace")
line_count = len(source_text.split("\n"))
code_lines = max(1, line_count - comment_lines)
return {
"language": "rust",
"functions": functions[:50],
"classes": classes[:30],
"imports": imports[:30],
"line_count": line_count,
"has_docstrings": False,
"has_comments": comment_lines > 0,
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
}
def _parse_go(root, source):
functions = []
classes = []
imports = []
comment_lines = 0
def _walk(node):
nonlocal comment_lines
for child in node.children:
ntype = child.type
if ntype in ("comment", "line_comment", "block_comment"):
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
if ntype == "function_declaration":
name = _text(_child_by_type(child, "identifier"))
params = _text(_child_by_type(child, "parameter_list"))
functions.append(f"{name}{params}")
elif ntype == "type_declaration":
spec = _child_by_type(child, "type_spec")
name = _text(_child_by_type(spec, "type_identifier")) if spec else ""
classes.append({"name": name or _text(child)[:60], "methods": []})
elif ntype == "import_declaration":
imports.append(child.text.decode("utf-8", errors="replace").strip())
_walk(child)
_walk(root)
source_text = source.decode("utf-8", errors="replace")
line_count = len(source_text.split("\n"))
code_lines = max(1, line_count - comment_lines)
return {
"language": "go",
"functions": functions[:50],
"classes": classes[:30],
"imports": imports[:30],
"line_count": line_count,
"has_docstrings": False,
"has_comments": comment_lines > 0,
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
}
# ---------------------------------------------------------------------------
# Language handler registry
# ---------------------------------------------------------------------------
_LANGUAGE_HANDLERS = {
"python": _parse_python,
"javascript": _parse_javascript,
"rust": _parse_rust,
"go": _parse_go,
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def parse_structure(path):
"""Parse a source file and return its structural skeleton as a JSON string.
Takes an absolute path. Returns a JSON string of the structure dict,
or an error string if parsing fails or the language is unsupported.
"""
if not os.path.isfile(path):
return f"Error: '{path}' is not a file."
ext = os.path.splitext(path)[1].lower()
ts = _get_ts_parser(ext)
if ts is None:
return (f"Error: no grammar for extension '{ext}'. "
f"Supported: {', '.join(sorted(_TS_LANGUAGES.keys()))}")
parser, lang_name = ts
handler = _LANGUAGE_HANDLERS.get(lang_name)
if handler is None:
return f"Error: no handler for language '{lang_name}'."
try:
with open(path, "rb") as f:
source = f.read()
except OSError as e:
return f"Error reading file: {e}"
tree = parser.parse(source)
result = handler(tree.root_node, source)
return json.dumps(result, indent=2)