315 lines
11 KiB
Python
315 lines
11 KiB
Python
|
|
"""AST structure extraction for Luminos using tree-sitter."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
|
||
|
|
import tree_sitter
|
||
|
|
import tree_sitter_python
|
||
|
|
import tree_sitter_javascript
|
||
|
|
import tree_sitter_rust
|
||
|
|
import tree_sitter_go
|
||
|
|
|
||
|
|
# Extension → (grammar_module, language_name)
|
||
|
|
_TS_LANGUAGES = {
|
||
|
|
".py": (tree_sitter_python, "python"),
|
||
|
|
".js": (tree_sitter_javascript, "javascript"),
|
||
|
|
".jsx": (tree_sitter_javascript, "javascript"),
|
||
|
|
".mjs": (tree_sitter_javascript, "javascript"),
|
||
|
|
".rs": (tree_sitter_rust, "rust"),
|
||
|
|
".go": (tree_sitter_go, "go"),
|
||
|
|
}
|
||
|
|
|
||
|
|
# Precomputed Language objects.
|
||
|
|
_TS_LANG_CACHE = {}
|
||
|
|
|
||
|
|
|
||
|
|
def _get_ts_parser(ext):
|
||
|
|
"""Return a (Parser, language_name) tuple for a file extension, or None."""
|
||
|
|
entry = _TS_LANGUAGES.get(ext)
|
||
|
|
if entry is None:
|
||
|
|
return None
|
||
|
|
module, lang_name = entry
|
||
|
|
if lang_name not in _TS_LANG_CACHE:
|
||
|
|
_TS_LANG_CACHE[lang_name] = tree_sitter.Language(module.language())
|
||
|
|
lang = _TS_LANG_CACHE[lang_name]
|
||
|
|
parser = tree_sitter.Parser(lang)
|
||
|
|
return parser, lang_name
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Tree-sitter node helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _child_by_type(node, *types):
|
||
|
|
for c in node.children:
|
||
|
|
if c.type in types:
|
||
|
|
return c
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def _text(node):
|
||
|
|
return node.text.decode("utf-8", errors="replace") if node else ""
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Per-language handlers: (root_node, source_bytes) -> dict
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _parse_python(root, source):
|
||
|
|
functions = []
|
||
|
|
classes = []
|
||
|
|
imports = []
|
||
|
|
has_docstrings = False
|
||
|
|
comment_lines = 0
|
||
|
|
|
||
|
|
def _walk(node):
|
||
|
|
nonlocal has_docstrings, comment_lines
|
||
|
|
for child in node.children:
|
||
|
|
ntype = child.type
|
||
|
|
|
||
|
|
if ntype in ("comment", "line_comment", "block_comment"):
|
||
|
|
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
|
||
|
|
|
||
|
|
if ntype == "function_definition":
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
params = _text(_child_by_type(child, "parameters"))
|
||
|
|
ret = _child_by_type(child, "type")
|
||
|
|
sig = f"{name}{params}"
|
||
|
|
if ret:
|
||
|
|
sig += f" -> {_text(ret)}"
|
||
|
|
functions.append(sig)
|
||
|
|
elif ntype == "class_definition":
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
methods = []
|
||
|
|
body = _child_by_type(child, "block")
|
||
|
|
if body:
|
||
|
|
for c in body.children:
|
||
|
|
if c.type == "function_definition":
|
||
|
|
mname = _text(_child_by_type(c, "identifier"))
|
||
|
|
mparams = _text(_child_by_type(c, "parameters"))
|
||
|
|
mret = _child_by_type(c, "type")
|
||
|
|
msig = f"{mname}{mparams}"
|
||
|
|
if mret:
|
||
|
|
msig += f" -> {_text(mret)}"
|
||
|
|
methods.append(msig)
|
||
|
|
classes.append({"name": name, "methods": methods[:20]})
|
||
|
|
elif ntype in ("import_statement", "import_from_statement"):
|
||
|
|
imports.append(child.text.decode("utf-8", errors="replace").strip())
|
||
|
|
elif ntype == "expression_statement":
|
||
|
|
first = child.children[0] if child.children else None
|
||
|
|
if first and first.type == "string":
|
||
|
|
has_docstrings = True
|
||
|
|
|
||
|
|
_walk(child)
|
||
|
|
|
||
|
|
_walk(root)
|
||
|
|
|
||
|
|
source_text = source.decode("utf-8", errors="replace")
|
||
|
|
line_count = len(source_text.split("\n"))
|
||
|
|
code_lines = max(1, line_count - comment_lines)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"language": "python",
|
||
|
|
"functions": functions[:50],
|
||
|
|
"classes": classes[:30],
|
||
|
|
"imports": imports[:30],
|
||
|
|
"line_count": line_count,
|
||
|
|
"has_docstrings": has_docstrings,
|
||
|
|
"has_comments": comment_lines > 0,
|
||
|
|
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_javascript(root, source):
|
||
|
|
functions = []
|
||
|
|
classes = []
|
||
|
|
imports = []
|
||
|
|
comment_lines = 0
|
||
|
|
|
||
|
|
def _walk(node):
|
||
|
|
nonlocal comment_lines
|
||
|
|
for child in node.children:
|
||
|
|
ntype = child.type
|
||
|
|
|
||
|
|
if ntype in ("comment", "line_comment", "block_comment"):
|
||
|
|
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
|
||
|
|
|
||
|
|
if ntype in ("function_declaration", "arrow_function", "function"):
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
params = _text(_child_by_type(child, "formal_parameters"))
|
||
|
|
functions.append(f"{name}{params}" if name else f"(anonymous){params}")
|
||
|
|
elif ntype == "class_declaration":
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
methods = []
|
||
|
|
body = _child_by_type(child, "class_body")
|
||
|
|
if body:
|
||
|
|
for c in body.children:
|
||
|
|
if c.type == "method_definition":
|
||
|
|
mname = _text(_child_by_type(c, "property_identifier"))
|
||
|
|
mparams = _text(_child_by_type(c, "formal_parameters"))
|
||
|
|
methods.append(f"{mname}{mparams}")
|
||
|
|
classes.append({"name": name, "methods": methods[:20]})
|
||
|
|
elif ntype == "import_statement":
|
||
|
|
imports.append(child.text.decode("utf-8", errors="replace").strip())
|
||
|
|
|
||
|
|
_walk(child)
|
||
|
|
|
||
|
|
_walk(root)
|
||
|
|
|
||
|
|
source_text = source.decode("utf-8", errors="replace")
|
||
|
|
line_count = len(source_text.split("\n"))
|
||
|
|
code_lines = max(1, line_count - comment_lines)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"language": "javascript",
|
||
|
|
"functions": functions[:50],
|
||
|
|
"classes": classes[:30],
|
||
|
|
"imports": imports[:30],
|
||
|
|
"line_count": line_count,
|
||
|
|
"has_docstrings": False,
|
||
|
|
"has_comments": comment_lines > 0,
|
||
|
|
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_rust(root, source):
|
||
|
|
functions = []
|
||
|
|
classes = []
|
||
|
|
imports = []
|
||
|
|
comment_lines = 0
|
||
|
|
|
||
|
|
def _walk(node):
|
||
|
|
nonlocal comment_lines
|
||
|
|
for child in node.children:
|
||
|
|
ntype = child.type
|
||
|
|
|
||
|
|
if ntype in ("comment", "line_comment", "block_comment"):
|
||
|
|
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
|
||
|
|
|
||
|
|
if ntype == "function_item":
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
params = _text(_child_by_type(child, "parameters"))
|
||
|
|
ret = _child_by_type(child, "type_identifier", "generic_type",
|
||
|
|
"reference_type", "scoped_type_identifier")
|
||
|
|
sig = f"{name}{params}"
|
||
|
|
if ret:
|
||
|
|
sig += f" -> {_text(ret)}"
|
||
|
|
functions.append(sig)
|
||
|
|
elif ntype in ("struct_item", "enum_item", "impl_item"):
|
||
|
|
name = _text(_child_by_type(child, "type_identifier"))
|
||
|
|
classes.append({"name": name or _text(child)[:60], "methods": []})
|
||
|
|
elif ntype == "use_declaration":
|
||
|
|
imports.append(child.text.decode("utf-8", errors="replace").strip())
|
||
|
|
|
||
|
|
_walk(child)
|
||
|
|
|
||
|
|
_walk(root)
|
||
|
|
|
||
|
|
source_text = source.decode("utf-8", errors="replace")
|
||
|
|
line_count = len(source_text.split("\n"))
|
||
|
|
code_lines = max(1, line_count - comment_lines)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"language": "rust",
|
||
|
|
"functions": functions[:50],
|
||
|
|
"classes": classes[:30],
|
||
|
|
"imports": imports[:30],
|
||
|
|
"line_count": line_count,
|
||
|
|
"has_docstrings": False,
|
||
|
|
"has_comments": comment_lines > 0,
|
||
|
|
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_go(root, source):
|
||
|
|
functions = []
|
||
|
|
classes = []
|
||
|
|
imports = []
|
||
|
|
comment_lines = 0
|
||
|
|
|
||
|
|
def _walk(node):
|
||
|
|
nonlocal comment_lines
|
||
|
|
for child in node.children:
|
||
|
|
ntype = child.type
|
||
|
|
|
||
|
|
if ntype in ("comment", "line_comment", "block_comment"):
|
||
|
|
comment_lines += child.text.decode("utf-8", errors="replace").count("\n") + 1
|
||
|
|
|
||
|
|
if ntype == "function_declaration":
|
||
|
|
name = _text(_child_by_type(child, "identifier"))
|
||
|
|
params = _text(_child_by_type(child, "parameter_list"))
|
||
|
|
functions.append(f"{name}{params}")
|
||
|
|
elif ntype == "type_declaration":
|
||
|
|
spec = _child_by_type(child, "type_spec")
|
||
|
|
name = _text(_child_by_type(spec, "type_identifier")) if spec else ""
|
||
|
|
classes.append({"name": name or _text(child)[:60], "methods": []})
|
||
|
|
elif ntype == "import_declaration":
|
||
|
|
imports.append(child.text.decode("utf-8", errors="replace").strip())
|
||
|
|
|
||
|
|
_walk(child)
|
||
|
|
|
||
|
|
_walk(root)
|
||
|
|
|
||
|
|
source_text = source.decode("utf-8", errors="replace")
|
||
|
|
line_count = len(source_text.split("\n"))
|
||
|
|
code_lines = max(1, line_count - comment_lines)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"language": "go",
|
||
|
|
"functions": functions[:50],
|
||
|
|
"classes": classes[:30],
|
||
|
|
"imports": imports[:30],
|
||
|
|
"line_count": line_count,
|
||
|
|
"has_docstrings": False,
|
||
|
|
"has_comments": comment_lines > 0,
|
||
|
|
"comment_to_code_ratio": round(comment_lines / code_lines, 2),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Language handler registry
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
_LANGUAGE_HANDLERS = {
|
||
|
|
"python": _parse_python,
|
||
|
|
"javascript": _parse_javascript,
|
||
|
|
"rust": _parse_rust,
|
||
|
|
"go": _parse_go,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Public API
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def parse_structure(path):
|
||
|
|
"""Parse a source file and return its structural skeleton as a JSON string.
|
||
|
|
|
||
|
|
Takes an absolute path. Returns a JSON string of the structure dict,
|
||
|
|
or an error string if parsing fails or the language is unsupported.
|
||
|
|
"""
|
||
|
|
if not os.path.isfile(path):
|
||
|
|
return f"Error: '{path}' is not a file."
|
||
|
|
|
||
|
|
ext = os.path.splitext(path)[1].lower()
|
||
|
|
ts = _get_ts_parser(ext)
|
||
|
|
if ts is None:
|
||
|
|
return (f"Error: no grammar for extension '{ext}'. "
|
||
|
|
f"Supported: {', '.join(sorted(_TS_LANGUAGES.keys()))}")
|
||
|
|
|
||
|
|
parser, lang_name = ts
|
||
|
|
|
||
|
|
handler = _LANGUAGE_HANDLERS.get(lang_name)
|
||
|
|
if handler is None:
|
||
|
|
return f"Error: no handler for language '{lang_name}'."
|
||
|
|
|
||
|
|
try:
|
||
|
|
with open(path, "rb") as f:
|
||
|
|
source = f.read()
|
||
|
|
except OSError as e:
|
||
|
|
return f"Error reading file: {e}"
|
||
|
|
|
||
|
|
tree = parser.parse(source)
|
||
|
|
result = handler(tree.root_node, source)
|
||
|
|
return json.dumps(result, indent=2)
|