Compare commits
No commits in common. "157ac3f606bd41dc3a7650c0fe7238600ef4e7d8" and "55da7fa8dcc76c20592eef68572a7a29b449a163" have entirely different histories.
157ac3f606
...
55da7fa8dc
5 changed files with 21 additions and 180 deletions
|
|
@ -8,11 +8,7 @@ import shutil
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from luminos_lib.tree import build_tree, render_tree
|
from luminos_lib.tree import build_tree, render_tree
|
||||||
from luminos_lib.filetypes import (
|
from luminos_lib.filetypes import classify_files, summarize_categories
|
||||||
classify_files,
|
|
||||||
summarize_categories,
|
|
||||||
survey_signals,
|
|
||||||
)
|
|
||||||
from luminos_lib.code import detect_languages, find_large_files
|
from luminos_lib.code import detect_languages, find_large_files
|
||||||
from luminos_lib.recency import find_recent_files
|
from luminos_lib.recency import find_recent_files
|
||||||
from luminos_lib.disk import get_disk_usage, top_directories
|
from luminos_lib.disk import get_disk_usage, top_directories
|
||||||
|
|
@ -60,7 +56,6 @@ def scan(target, depth=3, show_hidden=False, exclude=None):
|
||||||
finish()
|
finish()
|
||||||
report["file_categories"] = summarize_categories(classified)
|
report["file_categories"] = summarize_categories(classified)
|
||||||
report["classified_files"] = classified
|
report["classified_files"] = classified
|
||||||
report["survey_signals"] = survey_signals(classified)
|
|
||||||
|
|
||||||
on_file, finish = _progress("Counting lines")
|
on_file, finish = _progress("Counting lines")
|
||||||
languages, loc = detect_languages(classified, on_file=on_file)
|
languages, loc = detect_languages(classified, on_file=on_file)
|
||||||
|
|
|
||||||
|
|
@ -996,44 +996,21 @@ def _block_to_dict(block):
|
||||||
# Synthesis pass
|
# Synthesis pass
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _format_survey_signals(signals):
|
|
||||||
"""Render the survey_signals dict as a labeled text block."""
|
|
||||||
if not signals or not signals.get("total_files"):
|
|
||||||
return "(no files classified)"
|
|
||||||
|
|
||||||
lines = [f"Total files: {signals.get('total_files', 0)}", ""]
|
|
||||||
|
|
||||||
ext_hist = signals.get("extension_histogram") or {}
|
|
||||||
if ext_hist:
|
|
||||||
lines.append("Extensions (top, by count):")
|
|
||||||
for ext, n in ext_hist.items():
|
|
||||||
lines.append(f" {ext}: {n}")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
descs = signals.get("file_descriptions") or {}
|
|
||||||
if descs:
|
|
||||||
lines.append("file --brief output (top, by count):")
|
|
||||||
for desc, n in descs.items():
|
|
||||||
lines.append(f" {desc}: {n}")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
samples = signals.get("filename_samples") or []
|
|
||||||
if samples:
|
|
||||||
lines.append("Filename samples (evenly drawn):")
|
|
||||||
for name in samples:
|
|
||||||
lines.append(f" {name}")
|
|
||||||
|
|
||||||
return "\n".join(lines).rstrip()
|
|
||||||
|
|
||||||
|
|
||||||
def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
||||||
"""Run the reconnaissance survey pass.
|
"""Run the reconnaissance survey pass.
|
||||||
|
|
||||||
Returns a survey dict on success, or None on failure / out-of-turns.
|
Returns a survey dict on success, or None on failure / out-of-turns.
|
||||||
Survey is advisory — callers must treat None as "no survey context".
|
Survey is advisory — callers must treat None as "no survey context".
|
||||||
"""
|
"""
|
||||||
signals = report.get("survey_signals") or {}
|
categories = report.get("file_categories", {}) or {}
|
||||||
survey_signals_text = _format_survey_signals(signals)
|
if categories:
|
||||||
|
ftd_lines = [
|
||||||
|
f" {cat}: {n}"
|
||||||
|
for cat, n in sorted(categories.items(), key=lambda kv: -kv[1])
|
||||||
|
]
|
||||||
|
file_type_distribution = "\n".join(ftd_lines)
|
||||||
|
else:
|
||||||
|
file_type_distribution = " (no files classified)"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree_node = build_tree(target, max_depth=2)
|
tree_node = build_tree(target, max_depth=2)
|
||||||
|
|
@ -1046,7 +1023,7 @@ def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
||||||
|
|
||||||
system = _SURVEY_SYSTEM_PROMPT.format(
|
system = _SURVEY_SYSTEM_PROMPT.format(
|
||||||
target=target,
|
target=target,
|
||||||
survey_signals=survey_signals_text,
|
file_type_distribution=file_type_distribution,
|
||||||
tree_preview=tree_preview,
|
tree_preview=tree_preview,
|
||||||
available_tools=available_tools,
|
available_tools=available_tools,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -128,64 +128,3 @@ def summarize_categories(classified):
|
||||||
cat = f["category"]
|
cat = f["category"]
|
||||||
summary[cat] = summary.get(cat, 0) + 1
|
summary[cat] = summary.get(cat, 0) + 1
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
_SURVEY_TOP_N = 20
|
|
||||||
_SURVEY_DESC_TRUNCATE = 80
|
|
||||||
|
|
||||||
|
|
||||||
def survey_signals(classified, max_samples=20):
|
|
||||||
"""Return raw, unbucketed signals for the AI survey pass.
|
|
||||||
|
|
||||||
Unlike `summarize_categories`, which collapses files into a small
|
|
||||||
biased taxonomy, this exposes the primary signals so the survey
|
|
||||||
LLM can characterize the target without being misled by the
|
|
||||||
classifier's source-code bias.
|
|
||||||
|
|
||||||
See #42 for the rationale and #48 for the unit-of-analysis
|
|
||||||
limitation: the unit here is still "file" — containers like mbox,
|
|
||||||
SQLite, and zip will under-count, while dense file collections like
|
|
||||||
Maildir will over-count.
|
|
||||||
|
|
||||||
Returns a dict with:
|
|
||||||
total_files — total count
|
|
||||||
extension_histogram — {ext: count}, top _SURVEY_TOP_N by count
|
|
||||||
file_descriptions — {description: count}, top _SURVEY_TOP_N by count
|
|
||||||
filename_samples — up to max_samples filenames, evenly drawn
|
|
||||||
"""
|
|
||||||
total = len(classified)
|
|
||||||
|
|
||||||
ext_counts = {}
|
|
||||||
desc_counts = {}
|
|
||||||
for f in classified:
|
|
||||||
ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"
|
|
||||||
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
|
||||||
|
|
||||||
desc = (f.get("description") or "").strip()
|
|
||||||
if desc:
|
|
||||||
if len(desc) > _SURVEY_DESC_TRUNCATE:
|
|
||||||
desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."
|
|
||||||
desc_counts[desc] = desc_counts.get(desc, 0) + 1
|
|
||||||
|
|
||||||
def _top(d):
|
|
||||||
items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))
|
|
||||||
return dict(items[:_SURVEY_TOP_N])
|
|
||||||
|
|
||||||
if total > 0 and max_samples > 0:
|
|
||||||
if total <= max_samples:
|
|
||||||
samples = [f.get("name", "") for f in classified]
|
|
||||||
else:
|
|
||||||
stride = total / max_samples
|
|
||||||
samples = [
|
|
||||||
classified[int(i * stride)].get("name", "")
|
|
||||||
for i in range(max_samples)
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
samples = []
|
|
||||||
|
|
||||||
return {
|
|
||||||
"total_files": total,
|
|
||||||
"extension_histogram": _top(ext_counts),
|
|
||||||
"file_descriptions": _top(desc_counts),
|
|
||||||
"filename_samples": samples,
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -131,22 +131,17 @@ Answer three questions about the target: {target}
|
||||||
## Inputs
|
## Inputs
|
||||||
You have exactly two signals. Do not ask for more.
|
You have exactly two signals. Do not ask for more.
|
||||||
|
|
||||||
File-level signals (raw, unbucketed):
|
File type distribution (counts by category):
|
||||||
{survey_signals}
|
{file_type_distribution}
|
||||||
|
|
||||||
These signals are intentionally raw. The extension histogram and
|
IMPORTANT: the file type distribution is produced by a classifier
|
||||||
the `file --brief` descriptions reflect what is actually on disk,
|
that is biased toward source code. Its categories are: source,
|
||||||
without any taxonomy collapsing distinct content into one bucket.
|
config, data, document, media, archive, unknown. It has NO concept
|
||||||
Use them together: an extension alone can mislead (`.txt` could be
|
of mail, notebooks, calendars, contacts, ledgers, photo libraries,
|
||||||
notes, logs, or message bodies); the `file` command output and
|
or other personal-data domains — anything text-shaped tends to be
|
||||||
filename samples disambiguate.
|
labeled `source` even when it is not code. If the tree preview
|
||||||
|
suggests a non-code target, trust the tree over the histogram and
|
||||||
Note on units: each signal counts filesystem files. Some targets
|
say so in `domain_notes`.
|
||||||
have a different natural unit — a Maildir is one logical mailbox
|
|
||||||
with thousands of message files; an mbox is one file containing
|
|
||||||
many messages; an archive is one file containing many entries. If
|
|
||||||
the signals point at a container shape, name it in `description`
|
|
||||||
and `domain_notes` even though the count is in files.
|
|
||||||
|
|
||||||
Top-level tree (2 levels deep):
|
Top-level tree (2 levels deep):
|
||||||
{tree_preview}
|
{tree_preview}
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@ from luminos_lib.filetypes import (
|
||||||
_classify_one,
|
_classify_one,
|
||||||
classify_files,
|
classify_files,
|
||||||
summarize_categories,
|
summarize_categories,
|
||||||
survey_signals,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -141,69 +140,5 @@ class TestClassifyFiles(unittest.TestCase):
|
||||||
self.assertGreater(item["size"], 0)
|
self.assertGreater(item["size"], 0)
|
||||||
|
|
||||||
|
|
||||||
class TestSurveySignals(unittest.TestCase):
|
|
||||||
def _f(self, name, description="", category="source"):
|
|
||||||
return {"name": name, "path": f"/x/{name}", "category": category,
|
|
||||||
"size": 10, "description": description}
|
|
||||||
|
|
||||||
def test_empty_input(self):
|
|
||||||
s = survey_signals([])
|
|
||||||
self.assertEqual(s["total_files"], 0)
|
|
||||||
self.assertEqual(s["extension_histogram"], {})
|
|
||||||
self.assertEqual(s["file_descriptions"], {})
|
|
||||||
self.assertEqual(s["filename_samples"], [])
|
|
||||||
|
|
||||||
def test_extension_histogram_uses_lowercase_and_keeps_none(self):
|
|
||||||
files = [
|
|
||||||
self._f("a.PY"), self._f("b.py"), self._f("c.py"),
|
|
||||||
self._f("README"), self._f("Makefile"),
|
|
||||||
]
|
|
||||||
s = survey_signals(files)
|
|
||||||
self.assertEqual(s["extension_histogram"][".py"], 3)
|
|
||||||
self.assertEqual(s["extension_histogram"]["(none)"], 2)
|
|
||||||
|
|
||||||
def test_file_descriptions_aggregated_and_truncated(self):
|
|
||||||
long_desc = "x" * 200
|
|
||||||
files = [
|
|
||||||
self._f("a.py", "Python script, ASCII text"),
|
|
||||||
self._f("b.py", "Python script, ASCII text"),
|
|
||||||
self._f("c.bin", long_desc),
|
|
||||||
]
|
|
||||||
s = survey_signals(files)
|
|
||||||
self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
|
|
||||||
# The long description was truncated and still counted once
|
|
||||||
truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
|
|
||||||
self.assertEqual(len(truncated_keys), 1)
|
|
||||||
self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."
|
|
||||||
|
|
||||||
def test_descriptions_skipped_when_empty(self):
|
|
||||||
files = [self._f("a.py", ""), self._f("b.py", None)]
|
|
||||||
s = survey_signals(files)
|
|
||||||
self.assertEqual(s["file_descriptions"], {})
|
|
||||||
|
|
||||||
def test_top_n_caps_at_20(self):
|
|
||||||
files = [self._f(f"f{i}.ext{i}") for i in range(50)]
|
|
||||||
s = survey_signals(files)
|
|
||||||
self.assertEqual(len(s["extension_histogram"]), 20)
|
|
||||||
|
|
||||||
def test_filename_samples_evenly_drawn(self):
|
|
||||||
files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
|
|
||||||
s = survey_signals(files, max_samples=10)
|
|
||||||
self.assertEqual(len(s["filename_samples"]), 10)
|
|
||||||
# First sample is the first file (stride 10, index 0)
|
|
||||||
self.assertEqual(s["filename_samples"][0], "file_0000.txt")
|
|
||||||
# Last sample is around index 90, not 99
|
|
||||||
self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
|
|
||||||
|
|
||||||
def test_filename_samples_returns_all_when_under_cap(self):
|
|
||||||
files = [self._f(f"f{i}.txt") for i in range(5)]
|
|
||||||
s = survey_signals(files, max_samples=20)
|
|
||||||
self.assertEqual(len(s["filename_samples"]), 5)
|
|
||||||
|
|
||||||
def test_total_files_matches_input(self):
|
|
||||||
files = [self._f(f"f{i}.py") for i in range(7)]
|
|
||||||
self.assertEqual(survey_signals(files)["total_files"], 7)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue