Compare commits
2 commits
55da7fa8dc
...
157ac3f606
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
157ac3f606 | ||
|
|
f3abbce7d4 |
5 changed files with 180 additions and 21 deletions
|
|
@ -8,7 +8,11 @@ import shutil
|
|||
import sys
|
||||
|
||||
from luminos_lib.tree import build_tree, render_tree
|
||||
from luminos_lib.filetypes import classify_files, summarize_categories
|
||||
from luminos_lib.filetypes import (
|
||||
classify_files,
|
||||
summarize_categories,
|
||||
survey_signals,
|
||||
)
|
||||
from luminos_lib.code import detect_languages, find_large_files
|
||||
from luminos_lib.recency import find_recent_files
|
||||
from luminos_lib.disk import get_disk_usage, top_directories
|
||||
|
|
@ -56,6 +60,7 @@ def scan(target, depth=3, show_hidden=False, exclude=None):
|
|||
finish()
|
||||
report["file_categories"] = summarize_categories(classified)
|
||||
report["classified_files"] = classified
|
||||
report["survey_signals"] = survey_signals(classified)
|
||||
|
||||
on_file, finish = _progress("Counting lines")
|
||||
languages, loc = detect_languages(classified, on_file=on_file)
|
||||
|
|
|
|||
|
|
@ -996,21 +996,44 @@ def _block_to_dict(block):
|
|||
# Synthesis pass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _format_survey_signals(signals):
|
||||
"""Render the survey_signals dict as a labeled text block."""
|
||||
if not signals or not signals.get("total_files"):
|
||||
return "(no files classified)"
|
||||
|
||||
lines = [f"Total files: {signals.get('total_files', 0)}", ""]
|
||||
|
||||
ext_hist = signals.get("extension_histogram") or {}
|
||||
if ext_hist:
|
||||
lines.append("Extensions (top, by count):")
|
||||
for ext, n in ext_hist.items():
|
||||
lines.append(f" {ext}: {n}")
|
||||
lines.append("")
|
||||
|
||||
descs = signals.get("file_descriptions") or {}
|
||||
if descs:
|
||||
lines.append("file --brief output (top, by count):")
|
||||
for desc, n in descs.items():
|
||||
lines.append(f" {desc}: {n}")
|
||||
lines.append("")
|
||||
|
||||
samples = signals.get("filename_samples") or []
|
||||
if samples:
|
||||
lines.append("Filename samples (evenly drawn):")
|
||||
for name in samples:
|
||||
lines.append(f" {name}")
|
||||
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
|
||||
def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
||||
"""Run the reconnaissance survey pass.
|
||||
|
||||
Returns a survey dict on success, or None on failure / out-of-turns.
|
||||
Survey is advisory — callers must treat None as "no survey context".
|
||||
"""
|
||||
categories = report.get("file_categories", {}) or {}
|
||||
if categories:
|
||||
ftd_lines = [
|
||||
f" {cat}: {n}"
|
||||
for cat, n in sorted(categories.items(), key=lambda kv: -kv[1])
|
||||
]
|
||||
file_type_distribution = "\n".join(ftd_lines)
|
||||
else:
|
||||
file_type_distribution = " (no files classified)"
|
||||
signals = report.get("survey_signals") or {}
|
||||
survey_signals_text = _format_survey_signals(signals)
|
||||
|
||||
try:
|
||||
tree_node = build_tree(target, max_depth=2)
|
||||
|
|
@ -1023,7 +1046,7 @@ def _run_survey(client, target, report, tracker, max_turns=3, verbose=False):
|
|||
|
||||
system = _SURVEY_SYSTEM_PROMPT.format(
|
||||
target=target,
|
||||
file_type_distribution=file_type_distribution,
|
||||
survey_signals=survey_signals_text,
|
||||
tree_preview=tree_preview,
|
||||
available_tools=available_tools,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -128,3 +128,64 @@ def summarize_categories(classified):
|
|||
cat = f["category"]
|
||||
summary[cat] = summary.get(cat, 0) + 1
|
||||
return summary
|
||||
|
||||
|
||||
_SURVEY_TOP_N = 20
|
||||
_SURVEY_DESC_TRUNCATE = 80
|
||||
|
||||
|
||||
def survey_signals(classified, max_samples=20):
|
||||
"""Return raw, unbucketed signals for the AI survey pass.
|
||||
|
||||
Unlike `summarize_categories`, which collapses files into a small
|
||||
biased taxonomy, this exposes the primary signals so the survey
|
||||
LLM can characterize the target without being misled by the
|
||||
classifier's source-code bias.
|
||||
|
||||
See #42 for the rationale and #48 for the unit-of-analysis
|
||||
limitation: the unit here is still "file" — containers like mbox,
|
||||
SQLite, and zip will under-count, while dense file collections like
|
||||
Maildir will over-count.
|
||||
|
||||
Returns a dict with:
|
||||
total_files — total count
|
||||
extension_histogram — {ext: count}, top _SURVEY_TOP_N by count
|
||||
file_descriptions — {description: count}, top _SURVEY_TOP_N by count
|
||||
filename_samples — up to max_samples filenames, evenly drawn
|
||||
"""
|
||||
total = len(classified)
|
||||
|
||||
ext_counts = {}
|
||||
desc_counts = {}
|
||||
for f in classified:
|
||||
ext = os.path.splitext(f.get("name", ""))[1].lower() or "(none)"
|
||||
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
||||
|
||||
desc = (f.get("description") or "").strip()
|
||||
if desc:
|
||||
if len(desc) > _SURVEY_DESC_TRUNCATE:
|
||||
desc = desc[:_SURVEY_DESC_TRUNCATE] + "..."
|
||||
desc_counts[desc] = desc_counts.get(desc, 0) + 1
|
||||
|
||||
def _top(d):
|
||||
items = sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
return dict(items[:_SURVEY_TOP_N])
|
||||
|
||||
if total > 0 and max_samples > 0:
|
||||
if total <= max_samples:
|
||||
samples = [f.get("name", "") for f in classified]
|
||||
else:
|
||||
stride = total / max_samples
|
||||
samples = [
|
||||
classified[int(i * stride)].get("name", "")
|
||||
for i in range(max_samples)
|
||||
]
|
||||
else:
|
||||
samples = []
|
||||
|
||||
return {
|
||||
"total_files": total,
|
||||
"extension_histogram": _top(ext_counts),
|
||||
"file_descriptions": _top(desc_counts),
|
||||
"filename_samples": samples,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -131,17 +131,22 @@ Answer three questions about the target: {target}
|
|||
## Inputs
|
||||
You have exactly two signals. Do not ask for more.
|
||||
|
||||
File type distribution (counts by category):
|
||||
{file_type_distribution}
|
||||
File-level signals (raw, unbucketed):
|
||||
{survey_signals}
|
||||
|
||||
IMPORTANT: the file type distribution is produced by a classifier
|
||||
that is biased toward source code. Its categories are: source,
|
||||
config, data, document, media, archive, unknown. It has NO concept
|
||||
of mail, notebooks, calendars, contacts, ledgers, photo libraries,
|
||||
or other personal-data domains — anything text-shaped tends to be
|
||||
labeled `source` even when it is not code. If the tree preview
|
||||
suggests a non-code target, trust the tree over the histogram and
|
||||
say so in `domain_notes`.
|
||||
These signals are intentionally raw. The extension histogram and
|
||||
the `file --brief` descriptions reflect what is actually on disk,
|
||||
without any taxonomy collapsing distinct content into one bucket.
|
||||
Use them together: an extension alone can mislead (`.txt` could be
|
||||
notes, logs, or message bodies); the `file` command output and
|
||||
filename samples disambiguate.
|
||||
|
||||
Note on units: each signal counts filesystem files. Some targets
|
||||
have a different natural unit — a Maildir is one logical mailbox
|
||||
with thousands of message files; an mbox is one file containing
|
||||
many messages; an archive is one file containing many entries. If
|
||||
the signals point at a container shape, name it in `description`
|
||||
and `domain_notes` even though the count is in files.
|
||||
|
||||
Top-level tree (2 levels deep):
|
||||
{tree_preview}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from luminos_lib.filetypes import (
|
|||
_classify_one,
|
||||
classify_files,
|
||||
summarize_categories,
|
||||
survey_signals,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -140,5 +141,69 @@ class TestClassifyFiles(unittest.TestCase):
|
|||
self.assertGreater(item["size"], 0)
|
||||
|
||||
|
||||
class TestSurveySignals(unittest.TestCase):
|
||||
def _f(self, name, description="", category="source"):
|
||||
return {"name": name, "path": f"/x/{name}", "category": category,
|
||||
"size": 10, "description": description}
|
||||
|
||||
def test_empty_input(self):
|
||||
s = survey_signals([])
|
||||
self.assertEqual(s["total_files"], 0)
|
||||
self.assertEqual(s["extension_histogram"], {})
|
||||
self.assertEqual(s["file_descriptions"], {})
|
||||
self.assertEqual(s["filename_samples"], [])
|
||||
|
||||
def test_extension_histogram_uses_lowercase_and_keeps_none(self):
|
||||
files = [
|
||||
self._f("a.PY"), self._f("b.py"), self._f("c.py"),
|
||||
self._f("README"), self._f("Makefile"),
|
||||
]
|
||||
s = survey_signals(files)
|
||||
self.assertEqual(s["extension_histogram"][".py"], 3)
|
||||
self.assertEqual(s["extension_histogram"]["(none)"], 2)
|
||||
|
||||
def test_file_descriptions_aggregated_and_truncated(self):
|
||||
long_desc = "x" * 200
|
||||
files = [
|
||||
self._f("a.py", "Python script, ASCII text"),
|
||||
self._f("b.py", "Python script, ASCII text"),
|
||||
self._f("c.bin", long_desc),
|
||||
]
|
||||
s = survey_signals(files)
|
||||
self.assertEqual(s["file_descriptions"]["Python script, ASCII text"], 2)
|
||||
# The long description was truncated and still counted once
|
||||
truncated_keys = [k for k in s["file_descriptions"] if k.startswith("xxx") and k.endswith("...")]
|
||||
self.assertEqual(len(truncated_keys), 1)
|
||||
self.assertLessEqual(len(truncated_keys[0]), 84) # 80 + "..."
|
||||
|
||||
def test_descriptions_skipped_when_empty(self):
|
||||
files = [self._f("a.py", ""), self._f("b.py", None)]
|
||||
s = survey_signals(files)
|
||||
self.assertEqual(s["file_descriptions"], {})
|
||||
|
||||
def test_top_n_caps_at_20(self):
|
||||
files = [self._f(f"f{i}.ext{i}") for i in range(50)]
|
||||
s = survey_signals(files)
|
||||
self.assertEqual(len(s["extension_histogram"]), 20)
|
||||
|
||||
def test_filename_samples_evenly_drawn(self):
|
||||
files = [self._f(f"file_{i:04d}.txt") for i in range(100)]
|
||||
s = survey_signals(files, max_samples=10)
|
||||
self.assertEqual(len(s["filename_samples"]), 10)
|
||||
# First sample is the first file (stride 10, index 0)
|
||||
self.assertEqual(s["filename_samples"][0], "file_0000.txt")
|
||||
# Last sample is around index 90, not 99
|
||||
self.assertTrue(s["filename_samples"][-1].startswith("file_009"))
|
||||
|
||||
def test_filename_samples_returns_all_when_under_cap(self):
|
||||
files = [self._f(f"f{i}.txt") for i in range(5)]
|
||||
s = survey_signals(files, max_samples=20)
|
||||
self.assertEqual(len(s["filename_samples"]), 5)
|
||||
|
||||
def test_total_files_matches_input(self):
|
||||
files = [self._f(f"f{i}.py") for i in range(7)]
|
||||
self.assertEqual(survey_signals(files)["total_files"], 7)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Reference in a new issue