Compare commits
No commits in common. "f68bbb1052b1a35bc122aefcbbd2be4096ea49ae" and "5ea6879ba03005b941e363ba66e62449ad1da781" have entirely different histories.
f68bbb1052
...
5ea6879ba0
6 changed files with 0 additions and 192 deletions
21
cli/main.py
21
cli/main.py
|
|
@ -302,27 +302,6 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:
|
||||||
|
|
||||||
render_trace(entries, trace_id, console)
|
render_trace(entries, trace_id, console)
|
||||||
|
|
||||||
# Issue #54: if the agent persisted a sibling .result.json, render
|
|
||||||
# the full structured ResearchResult underneath the step log so
|
|
||||||
# replay can show which gaps fired, which sources were cited, etc.
|
|
||||||
result_path = path.parent / f"{trace_id}.result.json"
|
|
||||||
if result_path.exists():
|
|
||||||
try:
|
|
||||||
result = ResearchResult.model_validate_json(
|
|
||||||
result_path.read_text(encoding="utf-8")
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
console.print(
|
|
||||||
f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
console.print()
|
|
||||||
render_result(result, console)
|
|
||||||
else:
|
|
||||||
console.print(
|
|
||||||
"[dim]No persisted result file alongside this trace.[/dim]"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# costs command
|
# costs command
|
||||||
|
|
|
||||||
|
|
@ -351,44 +351,6 @@ class WebResearcher:
|
||||||
tokens_input += synth_in
|
tokens_input += synth_in
|
||||||
tokens_output += synth_out
|
tokens_output += synth_out
|
||||||
|
|
||||||
# Issue #54 (b): emit one trace event per gap/citation/discovery so
|
|
||||||
# the JSONL stream contains the actual categories alongside the
|
|
||||||
# existing summary counts. Cheap and gives us a queryable timeline.
|
|
||||||
for c in result.citations:
|
|
||||||
trace.log_step(
|
|
||||||
"citation_recorded",
|
|
||||||
decision="Citation kept in final result",
|
|
||||||
source=c.source,
|
|
||||||
locator=c.locator,
|
|
||||||
title=c.title,
|
|
||||||
confidence=c.confidence,
|
|
||||||
)
|
|
||||||
for g in result.gaps:
|
|
||||||
trace.log_step(
|
|
||||||
"gap_recorded",
|
|
||||||
decision="Gap surfaced in final result",
|
|
||||||
category=g.category.value,
|
|
||||||
topic=g.topic,
|
|
||||||
detail=g.detail,
|
|
||||||
)
|
|
||||||
for d in result.discovery_events:
|
|
||||||
trace.log_step(
|
|
||||||
"discovery_recorded",
|
|
||||||
decision="Discovery event surfaced in final result",
|
|
||||||
type=d.type,
|
|
||||||
suggested_researcher=d.suggested_researcher,
|
|
||||||
query=d.query,
|
|
||||||
reason=d.reason,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Issue #54 (a): persist the full ResearchResult next to the trace
|
|
||||||
# so replay and downstream analysis can recover the structured
|
|
||||||
# contract, not just counts.
|
|
||||||
try:
|
|
||||||
trace.write_result(result)
|
|
||||||
except Exception as write_err:
|
|
||||||
log.warning("trace_result_write_failed", error=str(write_err))
|
|
||||||
|
|
||||||
trace.log_step(
|
trace.log_step(
|
||||||
"complete",
|
"complete",
|
||||||
decision="Research complete",
|
decision="Research complete",
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,6 @@ class TraceLogger:
|
||||||
)
|
)
|
||||||
self.trace_dir.mkdir(parents=True, exist_ok=True)
|
self.trace_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
|
self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
|
||||||
self.result_path = self.trace_dir / f"{self.trace_id}.result.json"
|
|
||||||
self._step_counter = 0
|
self._step_counter = 0
|
||||||
self._file = None
|
self._file = None
|
||||||
# action_name -> monotonic start time, populated by starter
|
# action_name -> monotonic start time, populated by starter
|
||||||
|
|
@ -149,25 +148,6 @@ class TraceLogger:
|
||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
def write_result(self, result: Any) -> None:
|
|
||||||
"""Persist the final ResearchResult JSON next to the trace.
|
|
||||||
|
|
||||||
Issue #54: the JSONL trace records step events and final counts
|
|
||||||
only. Without the structured result on disk, replay can't show
|
|
||||||
which gaps fired or which sources were cited, and downstream
|
|
||||||
analysis (M3.2/M3.3) is impossible. We dump the pydantic model
|
|
||||||
to ``<trace_id>.result.json`` so the full contract survives the
|
|
||||||
process.
|
|
||||||
"""
|
|
||||||
# Accept either a pydantic model or a plain dict to keep the
|
|
||||||
# logger decoupled from the models module (avoids a circular
|
|
||||||
# import path).
|
|
||||||
if hasattr(result, "model_dump_json"):
|
|
||||||
payload = result.model_dump_json(indent=2)
|
|
||||||
else:
|
|
||||||
payload = json.dumps(result, indent=2, default=str)
|
|
||||||
self.result_path.write_text(payload, encoding="utf-8")
|
|
||||||
|
|
||||||
def read_entries(self) -> list[dict]:
|
def read_entries(self) -> list[dict]:
|
||||||
"""Read all entries from the trace file.
|
"""Read all entries from the trace file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -213,30 +213,6 @@ class TestWebResearcher:
|
||||||
assert result.cost_metadata.tokens_used > 0
|
assert result.cost_metadata.tokens_used > 0
|
||||||
assert result.trace_id is not None
|
assert result.trace_id is not None
|
||||||
|
|
||||||
# Issue #54 (a): full result is persisted next to the trace
|
|
||||||
from pathlib import Path
|
|
||||||
result_file = Path(tmp) / f"{result.trace_id}.result.json"
|
|
||||||
assert result_file.exists()
|
|
||||||
persisted = ResearchResult.model_validate_json(
|
|
||||||
result_file.read_text()
|
|
||||||
)
|
|
||||||
assert persisted.answer == result.answer
|
|
||||||
assert len(persisted.gaps) == 1
|
|
||||||
assert persisted.gaps[0].topic == "pest management"
|
|
||||||
|
|
||||||
# Issue #54 (b): per-item events are emitted in the trace
|
|
||||||
trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
|
|
||||||
entries = [
|
|
||||||
json.loads(l) for l in trace_file.read_text().splitlines() if l
|
|
||||||
]
|
|
||||||
actions = [e["action"] for e in entries]
|
|
||||||
assert "gap_recorded" in actions
|
|
||||||
assert "citation_recorded" in actions
|
|
||||||
assert "discovery_recorded" in actions
|
|
||||||
gap_event = next(e for e in entries if e["action"] == "gap_recorded")
|
|
||||||
assert gap_event["category"] == "source_not_found"
|
|
||||||
assert gap_event["topic"] == "pest management"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_budget_exhaustion(self):
|
async def test_budget_exhaustion(self):
|
||||||
"""Test that the loop stops when token budget is reached."""
|
"""Test that the loop stops when token budget is reached."""
|
||||||
|
|
|
||||||
|
|
@ -180,74 +180,6 @@ class TestReplayCommand:
|
||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
assert "invalid JSON" in result.output
|
assert "invalid JSON" in result.output
|
||||||
|
|
||||||
def test_replay_renders_persisted_result(self, tmp_path):
|
|
||||||
"""Issue #54: replay loads <id>.result.json sibling and renders it."""
|
|
||||||
runner = CliRunner()
|
|
||||||
self._write_trace(tmp_path)
|
|
||||||
result_payload = {
|
|
||||||
"answer": "Test answer about Utah crops.",
|
|
||||||
"citations": [
|
|
||||||
{
|
|
||||||
"source": "web",
|
|
||||||
"locator": "https://example.com/utah",
|
|
||||||
"title": "Utah Guide",
|
|
||||||
"snippet": None,
|
|
||||||
"raw_excerpt": "raw excerpt content",
|
|
||||||
"confidence": 0.9,
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"gaps": [
|
|
||||||
{
|
|
||||||
"topic": "irrigation",
|
|
||||||
"category": "scope_exceeded",
|
|
||||||
"detail": "out of scope",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"discovery_events": [],
|
|
||||||
"open_questions": [],
|
|
||||||
"confidence": 0.8,
|
|
||||||
"confidence_factors": {
|
|
||||||
"num_corroborating_sources": 2,
|
|
||||||
"source_authority": "high",
|
|
||||||
"contradiction_detected": False,
|
|
||||||
"query_specificity_match": 0.8,
|
|
||||||
"budget_exhausted": False,
|
|
||||||
"recency": "current",
|
|
||||||
},
|
|
||||||
"cost_metadata": {
|
|
||||||
"tokens_used": 1000,
|
|
||||||
"iterations_run": 2,
|
|
||||||
"wall_time_sec": 12.5,
|
|
||||||
"budget_exhausted": False,
|
|
||||||
"model_id": "claude-test",
|
|
||||||
},
|
|
||||||
"trace_id": "trace-xyz",
|
|
||||||
}
|
|
||||||
import json as _j
|
|
||||||
(tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
|
|
||||||
|
|
||||||
result = runner.invoke(
|
|
||||||
cli,
|
|
||||||
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
|
||||||
)
|
|
||||||
assert result.exit_code == 0, result.output
|
|
||||||
# Step log still rendered
|
|
||||||
assert "search" in result.output
|
|
||||||
# Persisted result also rendered
|
|
||||||
assert "Test answer about Utah crops" in result.output
|
|
||||||
assert "scope_exceeded" in result.output
|
|
||||||
assert "irrigation" in result.output
|
|
||||||
|
|
||||||
def test_replay_without_result_file_notes_absence(self, tmp_path):
|
|
||||||
runner = CliRunner()
|
|
||||||
self._write_trace(tmp_path)
|
|
||||||
result = runner.invoke(
|
|
||||||
cli,
|
|
||||||
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
|
||||||
)
|
|
||||||
assert result.exit_code == 0
|
|
||||||
assert "No persisted result file" in result.output
|
|
||||||
|
|
||||||
def test_render_trace_empty(self):
|
def test_render_trace_empty(self):
|
||||||
console = Console(record=True, width=120)
|
console = Console(record=True, width=120)
|
||||||
render_trace([], "empty-trace", console)
|
render_trace([], "empty-trace", console)
|
||||||
|
|
|
||||||
|
|
@ -34,27 +34,6 @@ class TestTraceLogger:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
logger = self._make_logger(tmp, trace_id="test-123")
|
logger = self._make_logger(tmp, trace_id="test-123")
|
||||||
assert str(logger.file_path).endswith("test-123.jsonl")
|
assert str(logger.file_path).endswith("test-123.jsonl")
|
||||||
assert str(logger.result_path).endswith("test-123.result.json")
|
|
||||||
|
|
||||||
def test_write_result_persists_pydantic_model(self):
|
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
|
||||||
logger = self._make_logger(tmp, trace_id="result-test")
|
|
||||||
|
|
||||||
class Stub:
|
|
||||||
def model_dump_json(self, indent=None):
|
|
||||||
return '{"answer": "hi", "gaps": []}'
|
|
||||||
|
|
||||||
logger.write_result(Stub())
|
|
||||||
assert logger.result_path.exists()
|
|
||||||
data = json.loads(logger.result_path.read_text())
|
|
||||||
assert data["answer"] == "hi"
|
|
||||||
assert data["gaps"] == []
|
|
||||||
|
|
||||||
def test_write_result_accepts_dict(self):
|
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
|
||||||
logger = self._make_logger(tmp, trace_id="dict-test")
|
|
||||||
logger.write_result({"foo": "bar"})
|
|
||||||
assert json.loads(logger.result_path.read_text()) == {"foo": "bar"}
|
|
||||||
|
|
||||||
def test_log_step_creates_file(self):
|
def test_log_step_creates_file(self):
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue