fix(observability): persist full ResearchResult and per-item trace events #56
6 changed files with 192 additions and 0 deletions
21
cli/main.py
21
cli/main.py
|
|
@ -302,6 +302,27 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:
|
||||||
|
|
||||||
render_trace(entries, trace_id, console)
|
render_trace(entries, trace_id, console)
|
||||||
|
|
||||||
|
# Issue #54: if the agent persisted a sibling .result.json, render
|
||||||
|
# the full structured ResearchResult underneath the step log so
|
||||||
|
# replay can show which gaps fired, which sources were cited, etc.
|
||||||
|
result_path = path.parent / f"{trace_id}.result.json"
|
||||||
|
if result_path.exists():
|
||||||
|
try:
|
||||||
|
result = ResearchResult.model_validate_json(
|
||||||
|
result_path.read_text(encoding="utf-8")
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
console.print(
|
||||||
|
f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
console.print()
|
||||||
|
render_result(result, console)
|
||||||
|
else:
|
||||||
|
console.print(
|
||||||
|
"[dim]No persisted result file alongside this trace.[/dim]"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# costs command
|
# costs command
|
||||||
|
|
|
||||||
|
|
@ -351,6 +351,44 @@ class WebResearcher:
|
||||||
tokens_input += synth_in
|
tokens_input += synth_in
|
||||||
tokens_output += synth_out
|
tokens_output += synth_out
|
||||||
|
|
||||||
|
# Issue #54 (b): emit one trace event per gap/citation/discovery so
|
||||||
|
# the JSONL stream contains the actual categories alongside the
|
||||||
|
# existing summary counts. Cheap and gives us a queryable timeline.
|
||||||
|
for c in result.citations:
|
||||||
|
trace.log_step(
|
||||||
|
"citation_recorded",
|
||||||
|
decision="Citation kept in final result",
|
||||||
|
source=c.source,
|
||||||
|
locator=c.locator,
|
||||||
|
title=c.title,
|
||||||
|
confidence=c.confidence,
|
||||||
|
)
|
||||||
|
for g in result.gaps:
|
||||||
|
trace.log_step(
|
||||||
|
"gap_recorded",
|
||||||
|
decision="Gap surfaced in final result",
|
||||||
|
category=g.category.value,
|
||||||
|
topic=g.topic,
|
||||||
|
detail=g.detail,
|
||||||
|
)
|
||||||
|
for d in result.discovery_events:
|
||||||
|
trace.log_step(
|
||||||
|
"discovery_recorded",
|
||||||
|
decision="Discovery event surfaced in final result",
|
||||||
|
type=d.type,
|
||||||
|
suggested_researcher=d.suggested_researcher,
|
||||||
|
query=d.query,
|
||||||
|
reason=d.reason,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Issue #54 (a): persist the full ResearchResult next to the trace
|
||||||
|
# so replay and downstream analysis can recover the structured
|
||||||
|
# contract, not just counts.
|
||||||
|
try:
|
||||||
|
trace.write_result(result)
|
||||||
|
except Exception as write_err:
|
||||||
|
log.warning("trace_result_write_failed", error=str(write_err))
|
||||||
|
|
||||||
trace.log_step(
|
trace.log_step(
|
||||||
"complete",
|
"complete",
|
||||||
decision="Research complete",
|
decision="Research complete",
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,7 @@ class TraceLogger:
|
||||||
)
|
)
|
||||||
self.trace_dir.mkdir(parents=True, exist_ok=True)
|
self.trace_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
|
self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
|
||||||
|
self.result_path = self.trace_dir / f"{self.trace_id}.result.json"
|
||||||
self._step_counter = 0
|
self._step_counter = 0
|
||||||
self._file = None
|
self._file = None
|
||||||
# action_name -> monotonic start time, populated by starter
|
# action_name -> monotonic start time, populated by starter
|
||||||
|
|
@ -148,6 +149,25 @@ class TraceLogger:
|
||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
def write_result(self, result: Any) -> None:
|
||||||
|
"""Persist the final ResearchResult JSON next to the trace.
|
||||||
|
|
||||||
|
Issue #54: the JSONL trace records step events and final counts
|
||||||
|
only. Without the structured result on disk, replay can't show
|
||||||
|
which gaps fired or which sources were cited, and downstream
|
||||||
|
analysis (M3.2/M3.3) is impossible. We dump the pydantic model
|
||||||
|
to ``<trace_id>.result.json`` so the full contract survives the
|
||||||
|
process.
|
||||||
|
"""
|
||||||
|
# Accept either a pydantic model or a plain dict to keep the
|
||||||
|
# logger decoupled from the models module (avoids a circular
|
||||||
|
# import path).
|
||||||
|
if hasattr(result, "model_dump_json"):
|
||||||
|
payload = result.model_dump_json(indent=2)
|
||||||
|
else:
|
||||||
|
payload = json.dumps(result, indent=2, default=str)
|
||||||
|
self.result_path.write_text(payload, encoding="utf-8")
|
||||||
|
|
||||||
def read_entries(self) -> list[dict]:
|
def read_entries(self) -> list[dict]:
|
||||||
"""Read all entries from the trace file.
|
"""Read all entries from the trace file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -213,6 +213,30 @@ class TestWebResearcher:
|
||||||
assert result.cost_metadata.tokens_used > 0
|
assert result.cost_metadata.tokens_used > 0
|
||||||
assert result.trace_id is not None
|
assert result.trace_id is not None
|
||||||
|
|
||||||
|
# Issue #54 (a): full result is persisted next to the trace
|
||||||
|
from pathlib import Path
|
||||||
|
result_file = Path(tmp) / f"{result.trace_id}.result.json"
|
||||||
|
assert result_file.exists()
|
||||||
|
persisted = ResearchResult.model_validate_json(
|
||||||
|
result_file.read_text()
|
||||||
|
)
|
||||||
|
assert persisted.answer == result.answer
|
||||||
|
assert len(persisted.gaps) == 1
|
||||||
|
assert persisted.gaps[0].topic == "pest management"
|
||||||
|
|
||||||
|
# Issue #54 (b): per-item events are emitted in the trace
|
||||||
|
trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
|
||||||
|
entries = [
|
||||||
|
json.loads(l) for l in trace_file.read_text().splitlines() if l
|
||||||
|
]
|
||||||
|
actions = [e["action"] for e in entries]
|
||||||
|
assert "gap_recorded" in actions
|
||||||
|
assert "citation_recorded" in actions
|
||||||
|
assert "discovery_recorded" in actions
|
||||||
|
gap_event = next(e for e in entries if e["action"] == "gap_recorded")
|
||||||
|
assert gap_event["category"] == "source_not_found"
|
||||||
|
assert gap_event["topic"] == "pest management"
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_budget_exhaustion(self):
|
async def test_budget_exhaustion(self):
|
||||||
"""Test that the loop stops when token budget is reached."""
|
"""Test that the loop stops when token budget is reached."""
|
||||||
|
|
|
||||||
|
|
@ -180,6 +180,74 @@ class TestReplayCommand:
|
||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
assert "invalid JSON" in result.output
|
assert "invalid JSON" in result.output
|
||||||
|
|
||||||
|
def test_replay_renders_persisted_result(self, tmp_path):
|
||||||
|
"""Issue #54: replay loads <id>.result.json sibling and renders it."""
|
||||||
|
runner = CliRunner()
|
||||||
|
self._write_trace(tmp_path)
|
||||||
|
result_payload = {
|
||||||
|
"answer": "Test answer about Utah crops.",
|
||||||
|
"citations": [
|
||||||
|
{
|
||||||
|
"source": "web",
|
||||||
|
"locator": "https://example.com/utah",
|
||||||
|
"title": "Utah Guide",
|
||||||
|
"snippet": None,
|
||||||
|
"raw_excerpt": "raw excerpt content",
|
||||||
|
"confidence": 0.9,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gaps": [
|
||||||
|
{
|
||||||
|
"topic": "irrigation",
|
||||||
|
"category": "scope_exceeded",
|
||||||
|
"detail": "out of scope",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discovery_events": [],
|
||||||
|
"open_questions": [],
|
||||||
|
"confidence": 0.8,
|
||||||
|
"confidence_factors": {
|
||||||
|
"num_corroborating_sources": 2,
|
||||||
|
"source_authority": "high",
|
||||||
|
"contradiction_detected": False,
|
||||||
|
"query_specificity_match": 0.8,
|
||||||
|
"budget_exhausted": False,
|
||||||
|
"recency": "current",
|
||||||
|
},
|
||||||
|
"cost_metadata": {
|
||||||
|
"tokens_used": 1000,
|
||||||
|
"iterations_run": 2,
|
||||||
|
"wall_time_sec": 12.5,
|
||||||
|
"budget_exhausted": False,
|
||||||
|
"model_id": "claude-test",
|
||||||
|
},
|
||||||
|
"trace_id": "trace-xyz",
|
||||||
|
}
|
||||||
|
import json as _j
|
||||||
|
(tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0, result.output
|
||||||
|
# Step log still rendered
|
||||||
|
assert "search" in result.output
|
||||||
|
# Persisted result also rendered
|
||||||
|
assert "Test answer about Utah crops" in result.output
|
||||||
|
assert "scope_exceeded" in result.output
|
||||||
|
assert "irrigation" in result.output
|
||||||
|
|
||||||
|
def test_replay_without_result_file_notes_absence(self, tmp_path):
|
||||||
|
runner = CliRunner()
|
||||||
|
self._write_trace(tmp_path)
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "No persisted result file" in result.output
|
||||||
|
|
||||||
def test_render_trace_empty(self):
|
def test_render_trace_empty(self):
|
||||||
console = Console(record=True, width=120)
|
console = Console(record=True, width=120)
|
||||||
render_trace([], "empty-trace", console)
|
render_trace([], "empty-trace", console)
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,27 @@ class TestTraceLogger:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
logger = self._make_logger(tmp, trace_id="test-123")
|
logger = self._make_logger(tmp, trace_id="test-123")
|
||||||
assert str(logger.file_path).endswith("test-123.jsonl")
|
assert str(logger.file_path).endswith("test-123.jsonl")
|
||||||
|
assert str(logger.result_path).endswith("test-123.result.json")
|
||||||
|
|
||||||
|
def test_write_result_persists_pydantic_model(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
logger = self._make_logger(tmp, trace_id="result-test")
|
||||||
|
|
||||||
|
class Stub:
|
||||||
|
def model_dump_json(self, indent=None):
|
||||||
|
return '{"answer": "hi", "gaps": []}'
|
||||||
|
|
||||||
|
logger.write_result(Stub())
|
||||||
|
assert logger.result_path.exists()
|
||||||
|
data = json.loads(logger.result_path.read_text())
|
||||||
|
assert data["answer"] == "hi"
|
||||||
|
assert data["gaps"] == []
|
||||||
|
|
||||||
|
def test_write_result_accepts_dict(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
logger = self._make_logger(tmp, trace_id="dict-test")
|
||||||
|
logger.write_result({"foo": "bar"})
|
||||||
|
assert json.loads(logger.result_path.read_text()) == {"foo": "bar"}
|
||||||
|
|
||||||
def test_log_step_creates_file(self):
|
def test_log_step_creates_file(self):
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue