fix(observability): persist full ResearchResult and per-item trace events

Closes #54.

The JSONL trace previously stored only counts on the `complete` event
(gap_count, citation_count, discovery_count). Replay could re-render the
step log but could not recover which gaps fired or which sources were
cited, blocking M3.2/M3.3 stress-testing and calibration work.

Two complementary fixes:

1. (a) TraceLogger.write_result() dumps the pydantic ResearchResult to
   `<trace_id>.result.json` next to the JSONL trace. The agent calls it
   right before emitting the `complete` step. `cli replay` now loads the
   sibling result file when present and renders the structured tables
   under the trace step log.

2. (b) The agent emits one `gap_recorded`, `citation_recorded`, or
   `discovery_recorded` trace event per item from the final result. This
   gives the JSONL stream a queryable timeline of what was kept, with
   categories and topics in-band, without needing to load the result
   sibling.

Tests: 4 added (127 total passing). Smoke-tested live with a real ask;
both files written and replay rendering verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jeff Smith 2026-04-08 19:27:33 -06:00
parent 5ea6879ba0
commit 1203b07248
6 changed files with 192 additions and 0 deletions

View file

@ -302,6 +302,27 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:
render_trace(entries, trace_id, console)
# Issue #54: if the agent persisted a sibling .result.json, render
# the full structured ResearchResult underneath the step log so
# replay can show which gaps fired, which sources were cited, etc.
result_path = path.parent / f"{trace_id}.result.json"
if result_path.exists():
try:
result = ResearchResult.model_validate_json(
result_path.read_text(encoding="utf-8")
)
except Exception as exc:
console.print(
f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}"
)
else:
console.print()
render_result(result, console)
else:
console.print(
"[dim]No persisted result file alongside this trace.[/dim]"
)
# ---------------------------------------------------------------------------
# costs command

View file

@ -351,6 +351,44 @@ class WebResearcher:
tokens_input += synth_in
tokens_output += synth_out
# Issue #54 (b): emit one trace event per gap/citation/discovery so
# the JSONL stream contains the actual categories alongside the
# existing summary counts. Cheap and gives us a queryable timeline.
for c in result.citations:
trace.log_step(
"citation_recorded",
decision="Citation kept in final result",
source=c.source,
locator=c.locator,
title=c.title,
confidence=c.confidence,
)
for g in result.gaps:
trace.log_step(
"gap_recorded",
decision="Gap surfaced in final result",
category=g.category.value,
topic=g.topic,
detail=g.detail,
)
for d in result.discovery_events:
trace.log_step(
"discovery_recorded",
decision="Discovery event surfaced in final result",
type=d.type,
suggested_researcher=d.suggested_researcher,
query=d.query,
reason=d.reason,
)
# Issue #54 (a): persist the full ResearchResult next to the trace
# so replay and downstream analysis can recover the structured
# contract, not just counts.
try:
trace.write_result(result)
except Exception as write_err:
log.warning("trace_result_write_failed", error=str(write_err))
trace.log_step(
"complete",
decision="Research complete",

View file

@ -79,6 +79,7 @@ class TraceLogger:
)
self.trace_dir.mkdir(parents=True, exist_ok=True)
self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
self.result_path = self.trace_dir / f"{self.trace_id}.result.json"
self._step_counter = 0
self._file = None
# action_name -> monotonic start time, populated by starter
@ -148,6 +149,25 @@ class TraceLogger:
return entry
def write_result(self, result: Any) -> None:
"""Persist the final ResearchResult JSON next to the trace.
Issue #54: the JSONL trace records step events and final counts
only. Without the structured result on disk, replay can't show
which gaps fired or which sources were cited, and downstream
analysis (M3.2/M3.3) is impossible. We dump the pydantic model
to ``<trace_id>.result.json`` so the full contract survives the
process.
"""
# Accept either a pydantic model or a plain dict to keep the
# logger decoupled from the models module (avoids a circular
# import path).
if hasattr(result, "model_dump_json"):
payload = result.model_dump_json(indent=2)
else:
payload = json.dumps(result, indent=2, default=str)
self.result_path.write_text(payload, encoding="utf-8")
def read_entries(self) -> list[dict]:
"""Read all entries from the trace file.

View file

@ -213,6 +213,30 @@ class TestWebResearcher:
assert result.cost_metadata.tokens_used > 0
assert result.trace_id is not None
# Issue #54 (a): full result is persisted next to the trace
from pathlib import Path
result_file = Path(tmp) / f"{result.trace_id}.result.json"
assert result_file.exists()
persisted = ResearchResult.model_validate_json(
result_file.read_text()
)
assert persisted.answer == result.answer
assert len(persisted.gaps) == 1
assert persisted.gaps[0].topic == "pest management"
# Issue #54 (b): per-item events are emitted in the trace
trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
entries = [
json.loads(l) for l in trace_file.read_text().splitlines() if l
]
actions = [e["action"] for e in entries]
assert "gap_recorded" in actions
assert "citation_recorded" in actions
assert "discovery_recorded" in actions
gap_event = next(e for e in entries if e["action"] == "gap_recorded")
assert gap_event["category"] == "source_not_found"
assert gap_event["topic"] == "pest management"
@pytest.mark.asyncio
async def test_budget_exhaustion(self):
"""Test that the loop stops when token budget is reached."""

View file

@ -180,6 +180,74 @@ class TestReplayCommand:
assert result.exit_code == 1
assert "invalid JSON" in result.output
def test_replay_renders_persisted_result(self, tmp_path):
"""Issue #54: replay loads <id>.result.json sibling and renders it."""
runner = CliRunner()
self._write_trace(tmp_path)
result_payload = {
"answer": "Test answer about Utah crops.",
"citations": [
{
"source": "web",
"locator": "https://example.com/utah",
"title": "Utah Guide",
"snippet": None,
"raw_excerpt": "raw excerpt content",
"confidence": 0.9,
}
],
"gaps": [
{
"topic": "irrigation",
"category": "scope_exceeded",
"detail": "out of scope",
}
],
"discovery_events": [],
"open_questions": [],
"confidence": 0.8,
"confidence_factors": {
"num_corroborating_sources": 2,
"source_authority": "high",
"contradiction_detected": False,
"query_specificity_match": 0.8,
"budget_exhausted": False,
"recency": "current",
},
"cost_metadata": {
"tokens_used": 1000,
"iterations_run": 2,
"wall_time_sec": 12.5,
"budget_exhausted": False,
"model_id": "claude-test",
},
"trace_id": "trace-xyz",
}
import json as _j
(tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
result = runner.invoke(
cli,
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
)
assert result.exit_code == 0, result.output
# Step log still rendered
assert "search" in result.output
# Persisted result also rendered
assert "Test answer about Utah crops" in result.output
assert "scope_exceeded" in result.output
assert "irrigation" in result.output
def test_replay_without_result_file_notes_absence(self, tmp_path):
runner = CliRunner()
self._write_trace(tmp_path)
result = runner.invoke(
cli,
["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
)
assert result.exit_code == 0
assert "No persisted result file" in result.output
def test_render_trace_empty(self):
console = Console(record=True, width=120)
render_trace([], "empty-trace", console)

View file

@ -34,6 +34,27 @@ class TestTraceLogger:
with tempfile.TemporaryDirectory() as tmp:
logger = self._make_logger(tmp, trace_id="test-123")
assert str(logger.file_path).endswith("test-123.jsonl")
assert str(logger.result_path).endswith("test-123.result.json")
def test_write_result_persists_pydantic_model(self):
with tempfile.TemporaryDirectory() as tmp:
logger = self._make_logger(tmp, trace_id="result-test")
class Stub:
def model_dump_json(self, indent=None):
return '{"answer": "hi", "gaps": []}'
logger.write_result(Stub())
assert logger.result_path.exists()
data = json.loads(logger.result_path.read_text())
assert data["answer"] == "hi"
assert data["gaps"] == []
def test_write_result_accepts_dict(self):
with tempfile.TemporaryDirectory() as tmp:
logger = self._make_logger(tmp, trace_id="dict-test")
logger.write_result({"foo": "bar"})
assert json.loads(logger.result_path.read_text()) == {"foo": "bar"}
def test_log_step_creates_file(self):
with tempfile.TemporaryDirectory() as tmp: