From 1203b07248197db015f0c3249ecad7e8b1409cf8 Mon Sep 17 00:00:00 2001 From: Jeff Smith Date: Wed, 8 Apr 2026 19:27:33 -0600 Subject: [PATCH] fix(observability): persist full ResearchResult and per-item trace events Closes #54. The JSONL trace previously stored only counts on the `complete` event (gap_count, citation_count, discovery_count). Replay could re-render the step log but could not recover which gaps fired or which sources were cited, blocking M3.2/M3.3 stress-testing and calibration work. Two complementary fixes: 1. (a) TraceLogger.write_result() dumps the pydantic ResearchResult to `.result.json` next to the JSONL trace. The agent calls it right before emitting the `complete` step. `cli replay` now loads the sibling result file when present and renders the structured tables under the trace step log. 2. (b) The agent emits one `gap_recorded`, `citation_recorded`, or `discovery_recorded` trace event per item from the final result. This gives the JSONL stream a queryable timeline of what was kept, with categories and topics in-band, without needing to load the result sibling. Tests: 4 added (127 total passing). Smoke-tested live with a real ask; both files written and replay rendering verified. Co-Authored-By: Claude Opus 4.6 (1M context) --- cli/main.py | 21 +++++++++++++ researchers/web/agent.py | 38 ++++++++++++++++++++++ researchers/web/trace.py | 20 ++++++++++++ tests/test_agent.py | 24 ++++++++++++++ tests/test_cli.py | 68 ++++++++++++++++++++++++++++++++++++++++ tests/test_trace.py | 21 +++++++++++++ 6 files changed, 192 insertions(+) diff --git a/cli/main.py b/cli/main.py index 16eafac..eef9ceb 100644 --- a/cli/main.py +++ b/cli/main.py @@ -302,6 +302,27 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None: render_trace(entries, trace_id, console) + # Issue #54: if the agent persisted a sibling .result.json, render + # the full structured ResearchResult underneath the step log so + # replay can show which gaps fired, which sources were cited, etc. + result_path = path.parent / f"{trace_id}.result.json" + if result_path.exists(): + try: + result = ResearchResult.model_validate_json( + result_path.read_text(encoding="utf-8") + ) + except Exception as exc: + console.print( + f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}" + ) + else: + console.print() + render_result(result, console) + else: + console.print( + "[dim]No persisted result file alongside this trace.[/dim]" + ) + # --------------------------------------------------------------------------- # costs command diff --git a/researchers/web/agent.py b/researchers/web/agent.py index ba26a77..83ccb8b 100644 --- a/researchers/web/agent.py +++ b/researchers/web/agent.py @@ -351,6 +351,44 @@ class WebResearcher: tokens_input += synth_in tokens_output += synth_out + # Issue #54 (b): emit one trace event per gap/citation/discovery so + # the JSONL stream contains the actual categories alongside the + # existing summary counts. Cheap and gives us a queryable timeline. + for c in result.citations: + trace.log_step( + "citation_recorded", + decision="Citation kept in final result", + source=c.source, + locator=c.locator, + title=c.title, + confidence=c.confidence, + ) + for g in result.gaps: + trace.log_step( + "gap_recorded", + decision="Gap surfaced in final result", + category=g.category.value, + topic=g.topic, + detail=g.detail, + ) + for d in result.discovery_events: + trace.log_step( + "discovery_recorded", + decision="Discovery event surfaced in final result", + type=d.type, + suggested_researcher=d.suggested_researcher, + query=d.query, + reason=d.reason, + ) + + # Issue #54 (a): persist the full ResearchResult next to the trace + # so replay and downstream analysis can recover the structured + # contract, not just counts. + try: + trace.write_result(result) + except Exception as write_err: + log.warning("trace_result_write_failed", error=str(write_err)) + trace.log_step( "complete", decision="Research complete", diff --git a/researchers/web/trace.py b/researchers/web/trace.py index 97c81ae..2a20bbc 100644 --- a/researchers/web/trace.py +++ b/researchers/web/trace.py @@ -79,6 +79,7 @@ class TraceLogger: ) self.trace_dir.mkdir(parents=True, exist_ok=True) self.file_path = self.trace_dir / f"{self.trace_id}.jsonl" + self.result_path = self.trace_dir / f"{self.trace_id}.result.json" self._step_counter = 0 self._file = None # action_name -> monotonic start time, populated by starter @@ -148,6 +149,25 @@ class TraceLogger: return entry + def write_result(self, result: Any) -> None: + """Persist the final ResearchResult JSON next to the trace. + + Issue #54: the JSONL trace records step events and final counts + only. Without the structured result on disk, replay can't show + which gaps fired or which sources were cited, and downstream + analysis (M3.2/M3.3) is impossible. We dump the pydantic model + to ``.result.json`` so the full contract survives the + process. + """ + # Accept either a pydantic model or a plain dict to keep the + # logger decoupled from the models module (avoids a circular + # import path). + if hasattr(result, "model_dump_json"): + payload = result.model_dump_json(indent=2) + else: + payload = json.dumps(result, indent=2, default=str) + self.result_path.write_text(payload, encoding="utf-8") + def read_entries(self) -> list[dict]: """Read all entries from the trace file. diff --git a/tests/test_agent.py b/tests/test_agent.py index 604a5e5..e96437f 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -213,6 +213,30 @@ class TestWebResearcher: assert result.cost_metadata.tokens_used > 0 assert result.trace_id is not None + # Issue #54 (a): full result is persisted next to the trace + from pathlib import Path + result_file = Path(tmp) / f"{result.trace_id}.result.json" + assert result_file.exists() + persisted = ResearchResult.model_validate_json( + result_file.read_text() + ) + assert persisted.answer == result.answer + assert len(persisted.gaps) == 1 + assert persisted.gaps[0].topic == "pest management" + + # Issue #54 (b): per-item events are emitted in the trace + trace_file = Path(tmp) / f"{result.trace_id}.jsonl" + entries = [ + json.loads(l) for l in trace_file.read_text().splitlines() if l + ] + actions = [e["action"] for e in entries] + assert "gap_recorded" in actions + assert "citation_recorded" in actions + assert "discovery_recorded" in actions + gap_event = next(e for e in entries if e["action"] == "gap_recorded") + assert gap_event["category"] == "source_not_found" + assert gap_event["topic"] == "pest management" + @pytest.mark.asyncio async def test_budget_exhaustion(self): """Test that the loop stops when token budget is reached.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 6ba2c6b..f10f897 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -180,6 +180,74 @@ class TestReplayCommand: assert result.exit_code == 1 assert "invalid JSON" in result.output + def test_replay_renders_persisted_result(self, tmp_path): + """Issue #54: replay loads .result.json sibling and renders it.""" + runner = CliRunner() + self._write_trace(tmp_path) + result_payload = { + "answer": "Test answer about Utah crops.", + "citations": [ + { + "source": "web", + "locator": "https://example.com/utah", + "title": "Utah Guide", + "snippet": None, + "raw_excerpt": "raw excerpt content", + "confidence": 0.9, + } + ], + "gaps": [ + { + "topic": "irrigation", + "category": "scope_exceeded", + "detail": "out of scope", + } + ], + "discovery_events": [], + "open_questions": [], + "confidence": 0.8, + "confidence_factors": { + "num_corroborating_sources": 2, + "source_authority": "high", + "contradiction_detected": False, + "query_specificity_match": 0.8, + "budget_exhausted": False, + "recency": "current", + }, + "cost_metadata": { + "tokens_used": 1000, + "iterations_run": 2, + "wall_time_sec": 12.5, + "budget_exhausted": False, + "model_id": "claude-test", + }, + "trace_id": "trace-xyz", + } + import json as _j + (tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload)) + + result = runner.invoke( + cli, + ["replay", "trace-xyz", "--trace-dir", str(tmp_path)], + ) + assert result.exit_code == 0, result.output + # Step log still rendered + assert "search" in result.output + # Persisted result also rendered + assert "Test answer about Utah crops" in result.output + assert "scope_exceeded" in result.output + assert "irrigation" in result.output + + def test_replay_without_result_file_notes_absence(self, tmp_path): + runner = CliRunner() + self._write_trace(tmp_path) + result = runner.invoke( + cli, + ["replay", "trace-xyz", "--trace-dir", str(tmp_path)], + ) + assert result.exit_code == 0 + assert "No persisted result file" in result.output + def test_render_trace_empty(self): console = Console(record=True, width=120) render_trace([], "empty-trace", console) diff --git a/tests/test_trace.py b/tests/test_trace.py index 3c77699..dc015d5 100644 --- a/tests/test_trace.py +++ b/tests/test_trace.py @@ -34,6 +34,27 @@ class TestTraceLogger: with tempfile.TemporaryDirectory() as tmp: logger = self._make_logger(tmp, trace_id="test-123") assert str(logger.file_path).endswith("test-123.jsonl") + assert str(logger.result_path).endswith("test-123.result.json") + + def test_write_result_persists_pydantic_model(self): + with tempfile.TemporaryDirectory() as tmp: + logger = self._make_logger(tmp, trace_id="result-test") + + class Stub: + def model_dump_json(self, indent=None): + return '{"answer": "hi", "gaps": []}' + + logger.write_result(Stub()) + assert logger.result_path.exists() + data = json.loads(logger.result_path.read_text()) + assert data["answer"] == "hi" + assert data["gaps"] == [] + + def test_write_result_accepts_dict(self): + with tempfile.TemporaryDirectory() as tmp: + logger = self._make_logger(tmp, trace_id="dict-test") + logger.write_result({"foo": "bar"}) + assert json.loads(logger.result_path.read_text()) == {"foo": "bar"} def test_log_step_creates_file(self): with tempfile.TemporaryDirectory() as tmp: