fix(observability): persist full ResearchResult and per-item trace events

Closes #54. The JSONL trace previously stored only counts on the `complete` event (gap_count, citation_count, discovery_count). Replay could re-render the step log but could not recover which gaps fired or which sources were cited, blocking M3.2/M3.3 stress-testing and calibration work. Two complementary fixes: 1. (a) TraceLogger.write_result() dumps the pydantic ResearchResult to `<trace_id>.result.json` next to the JSONL trace. The agent calls it right before emitting the `complete` step. `cli replay` now loads the sibling result file when present and renders the structured tables under the trace step log. 2. (b) The agent emits one `gap_recorded`, `citation_recorded`, or `discovery_recorded` trace event per item from the final result. This gives the JSONL stream a queryable timeline of what was kept, with categories and topics in-band, without needing to load the result sibling. Tests: 4 added (127 total passing). Smoke-tested live with a real ask; both files written and replay rendering verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 19:27:33 -06:00 · 2026-04-08 19:27:33 -06:00 · 1203b07248
commit 1203b07248
parent 5ea6879ba0
6 changed files with 192 additions and 0 deletions
--- a/cli/main.py
+++ b/cli/main.py
@ -302,6 +302,27 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:

    render_trace(entries, trace_id, console)

+    # Issue #54: if the agent persisted a sibling .result.json, render
+    # the full structured ResearchResult underneath the step log so
+    # replay can show which gaps fired, which sources were cited, etc.
+    result_path = path.parent / f"{trace_id}.result.json"
+    if result_path.exists():
+        try:
+            result = ResearchResult.model_validate_json(
+                result_path.read_text(encoding="utf-8")
+            )
+        except Exception as exc:
+            console.print(
+                f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}"
+            )
+        else:
+            console.print()
+            render_result(result, console)
+    else:
+        console.print(
+            "[dim]No persisted result file alongside this trace.[/dim]"
+        )
+

 # ---------------------------------------------------------------------------
 # costs command
--- a/researchers/web/agent.py
+++ b/researchers/web/agent.py
@ -351,6 +351,44 @@ class WebResearcher:
        tokens_input += synth_in
        tokens_output += synth_out

+        # Issue #54 (b): emit one trace event per gap/citation/discovery so
+        # the JSONL stream contains the actual categories alongside the
+        # existing summary counts. Cheap and gives us a queryable timeline.
+        for c in result.citations:
+            trace.log_step(
+                "citation_recorded",
+                decision="Citation kept in final result",
+                source=c.source,
+                locator=c.locator,
+                title=c.title,
+                confidence=c.confidence,
+            )
+        for g in result.gaps:
+            trace.log_step(
+                "gap_recorded",
+                decision="Gap surfaced in final result",
+                category=g.category.value,
+                topic=g.topic,
+                detail=g.detail,
+            )
+        for d in result.discovery_events:
+            trace.log_step(
+                "discovery_recorded",
+                decision="Discovery event surfaced in final result",
+                type=d.type,
+                suggested_researcher=d.suggested_researcher,
+                query=d.query,
+                reason=d.reason,
+            )
+
+        # Issue #54 (a): persist the full ResearchResult next to the trace
+        # so replay and downstream analysis can recover the structured
+        # contract, not just counts.
+        try:
+            trace.write_result(result)
+        except Exception as write_err:
+            log.warning("trace_result_write_failed", error=str(write_err))
+
        trace.log_step(
            "complete",
            decision="Research complete",
--- a/researchers/web/trace.py
+++ b/researchers/web/trace.py
@ -79,6 +79,7 @@ class TraceLogger:
        )
        self.trace_dir.mkdir(parents=True, exist_ok=True)
        self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
+        self.result_path = self.trace_dir / f"{self.trace_id}.result.json"
        self._step_counter = 0
        self._file = None
        # action_name -> monotonic start time, populated by starter
@ -148,6 +149,25 @@ class TraceLogger:

        return entry

+    def write_result(self, result: Any) -> None:
+        """Persist the final ResearchResult JSON next to the trace.
+
+        Issue #54: the JSONL trace records step events and final counts
+        only. Without the structured result on disk, replay can't show
+        which gaps fired or which sources were cited, and downstream
+        analysis (M3.2/M3.3) is impossible. We dump the pydantic model
+        to ``<trace_id>.result.json`` so the full contract survives the
+        process.
+        """
+        # Accept either a pydantic model or a plain dict to keep the
+        # logger decoupled from the models module (avoids a circular
+        # import path).
+        if hasattr(result, "model_dump_json"):
+            payload = result.model_dump_json(indent=2)
+        else:
+            payload = json.dumps(result, indent=2, default=str)
+        self.result_path.write_text(payload, encoding="utf-8")
+
    def read_entries(self) -> list[dict]:
        """Read all entries from the trace file.

--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@ -213,6 +213,30 @@ class TestWebResearcher:
            assert result.cost_metadata.tokens_used > 0
            assert result.trace_id is not None

+            # Issue #54 (a): full result is persisted next to the trace
+            from pathlib import Path
+            result_file = Path(tmp) / f"{result.trace_id}.result.json"
+            assert result_file.exists()
+            persisted = ResearchResult.model_validate_json(
+                result_file.read_text()
+            )
+            assert persisted.answer == result.answer
+            assert len(persisted.gaps) == 1
+            assert persisted.gaps[0].topic == "pest management"
+
+            # Issue #54 (b): per-item events are emitted in the trace
+            trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
+            entries = [
+                json.loads(l) for l in trace_file.read_text().splitlines() if l
+            ]
+            actions = [e["action"] for e in entries]
+            assert "gap_recorded" in actions
+            assert "citation_recorded" in actions
+            assert "discovery_recorded" in actions
+            gap_event = next(e for e in entries if e["action"] == "gap_recorded")
+            assert gap_event["category"] == "source_not_found"
+            assert gap_event["topic"] == "pest management"
+
    @pytest.mark.asyncio
    async def test_budget_exhaustion(self):
        """Test that the loop stops when token budget is reached."""
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -180,6 +180,74 @@ class TestReplayCommand:
        assert result.exit_code == 1
        assert "invalid JSON" in result.output

+    def test_replay_renders_persisted_result(self, tmp_path):
+        """Issue #54: replay loads <id>.result.json sibling and renders it."""
+        runner = CliRunner()
+        self._write_trace(tmp_path)
+        result_payload = {
+            "answer": "Test answer about Utah crops.",
+            "citations": [
+                {
+                    "source": "web",
+                    "locator": "https://example.com/utah",
+                    "title": "Utah Guide",
+                    "snippet": None,
+                    "raw_excerpt": "raw excerpt content",
+                    "confidence": 0.9,
+                }
+            ],
+            "gaps": [
+                {
+                    "topic": "irrigation",
+                    "category": "scope_exceeded",
+                    "detail": "out of scope",
+                }
+            ],
+            "discovery_events": [],
+            "open_questions": [],
+            "confidence": 0.8,
+            "confidence_factors": {
+                "num_corroborating_sources": 2,
+                "source_authority": "high",
+                "contradiction_detected": False,
+                "query_specificity_match": 0.8,
+                "budget_exhausted": False,
+                "recency": "current",
+            },
+            "cost_metadata": {
+                "tokens_used": 1000,
+                "iterations_run": 2,
+                "wall_time_sec": 12.5,
+                "budget_exhausted": False,
+                "model_id": "claude-test",
+            },
+            "trace_id": "trace-xyz",
+        }
+        import json as _j
+        (tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
+
+        result = runner.invoke(
+            cli,
+            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
+        )
+        assert result.exit_code == 0, result.output
+        # Step log still rendered
+        assert "search" in result.output
+        # Persisted result also rendered
+        assert "Test answer about Utah crops" in result.output
+        assert "scope_exceeded" in result.output
+        assert "irrigation" in result.output
+
+    def test_replay_without_result_file_notes_absence(self, tmp_path):
+        runner = CliRunner()
+        self._write_trace(tmp_path)
+        result = runner.invoke(
+            cli,
+            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
+        )
+        assert result.exit_code == 0
+        assert "No persisted result file" in result.output
+
    def test_render_trace_empty(self):
        console = Console(record=True, width=120)
        render_trace([], "empty-trace", console)
--- a/tests/test_trace.py
+++ b/tests/test_trace.py
@ -34,6 +34,27 @@ class TestTraceLogger:
        with tempfile.TemporaryDirectory() as tmp:
            logger = self._make_logger(tmp, trace_id="test-123")
            assert str(logger.file_path).endswith("test-123.jsonl")
+            assert str(logger.result_path).endswith("test-123.result.json")
+
+    def test_write_result_persists_pydantic_model(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            logger = self._make_logger(tmp, trace_id="result-test")
+
+            class Stub:
+                def model_dump_json(self, indent=None):
+                    return '{"answer": "hi", "gaps": []}'
+
+            logger.write_result(Stub())
+            assert logger.result_path.exists()
+            data = json.loads(logger.result_path.read_text())
+            assert data["answer"] == "hi"
+            assert data["gaps"] == []
+
+    def test_write_result_accepts_dict(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            logger = self._make_logger(tmp, trace_id="dict-test")
+            logger.write_result({"foo": "bar"})
+            assert json.loads(logger.result_path.read_text()) == {"foo": "bar"}

    def test_log_step_creates_file(self):
        with tempfile.TemporaryDirectory() as tmp: