2026-04-08 19:27:48 -06:00
6 changed files with 192 additions and 0 deletions
--- a/cli/main.py
+++ b/cli/main.py
@ -302,6 +302,27 @@ def replay(trace_id: str, trace_dir: Optional[str]) -> None:
    render_trace(entries, trace_id, console)
    # Issue #54: if the agent persisted a sibling .result.json, render
    # the full structured ResearchResult underneath the step log so
    # replay can show which gaps fired, which sources were cited, etc.
    result_path = path.parent / f"{trace_id}.result.json"
    if result_path.exists():
        try:
            result = ResearchResult.model_validate_json(
                result_path.read_text(encoding="utf-8")
            )
        except Exception as exc:
            console.print(
                f"[yellow]warning:[/yellow] could not parse {result_path.name}: {exc}"
            )
        else:
            console.print()
            render_result(result, console)
    else:
        console.print(
            "[dim]No persisted result file alongside this trace.[/dim]"
        )
 # ---------------------------------------------------------------------------
 # costs command
--- a/researchers/web/agent.py
+++ b/researchers/web/agent.py
@ -351,6 +351,44 @@ class WebResearcher:
        tokens_input += synth_in
        tokens_output += synth_out
        # Issue #54 (b): emit one trace event per gap/citation/discovery so
        # the JSONL stream contains the actual categories alongside the
        # existing summary counts. Cheap and gives us a queryable timeline.
        for c in result.citations:
            trace.log_step(
                "citation_recorded",
                decision="Citation kept in final result",
                source=c.source,
                locator=c.locator,
                title=c.title,
                confidence=c.confidence,
            )
        for g in result.gaps:
            trace.log_step(
                "gap_recorded",
                decision="Gap surfaced in final result",
                category=g.category.value,
                topic=g.topic,
                detail=g.detail,
            )
        for d in result.discovery_events:
            trace.log_step(
                "discovery_recorded",
                decision="Discovery event surfaced in final result",
                type=d.type,
                suggested_researcher=d.suggested_researcher,
                query=d.query,
                reason=d.reason,
            )
        # Issue #54 (a): persist the full ResearchResult next to the trace
        # so replay and downstream analysis can recover the structured
        # contract, not just counts.
        try:
            trace.write_result(result)
        except Exception as write_err:
            log.warning("trace_result_write_failed", error=str(write_err))
        trace.log_step(
            "complete",
            decision="Research complete",
--- a/researchers/web/trace.py
+++ b/researchers/web/trace.py
@ -79,6 +79,7 @@ class TraceLogger:
        )
        self.trace_dir.mkdir(parents=True, exist_ok=True)
        self.file_path = self.trace_dir / f"{self.trace_id}.jsonl"
        self.result_path = self.trace_dir / f"{self.trace_id}.result.json"
        self._step_counter = 0
        self._file = None
        # action_name -> monotonic start time, populated by starter
@ -148,6 +149,25 @@ class TraceLogger:
        return entry
    def write_result(self, result: Any) -> None:
        """Persist the final ResearchResult JSON next to the trace.
        Issue #54: the JSONL trace records step events and final counts
        only. Without the structured result on disk, replay can't show
        which gaps fired or which sources were cited, and downstream
        analysis (M3.2/M3.3) is impossible. We dump the pydantic model
        to ``<trace_id>.result.json`` so the full contract survives the
        process.
        """
        # Accept either a pydantic model or a plain dict to keep the
        # logger decoupled from the models module (avoids a circular
        # import path).
        if hasattr(result, "model_dump_json"):
            payload = result.model_dump_json(indent=2)
        else:
            payload = json.dumps(result, indent=2, default=str)
        self.result_path.write_text(payload, encoding="utf-8")
    def read_entries(self) -> list[dict]:
        """Read all entries from the trace file.
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@ -213,6 +213,30 @@ class TestWebResearcher:
            assert result.cost_metadata.tokens_used > 0
            assert result.trace_id is not None
            # Issue #54 (a): full result is persisted next to the trace
            from pathlib import Path
            result_file = Path(tmp) / f"{result.trace_id}.result.json"
            assert result_file.exists()
            persisted = ResearchResult.model_validate_json(
                result_file.read_text()
            )
            assert persisted.answer == result.answer
            assert len(persisted.gaps) == 1
            assert persisted.gaps[0].topic == "pest management"
            # Issue #54 (b): per-item events are emitted in the trace
            trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
            entries = [
                json.loads(l) for l in trace_file.read_text().splitlines() if l
            ]
            actions = [e["action"] for e in entries]
            assert "gap_recorded" in actions
            assert "citation_recorded" in actions
            assert "discovery_recorded" in actions
            gap_event = next(e for e in entries if e["action"] == "gap_recorded")
            assert gap_event["category"] == "source_not_found"
            assert gap_event["topic"] == "pest management"
    @pytest.mark.asyncio
    async def test_budget_exhaustion(self):
        """Test that the loop stops when token budget is reached."""
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -180,6 +180,74 @@ class TestReplayCommand:
        assert result.exit_code == 1
        assert "invalid JSON" in result.output
    def test_replay_renders_persisted_result(self, tmp_path):
        """Issue #54: replay loads <id>.result.json sibling and renders it."""
        runner = CliRunner()
        self._write_trace(tmp_path)
        result_payload = {
            "answer": "Test answer about Utah crops.",
            "citations": [
                {
                    "source": "web",
                    "locator": "https://example.com/utah",
                    "title": "Utah Guide",
                    "snippet": None,
                    "raw_excerpt": "raw excerpt content",
                    "confidence": 0.9,
                }
            ],
            "gaps": [
                {
                    "topic": "irrigation",
                    "category": "scope_exceeded",
                    "detail": "out of scope",
                }
            ],
            "discovery_events": [],
            "open_questions": [],
            "confidence": 0.8,
            "confidence_factors": {
                "num_corroborating_sources": 2,
                "source_authority": "high",
                "contradiction_detected": False,
                "query_specificity_match": 0.8,
                "budget_exhausted": False,
                "recency": "current",
            },
            "cost_metadata": {
                "tokens_used": 1000,
                "iterations_run": 2,
                "wall_time_sec": 12.5,
                "budget_exhausted": False,
                "model_id": "claude-test",
            },
            "trace_id": "trace-xyz",
        }
        import json as _j
        (tmp_path / "trace-xyz.result.json").write_text(_j.dumps(result_payload))
        result = runner.invoke(
            cli,
            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 0, result.output
        # Step log still rendered
        assert "search" in result.output
        # Persisted result also rendered
        assert "Test answer about Utah crops" in result.output
        assert "scope_exceeded" in result.output
        assert "irrigation" in result.output
    def test_replay_without_result_file_notes_absence(self, tmp_path):
        runner = CliRunner()
        self._write_trace(tmp_path)
        result = runner.invoke(
            cli,
            ["replay", "trace-xyz", "--trace-dir", str(tmp_path)],
        )
        assert result.exit_code == 0
        assert "No persisted result file" in result.output
    def test_render_trace_empty(self):
        console = Console(record=True, width=120)
        render_trace([], "empty-trace", console)
--- a/tests/test_trace.py
+++ b/tests/test_trace.py
@ -34,6 +34,27 @@ class TestTraceLogger:
        with tempfile.TemporaryDirectory() as tmp:
            logger = self._make_logger(tmp, trace_id="test-123")
            assert str(logger.file_path).endswith("test-123.jsonl")
            assert str(logger.result_path).endswith("test-123.result.json")
    def test_write_result_persists_pydantic_model(self):
        with tempfile.TemporaryDirectory() as tmp:
            logger = self._make_logger(tmp, trace_id="result-test")
            class Stub:
                def model_dump_json(self, indent=None):
                    return '{"answer": "hi", "gaps": []}'
            logger.write_result(Stub())
            assert logger.result_path.exists()
            data = json.loads(logger.result_path.read_text())
            assert data["answer"] == "hi"
            assert data["gaps"] == []
    def test_write_result_accepts_dict(self):
        with tempfile.TemporaryDirectory() as tmp:
            logger = self._make_logger(tmp, trace_id="dict-test")
            logger.write_result({"foo": "bar"})
            assert json.loads(logger.result_path.read_text()) == {"foo": "bar"}
    def test_log_step_creates_file(self):
        with tempfile.TemporaryDirectory() as tmp: