Closes #54. The JSONL trace previously stored only counts on the `complete` event (gap_count, citation_count, discovery_count). Replay could re-render the step log but could not recover which gaps fired or which sources were cited, blocking M3.2/M3.3 stress-testing and calibration work. Two complementary fixes: 1. (a) TraceLogger.write_result() dumps the pydantic ResearchResult to `<trace_id>.result.json` next to the JSONL trace. The agent calls it right before emitting the `complete` step. `cli replay` now loads the sibling result file when present and renders the structured tables under the trace step log. 2. (b) The agent emits one `gap_recorded`, `citation_recorded`, or `discovery_recorded` trace event per item from the final result. This gives the JSONL stream a queryable timeline of what was kept, with categories and topics in-band, without needing to load the result sibling. Tests: 4 added (127 total passing). Smoke-tested live with a real ask; both files written and replay rendering verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
400 lines
15 KiB
Python
400 lines
15 KiB
Python
"""Tests for the web researcher agent."""
|
|
|
|
import json
|
|
import tempfile
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, patch, AsyncMock
|
|
|
|
import pytest
|
|
|
|
from researchers.web.agent import WebResearcher, _format_search_results
|
|
from researchers.web.models import ResearchConstraints, ResearchResult
|
|
from researchers.web.tools import SearchResult
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_anthropic_response(content_blocks, input_tokens=100, output_tokens=200):
|
|
"""Build a mock Anthropic messages.create response."""
|
|
resp = MagicMock()
|
|
resp.content = content_blocks
|
|
resp.usage = SimpleNamespace(input_tokens=input_tokens, output_tokens=output_tokens)
|
|
return resp
|
|
|
|
|
|
def _text_block(text):
|
|
block = MagicMock()
|
|
block.type = "text"
|
|
block.text = text
|
|
return block
|
|
|
|
|
|
def _tool_use_block(name, tool_input, tool_id="tool_1"):
|
|
block = MagicMock()
|
|
block.type = "tool_use"
|
|
block.name = name
|
|
block.input = tool_input
|
|
block.id = tool_id
|
|
return block
|
|
|
|
|
|
VALID_SYNTHESIS_JSON = json.dumps(
|
|
{
|
|
"answer": "Utah is ideal for cool-season crops at high elevation.",
|
|
"citations": [
|
|
{
|
|
"source": "web",
|
|
"locator": "https://example.com/utah-crops",
|
|
"title": "Utah Crop Guide",
|
|
"snippet": "Cool-season crops thrive above 7000 ft.",
|
|
"raw_excerpt": "In Utah's high-elevation gardens, cool-season vegetables such as peas, lettuce, and potatoes consistently outperform warm-season crops.",
|
|
"confidence": 0.9,
|
|
}
|
|
],
|
|
"gaps": [
|
|
{
|
|
"topic": "pest management",
|
|
"category": "source_not_found",
|
|
"detail": "No pest data found.",
|
|
}
|
|
],
|
|
"discovery_events": [
|
|
{
|
|
"type": "related_research",
|
|
"suggested_researcher": "database",
|
|
"query": "Utah soil salinity data",
|
|
"reason": "Multiple sources reference USU studies",
|
|
"source_locator": "https://example.com/ref",
|
|
}
|
|
],
|
|
"open_questions": [
|
|
{
|
|
"question": "What is the optimal irrigation schedule for high-elevation potatoes?",
|
|
"context": "Multiple sources mention irrigation is critical but none specify schedules.",
|
|
"priority": "medium",
|
|
"source_locator": "https://example.com/utah-crops",
|
|
}
|
|
],
|
|
"confidence": 0.82,
|
|
"confidence_factors": {
|
|
"num_corroborating_sources": 3,
|
|
"source_authority": "high",
|
|
"contradiction_detected": False,
|
|
"query_specificity_match": 0.85,
|
|
"budget_exhausted": False,
|
|
"recency": "current",
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _format_search_results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestFormatSearchResults:
|
|
def test_formats_results(self):
|
|
results = [
|
|
SearchResult(
|
|
url="https://example.com",
|
|
title="Test",
|
|
content="Short summary",
|
|
raw_content="Full text here",
|
|
score=0.95,
|
|
content_hash="sha256:abc",
|
|
)
|
|
]
|
|
text = _format_search_results(results)
|
|
assert "Test" in text
|
|
assert "https://example.com" in text
|
|
assert "0.95" in text
|
|
assert "Full text here" in text
|
|
|
|
def test_prefers_raw_content(self):
|
|
results = [
|
|
SearchResult(
|
|
url="https://example.com",
|
|
title="Test",
|
|
content="Short",
|
|
raw_content="Much longer raw content",
|
|
score=0.9,
|
|
content_hash="sha256:abc",
|
|
)
|
|
]
|
|
text = _format_search_results(results)
|
|
assert "Much longer raw content" in text
|
|
|
|
def test_falls_back_to_content(self):
|
|
results = [
|
|
SearchResult(
|
|
url="https://example.com",
|
|
title="Test",
|
|
content="Only short content",
|
|
raw_content=None,
|
|
score=0.9,
|
|
content_hash="sha256:abc",
|
|
)
|
|
]
|
|
text = _format_search_results(results)
|
|
assert "Only short content" in text
|
|
|
|
def test_empty_results(self):
|
|
assert "No results" in _format_search_results([])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# WebResearcher — mocked tool loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWebResearcher:
|
|
@pytest.mark.asyncio
|
|
async def test_simple_research_loop(self):
|
|
"""Test a complete loop: one search → LLM stops → synthesis."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
researcher = WebResearcher(
|
|
anthropic_api_key="fake",
|
|
tavily_api_key="fake",
|
|
model_id="claude-test",
|
|
trace_dir=tmp,
|
|
)
|
|
|
|
# First call: LLM requests a web_search
|
|
search_response = _make_anthropic_response(
|
|
[_tool_use_block("web_search", {"query": "Utah crops"})],
|
|
)
|
|
# Second call: LLM is done (text only, no tools)
|
|
done_response = _make_anthropic_response(
|
|
[_text_block("I have enough information.")],
|
|
)
|
|
# Third call: synthesis
|
|
synthesis_response = _make_anthropic_response(
|
|
[_text_block(VALID_SYNTHESIS_JSON)],
|
|
)
|
|
|
|
researcher.client.messages.create = MagicMock(
|
|
side_effect=[search_response, done_response, synthesis_response]
|
|
)
|
|
|
|
with patch("researchers.web.agent.tavily_search") as mock_search:
|
|
mock_search.return_value = [
|
|
SearchResult(
|
|
url="https://example.com/utah",
|
|
title="Utah Gardening",
|
|
content="Cool-season crops work well.",
|
|
raw_content="Full content about Utah gardening.",
|
|
score=0.95,
|
|
content_hash="sha256:abc123",
|
|
)
|
|
]
|
|
|
|
result = await researcher.research(
|
|
"What are ideal crops for Utah?",
|
|
constraints=ResearchConstraints(max_iterations=3),
|
|
)
|
|
|
|
assert isinstance(result, ResearchResult)
|
|
assert "Utah" in result.answer
|
|
assert len(result.citations) == 1
|
|
assert result.citations[0].locator == "https://example.com/utah-crops"
|
|
assert result.citations[0].raw_excerpt.startswith("In Utah")
|
|
assert len(result.gaps) == 1
|
|
assert result.gaps[0].category == "source_not_found"
|
|
assert len(result.discovery_events) == 1
|
|
assert len(result.open_questions) == 1
|
|
assert "irrigation" in result.open_questions[0].question
|
|
assert result.confidence == 0.82
|
|
assert result.confidence_factors.num_corroborating_sources == 3
|
|
assert result.cost_metadata.model_id == "claude-test"
|
|
assert result.cost_metadata.tokens_used > 0
|
|
assert result.trace_id is not None
|
|
|
|
# Issue #54 (a): full result is persisted next to the trace
|
|
from pathlib import Path
|
|
result_file = Path(tmp) / f"{result.trace_id}.result.json"
|
|
assert result_file.exists()
|
|
persisted = ResearchResult.model_validate_json(
|
|
result_file.read_text()
|
|
)
|
|
assert persisted.answer == result.answer
|
|
assert len(persisted.gaps) == 1
|
|
assert persisted.gaps[0].topic == "pest management"
|
|
|
|
# Issue #54 (b): per-item events are emitted in the trace
|
|
trace_file = Path(tmp) / f"{result.trace_id}.jsonl"
|
|
entries = [
|
|
json.loads(l) for l in trace_file.read_text().splitlines() if l
|
|
]
|
|
actions = [e["action"] for e in entries]
|
|
assert "gap_recorded" in actions
|
|
assert "citation_recorded" in actions
|
|
assert "discovery_recorded" in actions
|
|
gap_event = next(e for e in entries if e["action"] == "gap_recorded")
|
|
assert gap_event["category"] == "source_not_found"
|
|
assert gap_event["topic"] == "pest management"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_budget_exhaustion(self):
|
|
"""Test that the loop stops when token budget is reached."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
researcher = WebResearcher(
|
|
anthropic_api_key="fake",
|
|
tavily_api_key="fake",
|
|
model_id="claude-test",
|
|
trace_dir=tmp,
|
|
)
|
|
|
|
# Each response uses 600 tokens — budget is 1000
|
|
search_response = _make_anthropic_response(
|
|
[_tool_use_block("web_search", {"query": "test"}, "t1")],
|
|
input_tokens=400,
|
|
output_tokens=200,
|
|
)
|
|
# Second search pushes over budget (600 + 600 = 1200 > 1000)
|
|
search_response_2 = _make_anthropic_response(
|
|
[_tool_use_block("web_search", {"query": "test2"}, "t2")],
|
|
input_tokens=400,
|
|
output_tokens=200,
|
|
)
|
|
synthesis_response = _make_anthropic_response(
|
|
[_text_block(VALID_SYNTHESIS_JSON)],
|
|
input_tokens=200,
|
|
output_tokens=100,
|
|
)
|
|
|
|
researcher.client.messages.create = MagicMock(
|
|
side_effect=[search_response, search_response_2, synthesis_response]
|
|
)
|
|
|
|
with patch("researchers.web.agent.tavily_search") as mock_search:
|
|
mock_search.return_value = [
|
|
SearchResult(
|
|
url="https://example.com",
|
|
title="Test",
|
|
content="Content",
|
|
raw_content=None,
|
|
score=0.9,
|
|
content_hash="sha256:abc",
|
|
)
|
|
]
|
|
|
|
result = await researcher.research(
|
|
"test question",
|
|
constraints=ResearchConstraints(
|
|
max_iterations=5,
|
|
token_budget=1000,
|
|
),
|
|
)
|
|
|
|
assert result.cost_metadata.budget_exhausted is True
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_synthesis_failure_returns_fallback(self):
|
|
"""If synthesis JSON is unparseable, return a valid fallback."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
researcher = WebResearcher(
|
|
anthropic_api_key="fake",
|
|
tavily_api_key="fake",
|
|
model_id="claude-test",
|
|
trace_dir=tmp,
|
|
)
|
|
|
|
# LLM immediately stops (no tools)
|
|
done_response = _make_anthropic_response(
|
|
[_text_block("Nothing to search.")],
|
|
)
|
|
# Synthesis returns garbage
|
|
bad_synthesis = _make_anthropic_response(
|
|
[_text_block("This is not valid JSON at all!!!")],
|
|
)
|
|
|
|
researcher.client.messages.create = MagicMock(
|
|
side_effect=[done_response, bad_synthesis]
|
|
)
|
|
|
|
result = await researcher.research("test question")
|
|
|
|
assert isinstance(result, ResearchResult)
|
|
assert "synthesis failed" in result.answer.lower()
|
|
assert result.confidence == 0.1
|
|
assert len(result.gaps) == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_trace_file_created(self):
|
|
"""Verify trace file is created and has entries."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
researcher = WebResearcher(
|
|
anthropic_api_key="fake",
|
|
tavily_api_key="fake",
|
|
model_id="claude-test",
|
|
trace_dir=tmp,
|
|
)
|
|
|
|
done_response = _make_anthropic_response(
|
|
[_text_block("Done.")],
|
|
)
|
|
synthesis_response = _make_anthropic_response(
|
|
[_text_block(VALID_SYNTHESIS_JSON)],
|
|
)
|
|
|
|
researcher.client.messages.create = MagicMock(
|
|
side_effect=[done_response, synthesis_response]
|
|
)
|
|
|
|
result = await researcher.research("test")
|
|
|
|
# Check trace file exists
|
|
from researchers.web.trace import TraceLogger
|
|
|
|
trace = TraceLogger(trace_id=result.trace_id, trace_dir=tmp)
|
|
entries = trace.read_entries()
|
|
assert len(entries) >= 3 # start, iteration_start, synthesis, complete
|
|
assert entries[0]["action"] == "start"
|
|
actions = [e["action"] for e in entries]
|
|
assert "complete" in actions
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_url_tool(self):
|
|
"""Test that fetch_url tool calls work in the loop."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
researcher = WebResearcher(
|
|
anthropic_api_key="fake",
|
|
tavily_api_key="fake",
|
|
model_id="claude-test",
|
|
trace_dir=tmp,
|
|
)
|
|
|
|
# LLM requests fetch_url
|
|
fetch_response = _make_anthropic_response(
|
|
[_tool_use_block("fetch_url", {"url": "https://example.com/page"})],
|
|
)
|
|
done_response = _make_anthropic_response(
|
|
[_text_block("Got it.")],
|
|
)
|
|
synthesis_response = _make_anthropic_response(
|
|
[_text_block(VALID_SYNTHESIS_JSON)],
|
|
)
|
|
|
|
researcher.client.messages.create = MagicMock(
|
|
side_effect=[fetch_response, done_response, synthesis_response]
|
|
)
|
|
|
|
with patch("researchers.web.agent.fetch_url") as mock_fetch:
|
|
from researchers.web.tools import FetchResult
|
|
|
|
mock_fetch.return_value = FetchResult(
|
|
url="https://example.com/page",
|
|
text="Fetched page content about Utah gardening.",
|
|
content_hash="sha256:def456",
|
|
content_length=42,
|
|
success=True,
|
|
)
|
|
|
|
result = await researcher.research("test question")
|
|
|
|
assert isinstance(result, ResearchResult)
|
|
mock_fetch.assert_called_once_with("https://example.com/page")
|