From 7cb3fde90e762ad5b045f082bfa944c58a0906e6 Mon Sep 17 00:00:00 2001
From: Jeff Smith <jeff@unbiasedgeek.com>
Date: Wed, 8 Apr 2026 14:29:27 -0600
Subject: [PATCH] M1.3: Inner agent loop with tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WebResearcher — the core agentic research loop:
- Tool-use loop: Claude decides when to search (Tavily) and fetch (httpx)
- Budget enforcement: stops at max_iterations or token_budget
- Synthesis step: separate LLM call produces structured ResearchResult JSON
- Fallback: valid ResearchResult even when synthesis JSON is unparseable
- Full trace logging at every step (start, search, fetch, synthesis, complete)
- Populates all contract fields: raw_excerpt, categorized gaps,
  discovery_events, confidence_factors, cost_metadata with model_id

9 tests: complete research loop, budget exhaustion, synthesis failure
fallback, trace file creation, fetch_url tool integration, search
result formatting.

Refs: archeious/marchwarden#1

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 researchers/web/agent.py | 601 +++++++++++++++++++++++++++++++++++++++
 tests/test_agent.py      | 366 ++++++++++++++++++++++++
 2 files changed, 967 insertions(+)
 create mode 100644 researchers/web/agent.py
 create mode 100644 tests/test_agent.py

diff --git a/researchers/web/agent.py b/researchers/web/agent.py
new file mode 100644
index 0000000..dbb2e59
--- /dev/null
+++ b/researchers/web/agent.py
@@ -0,0 +1,601 @@
+"""Web researcher agent — the inner agentic loop.
+
+Takes a question, runs a plan→search→fetch→iterate→synthesize loop
+using Claude as the reasoning engine and Tavily/httpx as tools.
+Returns a ResearchResult conforming to the v1 contract.
+"""
+
+import asyncio
+import json
+import time
+from typing import Optional
+
+from anthropic import Anthropic
+
+from researchers.web.models import (
+    Citation,
+    ConfidenceFactors,
+    CostMetadata,
+    DiscoveryEvent,
+    Gap,
+    GapCategory,
+    ResearchConstraints,
+    ResearchResult,
+)
+from researchers.web.tools import SearchResult, fetch_url, tavily_search
+from researchers.web.trace import TraceLogger
+
+SYSTEM_PROMPT = """\
+You are a Marchwarden — a research specialist stationed at the frontier of knowledge. \
+Your job is to investigate a question thoroughly using web search and URL fetching, \
+then produce a grounded, evidence-based answer.
+
+## Your process
+
+1. **Plan**: Decide what to search for. Break complex questions into sub-queries.
+2. **Search**: Use the web_search tool to find relevant sources.
+3. **Fetch**: Use the fetch_url tool to get full content from promising URLs.
+4. **Iterate**: If you don't have enough evidence, search again with refined queries.
+5. **Stop**: When you have sufficient evidence OR you've exhausted your budget.
+
+## Rules
+
+- Every claim must be traceable to a source you actually fetched.
+- If you can't find information, say so — never fabricate.
+- If sources contradict each other, note the contradiction.
+- If the question requires expertise outside web search (academic papers, databases, \
+legal documents), note it as a discovery for another researcher.
+- Be efficient. Don't fetch URLs that are clearly irrelevant from their title/snippet.
+- Prefer authoritative sources (.gov, .edu, established organizations) over blogs/forums.
+"""
+
+SYNTHESIS_PROMPT = """\
+Based on the evidence gathered, produce a structured research result as JSON.
+
+## Evidence gathered
+{evidence}
+
+## Original question
+{question}
+
+## Context from caller
+{context}
+
+## Instructions
+
+Produce a JSON object with these exact fields:
+
+{{
+  "answer": "Your synthesized answer. Every claim must trace to a citation.",
+  "citations": [
+    {{
+      "source": "web",
+      "locator": "the exact URL",
+      "title": "page title",
+      "snippet": "your 50-200 char summary of why this source is relevant",
+      "raw_excerpt": "verbatim 100-500 char excerpt from the source that supports your claim",
+      "confidence": 0.0-1.0
+    }}
+  ],
+  "gaps": [
+    {{
+      "topic": "what wasn't resolved",
+      "category": "source_not_found|access_denied|budget_exhausted|contradictory_sources|scope_exceeded",
+      "detail": "human-readable explanation"
+    }}
+  ],
+  "discovery_events": [
+    {{
+      "type": "related_research|new_source|contradiction",
+      "suggested_researcher": "arxiv|database|legal|null",
+      "query": "suggested query for that researcher",
+      "reason": "why this matters",
+      "source_locator": "URL where you found this, or null"
+    }}
+  ],
+  "confidence": 0.0-1.0,
+  "confidence_factors": {{
+    "num_corroborating_sources": 0,
+    "source_authority": "high|medium|low",
+    "contradiction_detected": false,
+    "query_specificity_match": 0.0-1.0,
+    "budget_exhausted": false,
+    "recency": "current|recent|dated|null"
+  }}
+}}
+
+Respond with ONLY the JSON object, no markdown fences, no explanation.
+"""
+
+# Tool definitions for Claude's tool_use API
+TOOLS = [
+    {
+        "name": "web_search",
+        "description": (
+            "Search the web for information. Returns titles, URLs, snippets, "
+            "and sometimes full page content. Use this to find sources."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "The search query.",
+                },
+                "max_results": {
+                    "type": "integer",
+                    "description": "Number of results (1-10). Default 5.",
+                    "default": 5,
+                },
+            },
+            "required": ["query"],
+        },
+    },
+    {
+        "name": "fetch_url",
+        "description": (
+            "Fetch the full text content of a URL. Use this when a search result "
+            "looks promising but the snippet isn't enough. Returns extracted text."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "url": {
+                    "type": "string",
+                    "description": "The URL to fetch.",
+                },
+            },
+            "required": ["url"],
+        },
+    },
+]
+
+
+class WebResearcher:
+    """Agentic web researcher that searches, fetches, and synthesizes."""
+
+    def __init__(
+        self,
+        anthropic_api_key: str,
+        tavily_api_key: str,
+        model_id: str = "claude-sonnet-4-5-20250514",
+        trace_dir: Optional[str] = None,
+    ):
+        self.client = Anthropic(api_key=anthropic_api_key)
+        self.tavily_api_key = tavily_api_key
+        self.model_id = model_id
+        self.trace_dir = trace_dir
+
+    async def research(
+        self,
+        question: str,
+        context: Optional[str] = None,
+        depth: str = "balanced",
+        constraints: Optional[ResearchConstraints] = None,
+    ) -> ResearchResult:
+        """Run a full research loop on a question.
+
+        Args:
+            question: The question to investigate.
+            context: What the caller already knows (optional).
+            depth: "shallow", "balanced", or "deep".
+            constraints: Budget and iteration limits.
+
+        Returns:
+            A ResearchResult conforming to the v1 contract.
+        """
+        constraints = constraints or ResearchConstraints()
+        trace = TraceLogger(trace_dir=self.trace_dir)
+        start_time = time.time()
+        total_tokens = 0
+        iterations = 0
+        evidence: list[dict] = []
+        budget_exhausted = False
+
+        trace.log_step(
+            "start",
+            decision=f"Beginning research: depth={depth}",
+            question=question,
+            context=context or "",
+            max_iterations=constraints.max_iterations,
+            token_budget=constraints.token_budget,
+        )
+
+        # Build initial message
+        user_message = f"Research this question: {question}"
+        if context:
+            user_message += f"\n\nContext from the caller: {context}"
+        user_message += f"\n\nResearch depth: {depth}"
+
+        messages = [{"role": "user", "content": user_message}]
+
+        # --- Tool-use loop ---
+        while iterations < constraints.max_iterations:
+            iterations += 1
+
+            trace.log_step(
+                "iteration_start",
+                decision=f"Starting iteration {iterations}/{constraints.max_iterations}",
+                tokens_so_far=total_tokens,
+            )
+
+            response = self.client.messages.create(
+                model=self.model_id,
+                max_tokens=4096,
+                system=SYSTEM_PROMPT,
+                messages=messages,
+                tools=TOOLS,
+            )
+
+            # Track tokens
+            total_tokens += response.usage.input_tokens + response.usage.output_tokens
+
+            # Check if the model wants to use tools
+            tool_calls = [b for b in response.content if b.type == "tool_use"]
+
+            if not tool_calls:
+                # Model is done researching — extract any final text
+                text_blocks = [b.text for b in response.content if b.type == "text"]
+                if text_blocks:
+                    trace.log_step(
+                        "agent_message",
+                        decision="Agent finished tool use",
+                        message=text_blocks[0][:500],
+                    )
+                break
+
+            # Process each tool call
+            tool_results = []
+            for tool_call in tool_calls:
+                result_content = await self._execute_tool(
+                    tool_call.name,
+                    tool_call.input,
+                    evidence,
+                    trace,
+                    constraints,
+                )
+                tool_results.append(
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": tool_call.id,
+                        "content": result_content,
+                    }
+                )
+
+            # Append assistant response + tool results to conversation
+            messages.append({"role": "assistant", "content": response.content})
+            messages.append({"role": "user", "content": tool_results})
+
+            # Check token budget
+            if total_tokens >= constraints.token_budget:
+                budget_exhausted = True
+                trace.log_step(
+                    "budget_exhausted",
+                    decision=f"Token budget reached: {total_tokens}/{constraints.token_budget}",
+                )
+                break
+
+        # --- Synthesis step ---
+        trace.log_step(
+            "synthesis_start",
+            decision="Beginning synthesis of gathered evidence",
+            evidence_count=len(evidence),
+            iterations_run=iterations,
+            tokens_used=total_tokens,
+        )
+
+        result = await self._synthesize(
+            question=question,
+            context=context,
+            evidence=evidence,
+            trace=trace,
+            total_tokens=total_tokens,
+            iterations=iterations,
+            start_time=start_time,
+            budget_exhausted=budget_exhausted,
+        )
+
+        trace.log_step(
+            "complete",
+            decision="Research complete",
+            confidence=result.confidence,
+            citation_count=len(result.citations),
+            gap_count=len(result.gaps),
+            discovery_count=len(result.discovery_events),
+        )
+        trace.close()
+
+        return result
+
+    async def _execute_tool(
+        self,
+        tool_name: str,
+        tool_input: dict,
+        evidence: list[dict],
+        trace: TraceLogger,
+        constraints: ResearchConstraints,
+    ) -> str:
+        """Execute a tool call and return the result as a string."""
+
+        if tool_name == "web_search":
+            query = tool_input.get("query", "")
+            max_results = min(
+                tool_input.get("max_results", 5),
+                constraints.max_sources,
+            )
+
+            trace.log_step(
+                "web_search",
+                decision=f"Searching: {query}",
+                query=query,
+                max_results=max_results,
+            )
+
+            results = tavily_search(
+                api_key=self.tavily_api_key,
+                query=query,
+                max_results=max_results,
+            )
+
+            # Store evidence
+            for r in results:
+                ev = {
+                    "type": "search_result",
+                    "url": r.url,
+                    "title": r.title,
+                    "content": r.content,
+                    "raw_content": r.raw_content,
+                    "content_hash": r.content_hash,
+                    "score": r.score,
+                }
+                evidence.append(ev)
+
+            trace.log_step(
+                "web_search_complete",
+                decision=f"Got {len(results)} results",
+                result_count=len(results),
+                urls=[r.url for r in results],
+            )
+
+            # Return results as text for the LLM
+            return _format_search_results(results)
+
+        elif tool_name == "fetch_url":
+            url = tool_input.get("url", "")
+
+            trace.log_step(
+                "fetch_url",
+                decision=f"Fetching: {url}",
+                url=url,
+            )
+
+            result = await fetch_url(url)
+
+            trace.log_step(
+                "fetch_url_complete",
+                decision="Fetch succeeded" if result.success else f"Fetch failed: {result.error}",
+                url=url,
+                content_hash=result.content_hash,
+                content_length=result.content_length,
+                success=result.success,
+            )
+
+            if result.success:
+                # Store evidence
+                evidence.append(
+                    {
+                        "type": "fetched_page",
+                        "url": url,
+                        "content": result.text[:10000],
+                        "content_hash": result.content_hash,
+                        "content_length": result.content_length,
+                    }
+                )
+                # Return truncated text for the LLM
+                return result.text[:8000]
+            else:
+                return f"Failed to fetch URL: {result.error}"
+
+        return f"Unknown tool: {tool_name}"
+
+    async def _synthesize(
+        self,
+        question: str,
+        context: Optional[str],
+        evidence: list[dict],
+        trace: TraceLogger,
+        total_tokens: int,
+        iterations: int,
+        start_time: float,
+        budget_exhausted: bool,
+    ) -> ResearchResult:
+        """Ask the LLM to synthesize evidence into a ResearchResult."""
+
+        # Format evidence for the synthesis prompt
+        evidence_text = ""
+        for i, ev in enumerate(evidence, 1):
+            if ev["type"] == "search_result":
+                content = ev.get("raw_content") or ev.get("content", "")
+                evidence_text += (
+                    f"\n--- Source {i} (search result) ---\n"
+                    f"URL: {ev['url']}\n"
+                    f"Title: {ev['title']}\n"
+                    f"Content hash: {ev['content_hash']}\n"
+                    f"Content: {content[:3000]}\n"
+                )
+            elif ev["type"] == "fetched_page":
+                evidence_text += (
+                    f"\n--- Source {i} (fetched page) ---\n"
+                    f"URL: {ev['url']}\n"
+                    f"Content hash: {ev['content_hash']}\n"
+                    f"Content: {ev['content'][:3000]}\n"
+                )
+
+        prompt = SYNTHESIS_PROMPT.format(
+            evidence=evidence_text or "(No evidence gathered)",
+            question=question,
+            context=context or "(No additional context)",
+        )
+
+        response = self.client.messages.create(
+            model=self.model_id,
+            max_tokens=4096,
+            messages=[{"role": "user", "content": prompt}],
+        )
+
+        total_tokens += response.usage.input_tokens + response.usage.output_tokens
+        wall_time = time.time() - start_time
+
+        # Parse the JSON response
+        raw_text = response.content[0].text.strip()
+        # Strip markdown fences if the model added them despite instructions
+        if raw_text.startswith("```"):
+            raw_text = raw_text.split("\n", 1)[1] if "\n" in raw_text else raw_text[3:]
+        if raw_text.endswith("```"):
+            raw_text = raw_text[:-3].strip()
+
+        try:
+            data = json.loads(raw_text)
+        except json.JSONDecodeError:
+            trace.log_step(
+                "synthesis_error",
+                decision="Failed to parse synthesis JSON, returning fallback",
+                raw_response=raw_text[:1000],
+            )
+            return self._fallback_result(
+                question, evidence, trace, total_tokens, iterations,
+                wall_time, budget_exhausted,
+            )
+
+        trace.log_step(
+            "synthesis_complete",
+            decision="Parsed synthesis JSON successfully",
+        )
+
+        # Build the ResearchResult from parsed JSON
+        try:
+            citations = [
+                Citation(
+                    source=c.get("source", "web"),
+                    locator=c.get("locator", ""),
+                    title=c.get("title"),
+                    snippet=c.get("snippet"),
+                    raw_excerpt=c.get("raw_excerpt", ""),
+                    confidence=c.get("confidence", 0.5),
+                )
+                for c in data.get("citations", [])
+            ]
+
+            gaps = [
+                Gap(
+                    topic=g.get("topic", ""),
+                    category=GapCategory(g.get("category", "source_not_found")),
+                    detail=g.get("detail", ""),
+                )
+                for g in data.get("gaps", [])
+            ]
+
+            discovery_events = [
+                DiscoveryEvent(
+                    type=d.get("type", "related_research"),
+                    suggested_researcher=d.get("suggested_researcher"),
+                    query=d.get("query", ""),
+                    reason=d.get("reason", ""),
+                    source_locator=d.get("source_locator"),
+                )
+                for d in data.get("discovery_events", [])
+            ]
+
+            cf = data.get("confidence_factors", {})
+            confidence_factors = ConfidenceFactors(
+                num_corroborating_sources=cf.get("num_corroborating_sources", 0),
+                source_authority=cf.get("source_authority", "low"),
+                contradiction_detected=cf.get("contradiction_detected", False),
+                query_specificity_match=cf.get("query_specificity_match", 0.5),
+                budget_exhausted=budget_exhausted or cf.get("budget_exhausted", False),
+                recency=cf.get("recency"),
+            )
+
+            return ResearchResult(
+                answer=data.get("answer", "No answer could be synthesized."),
+                citations=citations,
+                gaps=gaps,
+                discovery_events=discovery_events,
+                confidence=data.get("confidence", 0.5),
+                confidence_factors=confidence_factors,
+                cost_metadata=CostMetadata(
+                    tokens_used=total_tokens,
+                    iterations_run=iterations,
+                    wall_time_sec=wall_time,
+                    budget_exhausted=budget_exhausted,
+                    model_id=self.model_id,
+                ),
+                trace_id=trace.trace_id,
+            )
+        except Exception as e:
+            trace.log_step(
+                "synthesis_build_error",
+                decision=f"Failed to build ResearchResult: {e}",
+            )
+            return self._fallback_result(
+                question, evidence, trace, total_tokens, iterations,
+                wall_time, budget_exhausted,
+            )
+
+    def _fallback_result(
+        self,
+        question: str,
+        evidence: list[dict],
+        trace: TraceLogger,
+        total_tokens: int,
+        iterations: int,
+        wall_time: float,
+        budget_exhausted: bool,
+    ) -> ResearchResult:
+        """Produce a minimal valid ResearchResult when synthesis fails."""
+        return ResearchResult(
+            answer=f"Research on '{question}' completed but synthesis failed. {len(evidence)} sources were gathered.",
+            citations=[],
+            gaps=[
+                Gap(
+                    topic="synthesis",
+                    category=GapCategory.BUDGET_EXHAUSTED
+                    if budget_exhausted
+                    else GapCategory.SOURCE_NOT_FOUND,
+                    detail="The synthesis step failed to produce structured output.",
+                )
+            ],
+            discovery_events=[],
+            confidence=0.1,
+            confidence_factors=ConfidenceFactors(
+                num_corroborating_sources=0,
+                source_authority="low",
+                contradiction_detected=False,
+                query_specificity_match=0.0,
+                budget_exhausted=budget_exhausted,
+                recency=None,
+            ),
+            cost_metadata=CostMetadata(
+                tokens_used=total_tokens,
+                iterations_run=iterations,
+                wall_time_sec=wall_time,
+                budget_exhausted=budget_exhausted,
+                model_id=self.model_id,
+            ),
+            trace_id=trace.trace_id,
+        )
+
+
+def _format_search_results(results: list[SearchResult]) -> str:
+    """Format search results as readable text for the LLM."""
+    parts = []
+    for i, r in enumerate(results, 1):
+        content = r.raw_content or r.content
+        parts.append(
+            f"Result {i}:\n"
+            f"  Title: {r.title}\n"
+            f"  URL: {r.url}\n"
+            f"  Relevance: {r.score:.2f}\n"
+            f"  Content: {content[:2000]}\n"
+        )
+    return "\n".join(parts) if parts else "No results found."
diff --git a/tests/test_agent.py b/tests/test_agent.py
new file mode 100644
index 0000000..c3df74d
--- /dev/null
+++ b/tests/test_agent.py
@@ -0,0 +1,366 @@
+"""Tests for the web researcher agent."""
+
+import json
+import tempfile
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch, AsyncMock
+
+import pytest
+
+from researchers.web.agent import WebResearcher, _format_search_results
+from researchers.web.models import ResearchConstraints, ResearchResult
+from researchers.web.tools import SearchResult
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_anthropic_response(content_blocks, input_tokens=100, output_tokens=200):
+    """Build a mock Anthropic messages.create response."""
+    resp = MagicMock()
+    resp.content = content_blocks
+    resp.usage = SimpleNamespace(input_tokens=input_tokens, output_tokens=output_tokens)
+    return resp
+
+
+def _text_block(text):
+    block = MagicMock()
+    block.type = "text"
+    block.text = text
+    return block
+
+
+def _tool_use_block(name, tool_input, tool_id="tool_1"):
+    block = MagicMock()
+    block.type = "tool_use"
+    block.name = name
+    block.input = tool_input
+    block.id = tool_id
+    return block
+
+
+VALID_SYNTHESIS_JSON = json.dumps(
+    {
+        "answer": "Utah is ideal for cool-season crops at high elevation.",
+        "citations": [
+            {
+                "source": "web",
+                "locator": "https://example.com/utah-crops",
+                "title": "Utah Crop Guide",
+                "snippet": "Cool-season crops thrive above 7000 ft.",
+                "raw_excerpt": "In Utah's high-elevation gardens, cool-season vegetables such as peas, lettuce, and potatoes consistently outperform warm-season crops.",
+                "confidence": 0.9,
+            }
+        ],
+        "gaps": [
+            {
+                "topic": "pest management",
+                "category": "source_not_found",
+                "detail": "No pest data found.",
+            }
+        ],
+        "discovery_events": [
+            {
+                "type": "related_research",
+                "suggested_researcher": "database",
+                "query": "Utah soil salinity data",
+                "reason": "Multiple sources reference USU studies",
+                "source_locator": "https://example.com/ref",
+            }
+        ],
+        "confidence": 0.82,
+        "confidence_factors": {
+            "num_corroborating_sources": 3,
+            "source_authority": "high",
+            "contradiction_detected": False,
+            "query_specificity_match": 0.85,
+            "budget_exhausted": False,
+            "recency": "current",
+        },
+    }
+)
+
+
+# ---------------------------------------------------------------------------
+# _format_search_results
+# ---------------------------------------------------------------------------
+
+
+class TestFormatSearchResults:
+    def test_formats_results(self):
+        results = [
+            SearchResult(
+                url="https://example.com",
+                title="Test",
+                content="Short summary",
+                raw_content="Full text here",
+                score=0.95,
+                content_hash="sha256:abc",
+            )
+        ]
+        text = _format_search_results(results)
+        assert "Test" in text
+        assert "https://example.com" in text
+        assert "0.95" in text
+        assert "Full text here" in text
+
+    def test_prefers_raw_content(self):
+        results = [
+            SearchResult(
+                url="https://example.com",
+                title="Test",
+                content="Short",
+                raw_content="Much longer raw content",
+                score=0.9,
+                content_hash="sha256:abc",
+            )
+        ]
+        text = _format_search_results(results)
+        assert "Much longer raw content" in text
+
+    def test_falls_back_to_content(self):
+        results = [
+            SearchResult(
+                url="https://example.com",
+                title="Test",
+                content="Only short content",
+                raw_content=None,
+                score=0.9,
+                content_hash="sha256:abc",
+            )
+        ]
+        text = _format_search_results(results)
+        assert "Only short content" in text
+
+    def test_empty_results(self):
+        assert "No results" in _format_search_results([])
+
+
+# ---------------------------------------------------------------------------
+# WebResearcher — mocked tool loop
+# ---------------------------------------------------------------------------
+
+
+class TestWebResearcher:
+    @pytest.mark.asyncio
+    async def test_simple_research_loop(self):
+        """Test a complete loop: one search → LLM stops → synthesis."""
+        with tempfile.TemporaryDirectory() as tmp:
+            researcher = WebResearcher(
+                anthropic_api_key="fake",
+                tavily_api_key="fake",
+                model_id="claude-test",
+                trace_dir=tmp,
+            )
+
+            # First call: LLM requests a web_search
+            search_response = _make_anthropic_response(
+                [_tool_use_block("web_search", {"query": "Utah crops"})],
+            )
+            # Second call: LLM is done (text only, no tools)
+            done_response = _make_anthropic_response(
+                [_text_block("I have enough information.")],
+            )
+            # Third call: synthesis
+            synthesis_response = _make_anthropic_response(
+                [_text_block(VALID_SYNTHESIS_JSON)],
+            )
+
+            researcher.client.messages.create = MagicMock(
+                side_effect=[search_response, done_response, synthesis_response]
+            )
+
+            with patch("researchers.web.agent.tavily_search") as mock_search:
+                mock_search.return_value = [
+                    SearchResult(
+                        url="https://example.com/utah",
+                        title="Utah Gardening",
+                        content="Cool-season crops work well.",
+                        raw_content="Full content about Utah gardening.",
+                        score=0.95,
+                        content_hash="sha256:abc123",
+                    )
+                ]
+
+                result = await researcher.research(
+                    "What are ideal crops for Utah?",
+                    constraints=ResearchConstraints(max_iterations=3),
+                )
+
+            assert isinstance(result, ResearchResult)
+            assert "Utah" in result.answer
+            assert len(result.citations) == 1
+            assert result.citations[0].locator == "https://example.com/utah-crops"
+            assert result.citations[0].raw_excerpt.startswith("In Utah")
+            assert len(result.gaps) == 1
+            assert result.gaps[0].category == "source_not_found"
+            assert len(result.discovery_events) == 1
+            assert result.confidence == 0.82
+            assert result.confidence_factors.num_corroborating_sources == 3
+            assert result.cost_metadata.model_id == "claude-test"
+            assert result.cost_metadata.tokens_used > 0
+            assert result.trace_id is not None
+
+    @pytest.mark.asyncio
+    async def test_budget_exhaustion(self):
+        """Test that the loop stops when token budget is reached."""
+        with tempfile.TemporaryDirectory() as tmp:
+            researcher = WebResearcher(
+                anthropic_api_key="fake",
+                tavily_api_key="fake",
+                model_id="claude-test",
+                trace_dir=tmp,
+            )
+
+            # Each response uses 600 tokens — budget is 1000
+            search_response = _make_anthropic_response(
+                [_tool_use_block("web_search", {"query": "test"}, "t1")],
+                input_tokens=400,
+                output_tokens=200,
+            )
+            # Second search pushes over budget (600 + 600 = 1200 > 1000)
+            search_response_2 = _make_anthropic_response(
+                [_tool_use_block("web_search", {"query": "test2"}, "t2")],
+                input_tokens=400,
+                output_tokens=200,
+            )
+            synthesis_response = _make_anthropic_response(
+                [_text_block(VALID_SYNTHESIS_JSON)],
+                input_tokens=200,
+                output_tokens=100,
+            )
+
+            researcher.client.messages.create = MagicMock(
+                side_effect=[search_response, search_response_2, synthesis_response]
+            )
+
+            with patch("researchers.web.agent.tavily_search") as mock_search:
+                mock_search.return_value = [
+                    SearchResult(
+                        url="https://example.com",
+                        title="Test",
+                        content="Content",
+                        raw_content=None,
+                        score=0.9,
+                        content_hash="sha256:abc",
+                    )
+                ]
+
+                result = await researcher.research(
+                    "test question",
+                    constraints=ResearchConstraints(
+                        max_iterations=5,
+                        token_budget=1000,
+                    ),
+                )
+
+            assert result.cost_metadata.budget_exhausted is True
+
+    @pytest.mark.asyncio
+    async def test_synthesis_failure_returns_fallback(self):
+        """If synthesis JSON is unparseable, return a valid fallback."""
+        with tempfile.TemporaryDirectory() as tmp:
+            researcher = WebResearcher(
+                anthropic_api_key="fake",
+                tavily_api_key="fake",
+                model_id="claude-test",
+                trace_dir=tmp,
+            )
+
+            # LLM immediately stops (no tools)
+            done_response = _make_anthropic_response(
+                [_text_block("Nothing to search.")],
+            )
+            # Synthesis returns garbage
+            bad_synthesis = _make_anthropic_response(
+                [_text_block("This is not valid JSON at all!!!")],
+            )
+
+            researcher.client.messages.create = MagicMock(
+                side_effect=[done_response, bad_synthesis]
+            )
+
+            result = await researcher.research("test question")
+
+            assert isinstance(result, ResearchResult)
+            assert "synthesis failed" in result.answer.lower()
+            assert result.confidence == 0.1
+            assert len(result.gaps) == 1
+
+    @pytest.mark.asyncio
+    async def test_trace_file_created(self):
+        """Verify trace file is created and has entries."""
+        with tempfile.TemporaryDirectory() as tmp:
+            researcher = WebResearcher(
+                anthropic_api_key="fake",
+                tavily_api_key="fake",
+                model_id="claude-test",
+                trace_dir=tmp,
+            )
+
+            done_response = _make_anthropic_response(
+                [_text_block("Done.")],
+            )
+            synthesis_response = _make_anthropic_response(
+                [_text_block(VALID_SYNTHESIS_JSON)],
+            )
+
+            researcher.client.messages.create = MagicMock(
+                side_effect=[done_response, synthesis_response]
+            )
+
+            result = await researcher.research("test")
+
+            # Check trace file exists
+            from researchers.web.trace import TraceLogger
+
+            trace = TraceLogger(trace_id=result.trace_id, trace_dir=tmp)
+            entries = trace.read_entries()
+            assert len(entries) >= 3  # start, iteration_start, synthesis, complete
+            assert entries[0]["action"] == "start"
+            actions = [e["action"] for e in entries]
+            assert "complete" in actions
+
+    @pytest.mark.asyncio
+    async def test_fetch_url_tool(self):
+        """Test that fetch_url tool calls work in the loop."""
+        with tempfile.TemporaryDirectory() as tmp:
+            researcher = WebResearcher(
+                anthropic_api_key="fake",
+                tavily_api_key="fake",
+                model_id="claude-test",
+                trace_dir=tmp,
+            )
+
+            # LLM requests fetch_url
+            fetch_response = _make_anthropic_response(
+                [_tool_use_block("fetch_url", {"url": "https://example.com/page"})],
+            )
+            done_response = _make_anthropic_response(
+                [_text_block("Got it.")],
+            )
+            synthesis_response = _make_anthropic_response(
+                [_text_block(VALID_SYNTHESIS_JSON)],
+            )
+
+            researcher.client.messages.create = MagicMock(
+                side_effect=[fetch_response, done_response, synthesis_response]
+            )
+
+            with patch("researchers.web.agent.fetch_url") as mock_fetch:
+                from researchers.web.tools import FetchResult
+
+                mock_fetch.return_value = FetchResult(
+                    url="https://example.com/page",
+                    text="Fetched page content about Utah gardening.",
+                    content_hash="sha256:def456",
+                    content_length=42,
+                    success=True,
+                )
+
+                result = await researcher.research("test question")
+
+            assert isinstance(result, ResearchResult)
+            mock_fetch.assert_called_once_with("https://example.com/page")
-- 
2.45.2