marchwarden/researchers/web/tools.py

"""Search and fetch tools for the web researcher.

Two tools:
- tavily_search: Web search via Tavily API, returns structured results
- fetch_url: Direct URL fetch with content extraction and hashing
"""

import hashlib
from dataclasses import dataclass
from typing import Optional

import httpx
from tavily import TavilyClient


@dataclass
class SearchResult:
    """A single result from a Tavily search."""

    url: str
    title: str
    content: str  # Short extracted summary from Tavily
    raw_content: Optional[str]  # Full page text (may be None)
    score: float  # Tavily relevance score (0.0-1.0)
    content_hash: str  # SHA-256 of the best available content


@dataclass
class FetchResult:
    """Result of fetching and extracting content from a URL."""

    url: str
    text: str  # Extracted clean text
    content_hash: str  # SHA-256 of the fetched content
    content_length: int
    success: bool
    error: Optional[str] = None


def _sha256(text: str) -> str:
    """Compute SHA-256 hash of text content."""
    return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}"


def tavily_search(
    api_key: str,
    query: str,
    max_results: int = 5,
    include_raw_content: bool = True,
) -> list[SearchResult]:
    """Search the web via Tavily API.

    Args:
        api_key: Tavily API key.
        query: Search query string.
        max_results: Maximum number of results to return.
        include_raw_content: Whether to request full page text from Tavily.

    Returns:
        List of SearchResult objects, sorted by relevance score.
    """
    client = TavilyClient(api_key=api_key)
    response = client.search(
        query,
        max_results=max_results,
        include_raw_content=include_raw_content,
    )

    results = []
    for item in response.get("results", []):
        raw = item.get("raw_content") or ""
        content = item.get("content", "")
        # Hash the best available content (raw if present, else summary)
        hashable = raw if raw else content
        results.append(
            SearchResult(
                url=item.get("url", ""),
                title=item.get("title", ""),
                content=content,
                raw_content=raw if raw else None,
                score=item.get("score", 0.0),
                content_hash=_sha256(hashable) if hashable else _sha256(""),
            )
        )

    return results


async def fetch_url(
    url: str,
    timeout: float = 15.0,
    max_length: int = 100_000,
) -> FetchResult:
    """Fetch a URL and extract clean text content.

    Used for URLs where Tavily didn't return raw_content, or for
    URLs discovered during research that weren't in the search results.

    Args:
        url: The URL to fetch.
        timeout: Request timeout in seconds.
        max_length: Maximum response body length in characters.

    Returns:
        FetchResult with extracted text and content hash.
    """
    try:
        async with httpx.AsyncClient(
            follow_redirects=True,
            timeout=timeout,
        ) as client:
            response = await client.get(
                url,
                headers={
                    "User-Agent": "Marchwarden/0.1 (research agent)",
                },
            )
            response.raise_for_status()

            raw_text = response.text[:max_length]
            clean_text = _extract_text(raw_text)

            return FetchResult(
                url=url,
                text=clean_text,
                content_hash=_sha256(clean_text),
                content_length=len(clean_text),
                success=True,
            )
    except httpx.TimeoutException:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=f"Timeout after {timeout}s",
        )
    except httpx.HTTPStatusError as e:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=f"HTTP {e.response.status_code}",
        )
    except httpx.HTTPError as e:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=str(e),
        )


def _extract_text(html: str) -> str:
    """Extract readable text from HTML.

    Simple extraction: strip tags, collapse whitespace. For V1 this is
    sufficient. If quality is poor, swap in trafilatura or readability-lxml.
    """
    import re

    # Remove script and style blocks
    text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL)
    text = re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=re.DOTALL)
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Decode common HTML entities
    text = text.replace("&amp;", "&")
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    text = text.replace("&quot;", '"')
    text = text.replace("&#39;", "'")
    text = text.replace("&nbsp;", " ")
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text
M1.1: Search and fetch tools with tests - tavily_search(): Tavily API wrapper returning SearchResult dataclasses with content hashing (raw_content preferred, falls back to summary) - fetch_url(): async URL fetch with HTML text extraction, content hashing, and graceful error handling (timeout, HTTP errors, connection errors) - _extract_text(): simple HTML → clean text (strip scripts/styles/tags, decode entities, collapse whitespace) - _sha256(): SHA-256 content hashing with 'sha256:' prefix for traces 18 tests: hashing, HTML extraction, mocked Tavily search, mocked async fetch (success, timeout, HTTP error, hash consistency). Refs: archeious/marchwarden#1 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com> 2026-04-08 20:17:18 +00:00			`"""Search and fetch tools for the web researcher.`

			`Two tools:`
			`- tavily_search: Web search via Tavily API, returns structured results`
			`- fetch_url: Direct URL fetch with content extraction and hashing`
			`"""`

			`import hashlib`
			`from dataclasses import dataclass`
			`from typing import Optional`

			`import httpx`
			`from tavily import TavilyClient`


			`@dataclass`
			`class SearchResult:`
			`"""A single result from a Tavily search."""`

			`url: str`
			`title: str`
			`content: str # Short extracted summary from Tavily`
			`raw_content: Optional[str] # Full page text (may be None)`
			`score: float # Tavily relevance score (0.0-1.0)`
			`content_hash: str # SHA-256 of the best available content`


			`@dataclass`
			`class FetchResult:`
			`"""Result of fetching and extracting content from a URL."""`

			`url: str`
			`text: str # Extracted clean text`
			`content_hash: str # SHA-256 of the fetched content`
			`content_length: int`
			`success: bool`
			`error: Optional[str] = None`


			`def _sha256(text: str) -> str:`
			`"""Compute SHA-256 hash of text content."""`
			`return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}"`


			`def tavily_search(`
			`api_key: str,`
			`query: str,`
			`max_results: int = 5,`
			`include_raw_content: bool = True,`
			`) -> list[SearchResult]:`
			`"""Search the web via Tavily API.`

			`Args:`
			`api_key: Tavily API key.`
			`query: Search query string.`
			`max_results: Maximum number of results to return.`
			`include_raw_content: Whether to request full page text from Tavily.`

			`Returns:`
			`List of SearchResult objects, sorted by relevance score.`
			`"""`
			`client = TavilyClient(api_key=api_key)`
			`response = client.search(`
			`query,`
			`max_results=max_results,`
			`include_raw_content=include_raw_content,`
			`)`

			`results = []`
			`for item in response.get("results", []):`
			`raw = item.get("raw_content") or ""`
			`content = item.get("content", "")`
			`# Hash the best available content (raw if present, else summary)`
			`hashable = raw if raw else content`
			`results.append(`
			`SearchResult(`
			`url=item.get("url", ""),`
			`title=item.get("title", ""),`
			`content=content,`
			`raw_content=raw if raw else None,`
			`score=item.get("score", 0.0),`
			`content_hash=_sha256(hashable) if hashable else _sha256(""),`
			`)`
			`)`

			`return results`


			`async def fetch_url(`
			`url: str,`
			`timeout: float = 15.0,`
			`max_length: int = 100_000,`
			`) -> FetchResult:`
			`"""Fetch a URL and extract clean text content.`

			`Used for URLs where Tavily didn't return raw_content, or for`
			`URLs discovered during research that weren't in the search results.`

			`Args:`
			`url: The URL to fetch.`
			`timeout: Request timeout in seconds.`
			`max_length: Maximum response body length in characters.`

			`Returns:`
			`FetchResult with extracted text and content hash.`
			`"""`
			`try:`
			`async with httpx.AsyncClient(`
			`follow_redirects=True,`
			`timeout=timeout,`
			`) as client:`
			`response = await client.get(`
			`url,`
			`headers={`
			`"User-Agent": "Marchwarden/0.1 (research agent)",`
			`},`
			`)`
			`response.raise_for_status()`

			`raw_text = response.text[:max_length]`
			`clean_text = _extract_text(raw_text)`

			`return FetchResult(`
			`url=url,`
			`text=clean_text,`
			`content_hash=_sha256(clean_text),`
			`content_length=len(clean_text),`
			`success=True,`
			`)`
			`except httpx.TimeoutException:`
			`return FetchResult(`
			`url=url,`
			`text="",`
			`content_hash=_sha256(""),`
			`content_length=0,`
			`success=False,`
			`error=f"Timeout after {timeout}s",`
			`)`
			`except httpx.HTTPStatusError as e:`
			`return FetchResult(`
			`url=url,`
			`text="",`
			`content_hash=_sha256(""),`
			`content_length=0,`
			`success=False,`
			`error=f"HTTP {e.response.status_code}",`
			`)`
			`except httpx.HTTPError as e:`
			`return FetchResult(`
			`url=url,`
			`text="",`
			`content_hash=_sha256(""),`
			`content_length=0,`
			`success=False,`
			`error=str(e),`
			`)`


			`def _extract_text(html: str) -> str:`
			`"""Extract readable text from HTML.`

			`Simple extraction: strip tags, collapse whitespace. For V1 this is`
			`sufficient. If quality is poor, swap in trafilatura or readability-lxml.`
			`"""`
			`import re`

			`# Remove script and style blocks`
			`text = re.sub(r"<script[^>]>.?</script>", " ", html, flags=re.DOTALL)`
			`text = re.sub(r"<style[^>]>.?</style>", " ", text, flags=re.DOTALL)`
			`# Remove HTML tags`
			`text = re.sub(r"<[^>]+>", " ", text)`
			`# Decode common HTML entities`
			`text = text.replace("&", "&")`
			`text = text.replace("<", "<")`
			`text = text.replace(">", ">")`
			`text = text.replace(""", '"')`
			`text = text.replace("'", "'")`
			`text = text.replace(" ", " ")`
			`# Collapse whitespace`
			`text = re.sub(r"\s+", " ", text).strip()`
			`return text`