"""Search and fetch tools for the web researcher.
Two tools:
- tavily_search: Web search via Tavily API, returns structured results
- fetch_url: Direct URL fetch with content extraction and hashing
"""
import hashlib
from dataclasses import dataclass
from typing import Optional
import httpx
from tavily import TavilyClient
@dataclass
class SearchResult:
"""A single result from a Tavily search."""
url: str
title: str
content: str # Short extracted summary from Tavily
raw_content: Optional[str] # Full page text (may be None)
score: float # Tavily relevance score (0.0-1.0)
content_hash: str # SHA-256 of the best available content
@dataclass
class FetchResult:
"""Result of fetching and extracting content from a URL."""
url: str
text: str # Extracted clean text
content_hash: str # SHA-256 of the fetched content
content_length: int
success: bool
error: Optional[str] = None
def _sha256(text: str) -> str:
"""Compute SHA-256 hash of text content."""
return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}"
def tavily_search(
api_key: str,
query: str,
max_results: int = 5,
include_raw_content: bool = True,
) -> list[SearchResult]:
"""Search the web via Tavily API.
Args:
api_key: Tavily API key.
query: Search query string.
max_results: Maximum number of results to return.
include_raw_content: Whether to request full page text from Tavily.
Returns:
List of SearchResult objects, sorted by relevance score.
"""
client = TavilyClient(api_key=api_key)
response = client.search(
query,
max_results=max_results,
include_raw_content=include_raw_content,
)
results = []
for item in response.get("results", []):
raw = item.get("raw_content") or ""
content = item.get("content", "")
# Hash the best available content (raw if present, else summary)
hashable = raw if raw else content
results.append(
SearchResult(
url=item.get("url", ""),
title=item.get("title", ""),
content=content,
raw_content=raw if raw else None,
score=item.get("score", 0.0),
content_hash=_sha256(hashable) if hashable else _sha256(""),
)
)
return results
async def fetch_url(
url: str,
timeout: float = 15.0,
max_length: int = 100_000,
) -> FetchResult:
"""Fetch a URL and extract clean text content.
Used for URLs where Tavily didn't return raw_content, or for
URLs discovered during research that weren't in the search results.
Args:
url: The URL to fetch.
timeout: Request timeout in seconds.
max_length: Maximum response body length in characters.
Returns:
FetchResult with extracted text and content hash.
"""
try:
async with httpx.AsyncClient(
follow_redirects=True,
timeout=timeout,
) as client:
response = await client.get(
url,
headers={
"User-Agent": "Marchwarden/0.1 (research agent)",
},
)
response.raise_for_status()
raw_text = response.text[:max_length]
clean_text = _extract_text(raw_text)
return FetchResult(
url=url,
text=clean_text,
content_hash=_sha256(clean_text),
content_length=len(clean_text),
success=True,
)
except httpx.TimeoutException:
return FetchResult(
url=url,
text="",
content_hash=_sha256(""),
content_length=0,
success=False,
error=f"Timeout after {timeout}s",
)
except httpx.HTTPStatusError as e:
return FetchResult(
url=url,
text="",
content_hash=_sha256(""),
content_length=0,
success=False,
error=f"HTTP {e.response.status_code}",
)
except httpx.HTTPError as e:
return FetchResult(
url=url,
text="",
content_hash=_sha256(""),
content_length=0,
success=False,
error=str(e),
)
def _extract_text(html: str) -> str:
"""Extract readable text from HTML.
Simple extraction: strip tags, collapse whitespace. For V1 this is
sufficient. If quality is poor, swap in trafilatura or readability-lxml.
"""
import re
# Remove script and style blocks
text = re.sub(r"", " ", html, flags=re.DOTALL)
text = re.sub(r"", " ", text, flags=re.DOTALL)
# Remove HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Decode common HTML entities
text = text.replace("&", "&")
text = text.replace("<", "<")
text = text.replace(">", ">")
text = text.replace(""", '"')
text = text.replace("'", "'")
text = text.replace(" ", " ")
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text