"""Search and fetch tools for the web researcher. Two tools: - tavily_search: Web search via Tavily API, returns structured results - fetch_url: Direct URL fetch with content extraction and hashing """ import hashlib from dataclasses import dataclass from typing import Optional import httpx from tavily import TavilyClient @dataclass class SearchResult: """A single result from a Tavily search.""" url: str title: str content: str # Short extracted summary from Tavily raw_content: Optional[str] # Full page text (may be None) score: float # Tavily relevance score (0.0-1.0) content_hash: str # SHA-256 of the best available content @dataclass class FetchResult: """Result of fetching and extracting content from a URL.""" url: str text: str # Extracted clean text content_hash: str # SHA-256 of the fetched content content_length: int success: bool error: Optional[str] = None def _sha256(text: str) -> str: """Compute SHA-256 hash of text content.""" return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}" def tavily_search( api_key: str, query: str, max_results: int = 5, include_raw_content: bool = True, ) -> list[SearchResult]: """Search the web via Tavily API. Args: api_key: Tavily API key. query: Search query string. max_results: Maximum number of results to return. include_raw_content: Whether to request full page text from Tavily. Returns: List of SearchResult objects, sorted by relevance score. """ client = TavilyClient(api_key=api_key) response = client.search( query, max_results=max_results, include_raw_content=include_raw_content, ) results = [] for item in response.get("results", []): raw = item.get("raw_content") or "" content = item.get("content", "") # Hash the best available content (raw if present, else summary) hashable = raw if raw else content results.append( SearchResult( url=item.get("url", ""), title=item.get("title", ""), content=content, raw_content=raw if raw else None, score=item.get("score", 0.0), content_hash=_sha256(hashable) if hashable else _sha256(""), ) ) return results async def fetch_url( url: str, timeout: float = 15.0, max_length: int = 100_000, ) -> FetchResult: """Fetch a URL and extract clean text content. Used for URLs where Tavily didn't return raw_content, or for URLs discovered during research that weren't in the search results. Args: url: The URL to fetch. timeout: Request timeout in seconds. max_length: Maximum response body length in characters. Returns: FetchResult with extracted text and content hash. """ try: async with httpx.AsyncClient( follow_redirects=True, timeout=timeout, ) as client: response = await client.get( url, headers={ "User-Agent": "Marchwarden/0.1 (research agent)", }, ) response.raise_for_status() raw_text = response.text[:max_length] clean_text = _extract_text(raw_text) return FetchResult( url=url, text=clean_text, content_hash=_sha256(clean_text), content_length=len(clean_text), success=True, ) except httpx.TimeoutException: return FetchResult( url=url, text="", content_hash=_sha256(""), content_length=0, success=False, error=f"Timeout after {timeout}s", ) except httpx.HTTPStatusError as e: return FetchResult( url=url, text="", content_hash=_sha256(""), content_length=0, success=False, error=f"HTTP {e.response.status_code}", ) except httpx.HTTPError as e: return FetchResult( url=url, text="", content_hash=_sha256(""), content_length=0, success=False, error=str(e), ) def _extract_text(html: str) -> str: """Extract readable text from HTML. Simple extraction: strip tags, collapse whitespace. For V1 this is sufficient. If quality is poor, swap in trafilatura or readability-lxml. """ import re # Remove script and style blocks text = re.sub(r"]*>.*?", " ", html, flags=re.DOTALL) text = re.sub(r"]*>.*?", " ", text, flags=re.DOTALL) # Remove HTML tags text = re.sub(r"<[^>]+>", " ", text) # Decode common HTML entities text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace(">", ">") text = text.replace(""", '"') text = text.replace("'", "'") text = text.replace(" ", " ") # Collapse whitespace text = re.sub(r"\s+", " ", text).strip() return text