182 lines
5.1 KiB
Python
182 lines
5.1 KiB
Python
|
|
"""Search and fetch tools for the web researcher.
|
||
|
|
|
||
|
|
Two tools:
|
||
|
|
- tavily_search: Web search via Tavily API, returns structured results
|
||
|
|
- fetch_url: Direct URL fetch with content extraction and hashing
|
||
|
|
"""
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from tavily import TavilyClient
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class SearchResult:
|
||
|
|
"""A single result from a Tavily search."""
|
||
|
|
|
||
|
|
url: str
|
||
|
|
title: str
|
||
|
|
content: str # Short extracted summary from Tavily
|
||
|
|
raw_content: Optional[str] # Full page text (may be None)
|
||
|
|
score: float # Tavily relevance score (0.0-1.0)
|
||
|
|
content_hash: str # SHA-256 of the best available content
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class FetchResult:
|
||
|
|
"""Result of fetching and extracting content from a URL."""
|
||
|
|
|
||
|
|
url: str
|
||
|
|
text: str # Extracted clean text
|
||
|
|
content_hash: str # SHA-256 of the fetched content
|
||
|
|
content_length: int
|
||
|
|
success: bool
|
||
|
|
error: Optional[str] = None
|
||
|
|
|
||
|
|
|
||
|
|
def _sha256(text: str) -> str:
|
||
|
|
"""Compute SHA-256 hash of text content."""
|
||
|
|
return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}"
|
||
|
|
|
||
|
|
|
||
|
|
def tavily_search(
|
||
|
|
api_key: str,
|
||
|
|
query: str,
|
||
|
|
max_results: int = 5,
|
||
|
|
include_raw_content: bool = True,
|
||
|
|
) -> list[SearchResult]:
|
||
|
|
"""Search the web via Tavily API.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
api_key: Tavily API key.
|
||
|
|
query: Search query string.
|
||
|
|
max_results: Maximum number of results to return.
|
||
|
|
include_raw_content: Whether to request full page text from Tavily.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of SearchResult objects, sorted by relevance score.
|
||
|
|
"""
|
||
|
|
client = TavilyClient(api_key=api_key)
|
||
|
|
response = client.search(
|
||
|
|
query,
|
||
|
|
max_results=max_results,
|
||
|
|
include_raw_content=include_raw_content,
|
||
|
|
)
|
||
|
|
|
||
|
|
results = []
|
||
|
|
for item in response.get("results", []):
|
||
|
|
raw = item.get("raw_content") or ""
|
||
|
|
content = item.get("content", "")
|
||
|
|
# Hash the best available content (raw if present, else summary)
|
||
|
|
hashable = raw if raw else content
|
||
|
|
results.append(
|
||
|
|
SearchResult(
|
||
|
|
url=item.get("url", ""),
|
||
|
|
title=item.get("title", ""),
|
||
|
|
content=content,
|
||
|
|
raw_content=raw if raw else None,
|
||
|
|
score=item.get("score", 0.0),
|
||
|
|
content_hash=_sha256(hashable) if hashable else _sha256(""),
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
async def fetch_url(
|
||
|
|
url: str,
|
||
|
|
timeout: float = 15.0,
|
||
|
|
max_length: int = 100_000,
|
||
|
|
) -> FetchResult:
|
||
|
|
"""Fetch a URL and extract clean text content.
|
||
|
|
|
||
|
|
Used for URLs where Tavily didn't return raw_content, or for
|
||
|
|
URLs discovered during research that weren't in the search results.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: The URL to fetch.
|
||
|
|
timeout: Request timeout in seconds.
|
||
|
|
max_length: Maximum response body length in characters.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
FetchResult with extracted text and content hash.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(
|
||
|
|
follow_redirects=True,
|
||
|
|
timeout=timeout,
|
||
|
|
) as client:
|
||
|
|
response = await client.get(
|
||
|
|
url,
|
||
|
|
headers={
|
||
|
|
"User-Agent": "Marchwarden/0.1 (research agent)",
|
||
|
|
},
|
||
|
|
)
|
||
|
|
response.raise_for_status()
|
||
|
|
|
||
|
|
raw_text = response.text[:max_length]
|
||
|
|
clean_text = _extract_text(raw_text)
|
||
|
|
|
||
|
|
return FetchResult(
|
||
|
|
url=url,
|
||
|
|
text=clean_text,
|
||
|
|
content_hash=_sha256(clean_text),
|
||
|
|
content_length=len(clean_text),
|
||
|
|
success=True,
|
||
|
|
)
|
||
|
|
except httpx.TimeoutException:
|
||
|
|
return FetchResult(
|
||
|
|
url=url,
|
||
|
|
text="",
|
||
|
|
content_hash=_sha256(""),
|
||
|
|
content_length=0,
|
||
|
|
success=False,
|
||
|
|
error=f"Timeout after {timeout}s",
|
||
|
|
)
|
||
|
|
except httpx.HTTPStatusError as e:
|
||
|
|
return FetchResult(
|
||
|
|
url=url,
|
||
|
|
text="",
|
||
|
|
content_hash=_sha256(""),
|
||
|
|
content_length=0,
|
||
|
|
success=False,
|
||
|
|
error=f"HTTP {e.response.status_code}",
|
||
|
|
)
|
||
|
|
except httpx.HTTPError as e:
|
||
|
|
return FetchResult(
|
||
|
|
url=url,
|
||
|
|
text="",
|
||
|
|
content_hash=_sha256(""),
|
||
|
|
content_length=0,
|
||
|
|
success=False,
|
||
|
|
error=str(e),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_text(html: str) -> str:
|
||
|
|
"""Extract readable text from HTML.
|
||
|
|
|
||
|
|
Simple extraction: strip tags, collapse whitespace. For V1 this is
|
||
|
|
sufficient. If quality is poor, swap in trafilatura or readability-lxml.
|
||
|
|
"""
|
||
|
|
import re
|
||
|
|
|
||
|
|
# Remove script and style blocks
|
||
|
|
text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL)
|
||
|
|
text = re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=re.DOTALL)
|
||
|
|
# Remove HTML tags
|
||
|
|
text = re.sub(r"<[^>]+>", " ", text)
|
||
|
|
# Decode common HTML entities
|
||
|
|
text = text.replace("&", "&")
|
||
|
|
text = text.replace("<", "<")
|
||
|
|
text = text.replace(">", ">")
|
||
|
|
text = text.replace(""", '"')
|
||
|
|
text = text.replace("'", "'")
|
||
|
|
text = text.replace(" ", " ")
|
||
|
|
# Collapse whitespace
|
||
|
|
text = re.sub(r"\s+", " ", text).strip()
|
||
|
|
return text
|