"""Search and fetch tools for the web researcher.

Two tools:
- tavily_search: Web search via Tavily API, returns structured results
- fetch_url: Direct URL fetch with content extraction and hashing
"""

import hashlib
from dataclasses import dataclass
from typing import Optional

import httpx
from tavily import TavilyClient


@dataclass
class SearchResult:
    """A single result from a Tavily search."""

    url: str
    title: str
    content: str  # Short extracted summary from Tavily
    raw_content: Optional[str]  # Full page text (may be None)
    score: float  # Tavily relevance score (0.0-1.0)
    content_hash: str  # SHA-256 of the best available content


@dataclass
class FetchResult:
    """Result of fetching and extracting content from a URL."""

    url: str
    text: str  # Extracted clean text
    content_hash: str  # SHA-256 of the fetched content
    content_length: int
    success: bool
    error: Optional[str] = None


def _sha256(text: str) -> str:
    """Compute SHA-256 hash of text content."""
    return f"sha256:{hashlib.sha256(text.encode('utf-8')).hexdigest()}"


def tavily_search(
    api_key: str,
    query: str,
    max_results: int = 5,
    include_raw_content: bool = True,
) -> list[SearchResult]:
    """Search the web via Tavily API.

    Args:
        api_key: Tavily API key.
        query: Search query string.
        max_results: Maximum number of results to return.
        include_raw_content: Whether to request full page text from Tavily.

    Returns:
        List of SearchResult objects, sorted by relevance score.
    """
    client = TavilyClient(api_key=api_key)
    response = client.search(
        query,
        max_results=max_results,
        include_raw_content=include_raw_content,
    )

    results = []
    for item in response.get("results", []):
        raw = item.get("raw_content") or ""
        content = item.get("content", "")
        # Hash the best available content (raw if present, else summary)
        hashable = raw if raw else content
        results.append(
            SearchResult(
                url=item.get("url", ""),
                title=item.get("title", ""),
                content=content,
                raw_content=raw if raw else None,
                score=item.get("score", 0.0),
                content_hash=_sha256(hashable) if hashable else _sha256(""),
            )
        )

    return results


async def fetch_url(
    url: str,
    timeout: float = 15.0,
    max_length: int = 100_000,
) -> FetchResult:
    """Fetch a URL and extract clean text content.

    Used for URLs where Tavily didn't return raw_content, or for
    URLs discovered during research that weren't in the search results.

    Args:
        url: The URL to fetch.
        timeout: Request timeout in seconds.
        max_length: Maximum response body length in characters.

    Returns:
        FetchResult with extracted text and content hash.
    """
    try:
        async with httpx.AsyncClient(
            follow_redirects=True,
            timeout=timeout,
        ) as client:
            response = await client.get(
                url,
                headers={
                    "User-Agent": "Marchwarden/0.1 (research agent)",
                },
            )
            response.raise_for_status()

            raw_text = response.text[:max_length]
            clean_text = _extract_text(raw_text)

            return FetchResult(
                url=url,
                text=clean_text,
                content_hash=_sha256(clean_text),
                content_length=len(clean_text),
                success=True,
            )
    except httpx.TimeoutException:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=f"Timeout after {timeout}s",
        )
    except httpx.HTTPStatusError as e:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=f"HTTP {e.response.status_code}",
        )
    except httpx.HTTPError as e:
        return FetchResult(
            url=url,
            text="",
            content_hash=_sha256(""),
            content_length=0,
            success=False,
            error=str(e),
        )


def _extract_text(html: str) -> str:
    """Extract readable text from HTML.

    Simple extraction: strip tags, collapse whitespace. For V1 this is
    sufficient. If quality is poor, swap in trafilatura or readability-lxml.
    """
    import re

    # Remove script and style blocks
    text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL)
    text = re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=re.DOTALL)
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Decode common HTML entities
    text = text.replace("&amp;", "&")
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    text = text.replace("&quot;", '"')
    text = text.replace("&#39;", "'")
    text = text.replace("&nbsp;", " ")
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text