marchwarden/tests/test_tools.py
Jeff Smith a5bc93e275 M1.1: Search and fetch tools with tests
- tavily_search(): Tavily API wrapper returning SearchResult dataclasses
  with content hashing (raw_content preferred, falls back to summary)
- fetch_url(): async URL fetch with HTML text extraction, content hashing,
  and graceful error handling (timeout, HTTP errors, connection errors)
- _extract_text(): simple HTML → clean text (strip scripts/styles/tags,
  decode entities, collapse whitespace)
- _sha256(): SHA-256 content hashing with 'sha256:' prefix for traces

18 tests: hashing, HTML extraction, mocked Tavily search, mocked async
fetch (success, timeout, HTTP error, hash consistency).

Refs: archeious/marchwarden#1

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-08 14:17:18 -06:00

245 lines
8.6 KiB
Python

"""Tests for web researcher search and fetch tools."""
import hashlib
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from researchers.web.tools import (
FetchResult,
SearchResult,
_extract_text,
_sha256,
fetch_url,
tavily_search,
)
# ---------------------------------------------------------------------------
# _sha256
# ---------------------------------------------------------------------------
class TestSha256:
def test_consistent_hash(self):
assert _sha256("hello") == _sha256("hello")
def test_different_input_different_hash(self):
assert _sha256("hello") != _sha256("world")
def test_format(self):
h = _sha256("test")
assert h.startswith("sha256:")
# SHA-256 hex digest is 64 chars
assert len(h.split(":")[1]) == 64
def test_matches_hashlib(self):
text = "marchwarden test"
expected = "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
assert _sha256(text) == expected
# ---------------------------------------------------------------------------
# _extract_text
# ---------------------------------------------------------------------------
class TestExtractText:
def test_strips_tags(self):
assert _extract_text("<p>hello</p>") == "hello"
def test_strips_script(self):
html = "<script>var x=1;</script><p>content</p>"
assert "var x" not in _extract_text(html)
assert "content" in _extract_text(html)
def test_strips_style(self):
html = "<style>.foo{color:red}</style><p>visible</p>"
assert "color" not in _extract_text(html)
assert "visible" in _extract_text(html)
def test_collapses_whitespace(self):
html = "<p>hello</p> <p>world</p>"
assert _extract_text(html) == "hello world"
def test_decodes_entities(self):
html = "&amp; &lt; &gt; &quot; &#39; &nbsp;"
result = _extract_text(html)
assert "&" in result
assert "<" in result
assert ">" in result
assert '"' in result
assert "'" in result
def test_empty_input(self):
assert _extract_text("") == ""
def test_plain_text_passthrough(self):
assert _extract_text("just plain text") == "just plain text"
# ---------------------------------------------------------------------------
# tavily_search
# ---------------------------------------------------------------------------
class TestTavilySearch:
@patch("researchers.web.tools.TavilyClient")
def test_returns_search_results(self, mock_client_class):
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.search.return_value = {
"results": [
{
"url": "https://example.com/page1",
"title": "Page One",
"content": "Short summary of page one.",
"raw_content": "Full text of page one with lots of detail.",
"score": 0.95,
},
{
"url": "https://example.com/page2",
"title": "Page Two",
"content": "Short summary of page two.",
"raw_content": "",
"score": 0.80,
},
]
}
results = tavily_search("fake-key", "test query", max_results=2)
assert len(results) == 2
assert isinstance(results[0], SearchResult)
# First result has raw_content
assert results[0].url == "https://example.com/page1"
assert results[0].title == "Page One"
assert results[0].raw_content == "Full text of page one with lots of detail."
assert results[0].score == 0.95
assert results[0].content_hash.startswith("sha256:")
# Hash should be of raw_content (best available)
assert results[0].content_hash == _sha256(
"Full text of page one with lots of detail."
)
# Second result has empty raw_content → falls back to content
assert results[1].raw_content is None
assert results[1].content_hash == _sha256("Short summary of page two.")
@patch("researchers.web.tools.TavilyClient")
def test_empty_results(self, mock_client_class):
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.search.return_value = {"results": []}
results = tavily_search("fake-key", "obscure query")
assert results == []
@patch("researchers.web.tools.TavilyClient")
def test_passes_params_to_client(self, mock_client_class):
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.search.return_value = {"results": []}
tavily_search("my-key", "query", max_results=3, include_raw_content=False)
mock_client_class.assert_called_once_with(api_key="my-key")
mock_client.search.assert_called_once_with(
"query", max_results=3, include_raw_content=False
)
# ---------------------------------------------------------------------------
# fetch_url
# ---------------------------------------------------------------------------
class TestFetchUrl:
@pytest.mark.asyncio
async def test_successful_fetch(self):
mock_response = AsyncMock()
mock_response.status_code = 200
mock_response.text = "<html><body><p>Hello world</p></body></html>"
mock_response.raise_for_status = MagicMock()
with patch("researchers.web.tools.httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=False)
mock_client.get.return_value = mock_response
result = await fetch_url("https://example.com")
assert isinstance(result, FetchResult)
assert result.success is True
assert result.error is None
assert "Hello world" in result.text
assert result.content_hash.startswith("sha256:")
assert result.content_length > 0
assert result.url == "https://example.com"
@pytest.mark.asyncio
async def test_timeout_error(self):
import httpx
with patch("researchers.web.tools.httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=False)
mock_client.get.side_effect = httpx.TimeoutException("timed out")
result = await fetch_url("https://slow.example.com", timeout=5.0)
assert result.success is False
assert "Timeout" in result.error
assert result.text == ""
@pytest.mark.asyncio
async def test_http_error(self):
import httpx
mock_response = MagicMock()
mock_response.status_code = 404
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
"Not Found",
request=MagicMock(),
response=mock_response,
)
with patch("researchers.web.tools.httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=False)
mock_client.get.return_value = mock_response
result = await fetch_url("https://example.com/missing")
assert result.success is False
assert "404" in result.error
@pytest.mark.asyncio
async def test_content_hash_consistency(self):
"""Same content → same hash."""
mock_response = AsyncMock()
mock_response.text = "<p>consistent content</p>"
mock_response.raise_for_status = MagicMock()
with patch("researchers.web.tools.httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=False)
mock_client.get.return_value = mock_response
r1 = await fetch_url("https://example.com/a")
r2 = await fetch_url("https://example.com/b")
assert r1.content_hash == r2.content_hash