Previously the depth parameter (shallow/balanced/deep) was passed only as a text hint inside the agent's user message, with no mechanical effect on iterations, token budget, or source count. The flag was effectively cosmetic — the LLM was expected to "interpret" it. Add DEPTH_PRESETS table and constraints_for_depth() helper in researchers.web.models: shallow: 2 iters, 5,000 tokens, 5 sources balanced: 5 iters, 20,000 tokens, 10 sources (= historical defaults) deep: 8 iters, 60,000 tokens, 20 sources Wired through the stack: - WebResearcher.research(): when constraints is None, builds from the depth preset instead of bare ResearchConstraints() - MCP server `research` tool: max_iterations and token_budget now default to None; constraints are built via constraints_for_depth with explicit values overriding the preset - CLI `ask` command: --max-iterations and --budget default to None; the CLI only forwards them to the MCP tool when set, so unset flags fall through to the depth preset balanced is unchanged from the historical defaults so existing callers see no behavior difference. Explicit --max-iterations / --budget always win over the preset. Tests cover each preset's values, balanced backward-compat, unknown depth fallback, full override, and partial override. 116/116 tests passing. Live-verified: --depth shallow on a simple question now caps at 2 iterations and stays under budget. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
498 lines
15 KiB
Python
498 lines
15 KiB
Python
"""Tests for the Marchwarden Research Contract v1 models."""
|
|
|
|
import json
|
|
import uuid
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from researchers.web.models import (
|
|
Citation,
|
|
ConfidenceFactors,
|
|
CostMetadata,
|
|
DiscoveryEvent,
|
|
Gap,
|
|
GapCategory,
|
|
OpenQuestion,
|
|
ResearchConstraints,
|
|
ResearchResult,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def make_citation(**overrides) -> Citation:
|
|
defaults = {
|
|
"source": "web",
|
|
"locator": "https://example.com/article",
|
|
"title": "Example Article",
|
|
"snippet": "Relevant summary of the content.",
|
|
"raw_excerpt": "Verbatim text copied directly from the source document.",
|
|
"confidence": 0.85,
|
|
}
|
|
defaults.update(overrides)
|
|
return Citation(**defaults)
|
|
|
|
|
|
def make_gap(**overrides) -> Gap:
|
|
defaults = {
|
|
"topic": "pest management",
|
|
"category": GapCategory.SOURCE_NOT_FOUND,
|
|
"detail": "No pest data found in general web sources.",
|
|
}
|
|
defaults.update(overrides)
|
|
return Gap(**defaults)
|
|
|
|
|
|
def make_discovery_event(**overrides) -> DiscoveryEvent:
|
|
defaults = {
|
|
"type": "related_research",
|
|
"suggested_researcher": "arxiv",
|
|
"query": "soil salinity studies Utah 2024-2026",
|
|
"reason": "Multiple web sources reference USU study data",
|
|
"source_locator": "https://example.com/reference",
|
|
}
|
|
defaults.update(overrides)
|
|
return DiscoveryEvent(**defaults)
|
|
|
|
|
|
def make_open_question(**overrides) -> OpenQuestion:
|
|
defaults = {
|
|
"question": "What is the optimal irrigation schedule for high-elevation potatoes?",
|
|
"context": "Multiple sources mention irrigation is critical but none specify schedules.",
|
|
"priority": "medium",
|
|
"source_locator": "https://example.com/utah-crops",
|
|
}
|
|
defaults.update(overrides)
|
|
return OpenQuestion(**defaults)
|
|
|
|
|
|
def make_confidence_factors(**overrides) -> ConfidenceFactors:
|
|
defaults = {
|
|
"num_corroborating_sources": 3,
|
|
"source_authority": "high",
|
|
"contradiction_detected": False,
|
|
"query_specificity_match": 0.85,
|
|
"budget_exhausted": False,
|
|
"recency": "current",
|
|
}
|
|
defaults.update(overrides)
|
|
return ConfidenceFactors(**defaults)
|
|
|
|
|
|
def make_cost_metadata(**overrides) -> CostMetadata:
|
|
defaults = {
|
|
"tokens_used": 8452,
|
|
"iterations_run": 3,
|
|
"wall_time_sec": 42.5,
|
|
"budget_exhausted": False,
|
|
"model_id": "claude-sonnet-4-6",
|
|
}
|
|
defaults.update(overrides)
|
|
return CostMetadata(**defaults)
|
|
|
|
|
|
def make_research_result(**overrides) -> ResearchResult:
|
|
defaults = {
|
|
"answer": "Utah is ideal for cool-season crops at high elevation.",
|
|
"citations": [make_citation()],
|
|
"gaps": [make_gap()],
|
|
"discovery_events": [make_discovery_event()],
|
|
"open_questions": [make_open_question()],
|
|
"confidence": 0.82,
|
|
"confidence_factors": make_confidence_factors(),
|
|
"cost_metadata": make_cost_metadata(),
|
|
"trace_id": str(uuid.uuid4()),
|
|
}
|
|
defaults.update(overrides)
|
|
return ResearchResult(**defaults)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ResearchConstraints
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestResearchConstraints:
|
|
def test_defaults(self):
|
|
c = ResearchConstraints()
|
|
assert c.max_iterations == 5
|
|
assert c.token_budget == 20_000
|
|
assert c.max_sources == 10
|
|
assert c.source_filter is None
|
|
|
|
def test_custom_values(self):
|
|
c = ResearchConstraints(
|
|
max_iterations=3, token_budget=5000, max_sources=5
|
|
)
|
|
assert c.max_iterations == 3
|
|
assert c.token_budget == 5000
|
|
assert c.max_sources == 5
|
|
|
|
def test_invalid_iterations(self):
|
|
with pytest.raises(ValidationError):
|
|
ResearchConstraints(max_iterations=0)
|
|
|
|
def test_invalid_token_budget(self):
|
|
with pytest.raises(ValidationError):
|
|
ResearchConstraints(token_budget=500)
|
|
|
|
def test_serialization_roundtrip(self):
|
|
c = ResearchConstraints(max_iterations=3, token_budget=10000)
|
|
data = c.model_dump()
|
|
c2 = ResearchConstraints(**data)
|
|
assert c == c2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Citation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCitation:
|
|
def test_full_citation(self):
|
|
c = make_citation()
|
|
assert c.source == "web"
|
|
assert c.raw_excerpt.startswith("Verbatim")
|
|
assert 0.0 <= c.confidence <= 1.0
|
|
|
|
def test_minimal_citation(self):
|
|
c = Citation(
|
|
source="web",
|
|
locator="https://example.com",
|
|
raw_excerpt="Some text.",
|
|
confidence=0.5,
|
|
)
|
|
assert c.title is None
|
|
assert c.snippet is None
|
|
|
|
def test_confidence_bounds(self):
|
|
with pytest.raises(ValidationError):
|
|
make_citation(confidence=1.5)
|
|
with pytest.raises(ValidationError):
|
|
make_citation(confidence=-0.1)
|
|
|
|
def test_raw_excerpt_required(self):
|
|
with pytest.raises(ValidationError):
|
|
Citation(source="web", locator="https://example.com", confidence=0.5)
|
|
|
|
def test_serialization_roundtrip(self):
|
|
c = make_citation()
|
|
data = c.model_dump()
|
|
c2 = Citation(**data)
|
|
assert c == c2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# GapCategory
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGapCategory:
|
|
def test_all_categories_exist(self):
|
|
expected = {
|
|
"source_not_found",
|
|
"access_denied",
|
|
"budget_exhausted",
|
|
"contradictory_sources",
|
|
"scope_exceeded",
|
|
}
|
|
actual = {cat.value for cat in GapCategory}
|
|
assert actual == expected
|
|
|
|
def test_string_enum(self):
|
|
assert GapCategory.SOURCE_NOT_FOUND == "source_not_found"
|
|
assert isinstance(GapCategory.ACCESS_DENIED, str)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gap
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGap:
|
|
def test_gap_creation(self):
|
|
g = make_gap()
|
|
assert g.category == GapCategory.SOURCE_NOT_FOUND
|
|
assert g.topic == "pest management"
|
|
|
|
def test_all_categories_accepted(self):
|
|
for cat in GapCategory:
|
|
g = make_gap(category=cat)
|
|
assert g.category == cat
|
|
|
|
def test_serialization_roundtrip(self):
|
|
g = make_gap()
|
|
data = g.model_dump()
|
|
g2 = Gap(**data)
|
|
assert g == g2
|
|
|
|
def test_json_uses_string_category(self):
|
|
g = make_gap(category=GapCategory.BUDGET_EXHAUSTED)
|
|
data = json.loads(g.model_dump_json())
|
|
assert data["category"] == "budget_exhausted"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DiscoveryEvent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDiscoveryEvent:
|
|
def test_full_event(self):
|
|
e = make_discovery_event()
|
|
assert e.type == "related_research"
|
|
assert e.suggested_researcher == "arxiv"
|
|
|
|
def test_minimal_event(self):
|
|
e = DiscoveryEvent(
|
|
type="contradiction",
|
|
query="conflicting data on topic X",
|
|
reason="Two sources disagree",
|
|
)
|
|
assert e.suggested_researcher is None
|
|
assert e.source_locator is None
|
|
|
|
def test_serialization_roundtrip(self):
|
|
e = make_discovery_event()
|
|
data = e.model_dump()
|
|
e2 = DiscoveryEvent(**data)
|
|
assert e == e2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# OpenQuestion
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestOpenQuestion:
|
|
def test_full_question(self):
|
|
q = make_open_question()
|
|
assert "irrigation" in q.question
|
|
assert q.priority == "medium"
|
|
assert q.source_locator is not None
|
|
|
|
def test_minimal_question(self):
|
|
q = OpenQuestion(
|
|
question="Is this viable?",
|
|
context="Found conflicting data.",
|
|
priority="low",
|
|
)
|
|
assert q.source_locator is None
|
|
|
|
def test_serialization_roundtrip(self):
|
|
q = make_open_question()
|
|
data = q.model_dump()
|
|
q2 = OpenQuestion(**data)
|
|
assert q == q2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ConfidenceFactors
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestConfidenceFactors:
|
|
def test_creation(self):
|
|
cf = make_confidence_factors()
|
|
assert cf.num_corroborating_sources == 3
|
|
assert cf.source_authority == "high"
|
|
assert cf.contradiction_detected is False
|
|
assert cf.recency == "current"
|
|
|
|
def test_recency_none(self):
|
|
cf = make_confidence_factors(recency=None)
|
|
assert cf.recency is None
|
|
|
|
def test_query_specificity_bounds(self):
|
|
with pytest.raises(ValidationError):
|
|
make_confidence_factors(query_specificity_match=1.5)
|
|
with pytest.raises(ValidationError):
|
|
make_confidence_factors(query_specificity_match=-0.1)
|
|
|
|
def test_serialization_roundtrip(self):
|
|
cf = make_confidence_factors()
|
|
data = cf.model_dump()
|
|
cf2 = ConfidenceFactors(**data)
|
|
assert cf == cf2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CostMetadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestCostMetadata:
|
|
def test_creation(self):
|
|
cm = make_cost_metadata()
|
|
assert cm.tokens_used == 8452
|
|
assert cm.model_id == "claude-sonnet-4-6"
|
|
|
|
def test_model_id_required(self):
|
|
with pytest.raises(ValidationError):
|
|
CostMetadata(
|
|
tokens_used=100,
|
|
iterations_run=1,
|
|
wall_time_sec=1.0,
|
|
budget_exhausted=False,
|
|
)
|
|
|
|
def test_non_negative_constraints(self):
|
|
with pytest.raises(ValidationError):
|
|
make_cost_metadata(tokens_used=-1)
|
|
with pytest.raises(ValidationError):
|
|
make_cost_metadata(wall_time_sec=-0.5)
|
|
|
|
def test_serialization_roundtrip(self):
|
|
cm = make_cost_metadata()
|
|
data = cm.model_dump()
|
|
cm2 = CostMetadata(**data)
|
|
assert cm == cm2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ResearchResult (full contract)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestResearchResult:
|
|
def test_full_result(self):
|
|
r = make_research_result()
|
|
assert r.answer.startswith("Utah")
|
|
assert len(r.citations) == 1
|
|
assert len(r.gaps) == 1
|
|
assert len(r.discovery_events) == 1
|
|
assert len(r.open_questions) == 1
|
|
assert 0.0 <= r.confidence <= 1.0
|
|
assert r.cost_metadata.model_id == "claude-sonnet-4-6"
|
|
|
|
def test_empty_lists_allowed(self):
|
|
r = make_research_result(
|
|
citations=[], gaps=[], discovery_events=[], open_questions=[]
|
|
)
|
|
assert r.citations == []
|
|
assert r.gaps == []
|
|
assert r.discovery_events == []
|
|
assert r.open_questions == []
|
|
|
|
def test_confidence_bounds(self):
|
|
with pytest.raises(ValidationError):
|
|
make_research_result(confidence=1.5)
|
|
|
|
def test_full_json_roundtrip(self):
|
|
r = make_research_result()
|
|
json_str = r.model_dump_json()
|
|
data = json.loads(json_str)
|
|
r2 = ResearchResult(**data)
|
|
assert r == r2
|
|
|
|
def test_json_structure(self):
|
|
"""Verify the JSON output matches the contract schema."""
|
|
r = make_research_result()
|
|
data = json.loads(r.model_dump_json())
|
|
|
|
# Top-level keys
|
|
expected_keys = {
|
|
"answer",
|
|
"citations",
|
|
"gaps",
|
|
"discovery_events",
|
|
"open_questions",
|
|
"confidence",
|
|
"confidence_factors",
|
|
"cost_metadata",
|
|
"trace_id",
|
|
}
|
|
assert set(data.keys()) == expected_keys
|
|
|
|
# OpenQuestion keys
|
|
oq_keys = {"question", "context", "priority", "source_locator"}
|
|
assert set(data["open_questions"][0].keys()) == oq_keys
|
|
|
|
# Citation keys
|
|
citation_keys = {
|
|
"source",
|
|
"locator",
|
|
"title",
|
|
"snippet",
|
|
"raw_excerpt",
|
|
"confidence",
|
|
}
|
|
assert set(data["citations"][0].keys()) == citation_keys
|
|
|
|
# Gap keys
|
|
gap_keys = {"topic", "category", "detail"}
|
|
assert set(data["gaps"][0].keys()) == gap_keys
|
|
|
|
# Gap category is a string value
|
|
assert data["gaps"][0]["category"] == "source_not_found"
|
|
|
|
# CostMetadata includes model_id
|
|
assert "model_id" in data["cost_metadata"]
|
|
|
|
# ConfidenceFactors keys
|
|
cf_keys = {
|
|
"num_corroborating_sources",
|
|
"source_authority",
|
|
"contradiction_detected",
|
|
"query_specificity_match",
|
|
"budget_exhausted",
|
|
"recency",
|
|
}
|
|
assert set(data["confidence_factors"].keys()) == cf_keys
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Depth presets (Issue #30)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
from researchers.web.models import DEPTH_PRESETS, constraints_for_depth
|
|
|
|
|
|
class TestDepthPresets:
|
|
def test_shallow_preset(self):
|
|
c = constraints_for_depth("shallow")
|
|
assert c.max_iterations == 2
|
|
assert c.token_budget == 5_000
|
|
assert c.max_sources == 5
|
|
|
|
def test_balanced_preset_matches_historical_defaults(self):
|
|
# Backward compat: balanced must equal the original ResearchConstraints defaults
|
|
c = constraints_for_depth("balanced")
|
|
default = ResearchConstraints()
|
|
assert c.max_iterations == default.max_iterations == 5
|
|
assert c.token_budget == default.token_budget == 20_000
|
|
assert c.max_sources == default.max_sources == 10
|
|
|
|
def test_deep_preset(self):
|
|
c = constraints_for_depth("deep")
|
|
assert c.max_iterations == 8
|
|
assert c.token_budget == 60_000
|
|
assert c.max_sources == 20
|
|
|
|
def test_unknown_depth_falls_back_to_balanced(self):
|
|
c = constraints_for_depth("nonsense")
|
|
assert c.max_iterations == DEPTH_PRESETS["balanced"]["max_iterations"]
|
|
assert c.token_budget == DEPTH_PRESETS["balanced"]["token_budget"]
|
|
|
|
def test_explicit_overrides_win(self):
|
|
c = constraints_for_depth(
|
|
"shallow",
|
|
max_iterations=10,
|
|
token_budget=42_000,
|
|
max_sources=15,
|
|
)
|
|
assert c.max_iterations == 10
|
|
assert c.token_budget == 42_000
|
|
assert c.max_sources == 15
|
|
|
|
def test_partial_override(self):
|
|
# Only one field overridden — others stay at the preset
|
|
c = constraints_for_depth("deep", token_budget=100_000)
|
|
assert c.token_budget == 100_000
|
|
assert c.max_iterations == 8 # deep preset
|
|
assert c.max_sources == 20 # deep preset
|