M0.3: Implement contract v1 Pydantic models with tests

All Research Contract types as Pydantic models: - ResearchConstraints (input) - Citation with raw_excerpt (output) - GapCategory enum (5 categories) - Gap with structured category (output) - DiscoveryEvent (lateral findings) - ConfidenceFactors (auditable scoring inputs) - CostMetadata with model_id (resource tracking) - ResearchResult (top-level contract) 32 tests: validation, bounds checking, serialization roundtrips, JSON structure verification against contract spec. Refs: archeious/marchwarden#1 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-08 14:00:45 -06:00 · 2026-04-08 14:00:45 -06:00 · 1b0f86399a
commit 1b0f86399a
parent 6a8445ed13
3 changed files with 629 additions and 0 deletions
--- a/researchers/web/models.py
+++ b/researchers/web/models.py
@ -0,0 +1,231 @@
 """Marchwarden Research Contract v1 — Pydantic models.
 These models define the stable contract between a researcher MCP server
 and its caller (PI agent or CLI shim). Changes to required fields or
 types require a contract version bump.
 """
 from enum import Enum
 from typing import Optional
 from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
 # Input types
 # ---------------------------------------------------------------------------
 class ResearchConstraints(BaseModel):
    """Fine-grained control over researcher behavior."""
    max_iterations: int = Field(
        default=5,
        ge=1,
        le=20,
        description="Stop after N iterations, regardless of progress.",
    )
    token_budget: int = Field(
        default=20_000,
        ge=1_000,
        description="Soft limit on total tokens consumed by the research loop.",
    )
    max_sources: int = Field(
        default=10,
        ge=1,
        description="Maximum number of sources to fetch and extract.",
    )
    source_filter: Optional[str] = Field(
        default=None,
        description="Restrict search to specific domains (V2). E.g. '.gov,.edu'.",
    )
 # ---------------------------------------------------------------------------
 # Output types — Citation
 # ---------------------------------------------------------------------------
 class Citation(BaseModel):
    """A single source used by the researcher, with raw evidence."""
    source: str = Field(
        description="Source type: 'web', 'file', 'database', etc.",
    )
    locator: str = Field(
        description="URL, file path, row ID, or unique identifier.",
    )
    title: Optional[str] = Field(
        default=None,
        description="Human-readable title (for web sources).",
    )
    snippet: Optional[str] = Field(
        default=None,
        description="Researcher's summary of relevant content (50-200 chars).",
    )
    raw_excerpt: str = Field(
        description=(
            "Verbatim text from the source (up to 500 chars). "
            "Bypasses researcher synthesis to prevent the Synthesis Paradox."
        ),
    )
    confidence: float = Field(
        ge=0.0,
        le=1.0,
        description="Researcher's confidence in this source's accuracy.",
    )
 # ---------------------------------------------------------------------------
 # Output types — Gap
 # ---------------------------------------------------------------------------
 class GapCategory(str, Enum):
    """Categorized reason a gap exists. Drives PI decision-making."""
    SOURCE_NOT_FOUND = "source_not_found"
    ACCESS_DENIED = "access_denied"
    BUDGET_EXHAUSTED = "budget_exhausted"
    CONTRADICTORY_SOURCES = "contradictory_sources"
    SCOPE_EXCEEDED = "scope_exceeded"
 class Gap(BaseModel):
    """An unresolved aspect of the research question."""
    topic: str = Field(
        description="What aspect wasn't resolved.",
    )
    category: GapCategory = Field(
        description="Structured reason category.",
    )
    detail: str = Field(
        description="Human-readable explanation of why this gap exists.",
    )
 # ---------------------------------------------------------------------------
 # Output types — DiscoveryEvent
 # ---------------------------------------------------------------------------
 class DiscoveryEvent(BaseModel):
    """A lateral finding relevant to another researcher's domain."""
    type: str = Field(
        description="Event type: 'related_research', 'new_source', 'contradiction'.",
    )
    suggested_researcher: Optional[str] = Field(
        default=None,
        description="Target researcher type: 'arxiv', 'database', 'legal', etc.",
    )
    query: str = Field(
        description="Suggested query for the target researcher.",
    )
    reason: str = Field(
        description="Why this is relevant to the overall investigation.",
    )
    source_locator: Optional[str] = Field(
        default=None,
        description="Where the discovery was found (URL, DOI, etc.).",
    )
 # ---------------------------------------------------------------------------
 # Output types — Confidence
 # ---------------------------------------------------------------------------
 class ConfidenceFactors(BaseModel):
    """Inputs that fed the confidence score. Enables auditability and future calibration."""
    num_corroborating_sources: int = Field(
        ge=0,
        description="How many sources agree on the core claims.",
    )
    source_authority: str = Field(
        description="'high' (.gov, .edu, peer-reviewed), 'medium' (established orgs), 'low' (blogs, forums).",
    )
    contradiction_detected: bool = Field(
        description="Were conflicting claims found across sources?",
    )
    query_specificity_match: float = Field(
        ge=0.0,
        le=1.0,
        description="How well the results address the actual question (0.0-1.0).",
    )
    budget_exhausted: bool = Field(
        description="True if the researcher hit its iteration or token cap.",
    )
    recency: Optional[str] = Field(
        default=None,
        description="'current' (< 1yr), 'recent' (1-3yr), 'dated' (> 3yr), None if unknown.",
    )
 # ---------------------------------------------------------------------------
 # Output types — CostMetadata
 # ---------------------------------------------------------------------------
 class CostMetadata(BaseModel):
    """Resource usage for a single research call."""
    tokens_used: int = Field(
        ge=0,
        description="Total tokens consumed (Claude + search API calls).",
    )
    iterations_run: int = Field(
        ge=0,
        description="Number of inner-loop iterations completed.",
    )
    wall_time_sec: float = Field(
        ge=0.0,
        description="Actual elapsed wall-clock time in seconds.",
    )
    budget_exhausted: bool = Field(
        description="True if the researcher hit its iteration or token cap.",
    )
    model_id: str = Field(
        description="Model used for the research loop (e.g. 'claude-sonnet-4-6').",
    )
 # ---------------------------------------------------------------------------
 # Top-level output
 # ---------------------------------------------------------------------------
 class ResearchResult(BaseModel):
    """Complete result from a single research() call. This is the contract."""
    answer: str = Field(
        description="The synthesized answer. Every claim must trace to a citation.",
    )
    citations: list[Citation] = Field(
        default_factory=list,
        description="Sources used, with raw evidence.",
    )
    gaps: list[Gap] = Field(
        default_factory=list,
        description="What couldn't be resolved, categorized by cause.",
    )
    discovery_events: list[DiscoveryEvent] = Field(
        default_factory=list,
        description="Lateral findings for other researchers.",
    )
    confidence: float = Field(
        ge=0.0,
        le=1.0,
        description="Overall confidence in the answer (0.0-1.0).",
    )
    confidence_factors: ConfidenceFactors = Field(
        description="What fed the confidence score.",
    )
    cost_metadata: CostMetadata = Field(
        description="Resource usage for this research call.",
    )
    trace_id: str = Field(
        description="UUID linking to the JSONL trace log.",
    )
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -0,0 +1,398 @@
 """Tests for the Marchwarden Research Contract v1 models."""
 import json
 import uuid
 import pytest
 from pydantic import ValidationError
 from researchers.web.models import (
    Citation,
    ConfidenceFactors,
    CostMetadata,
    DiscoveryEvent,
    Gap,
    GapCategory,
    ResearchConstraints,
    ResearchResult,
 )
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
 def make_citation(**overrides) -> Citation:
    defaults = {
        "source": "web",
        "locator": "https://example.com/article",
        "title": "Example Article",
        "snippet": "Relevant summary of the content.",
        "raw_excerpt": "Verbatim text copied directly from the source document.",
        "confidence": 0.85,
    }
    defaults.update(overrides)
    return Citation(**defaults)
 def make_gap(**overrides) -> Gap:
    defaults = {
        "topic": "pest management",
        "category": GapCategory.SOURCE_NOT_FOUND,
        "detail": "No pest data found in general web sources.",
    }
    defaults.update(overrides)
    return Gap(**defaults)
 def make_discovery_event(**overrides) -> DiscoveryEvent:
    defaults = {
        "type": "related_research",
        "suggested_researcher": "arxiv",
        "query": "soil salinity studies Utah 2024-2026",
        "reason": "Multiple web sources reference USU study data",
        "source_locator": "https://example.com/reference",
    }
    defaults.update(overrides)
    return DiscoveryEvent(**defaults)
 def make_confidence_factors(**overrides) -> ConfidenceFactors:
    defaults = {
        "num_corroborating_sources": 3,
        "source_authority": "high",
        "contradiction_detected": False,
        "query_specificity_match": 0.85,
        "budget_exhausted": False,
        "recency": "current",
    }
    defaults.update(overrides)
    return ConfidenceFactors(**defaults)
 def make_cost_metadata(**overrides) -> CostMetadata:
    defaults = {
        "tokens_used": 8452,
        "iterations_run": 3,
        "wall_time_sec": 42.5,
        "budget_exhausted": False,
        "model_id": "claude-sonnet-4-6",
    }
    defaults.update(overrides)
    return CostMetadata(**defaults)
 def make_research_result(**overrides) -> ResearchResult:
    defaults = {
        "answer": "Utah is ideal for cool-season crops at high elevation.",
        "citations": [make_citation()],
        "gaps": [make_gap()],
        "discovery_events": [make_discovery_event()],
        "confidence": 0.82,
        "confidence_factors": make_confidence_factors(),
        "cost_metadata": make_cost_metadata(),
        "trace_id": str(uuid.uuid4()),
    }
    defaults.update(overrides)
    return ResearchResult(**defaults)
 # ---------------------------------------------------------------------------
 # ResearchConstraints
 # ---------------------------------------------------------------------------
 class TestResearchConstraints:
    def test_defaults(self):
        c = ResearchConstraints()
        assert c.max_iterations == 5
        assert c.token_budget == 20_000
        assert c.max_sources == 10
        assert c.source_filter is None
    def test_custom_values(self):
        c = ResearchConstraints(
            max_iterations=3, token_budget=5000, max_sources=5
        )
        assert c.max_iterations == 3
        assert c.token_budget == 5000
        assert c.max_sources == 5
    def test_invalid_iterations(self):
        with pytest.raises(ValidationError):
            ResearchConstraints(max_iterations=0)
    def test_invalid_token_budget(self):
        with pytest.raises(ValidationError):
            ResearchConstraints(token_budget=500)
    def test_serialization_roundtrip(self):
        c = ResearchConstraints(max_iterations=3, token_budget=10000)
        data = c.model_dump()
        c2 = ResearchConstraints(**data)
        assert c == c2
 # ---------------------------------------------------------------------------
 # Citation
 # ---------------------------------------------------------------------------
 class TestCitation:
    def test_full_citation(self):
        c = make_citation()
        assert c.source == "web"
        assert c.raw_excerpt.startswith("Verbatim")
        assert 0.0 <= c.confidence <= 1.0
    def test_minimal_citation(self):
        c = Citation(
            source="web",
            locator="https://example.com",
            raw_excerpt="Some text.",
            confidence=0.5,
        )
        assert c.title is None
        assert c.snippet is None
    def test_confidence_bounds(self):
        with pytest.raises(ValidationError):
            make_citation(confidence=1.5)
        with pytest.raises(ValidationError):
            make_citation(confidence=-0.1)
    def test_raw_excerpt_required(self):
        with pytest.raises(ValidationError):
            Citation(source="web", locator="https://example.com", confidence=0.5)
    def test_serialization_roundtrip(self):
        c = make_citation()
        data = c.model_dump()
        c2 = Citation(**data)
        assert c == c2
 # ---------------------------------------------------------------------------
 # GapCategory
 # ---------------------------------------------------------------------------
 class TestGapCategory:
    def test_all_categories_exist(self):
        expected = {
            "source_not_found",
            "access_denied",
            "budget_exhausted",
            "contradictory_sources",
            "scope_exceeded",
        }
        actual = {cat.value for cat in GapCategory}
        assert actual == expected
    def test_string_enum(self):
        assert GapCategory.SOURCE_NOT_FOUND == "source_not_found"
        assert isinstance(GapCategory.ACCESS_DENIED, str)
 # ---------------------------------------------------------------------------
 # Gap
 # ---------------------------------------------------------------------------
 class TestGap:
    def test_gap_creation(self):
        g = make_gap()
        assert g.category == GapCategory.SOURCE_NOT_FOUND
        assert g.topic == "pest management"
    def test_all_categories_accepted(self):
        for cat in GapCategory:
            g = make_gap(category=cat)
            assert g.category == cat
    def test_serialization_roundtrip(self):
        g = make_gap()
        data = g.model_dump()
        g2 = Gap(**data)
        assert g == g2
    def test_json_uses_string_category(self):
        g = make_gap(category=GapCategory.BUDGET_EXHAUSTED)
        data = json.loads(g.model_dump_json())
        assert data["category"] == "budget_exhausted"
 # ---------------------------------------------------------------------------
 # DiscoveryEvent
 # ---------------------------------------------------------------------------
 class TestDiscoveryEvent:
    def test_full_event(self):
        e = make_discovery_event()
        assert e.type == "related_research"
        assert e.suggested_researcher == "arxiv"
    def test_minimal_event(self):
        e = DiscoveryEvent(
            type="contradiction",
            query="conflicting data on topic X",
            reason="Two sources disagree",
        )
        assert e.suggested_researcher is None
        assert e.source_locator is None
    def test_serialization_roundtrip(self):
        e = make_discovery_event()
        data = e.model_dump()
        e2 = DiscoveryEvent(**data)
        assert e == e2
 # ---------------------------------------------------------------------------
 # ConfidenceFactors
 # ---------------------------------------------------------------------------
 class TestConfidenceFactors:
    def test_creation(self):
        cf = make_confidence_factors()
        assert cf.num_corroborating_sources == 3
        assert cf.source_authority == "high"
        assert cf.contradiction_detected is False
        assert cf.recency == "current"
    def test_recency_none(self):
        cf = make_confidence_factors(recency=None)
        assert cf.recency is None
    def test_query_specificity_bounds(self):
        with pytest.raises(ValidationError):
            make_confidence_factors(query_specificity_match=1.5)
        with pytest.raises(ValidationError):
            make_confidence_factors(query_specificity_match=-0.1)
    def test_serialization_roundtrip(self):
        cf = make_confidence_factors()
        data = cf.model_dump()
        cf2 = ConfidenceFactors(**data)
        assert cf == cf2
 # ---------------------------------------------------------------------------
 # CostMetadata
 # ---------------------------------------------------------------------------
 class TestCostMetadata:
    def test_creation(self):
        cm = make_cost_metadata()
        assert cm.tokens_used == 8452
        assert cm.model_id == "claude-sonnet-4-6"
    def test_model_id_required(self):
        with pytest.raises(ValidationError):
            CostMetadata(
                tokens_used=100,
                iterations_run=1,
                wall_time_sec=1.0,
                budget_exhausted=False,
            )
    def test_non_negative_constraints(self):
        with pytest.raises(ValidationError):
            make_cost_metadata(tokens_used=-1)
        with pytest.raises(ValidationError):
            make_cost_metadata(wall_time_sec=-0.5)
    def test_serialization_roundtrip(self):
        cm = make_cost_metadata()
        data = cm.model_dump()
        cm2 = CostMetadata(**data)
        assert cm == cm2
 # ---------------------------------------------------------------------------
 # ResearchResult (full contract)
 # ---------------------------------------------------------------------------
 class TestResearchResult:
    def test_full_result(self):
        r = make_research_result()
        assert r.answer.startswith("Utah")
        assert len(r.citations) == 1
        assert len(r.gaps) == 1
        assert len(r.discovery_events) == 1
        assert 0.0 <= r.confidence <= 1.0
        assert r.cost_metadata.model_id == "claude-sonnet-4-6"
    def test_empty_lists_allowed(self):
        r = make_research_result(
            citations=[], gaps=[], discovery_events=[]
        )
        assert r.citations == []
        assert r.gaps == []
        assert r.discovery_events == []
    def test_confidence_bounds(self):
        with pytest.raises(ValidationError):
            make_research_result(confidence=1.5)
    def test_full_json_roundtrip(self):
        r = make_research_result()
        json_str = r.model_dump_json()
        data = json.loads(json_str)
        r2 = ResearchResult(**data)
        assert r == r2
    def test_json_structure(self):
        """Verify the JSON output matches the contract schema."""
        r = make_research_result()
        data = json.loads(r.model_dump_json())
        # Top-level keys
        expected_keys = {
            "answer",
            "citations",
            "gaps",
            "discovery_events",
            "confidence",
            "confidence_factors",
            "cost_metadata",
            "trace_id",
        }
        assert set(data.keys()) == expected_keys
        # Citation keys
        citation_keys = {
            "source",
            "locator",
            "title",
            "snippet",
            "raw_excerpt",
            "confidence",
        }
        assert set(data["citations"][0].keys()) == citation_keys
        # Gap keys
        gap_keys = {"topic", "category", "detail"}
        assert set(data["gaps"][0].keys()) == gap_keys
        # Gap category is a string value
        assert data["gaps"][0]["category"] == "source_not_found"
        # CostMetadata includes model_id
        assert "model_id" in data["cost_metadata"]
        # ConfidenceFactors keys
        cf_keys = {
            "num_corroborating_sources",
            "source_authority",
            "contradiction_detected",
            "query_specificity_match",
            "budget_exhausted",
            "recency",
        }
        assert set(data["confidence_factors"].keys()) == cf_keys