"""Tests for the arxiv-rag ingest pipeline (M5.1.1).

Strategy: mock the slow / network bits (arxiv API, embedder, chromadb)
and exercise the real pipeline against synthetic PDFs generated with
pymupdf at test time. This keeps the tests deterministic, fast, and
network-free while still exercising the actual extract_sections logic.
"""

from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock

import pytest

from researchers.arxiv import ingest as ingest_mod
from researchers.arxiv.ingest import (
    PaperMetadata,
    Section,
    embed_and_store,
    extract_sections,
    ingest,
)
from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None:
    """Build a tiny PDF with one section per (heading, body) tuple.

    pymupdf is already a hard dep of the arxiv extra, so synthesizing a
    fixture PDF inline is cheaper than checking a binary into the repo.
    """
    import pymupdf

    doc = pymupdf.open()
    for heading, body in sections:
        page = doc.new_page()
        page.insert_text((50, 80), heading, fontsize=14)
        # Wrap body across a few lines for realism
        y = 110
        for line in body.split("\n"):
            page.insert_text((50, y), line, fontsize=11)
            y += 16
    doc.save(str(path))
    doc.close()


@pytest.fixture
def store(tmp_path):
    """ArxivStore rooted in a temp directory."""
    return ArxivStore(root=tmp_path / "arxiv-rag")


class StubEmbedder:
    """Minimal stand-in for sentence-transformers.SentenceTransformer."""

    def __init__(self, dim: int = 4):
        self.dim = dim
        self.calls: list[list[str]] = []

    def encode(self, texts):
        self.calls.append(list(texts))
        # Return deterministic vectors keyed off text length so two
        # different sections produce two different embeddings.
        return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts]


class StubChromaCollection:
    """In-memory drop-in for a chromadb collection."""

    def __init__(self):
        self.docs: dict[str, dict] = {}

    def upsert(self, ids, documents, embeddings, metadatas):
        for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas):
            self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta}

    def get(self, where=None):
        if where is None:
            ids = list(self.docs.keys())
        else:
            ids = [
                i
                for i, entry in self.docs.items()
                if all(entry["metadata"].get(k) == v for k, v in where.items())
            ]
        return {"ids": ids}

    def delete(self, where):
        to_drop = [
            i
            for i, entry in self.docs.items()
            if all(entry["metadata"].get(k) == v for k, v in where.items())
        ]
        for i in to_drop:
            del self.docs[i]


@pytest.fixture
def stub_collection(monkeypatch):
    """Replace ArxivStore.collection with an in-memory stub."""
    stub = StubChromaCollection()
    monkeypatch.setattr(
        ArxivStore, "collection", property(lambda self: stub)
    )
    return stub


# ---------------------------------------------------------------------------
# extract_sections — real pymupdf, synthetic PDFs
# ---------------------------------------------------------------------------


class TestExtractSections:
    def test_detects_canonical_headings(self, tmp_path):
        pdf = tmp_path / "paper.pdf"
        _make_synthetic_pdf(
            pdf,
            [
                ("Introduction", "We study X. We find Y."),
                ("Methods", "We used Z to evaluate Y."),
                ("Results", "Accuracy was 95%."),
                ("Conclusion", "X works."),
            ],
        )
        sections = extract_sections(pdf)
        titles = [s.title.lower() for s in sections]
        assert "introduction" in titles
        assert "methods" in titles
        assert "results" in titles
        assert "conclusion" in titles
        # Body text from each section should be present
        intro = next(s for s in sections if s.title.lower() == "introduction")
        assert "we study x" in intro.text.lower()

    def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path):
        pdf = tmp_path / "no-headings.pdf"
        _make_synthetic_pdf(
            pdf,
            [
                ("Some random title nobody recognizes", "Body text body text."),
            ],
        )
        sections = extract_sections(pdf)
        assert len(sections) == 1
        assert sections[0].title == "Full Paper"
        assert "body text" in sections[0].text.lower()


# ---------------------------------------------------------------------------
# embed_and_store — uses stub collection + stub embedder
# ---------------------------------------------------------------------------


class TestEmbedAndStore:
    def test_writes_chunks_and_returns_count(self, store, stub_collection):
        sections = [
            Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1),
            Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2),
        ]
        meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
        n = embed_and_store(
            arxiv_id="2403.12345",
            sections=sections,
            store=store,
            model_name="stub-model",
            metadata=meta,
            embedder=StubEmbedder(),
        )
        assert n == 2
        assert len(stub_collection.docs) == 2
        # Check that chunk ids are model-scoped
        expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)}
        assert set(stub_collection.docs.keys()) == expected_ids
        # Metadata round-trips
        first = next(iter(stub_collection.docs.values()))
        assert first["metadata"]["arxiv_id"] == "2403.12345"
        assert first["metadata"]["embedding_model"] == "stub-model"

    def test_re_embed_replaces_existing_chunks(self, store, stub_collection):
        meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
        sections_v1 = [
            Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
            Section(index=1, title="Methods", text="second", page_start=2, page_end=2),
        ]
        embed_and_store(
            "2403.12345", sections_v1, store, "stub-model", meta,
            embedder=StubEmbedder(),
        )
        assert len(stub_collection.docs) == 2

        # Re-embed with fewer sections — should drop the second.
        sections_v2 = [
            Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
        ]
        embed_and_store(
            "2403.12345", sections_v2, store, "stub-model", meta,
            embedder=StubEmbedder(),
        )
        assert len(stub_collection.docs) == 1

    def test_empty_sections_is_noop(self, store, stub_collection):
        meta = PaperMetadata(arxiv_id="x", version="", title="")
        n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder())
        assert n == 0
        assert stub_collection.docs == {}


# ---------------------------------------------------------------------------
# Top-level ingest() — full pipeline with mocked download
# ---------------------------------------------------------------------------


def _stub_arxiv_search(arxiv_id: str):
    """Return a fake arxiv.Search result for ``arxiv_id``."""

    def _download_pdf(dirpath=None, filename=None):
        # Generate a synthetic PDF on the fly so the rest of the
        # pipeline has something real to read.
        target = Path(dirpath) / filename
        _make_synthetic_pdf(
            target,
            [
                ("Introduction", "Stub paper introduction."),
                ("Methods", "Stub paper methods."),
                ("Results", "Stub paper results."),
            ],
        )

    paper = SimpleNamespace(
        entry_id=f"http://arxiv.org/abs/{arxiv_id}v1",
        title=f"Test paper {arxiv_id}",
        authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")],
        published=datetime(2024, 1, 15, tzinfo=timezone.utc),
        primary_category="cs.LG",
        download_pdf=_download_pdf,
    )
    return [paper]


class TestIngest:
    def test_end_to_end(self, store, stub_collection):
        record = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )

        # Manifest entry
        assert isinstance(record, PaperRecord)
        assert record.arxiv_id == "2403.12345"
        assert record.title == "Test paper 2403.12345"
        assert record.authors == ["Alice", "Bob"]
        assert record.year == 2024
        assert record.category == "cs.LG"
        assert record.chunks_indexed >= 1
        assert record.embedding_model == "stub-model"

        # Manifest persisted to disk
        loaded = store.load_manifest()
        assert "2403.12345" in loaded
        assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed

        # PDF cached
        assert (store.pdfs_dir / "2403.12345.pdf").exists()

        # Chunks in stub collection
        assert len(stub_collection.docs) == record.chunks_indexed

    def test_idempotent_reingest(self, store, stub_collection):
        first = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )
        chunks_after_first = len(stub_collection.docs)

        second = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )
        # Same number of chunks (replace, not append)
        assert len(stub_collection.docs) == chunks_after_first
        assert second.chunks_indexed == first.chunks_indexed

    def test_unknown_arxiv_id_raises(self, store):
        with pytest.raises(ValueError, match="not found"):
            ingest(
                "9999.99999",
                store=store,
                model_name="stub-model",
                arxiv_search=lambda _id: [],
                embedder=StubEmbedder(),
            )


# ---------------------------------------------------------------------------
# Manifest CRUD via ArxivStore
# ---------------------------------------------------------------------------


class TestManifest:
    def test_load_returns_empty_dict_when_missing(self, store):
        assert store.load_manifest() == {}

    def test_round_trip(self, store):
        rec = PaperRecord(
            arxiv_id="2401.00001",
            version="v2",
            title="Round trip test",
            authors=["A", "B"],
            year=2024,
            category="cs.AI",
            chunks_indexed=7,
            embedding_model="m",
        )
        store.upsert_paper(rec)
        loaded = store.load_manifest()
        assert "2401.00001" in loaded
        assert loaded["2401.00001"].title == "Round trip test"
        assert loaded["2401.00001"].chunks_indexed == 7

    def test_remove_paper(self, store):
        rec = PaperRecord(
            arxiv_id="2401.00001",
            version="",
            title="t",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
        )
        store.upsert_paper(rec)
        assert store.remove_paper("2401.00001") is True
        assert store.load_manifest() == {}
        assert store.remove_paper("2401.00001") is False

    def test_list_sorted_newest_first(self, store):
        old = PaperRecord(
            arxiv_id="old",
            version="",
            title="old",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
            added_at="2020-01-01T00:00:00Z",
        )
        new = PaperRecord(
            arxiv_id="new",
            version="",
            title="new",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
            added_at="2026-01-01T00:00:00Z",
        )
        store.upsert_paper(old)
        store.upsert_paper(new)
        listed = store.list_papers()
        assert [p.arxiv_id for p in listed] == ["new", "old"]


# ---------------------------------------------------------------------------
# CLI smoke (without actually calling chromadb)
# ---------------------------------------------------------------------------


class TestArxivCLI:
    def test_list_empty(self, tmp_path, monkeypatch):
        from click.testing import CliRunner

        from cli.main import cli

        monkeypatch.setattr(
            "researchers.arxiv.store.DEFAULT_ROOT",
            tmp_path / "arxiv-rag",
        )
        runner = CliRunner()
        result = runner.invoke(cli, ["arxiv", "list"])
        assert result.exit_code == 0, result.output
        assert "No papers indexed" in result.output

    def test_info_missing(self, tmp_path, monkeypatch):
        from click.testing import CliRunner

        from cli.main import cli

        monkeypatch.setattr(
            "researchers.arxiv.store.DEFAULT_ROOT",
            tmp_path / "arxiv-rag",
        )
        runner = CliRunner()
        result = runner.invoke(cli, ["arxiv", "info", "0000.00000"])
        assert result.exit_code == 1
        assert "Not indexed" in result.output