marchwarden/tests/test_arxiv_ingest.py

"""Tests for the arxiv-rag ingest pipeline (M5.1.1).

Strategy: mock the slow / network bits (arxiv API, embedder, chromadb)
and exercise the real pipeline against synthetic PDFs generated with
pymupdf at test time. This keeps the tests deterministic, fast, and
network-free while still exercising the actual extract_sections logic.
"""

from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock

import pytest

from researchers.arxiv import ingest as ingest_mod
from researchers.arxiv.ingest import (
    PaperMetadata,
    Section,
    embed_and_store,
    extract_sections,
    ingest,
)
from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None:
    """Build a tiny PDF with one section per (heading, body) tuple.

    pymupdf is already a hard dep of the arxiv extra, so synthesizing a
    fixture PDF inline is cheaper than checking a binary into the repo.
    """
    import pymupdf

    doc = pymupdf.open()
    for heading, body in sections:
        page = doc.new_page()
        page.insert_text((50, 80), heading, fontsize=14)
        # Wrap body across a few lines for realism
        y = 110
        for line in body.split("\n"):
            page.insert_text((50, y), line, fontsize=11)
            y += 16
    doc.save(str(path))
    doc.close()


@pytest.fixture
def store(tmp_path):
    """ArxivStore rooted in a temp directory."""
    return ArxivStore(root=tmp_path / "arxiv-rag")


class StubEmbedder:
    """Minimal stand-in for sentence-transformers.SentenceTransformer."""

    def __init__(self, dim: int = 4):
        self.dim = dim
        self.calls: list[list[str]] = []

    def encode(self, texts):
        self.calls.append(list(texts))
        # Return deterministic vectors keyed off text length so two
        # different sections produce two different embeddings.
        return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts]


class StubChromaCollection:
    """In-memory drop-in for a chromadb collection."""

    def __init__(self):
        self.docs: dict[str, dict] = {}

    def upsert(self, ids, documents, embeddings, metadatas):
        for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas):
            self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta}

    def get(self, where=None):
        if where is None:
            ids = list(self.docs.keys())
        else:
            ids = [
                i
                for i, entry in self.docs.items()
                if all(entry["metadata"].get(k) == v for k, v in where.items())
            ]
        return {"ids": ids}

    def delete(self, where):
        to_drop = [
            i
            for i, entry in self.docs.items()
            if all(entry["metadata"].get(k) == v for k, v in where.items())
        ]
        for i in to_drop:
            del self.docs[i]


@pytest.fixture
def stub_collection(monkeypatch):
    """Replace ArxivStore.collection with an in-memory stub."""
    stub = StubChromaCollection()
    monkeypatch.setattr(
        ArxivStore, "collection", property(lambda self: stub)
    )
    return stub


# ---------------------------------------------------------------------------
# extract_sections — real pymupdf, synthetic PDFs
# ---------------------------------------------------------------------------


class TestExtractSections:
    def test_detects_canonical_headings(self, tmp_path):
        pdf = tmp_path / "paper.pdf"
        _make_synthetic_pdf(
            pdf,
            [
                ("Introduction", "We study X. We find Y."),
                ("Methods", "We used Z to evaluate Y."),
                ("Results", "Accuracy was 95%."),
                ("Conclusion", "X works."),
            ],
        )
        sections = extract_sections(pdf)
        titles = [s.title.lower() for s in sections]
        assert "introduction" in titles
        assert "methods" in titles
        assert "results" in titles
        assert "conclusion" in titles
        # Body text from each section should be present
        intro = next(s for s in sections if s.title.lower() == "introduction")
        assert "we study x" in intro.text.lower()

    def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path):
        pdf = tmp_path / "no-headings.pdf"
        _make_synthetic_pdf(
            pdf,
            [
                ("Some random title nobody recognizes", "Body text body text."),
            ],
        )
        sections = extract_sections(pdf)
        assert len(sections) == 1
        assert sections[0].title == "Full Paper"
        assert "body text" in sections[0].text.lower()


# ---------------------------------------------------------------------------
# embed_and_store — uses stub collection + stub embedder
# ---------------------------------------------------------------------------


class TestEmbedAndStore:
    def test_writes_chunks_and_returns_count(self, store, stub_collection):
        sections = [
            Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1),
            Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2),
        ]
        meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
        n = embed_and_store(
            arxiv_id="2403.12345",
            sections=sections,
            store=store,
            model_name="stub-model",
            metadata=meta,
            embedder=StubEmbedder(),
        )
        assert n == 2
        assert len(stub_collection.docs) == 2
        # Check that chunk ids are model-scoped
        expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)}
        assert set(stub_collection.docs.keys()) == expected_ids
        # Metadata round-trips
        first = next(iter(stub_collection.docs.values()))
        assert first["metadata"]["arxiv_id"] == "2403.12345"
        assert first["metadata"]["embedding_model"] == "stub-model"

    def test_re_embed_replaces_existing_chunks(self, store, stub_collection):
        meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
        sections_v1 = [
            Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
            Section(index=1, title="Methods", text="second", page_start=2, page_end=2),
        ]
        embed_and_store(
            "2403.12345", sections_v1, store, "stub-model", meta,
            embedder=StubEmbedder(),
        )
        assert len(stub_collection.docs) == 2

        # Re-embed with fewer sections — should drop the second.
        sections_v2 = [
            Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
        ]
        embed_and_store(
            "2403.12345", sections_v2, store, "stub-model", meta,
            embedder=StubEmbedder(),
        )
        assert len(stub_collection.docs) == 1

    def test_empty_sections_is_noop(self, store, stub_collection):
        meta = PaperMetadata(arxiv_id="x", version="", title="")
        n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder())
        assert n == 0
        assert stub_collection.docs == {}


# ---------------------------------------------------------------------------
# Top-level ingest() — full pipeline with mocked download
# ---------------------------------------------------------------------------


def _stub_arxiv_search(arxiv_id: str):
    """Return a fake arxiv.Search result for ``arxiv_id``."""

    def _download_pdf(dirpath=None, filename=None):
        # Generate a synthetic PDF on the fly so the rest of the
        # pipeline has something real to read.
        target = Path(dirpath) / filename
        _make_synthetic_pdf(
            target,
            [
                ("Introduction", "Stub paper introduction."),
                ("Methods", "Stub paper methods."),
                ("Results", "Stub paper results."),
            ],
        )

    paper = SimpleNamespace(
        entry_id=f"http://arxiv.org/abs/{arxiv_id}v1",
        title=f"Test paper {arxiv_id}",
        authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")],
        published=datetime(2024, 1, 15, tzinfo=timezone.utc),
        primary_category="cs.LG",
        download_pdf=_download_pdf,
    )
    return [paper]


class TestIngest:
    def test_end_to_end(self, store, stub_collection):
        record = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )

        # Manifest entry
        assert isinstance(record, PaperRecord)
        assert record.arxiv_id == "2403.12345"
        assert record.title == "Test paper 2403.12345"
        assert record.authors == ["Alice", "Bob"]
        assert record.year == 2024
        assert record.category == "cs.LG"
        assert record.chunks_indexed >= 1
        assert record.embedding_model == "stub-model"

        # Manifest persisted to disk
        loaded = store.load_manifest()
        assert "2403.12345" in loaded
        assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed

        # PDF cached
        assert (store.pdfs_dir / "2403.12345.pdf").exists()

        # Chunks in stub collection
        assert len(stub_collection.docs) == record.chunks_indexed

    def test_idempotent_reingest(self, store, stub_collection):
        first = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )
        chunks_after_first = len(stub_collection.docs)

        second = ingest(
            "2403.12345",
            store=store,
            model_name="stub-model",
            arxiv_search=_stub_arxiv_search,
            embedder=StubEmbedder(),
        )
        # Same number of chunks (replace, not append)
        assert len(stub_collection.docs) == chunks_after_first
        assert second.chunks_indexed == first.chunks_indexed

    def test_unknown_arxiv_id_raises(self, store):
        with pytest.raises(ValueError, match="not found"):
            ingest(
                "9999.99999",
                store=store,
                model_name="stub-model",
                arxiv_search=lambda _id: [],
                embedder=StubEmbedder(),
            )


# ---------------------------------------------------------------------------
# Manifest CRUD via ArxivStore
# ---------------------------------------------------------------------------


class TestManifest:
    def test_load_returns_empty_dict_when_missing(self, store):
        assert store.load_manifest() == {}

    def test_round_trip(self, store):
        rec = PaperRecord(
            arxiv_id="2401.00001",
            version="v2",
            title="Round trip test",
            authors=["A", "B"],
            year=2024,
            category="cs.AI",
            chunks_indexed=7,
            embedding_model="m",
        )
        store.upsert_paper(rec)
        loaded = store.load_manifest()
        assert "2401.00001" in loaded
        assert loaded["2401.00001"].title == "Round trip test"
        assert loaded["2401.00001"].chunks_indexed == 7

    def test_remove_paper(self, store):
        rec = PaperRecord(
            arxiv_id="2401.00001",
            version="",
            title="t",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
        )
        store.upsert_paper(rec)
        assert store.remove_paper("2401.00001") is True
        assert store.load_manifest() == {}
        assert store.remove_paper("2401.00001") is False

    def test_list_sorted_newest_first(self, store):
        old = PaperRecord(
            arxiv_id="old",
            version="",
            title="old",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
            added_at="2020-01-01T00:00:00Z",
        )
        new = PaperRecord(
            arxiv_id="new",
            version="",
            title="new",
            authors=[],
            year=None,
            category=None,
            chunks_indexed=0,
            embedding_model="m",
            added_at="2026-01-01T00:00:00Z",
        )
        store.upsert_paper(old)
        store.upsert_paper(new)
        listed = store.list_papers()
        assert [p.arxiv_id for p in listed] == ["new", "old"]


# ---------------------------------------------------------------------------
# CLI smoke (without actually calling chromadb)
# ---------------------------------------------------------------------------


class TestArxivCLI:
    def test_list_empty(self, tmp_path, monkeypatch):
        from click.testing import CliRunner

        from cli.main import cli

        monkeypatch.setattr(
            "researchers.arxiv.store.DEFAULT_ROOT",
            tmp_path / "arxiv-rag",
        )
        runner = CliRunner()
        result = runner.invoke(cli, ["arxiv", "list"])
        assert result.exit_code == 0, result.output
        assert "No papers indexed" in result.output

    def test_info_missing(self, tmp_path, monkeypatch):
        from click.testing import CliRunner

        from cli.main import cli

        monkeypatch.setattr(
            "researchers.arxiv.store.DEFAULT_ROOT",
            tmp_path / "arxiv-rag",
        )
        runner = CliRunner()
        result = runner.invoke(cli, ["arxiv", "info", "0000.00000"])
        assert result.exit_code == 1
        assert "Not indexed" in result.output
feat(arxiv): ingest pipeline (M5.1.1) Closes #38. First sub-milestone of M5.1 (Researcher #2: arxiv-rag). New package researchers/arxiv/ with three modules: - store.py — ArxivStore wraps a persistent chromadb collection at ~/.marchwarden/arxiv-rag/chroma/ plus a papers.json manifest. Chunk ids are deterministic and embedding-model-scoped (per ArxivRagProposal decision 4) so re-ingesting with a different embedder doesn't collide with prior chunks. - ingest.py — three-phase pipeline: download_pdf (arxiv API), extract_sections (pymupdf with heuristic heading detection + whole-paper fallback), and embed_and_store (sentence-transformers, configurable via MARCHWARDEN_ARXIV_EMBED_MODEL). Top-level ingest() chains them and upserts the manifest entry. Re-ingest is idempotent — chunks for the same paper are dropped before re-adding. - CLI subgroup `marchwarden arxiv add\|list\|info\|remove`. Lazy-imports the heavy chromadb / torch deps so non-arxiv commands stay fast. The heavy ML deps (pymupdf, chromadb, sentence-transformers, arxiv) are gated behind an optional `[arxiv]` extra so the base install stays slim for users who only want the web researcher. Tests: 14 added (141 total passing). Real pymupdf against synthetic PDFs generated at test time covers extract_sections; chromadb and the embedder are stubbed via dependency injection so the tests stay fast, deterministic, and network-free. End-to-end ingest() is exercised with a mocked arxiv.Search that produces synthetic PDFs. Out of scope for #38 (covered by later sub-milestones): - Retrieval / search API (#39) - ArxivResearcher agent loop (#40) - MCP server (#41) - ask --researcher arxiv flag (#42) - Cost ledger embedding_calls field (#43) Notes: - pip install pulled in CUDA torch wheel (~2GB nvidia libs); harmless on CPU-only WSL but a future optimization would pin the CPU torch index. - Live smoke against a real arxiv id deferred so we don't block the M3.3 collection runner currently using the venv. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 02:03:42 +00:00			`"""Tests for the arxiv-rag ingest pipeline (M5.1.1).`

			`Strategy: mock the slow / network bits (arxiv API, embedder, chromadb)`
			`and exercise the real pipeline against synthetic PDFs generated with`
			`pymupdf at test time. This keeps the tests deterministic, fast, and`
			`network-free while still exercising the actual extract_sections logic.`
			`"""`

			`from __future__ import annotations`

			`import json`
			`from datetime import datetime, timezone`
			`from pathlib import Path`
			`from types import SimpleNamespace`
			`from unittest.mock import MagicMock`

			`import pytest`

			`from researchers.arxiv import ingest as ingest_mod`
			`from researchers.arxiv.ingest import (`
			`PaperMetadata,`
			`Section,`
			`embed_and_store,`
			`extract_sections,`
			`ingest,`
			`)`
			`from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id`


			`# ---------------------------------------------------------------------------`
			`# Fixtures`
			`# ---------------------------------------------------------------------------`


			`def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None:`
			`"""Build a tiny PDF with one section per (heading, body) tuple.`

			`pymupdf is already a hard dep of the arxiv extra, so synthesizing a`
			`fixture PDF inline is cheaper than checking a binary into the repo.`
			`"""`
			`import pymupdf`

			`doc = pymupdf.open()`
			`for heading, body in sections:`
			`page = doc.new_page()`
			`page.insert_text((50, 80), heading, fontsize=14)`
			`# Wrap body across a few lines for realism`
			`y = 110`
			`for line in body.split("\n"):`
			`page.insert_text((50, y), line, fontsize=11)`
			`y += 16`
			`doc.save(str(path))`
			`doc.close()`


			`@pytest.fixture`
			`def store(tmp_path):`
			`"""ArxivStore rooted in a temp directory."""`
			`return ArxivStore(root=tmp_path / "arxiv-rag")`


			`class StubEmbedder:`
			`"""Minimal stand-in for sentence-transformers.SentenceTransformer."""`

			`def __init__(self, dim: int = 4):`
			`self.dim = dim`
			`self.calls: list[list[str]] = []`

			`def encode(self, texts):`
			`self.calls.append(list(texts))`
			`# Return deterministic vectors keyed off text length so two`
			`# different sections produce two different embeddings.`
			`return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts]`


			`class StubChromaCollection:`
			`"""In-memory drop-in for a chromadb collection."""`

			`def __init__(self):`
			`self.docs: dict[str, dict] = {}`

			`def upsert(self, ids, documents, embeddings, metadatas):`
			`for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas):`
			`self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta}`

			`def get(self, where=None):`
			`if where is None:`
			`ids = list(self.docs.keys())`
			`else:`
			`ids = [`
			`i`
			`for i, entry in self.docs.items()`
			`if all(entry["metadata"].get(k) == v for k, v in where.items())`
			`]`
			`return {"ids": ids}`

			`def delete(self, where):`
			`to_drop = [`
			`i`
			`for i, entry in self.docs.items()`
			`if all(entry["metadata"].get(k) == v for k, v in where.items())`
			`]`
			`for i in to_drop:`
			`del self.docs[i]`


			`@pytest.fixture`
			`def stub_collection(monkeypatch):`
			`"""Replace ArxivStore.collection with an in-memory stub."""`
			`stub = StubChromaCollection()`
			`monkeypatch.setattr(`
			`ArxivStore, "collection", property(lambda self: stub)`
			`)`
			`return stub`


			`# ---------------------------------------------------------------------------`
			`# extract_sections — real pymupdf, synthetic PDFs`
			`# ---------------------------------------------------------------------------`


			`class TestExtractSections:`
			`def test_detects_canonical_headings(self, tmp_path):`
			`pdf = tmp_path / "paper.pdf"`
			`_make_synthetic_pdf(`
			`pdf,`
			`[`
			`("Introduction", "We study X. We find Y."),`
			`("Methods", "We used Z to evaluate Y."),`
			`("Results", "Accuracy was 95%."),`
			`("Conclusion", "X works."),`
			`],`
			`)`
			`sections = extract_sections(pdf)`
			`titles = [s.title.lower() for s in sections]`
			`assert "introduction" in titles`
			`assert "methods" in titles`
			`assert "results" in titles`
			`assert "conclusion" in titles`
			`# Body text from each section should be present`
			`intro = next(s for s in sections if s.title.lower() == "introduction")`
			`assert "we study x" in intro.text.lower()`

			`def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path):`
			`pdf = tmp_path / "no-headings.pdf"`
			`_make_synthetic_pdf(`
			`pdf,`
			`[`
			`("Some random title nobody recognizes", "Body text body text."),`
			`],`
			`)`
			`sections = extract_sections(pdf)`
			`assert len(sections) == 1`
			`assert sections[0].title == "Full Paper"`
			`assert "body text" in sections[0].text.lower()`


			`# ---------------------------------------------------------------------------`
			`# embed_and_store — uses stub collection + stub embedder`
			`# ---------------------------------------------------------------------------`


			`class TestEmbedAndStore:`
			`def test_writes_chunks_and_returns_count(self, store, stub_collection):`
			`sections = [`
			`Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1),`
			`Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2),`
			`]`
			`meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")`
			`n = embed_and_store(`
			`arxiv_id="2403.12345",`
			`sections=sections,`
			`store=store,`
			`model_name="stub-model",`
			`metadata=meta,`
			`embedder=StubEmbedder(),`
			`)`
			`assert n == 2`
			`assert len(stub_collection.docs) == 2`
			`# Check that chunk ids are model-scoped`
			`expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)}`
			`assert set(stub_collection.docs.keys()) == expected_ids`
			`# Metadata round-trips`
			`first = next(iter(stub_collection.docs.values()))`
			`assert first["metadata"]["arxiv_id"] == "2403.12345"`
			`assert first["metadata"]["embedding_model"] == "stub-model"`

			`def test_re_embed_replaces_existing_chunks(self, store, stub_collection):`
			`meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")`
			`sections_v1 = [`
			`Section(index=0, title="Intro", text="first", page_start=1, page_end=1),`
			`Section(index=1, title="Methods", text="second", page_start=2, page_end=2),`
			`]`
			`embed_and_store(`
			`"2403.12345", sections_v1, store, "stub-model", meta,`
			`embedder=StubEmbedder(),`
			`)`
			`assert len(stub_collection.docs) == 2`

			`# Re-embed with fewer sections — should drop the second.`
			`sections_v2 = [`
			`Section(index=0, title="Intro", text="first", page_start=1, page_end=1),`
			`]`
			`embed_and_store(`
			`"2403.12345", sections_v2, store, "stub-model", meta,`
			`embedder=StubEmbedder(),`
			`)`
			`assert len(stub_collection.docs) == 1`

			`def test_empty_sections_is_noop(self, store, stub_collection):`
			`meta = PaperMetadata(arxiv_id="x", version="", title="")`
			`n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder())`
			`assert n == 0`
			`assert stub_collection.docs == {}`


			`# ---------------------------------------------------------------------------`
			`# Top-level ingest() — full pipeline with mocked download`
			`# ---------------------------------------------------------------------------`


			`def _stub_arxiv_search(arxiv_id: str):`
			"""Return a fake arxiv.Search result for ``arxiv_id``."""

			`def _download_pdf(dirpath=None, filename=None):`
			`# Generate a synthetic PDF on the fly so the rest of the`
			`# pipeline has something real to read.`
			`target = Path(dirpath) / filename`
			`_make_synthetic_pdf(`
			`target,`
			`[`
			`("Introduction", "Stub paper introduction."),`
			`("Methods", "Stub paper methods."),`
			`("Results", "Stub paper results."),`
			`],`
			`)`

			`paper = SimpleNamespace(`
			`entry_id=f"http://arxiv.org/abs/{arxiv_id}v1",`
			`title=f"Test paper {arxiv_id}",`
			`authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")],`
			`published=datetime(2024, 1, 15, tzinfo=timezone.utc),`
			`primary_category="cs.LG",`
			`download_pdf=_download_pdf,`
			`)`
			`return [paper]`


			`class TestIngest:`
			`def test_end_to_end(self, store, stub_collection):`
			`record = ingest(`
			`"2403.12345",`
			`store=store,`
			`model_name="stub-model",`
			`arxiv_search=_stub_arxiv_search,`
			`embedder=StubEmbedder(),`
			`)`

			`# Manifest entry`
			`assert isinstance(record, PaperRecord)`
			`assert record.arxiv_id == "2403.12345"`
			`assert record.title == "Test paper 2403.12345"`
			`assert record.authors == ["Alice", "Bob"]`
			`assert record.year == 2024`
			`assert record.category == "cs.LG"`
			`assert record.chunks_indexed >= 1`
			`assert record.embedding_model == "stub-model"`

			`# Manifest persisted to disk`
			`loaded = store.load_manifest()`
			`assert "2403.12345" in loaded`
			`assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed`

			`# PDF cached`
			`assert (store.pdfs_dir / "2403.12345.pdf").exists()`

			`# Chunks in stub collection`
			`assert len(stub_collection.docs) == record.chunks_indexed`

			`def test_idempotent_reingest(self, store, stub_collection):`
			`first = ingest(`
			`"2403.12345",`
			`store=store,`
			`model_name="stub-model",`
			`arxiv_search=_stub_arxiv_search,`
			`embedder=StubEmbedder(),`
			`)`
			`chunks_after_first = len(stub_collection.docs)`

			`second = ingest(`
			`"2403.12345",`
			`store=store,`
			`model_name="stub-model",`
			`arxiv_search=_stub_arxiv_search,`
			`embedder=StubEmbedder(),`
			`)`
			`# Same number of chunks (replace, not append)`
			`assert len(stub_collection.docs) == chunks_after_first`
			`assert second.chunks_indexed == first.chunks_indexed`

			`def test_unknown_arxiv_id_raises(self, store):`
			`with pytest.raises(ValueError, match="not found"):`
			`ingest(`
			`"9999.99999",`
			`store=store,`
			`model_name="stub-model",`
			`arxiv_search=lambda _id: [],`
			`embedder=StubEmbedder(),`
			`)`


			`# ---------------------------------------------------------------------------`
			`# Manifest CRUD via ArxivStore`
			`# ---------------------------------------------------------------------------`


			`class TestManifest:`
			`def test_load_returns_empty_dict_when_missing(self, store):`
			`assert store.load_manifest() == {}`

			`def test_round_trip(self, store):`
			`rec = PaperRecord(`
			`arxiv_id="2401.00001",`
			`version="v2",`
			`title="Round trip test",`
			`authors=["A", "B"],`
			`year=2024,`
			`category="cs.AI",`
			`chunks_indexed=7,`
			`embedding_model="m",`
			`)`
			`store.upsert_paper(rec)`
			`loaded = store.load_manifest()`
			`assert "2401.00001" in loaded`
			`assert loaded["2401.00001"].title == "Round trip test"`
			`assert loaded["2401.00001"].chunks_indexed == 7`

			`def test_remove_paper(self, store):`
			`rec = PaperRecord(`
			`arxiv_id="2401.00001",`
			`version="",`
			`title="t",`
			`authors=[],`
			`year=None,`
			`category=None,`
			`chunks_indexed=0,`
			`embedding_model="m",`
			`)`
			`store.upsert_paper(rec)`
			`assert store.remove_paper("2401.00001") is True`
			`assert store.load_manifest() == {}`
			`assert store.remove_paper("2401.00001") is False`

			`def test_list_sorted_newest_first(self, store):`
			`old = PaperRecord(`
			`arxiv_id="old",`
			`version="",`
			`title="old",`
			`authors=[],`
			`year=None,`
			`category=None,`
			`chunks_indexed=0,`
			`embedding_model="m",`
			`added_at="2020-01-01T00:00:00Z",`
			`)`
			`new = PaperRecord(`
			`arxiv_id="new",`
			`version="",`
			`title="new",`
			`authors=[],`
			`year=None,`
			`category=None,`
			`chunks_indexed=0,`
			`embedding_model="m",`
			`added_at="2026-01-01T00:00:00Z",`
			`)`
			`store.upsert_paper(old)`
			`store.upsert_paper(new)`
			`listed = store.list_papers()`
			`assert [p.arxiv_id for p in listed] == ["new", "old"]`


			`# ---------------------------------------------------------------------------`
			`# CLI smoke (without actually calling chromadb)`
			`# ---------------------------------------------------------------------------`


			`class TestArxivCLI:`
			`def test_list_empty(self, tmp_path, monkeypatch):`
			`from click.testing import CliRunner`

			`from cli.main import cli`

			`monkeypatch.setattr(`
			`"researchers.arxiv.store.DEFAULT_ROOT",`
			`tmp_path / "arxiv-rag",`
			`)`
			`runner = CliRunner()`
			`result = runner.invoke(cli, ["arxiv", "list"])`
			`assert result.exit_code == 0, result.output`
			`assert "No papers indexed" in result.output`

			`def test_info_missing(self, tmp_path, monkeypatch):`
			`from click.testing import CliRunner`

			`from cli.main import cli`

			`monkeypatch.setattr(`
			`"researchers.arxiv.store.DEFAULT_ROOT",`
			`tmp_path / "arxiv-rag",`
			`)`
			`runner = CliRunner()`
			`result = runner.invoke(cli, ["arxiv", "info", "0000.00000"])`
			`assert result.exit_code == 1`
			`assert "Not indexed" in result.output`