"""Tests for the arxiv-rag ingest pipeline (M5.1.1). Strategy: mock the slow / network bits (arxiv API, embedder, chromadb) and exercise the real pipeline against synthetic PDFs generated with pymupdf at test time. This keeps the tests deterministic, fast, and network-free while still exercising the actual extract_sections logic. """ from __future__ import annotations import json from datetime import datetime, timezone from pathlib import Path from types import SimpleNamespace from unittest.mock import MagicMock import pytest from researchers.arxiv import ingest as ingest_mod from researchers.arxiv.ingest import ( PaperMetadata, Section, embed_and_store, extract_sections, ingest, ) from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None: """Build a tiny PDF with one section per (heading, body) tuple. pymupdf is already a hard dep of the arxiv extra, so synthesizing a fixture PDF inline is cheaper than checking a binary into the repo. """ import pymupdf doc = pymupdf.open() for heading, body in sections: page = doc.new_page() page.insert_text((50, 80), heading, fontsize=14) # Wrap body across a few lines for realism y = 110 for line in body.split("\n"): page.insert_text((50, y), line, fontsize=11) y += 16 doc.save(str(path)) doc.close() @pytest.fixture def store(tmp_path): """ArxivStore rooted in a temp directory.""" return ArxivStore(root=tmp_path / "arxiv-rag") class StubEmbedder: """Minimal stand-in for sentence-transformers.SentenceTransformer.""" def __init__(self, dim: int = 4): self.dim = dim self.calls: list[list[str]] = [] def encode(self, texts): self.calls.append(list(texts)) # Return deterministic vectors keyed off text length so two # different sections produce two different embeddings. return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts] class StubChromaCollection: """In-memory drop-in for a chromadb collection.""" def __init__(self): self.docs: dict[str, dict] = {} def upsert(self, ids, documents, embeddings, metadatas): for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas): self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta} def get(self, where=None): if where is None: ids = list(self.docs.keys()) else: ids = [ i for i, entry in self.docs.items() if all(entry["metadata"].get(k) == v for k, v in where.items()) ] return {"ids": ids} def delete(self, where): to_drop = [ i for i, entry in self.docs.items() if all(entry["metadata"].get(k) == v for k, v in where.items()) ] for i in to_drop: del self.docs[i] @pytest.fixture def stub_collection(monkeypatch): """Replace ArxivStore.collection with an in-memory stub.""" stub = StubChromaCollection() monkeypatch.setattr( ArxivStore, "collection", property(lambda self: stub) ) return stub # --------------------------------------------------------------------------- # extract_sections — real pymupdf, synthetic PDFs # --------------------------------------------------------------------------- class TestExtractSections: def test_detects_canonical_headings(self, tmp_path): pdf = tmp_path / "paper.pdf" _make_synthetic_pdf( pdf, [ ("Introduction", "We study X. We find Y."), ("Methods", "We used Z to evaluate Y."), ("Results", "Accuracy was 95%."), ("Conclusion", "X works."), ], ) sections = extract_sections(pdf) titles = [s.title.lower() for s in sections] assert "introduction" in titles assert "methods" in titles assert "results" in titles assert "conclusion" in titles # Body text from each section should be present intro = next(s for s in sections if s.title.lower() == "introduction") assert "we study x" in intro.text.lower() def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path): pdf = tmp_path / "no-headings.pdf" _make_synthetic_pdf( pdf, [ ("Some random title nobody recognizes", "Body text body text."), ], ) sections = extract_sections(pdf) assert len(sections) == 1 assert sections[0].title == "Full Paper" assert "body text" in sections[0].text.lower() # --------------------------------------------------------------------------- # embed_and_store — uses stub collection + stub embedder # --------------------------------------------------------------------------- class TestEmbedAndStore: def test_writes_chunks_and_returns_count(self, store, stub_collection): sections = [ Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1), Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2), ] meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test") n = embed_and_store( arxiv_id="2403.12345", sections=sections, store=store, model_name="stub-model", metadata=meta, embedder=StubEmbedder(), ) assert n == 2 assert len(stub_collection.docs) == 2 # Check that chunk ids are model-scoped expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)} assert set(stub_collection.docs.keys()) == expected_ids # Metadata round-trips first = next(iter(stub_collection.docs.values())) assert first["metadata"]["arxiv_id"] == "2403.12345" assert first["metadata"]["embedding_model"] == "stub-model" def test_re_embed_replaces_existing_chunks(self, store, stub_collection): meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test") sections_v1 = [ Section(index=0, title="Intro", text="first", page_start=1, page_end=1), Section(index=1, title="Methods", text="second", page_start=2, page_end=2), ] embed_and_store( "2403.12345", sections_v1, store, "stub-model", meta, embedder=StubEmbedder(), ) assert len(stub_collection.docs) == 2 # Re-embed with fewer sections — should drop the second. sections_v2 = [ Section(index=0, title="Intro", text="first", page_start=1, page_end=1), ] embed_and_store( "2403.12345", sections_v2, store, "stub-model", meta, embedder=StubEmbedder(), ) assert len(stub_collection.docs) == 1 def test_empty_sections_is_noop(self, store, stub_collection): meta = PaperMetadata(arxiv_id="x", version="", title="") n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder()) assert n == 0 assert stub_collection.docs == {} # --------------------------------------------------------------------------- # Top-level ingest() — full pipeline with mocked download # --------------------------------------------------------------------------- def _stub_arxiv_search(arxiv_id: str): """Return a fake arxiv.Search result for ``arxiv_id``.""" def _download_pdf(dirpath=None, filename=None): # Generate a synthetic PDF on the fly so the rest of the # pipeline has something real to read. target = Path(dirpath) / filename _make_synthetic_pdf( target, [ ("Introduction", "Stub paper introduction."), ("Methods", "Stub paper methods."), ("Results", "Stub paper results."), ], ) paper = SimpleNamespace( entry_id=f"http://arxiv.org/abs/{arxiv_id}v1", title=f"Test paper {arxiv_id}", authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")], published=datetime(2024, 1, 15, tzinfo=timezone.utc), primary_category="cs.LG", download_pdf=_download_pdf, ) return [paper] class TestIngest: def test_end_to_end(self, store, stub_collection): record = ingest( "2403.12345", store=store, model_name="stub-model", arxiv_search=_stub_arxiv_search, embedder=StubEmbedder(), ) # Manifest entry assert isinstance(record, PaperRecord) assert record.arxiv_id == "2403.12345" assert record.title == "Test paper 2403.12345" assert record.authors == ["Alice", "Bob"] assert record.year == 2024 assert record.category == "cs.LG" assert record.chunks_indexed >= 1 assert record.embedding_model == "stub-model" # Manifest persisted to disk loaded = store.load_manifest() assert "2403.12345" in loaded assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed # PDF cached assert (store.pdfs_dir / "2403.12345.pdf").exists() # Chunks in stub collection assert len(stub_collection.docs) == record.chunks_indexed def test_idempotent_reingest(self, store, stub_collection): first = ingest( "2403.12345", store=store, model_name="stub-model", arxiv_search=_stub_arxiv_search, embedder=StubEmbedder(), ) chunks_after_first = len(stub_collection.docs) second = ingest( "2403.12345", store=store, model_name="stub-model", arxiv_search=_stub_arxiv_search, embedder=StubEmbedder(), ) # Same number of chunks (replace, not append) assert len(stub_collection.docs) == chunks_after_first assert second.chunks_indexed == first.chunks_indexed def test_unknown_arxiv_id_raises(self, store): with pytest.raises(ValueError, match="not found"): ingest( "9999.99999", store=store, model_name="stub-model", arxiv_search=lambda _id: [], embedder=StubEmbedder(), ) # --------------------------------------------------------------------------- # Manifest CRUD via ArxivStore # --------------------------------------------------------------------------- class TestManifest: def test_load_returns_empty_dict_when_missing(self, store): assert store.load_manifest() == {} def test_round_trip(self, store): rec = PaperRecord( arxiv_id="2401.00001", version="v2", title="Round trip test", authors=["A", "B"], year=2024, category="cs.AI", chunks_indexed=7, embedding_model="m", ) store.upsert_paper(rec) loaded = store.load_manifest() assert "2401.00001" in loaded assert loaded["2401.00001"].title == "Round trip test" assert loaded["2401.00001"].chunks_indexed == 7 def test_remove_paper(self, store): rec = PaperRecord( arxiv_id="2401.00001", version="", title="t", authors=[], year=None, category=None, chunks_indexed=0, embedding_model="m", ) store.upsert_paper(rec) assert store.remove_paper("2401.00001") is True assert store.load_manifest() == {} assert store.remove_paper("2401.00001") is False def test_list_sorted_newest_first(self, store): old = PaperRecord( arxiv_id="old", version="", title="old", authors=[], year=None, category=None, chunks_indexed=0, embedding_model="m", added_at="2020-01-01T00:00:00Z", ) new = PaperRecord( arxiv_id="new", version="", title="new", authors=[], year=None, category=None, chunks_indexed=0, embedding_model="m", added_at="2026-01-01T00:00:00Z", ) store.upsert_paper(old) store.upsert_paper(new) listed = store.list_papers() assert [p.arxiv_id for p in listed] == ["new", "old"] # --------------------------------------------------------------------------- # CLI smoke (without actually calling chromadb) # --------------------------------------------------------------------------- class TestArxivCLI: def test_list_empty(self, tmp_path, monkeypatch): from click.testing import CliRunner from cli.main import cli monkeypatch.setattr( "researchers.arxiv.store.DEFAULT_ROOT", tmp_path / "arxiv-rag", ) runner = CliRunner() result = runner.invoke(cli, ["arxiv", "list"]) assert result.exit_code == 0, result.output assert "No papers indexed" in result.output def test_info_missing(self, tmp_path, monkeypatch): from click.testing import CliRunner from cli.main import cli monkeypatch.setattr( "researchers.arxiv.store.DEFAULT_ROOT", tmp_path / "arxiv-rag", ) runner = CliRunner() result = runner.invoke(cli, ["arxiv", "info", "0000.00000"]) assert result.exit_code == 1 assert "Not indexed" in result.output