marchwarden/tests/test_arxiv_ingest.py

416 lines
14 KiB
Python
Raw Permalink Normal View History

feat(arxiv): ingest pipeline (M5.1.1) Closes #38. First sub-milestone of M5.1 (Researcher #2: arxiv-rag). New package researchers/arxiv/ with three modules: - store.py — ArxivStore wraps a persistent chromadb collection at ~/.marchwarden/arxiv-rag/chroma/ plus a papers.json manifest. Chunk ids are deterministic and embedding-model-scoped (per ArxivRagProposal decision 4) so re-ingesting with a different embedder doesn't collide with prior chunks. - ingest.py — three-phase pipeline: download_pdf (arxiv API), extract_sections (pymupdf with heuristic heading detection + whole-paper fallback), and embed_and_store (sentence-transformers, configurable via MARCHWARDEN_ARXIV_EMBED_MODEL). Top-level ingest() chains them and upserts the manifest entry. Re-ingest is idempotent — chunks for the same paper are dropped before re-adding. - CLI subgroup `marchwarden arxiv add|list|info|remove`. Lazy-imports the heavy chromadb / torch deps so non-arxiv commands stay fast. The heavy ML deps (pymupdf, chromadb, sentence-transformers, arxiv) are gated behind an optional `[arxiv]` extra so the base install stays slim for users who only want the web researcher. Tests: 14 added (141 total passing). Real pymupdf against synthetic PDFs generated at test time covers extract_sections; chromadb and the embedder are stubbed via dependency injection so the tests stay fast, deterministic, and network-free. End-to-end ingest() is exercised with a mocked arxiv.Search that produces synthetic PDFs. Out of scope for #38 (covered by later sub-milestones): - Retrieval / search API (#39) - ArxivResearcher agent loop (#40) - MCP server (#41) - ask --researcher arxiv flag (#42) - Cost ledger embedding_calls field (#43) Notes: - pip install pulled in CUDA torch wheel (~2GB nvidia libs); harmless on CPU-only WSL but a future optimization would pin the CPU torch index. - Live smoke against a real arxiv id deferred so we don't block the M3.3 collection runner currently using the venv. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 02:03:42 +00:00
"""Tests for the arxiv-rag ingest pipeline (M5.1.1).
Strategy: mock the slow / network bits (arxiv API, embedder, chromadb)
and exercise the real pipeline against synthetic PDFs generated with
pymupdf at test time. This keeps the tests deterministic, fast, and
network-free while still exercising the actual extract_sections logic.
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
from researchers.arxiv import ingest as ingest_mod
from researchers.arxiv.ingest import (
PaperMetadata,
Section,
embed_and_store,
extract_sections,
ingest,
)
from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None:
"""Build a tiny PDF with one section per (heading, body) tuple.
pymupdf is already a hard dep of the arxiv extra, so synthesizing a
fixture PDF inline is cheaper than checking a binary into the repo.
"""
import pymupdf
doc = pymupdf.open()
for heading, body in sections:
page = doc.new_page()
page.insert_text((50, 80), heading, fontsize=14)
# Wrap body across a few lines for realism
y = 110
for line in body.split("\n"):
page.insert_text((50, y), line, fontsize=11)
y += 16
doc.save(str(path))
doc.close()
@pytest.fixture
def store(tmp_path):
"""ArxivStore rooted in a temp directory."""
return ArxivStore(root=tmp_path / "arxiv-rag")
class StubEmbedder:
"""Minimal stand-in for sentence-transformers.SentenceTransformer."""
def __init__(self, dim: int = 4):
self.dim = dim
self.calls: list[list[str]] = []
def encode(self, texts):
self.calls.append(list(texts))
# Return deterministic vectors keyed off text length so two
# different sections produce two different embeddings.
return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts]
class StubChromaCollection:
"""In-memory drop-in for a chromadb collection."""
def __init__(self):
self.docs: dict[str, dict] = {}
def upsert(self, ids, documents, embeddings, metadatas):
for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas):
self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta}
def get(self, where=None):
if where is None:
ids = list(self.docs.keys())
else:
ids = [
i
for i, entry in self.docs.items()
if all(entry["metadata"].get(k) == v for k, v in where.items())
]
return {"ids": ids}
def delete(self, where):
to_drop = [
i
for i, entry in self.docs.items()
if all(entry["metadata"].get(k) == v for k, v in where.items())
]
for i in to_drop:
del self.docs[i]
@pytest.fixture
def stub_collection(monkeypatch):
"""Replace ArxivStore.collection with an in-memory stub."""
stub = StubChromaCollection()
monkeypatch.setattr(
ArxivStore, "collection", property(lambda self: stub)
)
return stub
# ---------------------------------------------------------------------------
# extract_sections — real pymupdf, synthetic PDFs
# ---------------------------------------------------------------------------
class TestExtractSections:
def test_detects_canonical_headings(self, tmp_path):
pdf = tmp_path / "paper.pdf"
_make_synthetic_pdf(
pdf,
[
("Introduction", "We study X. We find Y."),
("Methods", "We used Z to evaluate Y."),
("Results", "Accuracy was 95%."),
("Conclusion", "X works."),
],
)
sections = extract_sections(pdf)
titles = [s.title.lower() for s in sections]
assert "introduction" in titles
assert "methods" in titles
assert "results" in titles
assert "conclusion" in titles
# Body text from each section should be present
intro = next(s for s in sections if s.title.lower() == "introduction")
assert "we study x" in intro.text.lower()
def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path):
pdf = tmp_path / "no-headings.pdf"
_make_synthetic_pdf(
pdf,
[
("Some random title nobody recognizes", "Body text body text."),
],
)
sections = extract_sections(pdf)
assert len(sections) == 1
assert sections[0].title == "Full Paper"
assert "body text" in sections[0].text.lower()
# ---------------------------------------------------------------------------
# embed_and_store — uses stub collection + stub embedder
# ---------------------------------------------------------------------------
class TestEmbedAndStore:
def test_writes_chunks_and_returns_count(self, store, stub_collection):
sections = [
Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1),
Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2),
]
meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
n = embed_and_store(
arxiv_id="2403.12345",
sections=sections,
store=store,
model_name="stub-model",
metadata=meta,
embedder=StubEmbedder(),
)
assert n == 2
assert len(stub_collection.docs) == 2
# Check that chunk ids are model-scoped
expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)}
assert set(stub_collection.docs.keys()) == expected_ids
# Metadata round-trips
first = next(iter(stub_collection.docs.values()))
assert first["metadata"]["arxiv_id"] == "2403.12345"
assert first["metadata"]["embedding_model"] == "stub-model"
def test_re_embed_replaces_existing_chunks(self, store, stub_collection):
meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
sections_v1 = [
Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
Section(index=1, title="Methods", text="second", page_start=2, page_end=2),
]
embed_and_store(
"2403.12345", sections_v1, store, "stub-model", meta,
embedder=StubEmbedder(),
)
assert len(stub_collection.docs) == 2
# Re-embed with fewer sections — should drop the second.
sections_v2 = [
Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
]
embed_and_store(
"2403.12345", sections_v2, store, "stub-model", meta,
embedder=StubEmbedder(),
)
assert len(stub_collection.docs) == 1
def test_empty_sections_is_noop(self, store, stub_collection):
meta = PaperMetadata(arxiv_id="x", version="", title="")
n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder())
assert n == 0
assert stub_collection.docs == {}
# ---------------------------------------------------------------------------
# Top-level ingest() — full pipeline with mocked download
# ---------------------------------------------------------------------------
def _stub_arxiv_search(arxiv_id: str):
"""Return a fake arxiv.Search result for ``arxiv_id``."""
def _download_pdf(dirpath=None, filename=None):
# Generate a synthetic PDF on the fly so the rest of the
# pipeline has something real to read.
target = Path(dirpath) / filename
_make_synthetic_pdf(
target,
[
("Introduction", "Stub paper introduction."),
("Methods", "Stub paper methods."),
("Results", "Stub paper results."),
],
)
paper = SimpleNamespace(
entry_id=f"http://arxiv.org/abs/{arxiv_id}v1",
title=f"Test paper {arxiv_id}",
authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")],
published=datetime(2024, 1, 15, tzinfo=timezone.utc),
primary_category="cs.LG",
download_pdf=_download_pdf,
)
return [paper]
class TestIngest:
def test_end_to_end(self, store, stub_collection):
record = ingest(
"2403.12345",
store=store,
model_name="stub-model",
arxiv_search=_stub_arxiv_search,
embedder=StubEmbedder(),
)
# Manifest entry
assert isinstance(record, PaperRecord)
assert record.arxiv_id == "2403.12345"
assert record.title == "Test paper 2403.12345"
assert record.authors == ["Alice", "Bob"]
assert record.year == 2024
assert record.category == "cs.LG"
assert record.chunks_indexed >= 1
assert record.embedding_model == "stub-model"
# Manifest persisted to disk
loaded = store.load_manifest()
assert "2403.12345" in loaded
assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed
# PDF cached
assert (store.pdfs_dir / "2403.12345.pdf").exists()
# Chunks in stub collection
assert len(stub_collection.docs) == record.chunks_indexed
def test_idempotent_reingest(self, store, stub_collection):
first = ingest(
"2403.12345",
store=store,
model_name="stub-model",
arxiv_search=_stub_arxiv_search,
embedder=StubEmbedder(),
)
chunks_after_first = len(stub_collection.docs)
second = ingest(
"2403.12345",
store=store,
model_name="stub-model",
arxiv_search=_stub_arxiv_search,
embedder=StubEmbedder(),
)
# Same number of chunks (replace, not append)
assert len(stub_collection.docs) == chunks_after_first
assert second.chunks_indexed == first.chunks_indexed
def test_unknown_arxiv_id_raises(self, store):
with pytest.raises(ValueError, match="not found"):
ingest(
"9999.99999",
store=store,
model_name="stub-model",
arxiv_search=lambda _id: [],
embedder=StubEmbedder(),
)
# ---------------------------------------------------------------------------
# Manifest CRUD via ArxivStore
# ---------------------------------------------------------------------------
class TestManifest:
def test_load_returns_empty_dict_when_missing(self, store):
assert store.load_manifest() == {}
def test_round_trip(self, store):
rec = PaperRecord(
arxiv_id="2401.00001",
version="v2",
title="Round trip test",
authors=["A", "B"],
year=2024,
category="cs.AI",
chunks_indexed=7,
embedding_model="m",
)
store.upsert_paper(rec)
loaded = store.load_manifest()
assert "2401.00001" in loaded
assert loaded["2401.00001"].title == "Round trip test"
assert loaded["2401.00001"].chunks_indexed == 7
def test_remove_paper(self, store):
rec = PaperRecord(
arxiv_id="2401.00001",
version="",
title="t",
authors=[],
year=None,
category=None,
chunks_indexed=0,
embedding_model="m",
)
store.upsert_paper(rec)
assert store.remove_paper("2401.00001") is True
assert store.load_manifest() == {}
assert store.remove_paper("2401.00001") is False
def test_list_sorted_newest_first(self, store):
old = PaperRecord(
arxiv_id="old",
version="",
title="old",
authors=[],
year=None,
category=None,
chunks_indexed=0,
embedding_model="m",
added_at="2020-01-01T00:00:00Z",
)
new = PaperRecord(
arxiv_id="new",
version="",
title="new",
authors=[],
year=None,
category=None,
chunks_indexed=0,
embedding_model="m",
added_at="2026-01-01T00:00:00Z",
)
store.upsert_paper(old)
store.upsert_paper(new)
listed = store.list_papers()
assert [p.arxiv_id for p in listed] == ["new", "old"]
# ---------------------------------------------------------------------------
# CLI smoke (without actually calling chromadb)
# ---------------------------------------------------------------------------
class TestArxivCLI:
def test_list_empty(self, tmp_path, monkeypatch):
from click.testing import CliRunner
from cli.main import cli
monkeypatch.setattr(
"researchers.arxiv.store.DEFAULT_ROOT",
tmp_path / "arxiv-rag",
)
runner = CliRunner()
result = runner.invoke(cli, ["arxiv", "list"])
assert result.exit_code == 0, result.output
assert "No papers indexed" in result.output
def test_info_missing(self, tmp_path, monkeypatch):
from click.testing import CliRunner
from cli.main import cli
monkeypatch.setattr(
"researchers.arxiv.store.DEFAULT_ROOT",
tmp_path / "arxiv-rag",
)
runner = CliRunner()
result = runner.invoke(cli, ["arxiv", "info", "0000.00000"])
assert result.exit_code == 1
assert "Not indexed" in result.output