416 lines
14 KiB
Python
416 lines
14 KiB
Python
|
|
"""Tests for the arxiv-rag ingest pipeline (M5.1.1).
|
||
|
|
|
||
|
|
Strategy: mock the slow / network bits (arxiv API, embedder, chromadb)
|
||
|
|
and exercise the real pipeline against synthetic PDFs generated with
|
||
|
|
pymupdf at test time. This keeps the tests deterministic, fast, and
|
||
|
|
network-free while still exercising the actual extract_sections logic.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
from types import SimpleNamespace
|
||
|
|
from unittest.mock import MagicMock
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from researchers.arxiv import ingest as ingest_mod
|
||
|
|
from researchers.arxiv.ingest import (
|
||
|
|
PaperMetadata,
|
||
|
|
Section,
|
||
|
|
embed_and_store,
|
||
|
|
extract_sections,
|
||
|
|
ingest,
|
||
|
|
)
|
||
|
|
from researchers.arxiv.store import ArxivStore, PaperRecord, make_chunk_id
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Fixtures
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def _make_synthetic_pdf(path: Path, sections: list[tuple[str, str]]) -> None:
|
||
|
|
"""Build a tiny PDF with one section per (heading, body) tuple.
|
||
|
|
|
||
|
|
pymupdf is already a hard dep of the arxiv extra, so synthesizing a
|
||
|
|
fixture PDF inline is cheaper than checking a binary into the repo.
|
||
|
|
"""
|
||
|
|
import pymupdf
|
||
|
|
|
||
|
|
doc = pymupdf.open()
|
||
|
|
for heading, body in sections:
|
||
|
|
page = doc.new_page()
|
||
|
|
page.insert_text((50, 80), heading, fontsize=14)
|
||
|
|
# Wrap body across a few lines for realism
|
||
|
|
y = 110
|
||
|
|
for line in body.split("\n"):
|
||
|
|
page.insert_text((50, y), line, fontsize=11)
|
||
|
|
y += 16
|
||
|
|
doc.save(str(path))
|
||
|
|
doc.close()
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def store(tmp_path):
|
||
|
|
"""ArxivStore rooted in a temp directory."""
|
||
|
|
return ArxivStore(root=tmp_path / "arxiv-rag")
|
||
|
|
|
||
|
|
|
||
|
|
class StubEmbedder:
|
||
|
|
"""Minimal stand-in for sentence-transformers.SentenceTransformer."""
|
||
|
|
|
||
|
|
def __init__(self, dim: int = 4):
|
||
|
|
self.dim = dim
|
||
|
|
self.calls: list[list[str]] = []
|
||
|
|
|
||
|
|
def encode(self, texts):
|
||
|
|
self.calls.append(list(texts))
|
||
|
|
# Return deterministic vectors keyed off text length so two
|
||
|
|
# different sections produce two different embeddings.
|
||
|
|
return [[float(len(t)), 0.0, 0.0, 0.0] for t in texts]
|
||
|
|
|
||
|
|
|
||
|
|
class StubChromaCollection:
|
||
|
|
"""In-memory drop-in for a chromadb collection."""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.docs: dict[str, dict] = {}
|
||
|
|
|
||
|
|
def upsert(self, ids, documents, embeddings, metadatas):
|
||
|
|
for i, doc, emb, meta in zip(ids, documents, embeddings, metadatas):
|
||
|
|
self.docs[i] = {"document": doc, "embedding": emb, "metadata": meta}
|
||
|
|
|
||
|
|
def get(self, where=None):
|
||
|
|
if where is None:
|
||
|
|
ids = list(self.docs.keys())
|
||
|
|
else:
|
||
|
|
ids = [
|
||
|
|
i
|
||
|
|
for i, entry in self.docs.items()
|
||
|
|
if all(entry["metadata"].get(k) == v for k, v in where.items())
|
||
|
|
]
|
||
|
|
return {"ids": ids}
|
||
|
|
|
||
|
|
def delete(self, where):
|
||
|
|
to_drop = [
|
||
|
|
i
|
||
|
|
for i, entry in self.docs.items()
|
||
|
|
if all(entry["metadata"].get(k) == v for k, v in where.items())
|
||
|
|
]
|
||
|
|
for i in to_drop:
|
||
|
|
del self.docs[i]
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def stub_collection(monkeypatch):
|
||
|
|
"""Replace ArxivStore.collection with an in-memory stub."""
|
||
|
|
stub = StubChromaCollection()
|
||
|
|
monkeypatch.setattr(
|
||
|
|
ArxivStore, "collection", property(lambda self: stub)
|
||
|
|
)
|
||
|
|
return stub
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# extract_sections — real pymupdf, synthetic PDFs
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
class TestExtractSections:
|
||
|
|
def test_detects_canonical_headings(self, tmp_path):
|
||
|
|
pdf = tmp_path / "paper.pdf"
|
||
|
|
_make_synthetic_pdf(
|
||
|
|
pdf,
|
||
|
|
[
|
||
|
|
("Introduction", "We study X. We find Y."),
|
||
|
|
("Methods", "We used Z to evaluate Y."),
|
||
|
|
("Results", "Accuracy was 95%."),
|
||
|
|
("Conclusion", "X works."),
|
||
|
|
],
|
||
|
|
)
|
||
|
|
sections = extract_sections(pdf)
|
||
|
|
titles = [s.title.lower() for s in sections]
|
||
|
|
assert "introduction" in titles
|
||
|
|
assert "methods" in titles
|
||
|
|
assert "results" in titles
|
||
|
|
assert "conclusion" in titles
|
||
|
|
# Body text from each section should be present
|
||
|
|
intro = next(s for s in sections if s.title.lower() == "introduction")
|
||
|
|
assert "we study x" in intro.text.lower()
|
||
|
|
|
||
|
|
def test_falls_back_to_whole_paper_when_no_headings(self, tmp_path):
|
||
|
|
pdf = tmp_path / "no-headings.pdf"
|
||
|
|
_make_synthetic_pdf(
|
||
|
|
pdf,
|
||
|
|
[
|
||
|
|
("Some random title nobody recognizes", "Body text body text."),
|
||
|
|
],
|
||
|
|
)
|
||
|
|
sections = extract_sections(pdf)
|
||
|
|
assert len(sections) == 1
|
||
|
|
assert sections[0].title == "Full Paper"
|
||
|
|
assert "body text" in sections[0].text.lower()
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# embed_and_store — uses stub collection + stub embedder
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
class TestEmbedAndStore:
|
||
|
|
def test_writes_chunks_and_returns_count(self, store, stub_collection):
|
||
|
|
sections = [
|
||
|
|
Section(index=0, title="Intro", text="aaa", page_start=1, page_end=1),
|
||
|
|
Section(index=1, title="Methods", text="bbbb", page_start=2, page_end=2),
|
||
|
|
]
|
||
|
|
meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
|
||
|
|
n = embed_and_store(
|
||
|
|
arxiv_id="2403.12345",
|
||
|
|
sections=sections,
|
||
|
|
store=store,
|
||
|
|
model_name="stub-model",
|
||
|
|
metadata=meta,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
assert n == 2
|
||
|
|
assert len(stub_collection.docs) == 2
|
||
|
|
# Check that chunk ids are model-scoped
|
||
|
|
expected_ids = {make_chunk_id("2403.12345", i, "stub-model") for i in (0, 1)}
|
||
|
|
assert set(stub_collection.docs.keys()) == expected_ids
|
||
|
|
# Metadata round-trips
|
||
|
|
first = next(iter(stub_collection.docs.values()))
|
||
|
|
assert first["metadata"]["arxiv_id"] == "2403.12345"
|
||
|
|
assert first["metadata"]["embedding_model"] == "stub-model"
|
||
|
|
|
||
|
|
def test_re_embed_replaces_existing_chunks(self, store, stub_collection):
|
||
|
|
meta = PaperMetadata(arxiv_id="2403.12345", version="v1", title="Test")
|
||
|
|
sections_v1 = [
|
||
|
|
Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
|
||
|
|
Section(index=1, title="Methods", text="second", page_start=2, page_end=2),
|
||
|
|
]
|
||
|
|
embed_and_store(
|
||
|
|
"2403.12345", sections_v1, store, "stub-model", meta,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
assert len(stub_collection.docs) == 2
|
||
|
|
|
||
|
|
# Re-embed with fewer sections — should drop the second.
|
||
|
|
sections_v2 = [
|
||
|
|
Section(index=0, title="Intro", text="first", page_start=1, page_end=1),
|
||
|
|
]
|
||
|
|
embed_and_store(
|
||
|
|
"2403.12345", sections_v2, store, "stub-model", meta,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
assert len(stub_collection.docs) == 1
|
||
|
|
|
||
|
|
def test_empty_sections_is_noop(self, store, stub_collection):
|
||
|
|
meta = PaperMetadata(arxiv_id="x", version="", title="")
|
||
|
|
n = embed_and_store("x", [], store, "stub-model", meta, embedder=StubEmbedder())
|
||
|
|
assert n == 0
|
||
|
|
assert stub_collection.docs == {}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Top-level ingest() — full pipeline with mocked download
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def _stub_arxiv_search(arxiv_id: str):
|
||
|
|
"""Return a fake arxiv.Search result for ``arxiv_id``."""
|
||
|
|
|
||
|
|
def _download_pdf(dirpath=None, filename=None):
|
||
|
|
# Generate a synthetic PDF on the fly so the rest of the
|
||
|
|
# pipeline has something real to read.
|
||
|
|
target = Path(dirpath) / filename
|
||
|
|
_make_synthetic_pdf(
|
||
|
|
target,
|
||
|
|
[
|
||
|
|
("Introduction", "Stub paper introduction."),
|
||
|
|
("Methods", "Stub paper methods."),
|
||
|
|
("Results", "Stub paper results."),
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
paper = SimpleNamespace(
|
||
|
|
entry_id=f"http://arxiv.org/abs/{arxiv_id}v1",
|
||
|
|
title=f"Test paper {arxiv_id}",
|
||
|
|
authors=[SimpleNamespace(name="Alice"), SimpleNamespace(name="Bob")],
|
||
|
|
published=datetime(2024, 1, 15, tzinfo=timezone.utc),
|
||
|
|
primary_category="cs.LG",
|
||
|
|
download_pdf=_download_pdf,
|
||
|
|
)
|
||
|
|
return [paper]
|
||
|
|
|
||
|
|
|
||
|
|
class TestIngest:
|
||
|
|
def test_end_to_end(self, store, stub_collection):
|
||
|
|
record = ingest(
|
||
|
|
"2403.12345",
|
||
|
|
store=store,
|
||
|
|
model_name="stub-model",
|
||
|
|
arxiv_search=_stub_arxiv_search,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
|
||
|
|
# Manifest entry
|
||
|
|
assert isinstance(record, PaperRecord)
|
||
|
|
assert record.arxiv_id == "2403.12345"
|
||
|
|
assert record.title == "Test paper 2403.12345"
|
||
|
|
assert record.authors == ["Alice", "Bob"]
|
||
|
|
assert record.year == 2024
|
||
|
|
assert record.category == "cs.LG"
|
||
|
|
assert record.chunks_indexed >= 1
|
||
|
|
assert record.embedding_model == "stub-model"
|
||
|
|
|
||
|
|
# Manifest persisted to disk
|
||
|
|
loaded = store.load_manifest()
|
||
|
|
assert "2403.12345" in loaded
|
||
|
|
assert loaded["2403.12345"].chunks_indexed == record.chunks_indexed
|
||
|
|
|
||
|
|
# PDF cached
|
||
|
|
assert (store.pdfs_dir / "2403.12345.pdf").exists()
|
||
|
|
|
||
|
|
# Chunks in stub collection
|
||
|
|
assert len(stub_collection.docs) == record.chunks_indexed
|
||
|
|
|
||
|
|
def test_idempotent_reingest(self, store, stub_collection):
|
||
|
|
first = ingest(
|
||
|
|
"2403.12345",
|
||
|
|
store=store,
|
||
|
|
model_name="stub-model",
|
||
|
|
arxiv_search=_stub_arxiv_search,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
chunks_after_first = len(stub_collection.docs)
|
||
|
|
|
||
|
|
second = ingest(
|
||
|
|
"2403.12345",
|
||
|
|
store=store,
|
||
|
|
model_name="stub-model",
|
||
|
|
arxiv_search=_stub_arxiv_search,
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
# Same number of chunks (replace, not append)
|
||
|
|
assert len(stub_collection.docs) == chunks_after_first
|
||
|
|
assert second.chunks_indexed == first.chunks_indexed
|
||
|
|
|
||
|
|
def test_unknown_arxiv_id_raises(self, store):
|
||
|
|
with pytest.raises(ValueError, match="not found"):
|
||
|
|
ingest(
|
||
|
|
"9999.99999",
|
||
|
|
store=store,
|
||
|
|
model_name="stub-model",
|
||
|
|
arxiv_search=lambda _id: [],
|
||
|
|
embedder=StubEmbedder(),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Manifest CRUD via ArxivStore
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
class TestManifest:
|
||
|
|
def test_load_returns_empty_dict_when_missing(self, store):
|
||
|
|
assert store.load_manifest() == {}
|
||
|
|
|
||
|
|
def test_round_trip(self, store):
|
||
|
|
rec = PaperRecord(
|
||
|
|
arxiv_id="2401.00001",
|
||
|
|
version="v2",
|
||
|
|
title="Round trip test",
|
||
|
|
authors=["A", "B"],
|
||
|
|
year=2024,
|
||
|
|
category="cs.AI",
|
||
|
|
chunks_indexed=7,
|
||
|
|
embedding_model="m",
|
||
|
|
)
|
||
|
|
store.upsert_paper(rec)
|
||
|
|
loaded = store.load_manifest()
|
||
|
|
assert "2401.00001" in loaded
|
||
|
|
assert loaded["2401.00001"].title == "Round trip test"
|
||
|
|
assert loaded["2401.00001"].chunks_indexed == 7
|
||
|
|
|
||
|
|
def test_remove_paper(self, store):
|
||
|
|
rec = PaperRecord(
|
||
|
|
arxiv_id="2401.00001",
|
||
|
|
version="",
|
||
|
|
title="t",
|
||
|
|
authors=[],
|
||
|
|
year=None,
|
||
|
|
category=None,
|
||
|
|
chunks_indexed=0,
|
||
|
|
embedding_model="m",
|
||
|
|
)
|
||
|
|
store.upsert_paper(rec)
|
||
|
|
assert store.remove_paper("2401.00001") is True
|
||
|
|
assert store.load_manifest() == {}
|
||
|
|
assert store.remove_paper("2401.00001") is False
|
||
|
|
|
||
|
|
def test_list_sorted_newest_first(self, store):
|
||
|
|
old = PaperRecord(
|
||
|
|
arxiv_id="old",
|
||
|
|
version="",
|
||
|
|
title="old",
|
||
|
|
authors=[],
|
||
|
|
year=None,
|
||
|
|
category=None,
|
||
|
|
chunks_indexed=0,
|
||
|
|
embedding_model="m",
|
||
|
|
added_at="2020-01-01T00:00:00Z",
|
||
|
|
)
|
||
|
|
new = PaperRecord(
|
||
|
|
arxiv_id="new",
|
||
|
|
version="",
|
||
|
|
title="new",
|
||
|
|
authors=[],
|
||
|
|
year=None,
|
||
|
|
category=None,
|
||
|
|
chunks_indexed=0,
|
||
|
|
embedding_model="m",
|
||
|
|
added_at="2026-01-01T00:00:00Z",
|
||
|
|
)
|
||
|
|
store.upsert_paper(old)
|
||
|
|
store.upsert_paper(new)
|
||
|
|
listed = store.list_papers()
|
||
|
|
assert [p.arxiv_id for p in listed] == ["new", "old"]
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# CLI smoke (without actually calling chromadb)
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
class TestArxivCLI:
|
||
|
|
def test_list_empty(self, tmp_path, monkeypatch):
|
||
|
|
from click.testing import CliRunner
|
||
|
|
|
||
|
|
from cli.main import cli
|
||
|
|
|
||
|
|
monkeypatch.setattr(
|
||
|
|
"researchers.arxiv.store.DEFAULT_ROOT",
|
||
|
|
tmp_path / "arxiv-rag",
|
||
|
|
)
|
||
|
|
runner = CliRunner()
|
||
|
|
result = runner.invoke(cli, ["arxiv", "list"])
|
||
|
|
assert result.exit_code == 0, result.output
|
||
|
|
assert "No papers indexed" in result.output
|
||
|
|
|
||
|
|
def test_info_missing(self, tmp_path, monkeypatch):
|
||
|
|
from click.testing import CliRunner
|
||
|
|
|
||
|
|
from cli.main import cli
|
||
|
|
|
||
|
|
monkeypatch.setattr(
|
||
|
|
"researchers.arxiv.store.DEFAULT_ROOT",
|
||
|
|
tmp_path / "arxiv-rag",
|
||
|
|
)
|
||
|
|
runner = CliRunner()
|
||
|
|
result = runner.invoke(cli, ["arxiv", "info", "0000.00000"])
|
||
|
|
assert result.exit_code == 1
|
||
|
|
assert "Not indexed" in result.output
|