marchwarden/pyproject.toml

74 lines
1.5 KiB
TOML
Raw Permalink Normal View History

[build-system]
requires = ["setuptools>=45", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "marchwarden"
version = "0.1.0-dev"
description = "Agentic research network: specialists at the frontier, PI orchestrator at the center"
readme = "README.md"
requires-python = ">=3.10"
authors = [
{name = "archeious"}
]
dependencies = [
"anthropic>=0.30.0",
"mcp>=0.1.0",
"pydantic>=2.0",
"tavily-python>=0.3.0",
"httpx>=0.24.0",
"click>=8.0",
"rich>=13.0",
M2.5.1: Structured application logger via structlog (#24) Adds an operational logging layer separate from the JSONL trace audit logs. Operational logs cover system events (startup, errors, MCP transport, research lifecycle); JSONL traces remain the researcher provenance audit trail. Backend: structlog with two renderers selectable via MARCHWARDEN_LOG_FORMAT (json|console). Defaults to console when stderr is a TTY, json otherwise — so dev runs are human-readable and shipped runs (containers, automation) emit OpenSearch-ready JSON without configuration. Key features: - Named loggers per component: marchwarden.cli, marchwarden.mcp, marchwarden.researcher.web - MARCHWARDEN_LOG_LEVEL controls global level (default INFO) - MARCHWARDEN_LOG_FILE=1 enables a 10MB-rotating file at ~/.marchwarden/logs/marchwarden.log - structlog contextvars bind trace_id + researcher at the start of each research() call so every downstream log line carries them automatically; cleared on completion - stdlib logging is funneled through the same pipeline so noisy third-party loggers (httpx, anthropic) get the same formatting and quieted to WARN unless DEBUG is requested - Logs to stderr to keep MCP stdio stdout clean Wired into: - cli.main.cli — configures logging on startup, logs ask_started/ ask_completed/ask_failed - researchers.web.server.main — configures logging on startup, logs mcp_server_starting - researchers.web.agent.research — binds trace context, logs research_started/research_completed Tests verify JSON and console formats, contextvar propagation, level filtering, idempotency, and auto-configure-on-first-use. 94/94 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:46:51 +00:00
"structlog>=24.0",
]
[project.optional-dependencies]
dev = [
"pytest>=7.0",
"pytest-cov>=4.0",
"pytest-asyncio>=0.21",
"black>=23.0",
"ruff>=0.1.0",
"mypy>=1.0",
]
feat(arxiv): ingest pipeline (M5.1.1) Closes #38. First sub-milestone of M5.1 (Researcher #2: arxiv-rag). New package researchers/arxiv/ with three modules: - store.py — ArxivStore wraps a persistent chromadb collection at ~/.marchwarden/arxiv-rag/chroma/ plus a papers.json manifest. Chunk ids are deterministic and embedding-model-scoped (per ArxivRagProposal decision 4) so re-ingesting with a different embedder doesn't collide with prior chunks. - ingest.py — three-phase pipeline: download_pdf (arxiv API), extract_sections (pymupdf with heuristic heading detection + whole-paper fallback), and embed_and_store (sentence-transformers, configurable via MARCHWARDEN_ARXIV_EMBED_MODEL). Top-level ingest() chains them and upserts the manifest entry. Re-ingest is idempotent — chunks for the same paper are dropped before re-adding. - CLI subgroup `marchwarden arxiv add|list|info|remove`. Lazy-imports the heavy chromadb / torch deps so non-arxiv commands stay fast. The heavy ML deps (pymupdf, chromadb, sentence-transformers, arxiv) are gated behind an optional `[arxiv]` extra so the base install stays slim for users who only want the web researcher. Tests: 14 added (141 total passing). Real pymupdf against synthetic PDFs generated at test time covers extract_sections; chromadb and the embedder are stubbed via dependency injection so the tests stay fast, deterministic, and network-free. End-to-end ingest() is exercised with a mocked arxiv.Search that produces synthetic PDFs. Out of scope for #38 (covered by later sub-milestones): - Retrieval / search API (#39) - ArxivResearcher agent loop (#40) - MCP server (#41) - ask --researcher arxiv flag (#42) - Cost ledger embedding_calls field (#43) Notes: - pip install pulled in CUDA torch wheel (~2GB nvidia libs); harmless on CPU-only WSL but a future optimization would pin the CPU torch index. - Live smoke against a real arxiv id deferred so we don't block the M3.3 collection runner currently using the venv. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 02:03:42 +00:00
# arxiv-rag researcher (M5.1). Heavy ML deps — optional extra so the base
# install stays slim for users who only want the web researcher.
arxiv = [
"pymupdf>=1.24",
"chromadb>=0.5",
"sentence-transformers>=3.0",
"arxiv>=2.1",
]
[project.scripts]
marchwarden = "cli.main:cli"
[tool.setuptools.packages.find]
M2.5.1: Structured application logger via structlog (#24) Adds an operational logging layer separate from the JSONL trace audit logs. Operational logs cover system events (startup, errors, MCP transport, research lifecycle); JSONL traces remain the researcher provenance audit trail. Backend: structlog with two renderers selectable via MARCHWARDEN_LOG_FORMAT (json|console). Defaults to console when stderr is a TTY, json otherwise — so dev runs are human-readable and shipped runs (containers, automation) emit OpenSearch-ready JSON without configuration. Key features: - Named loggers per component: marchwarden.cli, marchwarden.mcp, marchwarden.researcher.web - MARCHWARDEN_LOG_LEVEL controls global level (default INFO) - MARCHWARDEN_LOG_FILE=1 enables a 10MB-rotating file at ~/.marchwarden/logs/marchwarden.log - structlog contextvars bind trace_id + researcher at the start of each research() call so every downstream log line carries them automatically; cleared on completion - stdlib logging is funneled through the same pipeline so noisy third-party loggers (httpx, anthropic) get the same formatting and quieted to WARN unless DEBUG is requested - Logs to stderr to keep MCP stdio stdout clean Wired into: - cli.main.cli — configures logging on startup, logs ask_started/ ask_completed/ask_failed - researchers.web.server.main — configures logging on startup, logs mcp_server_starting - researchers.web.agent.research — binds trace context, logs research_started/research_completed Tests verify JSON and console formats, contextvar propagation, level filtering, idempotency, and auto-configure-on-first-use. 94/94 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:46:51 +00:00
include = ["researchers*", "orchestrator*", "cli*", "obs*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
[tool.coverage.run]
source = ["researchers", "orchestrator", "cli"]
omit = ["*/tests/*", "*/test_*.py"]
[tool.black]
line-length = 88
target-version = ['py310']
[tool.ruff]
line-length = 88
target-version = "py310"
select = ["E", "F", "W", "I001"]
[tool.mypy]
python_version = "3.10"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = false