#!/usr/bin/env bash # scripts/calibration_runner.sh # # M3.3 Phase A: run a fixed set of 20 balanced-depth calibration queries. # Each run writes a trace JSONL and a result.json under ~/.marchwarden/traces/. # This script is idempotent in the sense that it doesn't track state — re-running # it will produce 20 NEW traces. Don't re-run unless you want fresh data. # # Categories (5 each): # - factual: single verifiable answer # - comparative: X vs Y across some dimension # - contradiction-prone: contested topics, sources disagree # - scope-edge: niche, proprietary, or expert-only knowledge set -euo pipefail cd "$(dirname "$0")/.." PY=".venv/bin/python" LOG_DIR="docs/stress-tests/M3.3-runs" mkdir -p "$LOG_DIR" declare -a QUERIES=( # factual "factual|01|What is the boiling point of liquid nitrogen at standard atmospheric pressure?" "factual|02|When did the James Webb Space Telescope launch?" "factual|03|What programming language is the Linux kernel primarily written in?" "factual|04|What is the capital of Mongolia?" "factual|05|How many amino acids are encoded by the standard genetic code?" # comparative "comparative|06|Compare the energy density of lithium-ion vs sodium-ion batteries." "comparative|07|Compare PostgreSQL and SQLite for embedded analytics workloads." "comparative|08|Compare CRISPR-Cas9 and CRISPR-Cas12 for in vivo gene editing." "comparative|09|Compare React and Vue for large enterprise frontends in 2026." "comparative|10|Compare wind and solar capacity factors in the continental United States." # contradiction-prone "contradiction|11|Is red wine good for cardiovascular health?" "contradiction|12|Does intermittent fasting extend lifespan in humans?" "contradiction|13|Are nuclear power plants safe?" "contradiction|14|Is dietary cholesterol harmful?" "contradiction|15|Does screen time harm child development?" # scope-edge "scope|16|What proprietary indexing strategies do high-frequency trading firms use for order book reconstruction?" "scope|17|What is the actual operational doctrine of Chinese DF-41 ICBM brigades?" "scope|18|What internal compensation bands does Goldman Sachs use for VPs in 2026?" "scope|19|How does Renaissance Technologies Medallion Fund actually generate alpha?" "scope|20|What are the precise materials and tolerances in TSMC's 2nm process?" ) echo "Running ${#QUERIES[@]} calibration queries at depth=balanced..." echo "Output dir: $LOG_DIR" echo for entry in "${QUERIES[@]}"; do IFS='|' read -r category num question <<<"$entry" log_file="$LOG_DIR/${num}-${category}.log" echo "[$num/$category] $question" if "$PY" -m cli.main ask "$question" --depth balanced >"$log_file" 2>&1; then trace_id=$(grep -oE 'trace_id: [a-f0-9-]+' "$log_file" | tail -1 | awk '{print $2}') echo " -> $trace_id" else echo " !! FAILED — see $log_file" fi done echo echo "Done. Result files at ~/.marchwarden/traces/*.result.json"