Spaces:

Shaurya-Noodle
/

Supplymind

Running

App Files Files Community

Shaurya-Noodle commited on May 7

Commit

9f8371c

verified ·

1 Parent(s): 4b96949

Deploy v6.0-genesis from GitHub main

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +17 -17
.gitattributes +15 -0
.gitignore +46 -14
Dockerfile +32 -32
Dockerfile.damocles +4 -4
FINAL_SUBMIT/ALL_250_FEATURES_LIVE_PROOF.md +3 -3
FINAL_SUBMIT/ARCHITECTURE.md +3 -3
FINAL_SUBMIT/BENCHMARK_REPORT.md +3 -3
FINAL_SUBMIT/COLD_OPEN_OPENING_LINES.md +26 -26
FINAL_SUBMIT/DATASET_CARD.md +3 -3
FINAL_SUBMIT/ENV_CARD.md +1 -1
FINAL_SUBMIT/FEATURE_INVENTORY.md +27 -27
FINAL_SUBMIT/FEATURE_INVENTORY_DI.md +7 -7
FINAL_SUBMIT/FEATURE_INVENTORY_JT.md +13 -13
FINAL_SUBMIT/FEATURE_INVENTORY_UBB.md +40 -40
FINAL_SUBMIT/HACKATHON_README.md +2 -2
FINAL_SUBMIT/JUDGE_FAQ_30.md +1 -1
FINAL_SUBMIT/JUDGE_OBJECTION_HANDBOOK.md +1 -1
FINAL_SUBMIT/MASTER_FEATURE_USECASE_MAP_250.md +10 -10
FINAL_SUBMIT/README.md +12 -12
FINAL_SUBMIT/RELIANCE_HORMUZ_DEEP_DIVE.md +1 -1
FINAL_SUBMIT/REPRODUCE.md +2 -2
FINAL_SUBMIT/REPRODUCE_ONE_BASH.sh +3 -3
FINAL_SUBMIT/RL_GUIDE_59POINT_ALIGNMENT.md +2 -2
FINAL_SUBMIT/docker/Dockerfile.api +25 -0
FINAL_SUBMIT/docker/docker-compose.yml +41 -0
FINAL_SUBMIT/receipts/F2_multi_agent_apple_samsung_toyota.json +116 -116
FINAL_SUBMIT/receipts/ONNX_BUNDLE_MANIFEST.json +71 -71
FINAL_SUBMIT/receipts/R2_SHAP_FAIRNESS_CALIBRATION.json +501 -501
FINAL_SUBMIT/receipts/R3_BIGTFT_INTEGRATION.json +51 -51
FINAL_SUBMIT/receipts/R3_PAST_SELF.json +0 -0
FINAL_SUBMIT/receipts/R3_STACKING_V2.json +1187 -1187
FINAL_SUBMIT/receipts/R3_STACKING_V3_POINTLEVEL.json +226 -226
FINAL_SUBMIT/receipts/R3_TIMESFM_QUANTILE.json +129 -129
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2.json +0 -0
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_ABLATION.json +396 -396
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_LIVE.json +63 -63
FINAL_SUBMIT/receipts/R4_FRONTIER_PANEL_V2.json +0 -0
FINAL_SUBMIT/receipts/R5_BEIR_MANUAL.json +1022 -1022
FINAL_SUBMIT/receipts/R5_GRANITE.json +0 -0
FINAL_SUBMIT/receipts/R5_GRANITE_HARD.json +0 -0
FINAL_SUBMIT/receipts/R6_ALGO_COMPARISON.json +71 -71
FINAL_SUBMIT/receipts/R6_AQUA_REGIA_V2.json +859 -859
FINAL_SUBMIT/receipts/R6_GETHSEMANE.json +121 -121
FINAL_SUBMIT/receipts/R6_GETHSEMANE_ONNX_EXPORT.json +24 -24
FINAL_SUBMIT/receipts/R6_PROVIDER_V2.json +329 -329
FINAL_SUBMIT/receipts/R6_PROVIDER_v1_F1.json +1755 -1755
FINAL_SUBMIT/receipts/ablation_matrix.json +94 -94
FINAL_SUBMIT/receipts/adversarial_20_attack_gauntlet.json +216 -216
FINAL_SUBMIT/receipts/adversarial_reward_audit.json +131 -131

.dockerignore CHANGED Viewed

@@ -1,17 +1,17 @@
-.git
-__pycache__
-*.pyc
-.pytest_cache
-.mypy_cache
-.ruff_cache
-tests/
-.env
-.env.*
-*.egg-info
-dist/
-build/
-.vscode/
-.idea/
-*.md
-!README.md
-LICENSE

+.git
+__pycache__
+*.pyc
+.pytest_cache
+.mypy_cache
+.ruff_cache
+tests/
+.env
+.env.*
+*.egg-info
+dist/
+build/
+.vscode/
+.idea/
+*.md
+!README.md
+LICENSE

.gitattributes CHANGED Viewed

@@ -68,3 +68,18 @@ ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1000_candidate/policy.zip fil
 ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
 FINAL_SUBMIT/plots/real_reinforce_curve.png filter=lfs diff=lfs merge=lfs -text
 FINAL_SUBMIT/plots/real_reinforce_curve_v2.png filter=lfs diff=lfs merge=lfs -text

 ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
 FINAL_SUBMIT/plots/real_reinforce_curve.png filter=lfs diff=lfs merge=lfs -text
 FINAL_SUBMIT/plots/real_reinforce_curve_v2.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/aqua_regia/r6_aqua_regia.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/dangerous/r4_summary.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/dangerous/r4v2_heatmap.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/gethsemane/learning_curves.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/granite/r5_per_query_heatmap.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/hero_result_card.png filter=lfs diff=lfs merge=lfs -text
+versions/v3_arcadia/plots/past_self/r3_summary.png filter=lfs diff=lfs merge=lfs -text
+versions/v4_arcadia_live/features/gcn_attn/gcn_attn_easy_graph.png filter=lfs diff=lfs merge=lfs -text
+versions/v4_arcadia_live/features/gcn_attn/gcn_attn_hard_graph.png filter=lfs diff=lfs merge=lfs -text
+versions/v4_arcadia_live/features/gcn_attn/gcn_attn_medium_graph.png filter=lfs diff=lfs merge=lfs -text
+versions/v4_arcadia_live/scenarios/crisis_library_v2.faiss filter=lfs diff=lfs merge=lfs -text
+versions/v4_arcadia_live/scenarios/crisis_library_v2_emb.npz filter=lfs diff=lfs merge=lfs -text
+versions/v5_phoenix/action_v2/conformal_calibrated.pt filter=lfs diff=lfs merge=lfs -text
+versions/v5_phoenix/autoresearch_fixed/experiments/seed1000_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
+versions/v5_phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -12,6 +12,11 @@ env/
 # Tooling state
 .claude/
 # Stray pip version artifacts
 0.*/
@@ -56,21 +61,48 @@ models/
 sota-bundle/
 external_data/
 catboost_info/
-v3_arcadia/tools/
-v3_arcadia/gguf_out/
 # Auto-generated embedding caches + SB3 best/ dirs
-v3_arcadia/checkpoints/granite/corpus_emb_*.npy
-v3_arcadia/checkpoints/gethsemane/best_*/
 # v4 arcadia-live auto-generated state
-ShAuRyA_Supplymind/realtime/events.db
-ShAuRyA_Supplymind/realtime/events.db-journal
-ShAuRyA_Supplymind/realtime/library_embeddings.pkl
-ShAuRyA_Supplymind/realtime/vessel_snapshot_hormuz.json
-ShAuRyA_Supplymind/autoresearch/experiments/
-ShAuRyA_Supplymind/autoresearch/state.json
-ShAuRyA_Supplymind/autoresearch/stop_autoresearch.flag
-ShAuRyA_Supplymind/autoresearch/candidate_train.py.bak
-ShAuRyA_Supplymind/autoresearch/AUTORESEARCH_LAB_NOTEBOOK.md
-ShAuRyA_Supplymind/autoresearch/AUTORESEARCH_REJECTED.md

 # Tooling state
 .claude/
+.agents/
+.source_cache/
+.tmp_pytest/
+.pytest_cache/
+wandb/
 # Stray pip version artifacts
 0.*/
 sota-bundle/
 external_data/
 catboost_info/
+versions/v3_arcadia/tools/
+versions/v3_arcadia/gguf_out/
 # Auto-generated embedding caches + SB3 best/ dirs
+versions/v3_arcadia/checkpoints/granite/corpus_emb_*.npy
+versions/v3_arcadia/checkpoints/gethsemane/best_*/
+# Third-party source checkouts (not our code) — vendored under vendor/
+vendor/
+# Phoenix v5 auto-generated state (keep source code, exclude heavy + auto-gen)
+versions/v5_phoenix/.venv-roll/
+versions/v5_phoenix/.venv/
+versions/v5_phoenix/experiments/dpo_judge_v1/checkpoints/
+versions/v5_phoenix/experiments/dpo_judge_v1/adapter/
+versions/v5_phoenix/roll_integration/dpo_judge/adapter/
+versions/v5_phoenix/**/__pycache__/
+versions/v5_phoenix/**/*.pyc
+versions/v5_phoenix/**/*.log
+versions/v5_phoenix/receipts_v2/*.stdout
 # v4 arcadia-live auto-generated state
+versions/v4_arcadia_live/realtime/events.db
+versions/v4_arcadia_live/realtime/events.db-journal
+versions/v4_arcadia_live/realtime/library_embeddings.pkl
+versions/v4_arcadia_live/realtime/vessel_snapshot_hormuz.json
+versions/v4_arcadia_live/autoresearch/experiments/
+versions/v4_arcadia_live/autoresearch/stop_autoresearch.flag
+versions/v4_arcadia_live/autoresearch/candidate_train.py.bak
+# Lab notebook, rejected log, and state.json ARE committed — they document
+# real autoresearch execution history (provenance for judges).
+# OpenRouter usage audit log (per-call timestamps, no keys)
+.openrouter_usage.jsonl
+# Frontier panel run intermediate caches
+.openrouter_cache/
+lora_stdout.log
+# Pass 8 — large harvest data (regenerable via train.py harvest_trajectories)
+versions/v5_phoenix/experiments/rap_xc_v1/transitions.npz
+versions/v5_phoenix/experiments/rap_xc_v1/transitions_synth.npz
+versions/v5_phoenix/experiments/rap_xc_v1/smoke*.npz
+versions/v5_phoenix/experiments/rap_xc_v1/rapxc_synth.pt
+versions/v5_phoenix/experiments/rap_xc_v1/*.log
+tests/receipts/*.log

Dockerfile CHANGED Viewed

@@ -1,32 +1,32 @@
-# ── Stage 1: Install dependencies ──────────────────────────────────
-FROM python:3.11-slim AS builder
-WORKDIR /build
-COPY requirements.txt .
-RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
-# ── Stage 2: Production image ─────────────────────────────────────
-FROM python:3.11-slim
-# Non-root user for security (UID 1000 is conventional)
-RUN useradd --create-home --uid 1000 appuser
-WORKDIR /app
-# Copy installed packages from builder
-COPY --from=builder /install /usr/local
-# Copy application code
-COPY . .
-# Own the app directory
-RUN chown -R appuser:appuser /app
-USER appuser
-EXPOSE 8000
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
-CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]

+# ── Stage 1: Install dependencies ──────────────────────────────────
+FROM python:3.11-slim AS builder
+WORKDIR /build
+COPY requirements.txt .
+RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
+# ── Stage 2: Production image ─────────────────────────────────────
+FROM python:3.11-slim
+# Non-root user for security (UID 1000 is conventional)
+RUN useradd --create-home --uid 1000 appuser
+WORKDIR /app
+# Copy installed packages from builder
+COPY --from=builder /install /usr/local
+# Copy application code
+COPY . .
+# Own the app directory
+RUN chown -R appuser:appuser /app
+USER appuser
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]

Dockerfile.damocles CHANGED Viewed

@@ -1,5 +1,5 @@
 # SupplyMind v3.0-arcadia — Damocles API (FastAPI)
-# Deploys v3_arcadia/90_damocles/app.py with /assess, /forecast, /rag, /rl/act, /health
 #
 # Build:  docker build -f Dockerfile.damocles -t supplymind-damocles:v3.0-arcadia .
 # Run:    docker run -p 8765:8765 supplymind-damocles:v3.0-arcadia
@@ -18,10 +18,10 @@ COPY requirements-damocles.txt .
 RUN pip install --no-cache-dir -r requirements-damocles.txt
 # App code
-COPY v3_arcadia/90_damocles/ /app/v3_arcadia/90_damocles/
-COPY v3_arcadia/checkpoints/granite/corpus_chunks.pkl /app/v3_arcadia/checkpoints/granite/
 # Embeddings loaded lazily from cached .npy (mounted at runtime or fetched via env)
-COPY v3_arcadia/checkpoints/gethsemane/ppo_easy_typhoon_response.onnx /app/v3_arcadia/checkpoints/gethsemane/
 COPY models/mxbai-embed-large/ /app/models/mxbai-embed-large/
 # Healthcheck

 # SupplyMind v3.0-arcadia — Damocles API (FastAPI)
+# Deploys versions/v3_arcadia/90_damocles/app.py with /assess, /forecast, /rag, /rl/act, /health
 #
 # Build:  docker build -f Dockerfile.damocles -t supplymind-damocles:v3.0-arcadia .
 # Run:    docker run -p 8765:8765 supplymind-damocles:v3.0-arcadia
 RUN pip install --no-cache-dir -r requirements-damocles.txt
 # App code
+COPY versions/v3_arcadia/90_damocles/ /app/versions/v3_arcadia/90_damocles/
+COPY versions/v3_arcadia/checkpoints/granite/corpus_chunks.pkl /app/versions/v3_arcadia/checkpoints/granite/
 # Embeddings loaded lazily from cached .npy (mounted at runtime or fetched via env)
+COPY versions/v3_arcadia/checkpoints/gethsemane/ppo_easy_typhoon_response.onnx /app/versions/v3_arcadia/checkpoints/gethsemane/
 COPY models/mxbai-embed-large/ /app/models/mxbai-embed-large/
 # Healthcheck

FINAL_SUBMIT/ALL_250_FEATURES_LIVE_PROOF.md CHANGED Viewed

@@ -25,9 +25,9 @@ Status legend:
 | A7 | 30-step horizon | `server/supply_environment.py` | reset config | ✅ |
 | A8 | $5–15M budget tasks | `data/disruptions.json` | task manifest | ✅ |
 | A9 | TSMC/Samsung coords | `data/companies_real.json` | n_real_nodes=40 | ✅ |
-| A10 | 8-event crisis library v1 | `ShAuRyA_Supplymind/realtime/crisis_library.py` | 8 events indexed | ✅ |
-| A11 | Wordle RLVR mini-env | `ShAuRyA_Phoenix/wordle_env/env.py` | `wordle_real_reinforce_v2_curve.json` | ✅ |
-| A12 | RLVE adaptive curriculum | `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` | `rlve_curriculum_smoke.json` | ✅ |
 ## B · Reward engineering (14) — 14/14 ✅

 | A7 | 30-step horizon | `server/supply_environment.py` | reset config | ✅ |
 | A8 | $5–15M budget tasks | `data/disruptions.json` | task manifest | ✅ |
 | A9 | TSMC/Samsung coords | `data/companies_real.json` | n_real_nodes=40 | ✅ |
+| A10 | 8-event crisis library v1 | `versions/v4_arcadia_live/realtime/crisis_library.py` | 8 events indexed | ✅ |
+| A11 | Wordle RLVR mini-env | `versions/v5_phoenix/wordle_env/env.py` | `wordle_real_reinforce_v2_curve.json` | ✅ |
+| A12 | RLVE adaptive curriculum | `versions/v5_phoenix/wordle_env/rlve_curriculum.py` | `rlve_curriculum_smoke.json` | ✅ |
 ## B · Reward engineering (14) — 14/14 ✅

FINAL_SUBMIT/ARCHITECTURE.md CHANGED Viewed

@@ -90,7 +90,7 @@
 Plus 4 base wrappers: `qwen25-14b-local`, `qwen25-coder-local`, `mistral-nemo-local`, `deepseek-r1-local-q4`.
-5 Modelfiles committed at `rl/lora/Modelfile`, `Modelfile.v2-v4`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5`.
 ### 4. LoRA fine-tuning track
@@ -100,7 +100,7 @@ Qwen-2.5-1.5B → PEFT/LoRA → 4-bit NF4 (bitsandbytes) → TRL → 225 instruc
 Qwen-2.5-3B-Instruct base. 21 preference pairs from R4 ground truth at `dpo_judge/data/preference_pairs.jsonl`. DPO sigmoid loss, β=0.1, LoRA r=8 / α=16, hf strategy (single-GPU 12GB), per_device_train_batch_size=1, gradient_accumulation_steps=4, lr=5e-5, save_adapter_only.
-5 trainers in `ShAuRyA_Phoenix/roll_integration/dpo_judge/`:
 - `train_dpo_trl.py` — TRL standalone (ROLL-free fallback)
 - `train_dpo_roll.py` — ROLL-integrated
 - `train_grpo_env.py` — GRPO multi-turn
@@ -178,7 +178,7 @@ dag_feats (80-d) ──→ DAGEncoder
 ### 14. Live data layer (20 sources)
-`ShAuRyA_Supplymind/realtime/orchestrator_v2.py` fans out to 20 sources via ThreadPoolExecutor with per-source timeouts and graceful failure:
 NewsAPI · GDELT · GDELT-Conflict · GDELT-Humanitarian · USGS earthquakes · NOAA NDBC buoys · NOAA Tides · NASA EONET · NASA FIRMS fires · EIA Brent · EIA WTI · EIA natgas · MarineTraffic AIS · Global Fishing Watch · World Bank commodities · WHO DON · SEC EDGAR · CISA KEV · OFAC sanctions · Wikipedia pageviews · HN tech ticker

 Plus 4 base wrappers: `qwen25-14b-local`, `qwen25-coder-local`, `mistral-nemo-local`, `deepseek-r1-local-q4`.
+5 Modelfiles committed at `rl/lora/Modelfile`, `Modelfile.v2-v4`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5`.
 ### 4. LoRA fine-tuning track
 Qwen-2.5-3B-Instruct base. 21 preference pairs from R4 ground truth at `dpo_judge/data/preference_pairs.jsonl`. DPO sigmoid loss, β=0.1, LoRA r=8 / α=16, hf strategy (single-GPU 12GB), per_device_train_batch_size=1, gradient_accumulation_steps=4, lr=5e-5, save_adapter_only.
+5 trainers in `versions/v5_phoenix/roll_integration/dpo_judge/`:
 - `train_dpo_trl.py` — TRL standalone (ROLL-free fallback)
 - `train_dpo_roll.py` — ROLL-integrated
 - `train_grpo_env.py` — GRPO multi-turn
 ### 14. Live data layer (20 sources)
+`versions/v4_arcadia_live/realtime/orchestrator_v2.py` fans out to 20 sources via ThreadPoolExecutor with per-source timeouts and graceful failure:
 NewsAPI · GDELT · GDELT-Conflict · GDELT-Humanitarian · USGS earthquakes · NOAA NDBC buoys · NOAA Tides · NASA EONET · NASA FIRMS fires · EIA Brent · EIA WTI · EIA natgas · MarineTraffic AIS · Global Fishing Watch · World Bank commodities · WHO DON · SEC EDGAR · CISA KEV · OFAC sanctions · Wikipedia pageviews · HN tech ticker

FINAL_SUBMIT/BENCHMARK_REPORT.md CHANGED Viewed

@@ -69,7 +69,7 @@ Tested on 32k held-out training rows of real harvested transitions. The split-co
 ## 4. RAP-XC training on real harvest
-`ShAuRyA_Phoenix/rap_xc/train.py` → `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt`
 | Metric | Result |
 |---|---|
@@ -83,7 +83,7 @@ Tested on 32k held-out training rows of real harvested transitions. The split-co
 ## 5. HetTemporalGAT vs v1 GCN cascade
-`ShAuRyA_Phoenix/gnn_v2/train_hetgat.py` → `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json`
 Task: arrival-time regression on R6 cascade graphs (real semiconductor supply-chain).
@@ -111,7 +111,7 @@ Strong cross-corpus stability — same panel produces near-identical α on indep
 ## 7. Tohoku 2011 Platinum counterfactual replication
-`ShAuRyA_Phoenix/counterfactual_v2/platinum.py` synthetic-control method on real Tohoku 2011 economic data.
 | Metric | Value |
 |---|---|

 ## 4. RAP-XC training on real harvest
+`versions/v5_phoenix/rap_xc/train.py` → `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt`
 | Metric | Result |
 |---|---|
 ## 5. HetTemporalGAT vs v1 GCN cascade
+`versions/v5_phoenix/gnn_v2/train_hetgat.py` → `versions/v5_phoenix/experiments/hetgat_v1/report.json`
 Task: arrival-time regression on R6 cascade graphs (real semiconductor supply-chain).
 ## 7. Tohoku 2011 Platinum counterfactual replication
+`versions/v5_phoenix/counterfactual_v2/platinum.py` synthetic-control method on real Tohoku 2011 economic data.
 | Metric | Value |
 |---|---|

FINAL_SUBMIT/COLD_OPEN_OPENING_LINES.md CHANGED Viewed

@@ -1,26 +1,26 @@
-# COLD OPEN -- opening lines for judge pitch (<= 8 sec each)
-## Three variants depending on judge persona
-### A -- Technical depth judge (academic/research)
-> "REINFORCE on Wordle: 100% solve rate, Wilcoxon p=1.87e-34, Cohen's d=3.89, 9.8 seconds on a single CPU thread. Same loop drives a 280-action supply-chain RL env with 1500-event EMDAT RAG corpus and conformal action filter at 0.9001 empirical coverage."
-### B -- Industry pragmatist (engineer/PM)
-> "If Hormuz closes tomorrow, India loses INR X-trillion in 30 days. Watch what one LLM, RL-trained, does about it -- live API calls, real EIA price data, real NASA fire feed, end-to-end in 7 seconds with a sha256 receipt for every claim."
-### C -- Storyteller (DevRel/PM)
-> "Most hackathon entries train on Wordle. We ALSO train on Wordle -- and use the same canonical loop on a real-world supply-chain crisis simulator with 9 live data feeds. One submission, all three hackathon themes, every claim sha256-replayable."
-## Use-case map
-| Persona | Likely panel weight | Use line |
-|---|---|---|
-| Academic/research | 40% (per VICTORY_CALCULUS) | A |
-| Industry/PM | 35% | B |
-| Storyteller/DevRel | 25% | C |
-## Backup ultra-short variants (<= 4 sec)
-- "100% solve, p=1e-34, 9.8 seconds, CPU only."
-- "9 live APIs. 1500 events. 7-second war room."
-- "Three themes. One env. Every claim hashed."

+# COLD OPEN -- opening lines for judge pitch (<= 8 sec each)
+## Three variants depending on judge persona
+### A -- Technical depth judge (academic/research)
+> "REINFORCE on Wordle: 100% solve rate, Wilcoxon p=1.87e-34, Cohen's d=3.89, 9.8 seconds on a single CPU thread. Same loop drives a 280-action supply-chain RL env with 1500-event EMDAT RAG corpus and conformal action filter at 0.9001 empirical coverage."
+### B -- Industry pragmatist (engineer/PM)
+> "If Hormuz closes tomorrow, India loses INR X-trillion in 30 days. Watch what one LLM, RL-trained, does about it -- live API calls, real EIA price data, real NASA fire feed, end-to-end in 7 seconds with a sha256 receipt for every claim."
+### C -- Storyteller (DevRel/PM)
+> "Most hackathon entries train on Wordle. We ALSO train on Wordle -- and use the same canonical loop on a real-world supply-chain crisis simulator with 9 live data feeds. One submission, all three hackathon themes, every claim sha256-replayable."
+## Use-case map
+| Persona | Likely panel weight | Use line |
+|---|---|---|
+| Academic/research | 40% (per VICTORY_CALCULUS) | A |
+| Industry/PM | 35% | B |
+| Storyteller/DevRel | 25% | C |
+## Backup ultra-short variants (<= 4 sec)
+- "100% solve, p=1e-34, 9.8 seconds, CPU only."
+- "9 live APIs. 1500 events. 7-second war room."
+- "Three themes. One env. Every claim hashed."

FINAL_SUBMIT/DATASET_CARD.md CHANGED Viewed

@@ -23,11 +23,11 @@
 ## Static datasets
 | Name | Size | Description | Path |
 |------|------|-------------|------|
-| EMDAT crisis library v2 | ~1500 events | historical disaster impact records | `ShAuRyA_Supplymind/scenarios/` |
-| Hand-curated 8 events | 8 events | Iran/Israel/Hormuz/Red-Sea/Suez/Taiwan/Thailand/Tohoku | `ShAuRyA_Supplymind/realtime/crisis_library.py` |
 | WTI crude time-series | 2,818 windows | DCOILWTICO from FRED | TFT training |
 | Real company nodes | 40 nodes | TSMC/Samsung/Toyota etc with real coords | `data/companies_real.json` |
-| Wordle dictionary | 102 words | 5-letter common words (tier-0 baseline) | `ShAuRyA_Phoenix/wordle_env/env.py` |
 | Wordle tier 1+ | +200/+150/+80 words | RLVE expansion tiers | `rlve_curriculum.py` |
 | RAG corpus | 6,483 chunks | wiki_crisis 564 + sec_10k 5790 + policy 129 | `R5_GRANITE.json` |
 | Conformal calibration NLLs | 5,696 (v2) / 16,000 (v3) | nonconformity scores | `conformal_*.json` |

 ## Static datasets
 | Name | Size | Description | Path |
 |------|------|-------------|------|
+| EMDAT crisis library v2 | ~1500 events | historical disaster impact records | `versions/v4_arcadia_live/scenarios/` |
+| Hand-curated 8 events | 8 events | Iran/Israel/Hormuz/Red-Sea/Suez/Taiwan/Thailand/Tohoku | `versions/v4_arcadia_live/realtime/crisis_library.py` |
 | WTI crude time-series | 2,818 windows | DCOILWTICO from FRED | TFT training |
 | Real company nodes | 40 nodes | TSMC/Samsung/Toyota etc with real coords | `data/companies_real.json` |
+| Wordle dictionary | 102 words | 5-letter common words (tier-0 baseline) | `versions/v5_phoenix/wordle_env/env.py` |
 | Wordle tier 1+ | +200/+150/+80 words | RLVE expansion tiers | `rlve_curriculum.py` |
 | RAG corpus | 6,483 chunks | wiki_crisis 564 + sec_10k 5790 + policy 129 | `R5_GRANITE.json` |
 | Conformal calibration NLLs | 5,696 (v2) / 16,000 (v3) | nonconformity scores | `conformal_*.json` |

FINAL_SUBMIT/ENV_CARD.md CHANGED Viewed

@@ -45,7 +45,7 @@
 - **hard_cascading_crisis** — 40 nodes, 60 days, $15M budget, cascading
 ## Wordle Companion Environment
-- **Class**: `ShAuRyA_Phoenix.wordle_env.env`
 - **Type**: Canonical RLVR mini-env
 - **Action space**: `Discrete(102)` (102-word baseline) or restricted by curriculum tier
 - **State**: 188-dim (rich encoding per `final_real_reinforce_wordle_v2.py`)

 - **hard_cascading_crisis** — 40 nodes, 60 days, $15M budget, cascading
 ## Wordle Companion Environment
+- **Class**: `versions.v5_phoenix.wordle_env.env`
 - **Type**: Canonical RLVR mini-env
 - **Action space**: `Discrete(102)` (102-word baseline) or restricted by curriculum tier
 - **State**: 188-dim (rich encoding per `final_real_reinforce_wordle_v2.py`)

FINAL_SUBMIT/FEATURE_INVENTORY.md CHANGED Viewed

@@ -8,9 +8,9 @@ Verification: every bullet point in the project plan mapped to file:line.
 | Component | Previous | Now wired in |
 |---|---|---|
-| Chronos-Bolt-base | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:53-71` |
-| TimesFM-2 | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:74-99` |
-| TabPFN-v2 regressor | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:101-145` |
 Closed Brent backtest gap from 6/8 to **8/8 within ±30%** (median rel err 3.3%).
 See `tests/receipts/ensemble_brent_validation.json`.
@@ -22,12 +22,12 @@ See `tests/receipts/ensemble_brent_validation.json`.
 | Bullet | Status | Path(s) | Note |
 |---|---|---|---|
 | supplymind-analyst:v1 | MISSING | — | only v2-v5 retained; v1 superseded |
-| supplymind-analyst:v2-v5 | PRESENT | `rl/lora/Modelfile.v2:1-20`, `Modelfile.v3:1-20`, `Modelfile.v4:1-20`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5:1-20` | 4 versions |
-| qwen25-14b-local Modelfile | PRESENT | `v3_arcadia/00_emergence/qwen25-14b.Modelfile:1-19` | Q4_K_M |
-| qwen25-coder-local Modelfile | PRESENT | `v3_arcadia/00_emergence/qwen25-coder-14b.Modelfile:1-19` | JSON-mode |
-| mistral-nemo-local Modelfile | PRESENT | `v3_arcadia/00_emergence/mistral-nemo.Modelfile:1-18` | num_ctx 32768 |
 | deepseek-r1-local-q4 Modelfile | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | Q4_K_M reference |
-| 5 Modelfile files (rl/lora/*) | PRESENT | `rl/lora/Modelfile, .v2, .v3, .v4` + `ShAuRyA_Supplymind/features/Modelfile.analyst_v5` | All 5 present |
 ## A.2 Modelfile Crafting
@@ -54,7 +54,7 @@ See `tests/receipts/ensemble_brent_validation.json`.
 | Bullet | Status | Path(s) | Note |
 |---|---|---|---|
-| `dpo_judge/*` directory | PRESENT | `ShAuRyA_Phoenix/roll_integration/dpo_judge/` | 6 files |
 | `prepare_preference_data.py` | PRESENT | `dpo_judge/prepare_preference_data.py:1-50+` | DPO pair builder |
 | `train_dpo_trl.py` | PRESENT | `dpo_judge/train_dpo_trl.py:1-50+` | TRL trainer |
 | `train_dpo_roll.py` | PRESENT | `dpo_judge/train_dpo_roll.py:1-30+` | ROLL-integrated |
@@ -81,7 +81,7 @@ See `tests/receipts/ensemble_brent_validation.json`.
 |---|---|---|---|
 | Q4_K_M references | PRESENT | `mistral-nemo.Modelfile:1`, `qwen25-14b.Modelfile:1`, `qwen25-coder-14b.Modelfile:1` | all 3 specify q4km |
 | `OLLAMA_MAX_LOADED_MODELS=1` | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | VRAM discipline |
-| `convert_bge_to_safetensors.py` | PRESENT | `v3_arcadia/00_emergence/convert_bge_to_safetensors.py:1-45` | CVE-2025-32434 workaround |
 | 2GB safetensors output | PRESENT | `models/bge-m3/model.safetensors` | 2.2GB verified |
 ## B. 13 Foundation Models
@@ -106,14 +106,14 @@ See `tests/receipts/ensemble_brent_validation.json`.
 | Script | Status | Path |
 |---|---|---|
-| `verify_qwen14b.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen14b.py` |
-| `verify_mistral_nemo.py` | PRESENT | `v3_arcadia/00_emergence/verify_mistral_nemo.py` |
-| `verify_qwen_coder.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen_coder.py` |
-| `verify_qwen_vl.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen_vl.py` |
-| `verify_tabpfn.py` | PRESENT | `v3_arcadia/00_emergence/verify_tabpfn.py` |
-| `verify_timesfm.py` | PRESENT | `v3_arcadia/00_emergence/verify_timesfm.py` |
-| `verify_embedders_chronos.py` | PRESENT | `v3_arcadia/00_emergence/verify_embedders_chronos.py` |
-| `r1_qwen_vl_downstream.py` | PRESENT | `v3_arcadia/00_emergence/r1_qwen_vl_downstream.py` |
 ## C.1 Game-Engine Tasks & Action Space
@@ -203,19 +203,19 @@ See `tests/receipts/ensemble_brent_validation.json`.
 | Component | Path | Purpose |
 |---|---|---|
-| Hormuz War Room orchestrator | `ShAuRyA_Supplymind/realtime/hormuz_war_room_router.py` | `/demo/hormuz-war-room` POST + UI route |
-| India 7-sector exposure | `ShAuRyA_Supplymind/scenarios/india_industry_exposure.py` | 7 cited sectors + deterministic scorer |
-| Gulf 7-sector exposure | `ShAuRyA_Supplymind/scenarios/gulf_industry_exposure.py` | 7 cited sectors + bypass-credit scorer |
-| Hormuz chokepoint graph | `ShAuRyA_Supplymind/scenarios/hormuz_chokepoint_graph.py` | 14 nodes + 18 edges + 5 IEA facts |
-| OpenRouter 6-judge cross-check | `ShAuRyA_Supplymind/realtime/openrouter_war_room_panel.py` | gpt-oss-120b, gemma, glm, minimax, nemotron, gemma-26b |
 | War-Room dashboard HTML | `server/static/hormuz_war_room.html` | dark-mode 6-panel UI |
 | War-Room validation harness | `scripts/validate_war_room.py` | 8-event historical backtest |
-| Ensemble Brent forecaster | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py` | Chronos+TimesFM+TabPFN, 8/8 ±30% |
 | Ensemble Brent validator | `scripts/validate_ensemble_brent.py` | 8-event closed-form backtest |
 | Master demo HTML | `server/static/master.html` | 9-card live integration page |
-| RAP-XC weights | `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt` | 3.14M params, BC 5.62→0.23 |
-| Conformal weights | `ShAuRyA_Phoenix/action_v2/conformal_calibrated.pt` | α=0.1, coverage 0.9001 |
-| HetGAT report | `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json` | +7.77/+12.15/+10.03% |
 ## API Keys (every key reaches a UI element)

 | Component | Previous | Now wired in |
 |---|---|---|
+| Chronos-Bolt-base | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:53-71` |
+| TimesFM-2 | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:74-99` |
+| TabPFN-v2 regressor | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:101-145` |
 Closed Brent backtest gap from 6/8 to **8/8 within ±30%** (median rel err 3.3%).
 See `tests/receipts/ensemble_brent_validation.json`.
 | Bullet | Status | Path(s) | Note |
 |---|---|---|---|
 | supplymind-analyst:v1 | MISSING | — | only v2-v5 retained; v1 superseded |
+| supplymind-analyst:v2-v5 | PRESENT | `rl/lora/Modelfile.v2:1-20`, `Modelfile.v3:1-20`, `Modelfile.v4:1-20`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5:1-20` | 4 versions |
+| qwen25-14b-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/qwen25-14b.Modelfile:1-19` | Q4_K_M |
+| qwen25-coder-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/qwen25-coder-14b.Modelfile:1-19` | JSON-mode |
+| mistral-nemo-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/mistral-nemo.Modelfile:1-18` | num_ctx 32768 |
 | deepseek-r1-local-q4 Modelfile | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | Q4_K_M reference |
+| 5 Modelfile files (rl/lora/*) | PRESENT | `rl/lora/Modelfile, .v2, .v3, .v4` + `versions/v4_arcadia_live/features/Modelfile.analyst_v5` | All 5 present |
 ## A.2 Modelfile Crafting
 | Bullet | Status | Path(s) | Note |
 |---|---|---|---|
+| `dpo_judge/*` directory | PRESENT | `versions/v5_phoenix/roll_integration/dpo_judge/` | 6 files |
 | `prepare_preference_data.py` | PRESENT | `dpo_judge/prepare_preference_data.py:1-50+` | DPO pair builder |
 | `train_dpo_trl.py` | PRESENT | `dpo_judge/train_dpo_trl.py:1-50+` | TRL trainer |
 | `train_dpo_roll.py` | PRESENT | `dpo_judge/train_dpo_roll.py:1-30+` | ROLL-integrated |
 |---|---|---|---|
 | Q4_K_M references | PRESENT | `mistral-nemo.Modelfile:1`, `qwen25-14b.Modelfile:1`, `qwen25-coder-14b.Modelfile:1` | all 3 specify q4km |
 | `OLLAMA_MAX_LOADED_MODELS=1` | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | VRAM discipline |
+| `convert_bge_to_safetensors.py` | PRESENT | `versions/v3_arcadia/00_emergence/convert_bge_to_safetensors.py:1-45` | CVE-2025-32434 workaround |
 | 2GB safetensors output | PRESENT | `models/bge-m3/model.safetensors` | 2.2GB verified |
 ## B. 13 Foundation Models
 | Script | Status | Path |
 |---|---|---|
+| `verify_qwen14b.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen14b.py` |
+| `verify_mistral_nemo.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_mistral_nemo.py` |
+| `verify_qwen_coder.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen_coder.py` |
+| `verify_qwen_vl.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen_vl.py` |
+| `verify_tabpfn.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_tabpfn.py` |
+| `verify_timesfm.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_timesfm.py` |
+| `verify_embedders_chronos.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_embedders_chronos.py` |
+| `r1_qwen_vl_downstream.py` | PRESENT | `versions/v3_arcadia/00_emergence/r1_qwen_vl_downstream.py` |
 ## C.1 Game-Engine Tasks & Action Space
 | Component | Path | Purpose |
 |---|---|---|
+| Hormuz War Room orchestrator | `versions/v4_arcadia_live/realtime/hormuz_war_room_router.py` | `/demo/hormuz-war-room` POST + UI route |
+| India 7-sector exposure | `versions/v4_arcadia_live/scenarios/india_industry_exposure.py` | 7 cited sectors + deterministic scorer |
+| Gulf 7-sector exposure | `versions/v4_arcadia_live/scenarios/gulf_industry_exposure.py` | 7 cited sectors + bypass-credit scorer |
+| Hormuz chokepoint graph | `versions/v4_arcadia_live/scenarios/hormuz_chokepoint_graph.py` | 14 nodes + 18 edges + 5 IEA facts |
+| OpenRouter 6-judge cross-check | `versions/v4_arcadia_live/realtime/openrouter_war_room_panel.py` | gpt-oss-120b, gemma, glm, minimax, nemotron, gemma-26b |
 | War-Room dashboard HTML | `server/static/hormuz_war_room.html` | dark-mode 6-panel UI |
 | War-Room validation harness | `scripts/validate_war_room.py` | 8-event historical backtest |
+| Ensemble Brent forecaster | `versions/v5_phoenix/forecast_v2/ensemble_brent.py` | Chronos+TimesFM+TabPFN, 8/8 ±30% |
 | Ensemble Brent validator | `scripts/validate_ensemble_brent.py` | 8-event closed-form backtest |
 | Master demo HTML | `server/static/master.html` | 9-card live integration page |
+| RAP-XC weights | `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt` | 3.14M params, BC 5.62→0.23 |
+| Conformal weights | `versions/v5_phoenix/action_v2/conformal_calibrated.pt` | α=0.1, coverage 0.9001 |
+| HetGAT report | `versions/v5_phoenix/experiments/hetgat_v1/report.json` | +7.77/+12.15/+10.03% |
 ## API Keys (every key reaches a UI element)

FINAL_SUBMIT/FEATURE_INVENTORY_DI.md CHANGED Viewed

@@ -2,7 +2,7 @@
 Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). Each row links to a file or a JSON receipt that proves the claim.
-**Note:** receipts named `R*_*.json` are mirrored from `v3_arcadia/results/` to `FINAL_SUBMIT/receipts/`.
 ---
@@ -86,7 +86,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
 |---|---|---|---|
 | 44 | Custom TFT 513K params on 3-target FRED | ✅ | `tft_real_metrics.json: params, test_mae_p50: {DCOILWTICO, PCOPPUSDM, PPICMM}` |
 | 45 | Custom TFT 90K params on real WTI MAE $7.83/bbl | ✅ | `tft_v2_metrics.json: params, test_mae_p50.DCOILWTICO` |
-| 46 | Chronos-Bolt 14-step quantile [0.1, 0.5, 0.9] | ✅ | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:53-71` (pass-10), `R3_TIMESFM_QUANTILE.json` |
 | 47 | TimesFM-2 + synthesized quantile via residual regression | ✅ | `forecast_v2/ensemble_brent.py:74-99`, `R3_TIMESFM_QUANTILE.json` |
 | 48 | Prophet weekly+yearly | ✅ | `R3_PAST_SELF.json` ensemble row (Prophet seasonality) |
 | 49 | ARIMA(5,1,0) classical baseline | ✅ | `R3_PAST_SELF.json` ensemble row |
@@ -98,7 +98,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
 | 55 | 8 FRED targets (WTI, copper, EUR/USD, JPY/USD, CNY/USD, KOR/USD, EUR-USD, PPICMM) | ✅ | `tft_v2_metrics.json: targets` (7 targets confirmed in train_tft_real.py: DCOILWTICO/PCOPPUSDM/DEXTAUS/DEXKOUS/DEXJPUS/DEXUSEU/DEXCHUS); 8th = PPICMM in tft_real_metrics |
 | 56 | 3 horizons (7, 14, 28 days) | ⚠️ | `train_tft_real.py:39 HORIZON=14`; 7 and 28-day variants in `R3_PAST_SELF` rolling_backtest fields |
 | 57 | PICP@80/90/95% calibration | ✅ | `R3_PAST_SELF.json: per_target_horizon.picp_*` |
-| 58 | Per-horizon split-conformal (Foygel Barber 2022) | ✅ | `ShAuRyA_Phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 59 | TimesFM-CP residual quantile regression | ✅ | `R3_TIMESFM_QUANTILE.json` + `R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 60 | Heteroscedastic Ridge widths | ✅ | `R3_PAST_SELF.json: ridge_widths` |
 | 61 | 2,883 business days (2015-2026) | ✅ | `tft_v2_metrics.json: train_size`, `rl/data/fred_cache.json` |
@@ -114,7 +114,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
 |---|---|---|---|
 | 63 | MC Dropout 50 forward passes | ✅ | `rl/uncertainty.py:1-50, n_passes=50`, `mc_dropout_v2.json` |
 | 64 | Epistemic σ correlates accuracy (Q1=99.76%, Q4=55.92%) | ✅ | `mc_dropout_v2.json: reliability_full[bins]` |
-| 65 | Conformal RL on Q-values (3 alpha 0.05/0.05/0.1) | ✅ | `ShAuRyA_Supplymind/features/conformal_rl.py:1-50` + `ShAuRyA_Phoenix/action_v2/conformal.py` |
 | 66 | Confidence-damped projection | ✅ | `rl/uncertainty.py: confidence_damping` + `crisis_library.py: damp_on_weak_match` |
 | 67 | Beta-severity + Lognormal-duration MC | ✅ | `rl/surrogate/fast_monte_carlo.py: scenarios` |
 | 68 | Numba JIT MC hotloop (10-50× speedup) | ✅ | `rl/surrogate/fast_monte_carlo.py: @numba.jit` |
@@ -147,7 +147,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
 | 86 | 26 BEIR Wikipedia subset | ✅ | `R5_BEIR_MANUAL.json` |
 | 87 | ChromaDB persistent at rl/rag/chroma_db/ | ✅ | dir present |
 | 88 | Ollama nomic-embed-text (768d) | ✅ | `rl/rag/indexer.py:29-30 EMBEDDING_MODEL=nomic-embed-text` |
-| 89 | mxbai-embed-large for crisis library | ✅ | `ShAuRyA_Supplymind/scenarios/library_v2_search.py` (pass-6) |
 | 90 | Corpus SHA-256 hash caching | ⚠️ | grep finds `corpus_hash` references in some scripts; not in indexer.py directly |
 | 91 | min_score=0.60 | ✅ | `rl/rag/indexer.py:31 MIN_SCORE=0.60` |
 | 92 | chunk_words=256, overlap=32, min=30 | ⚠️ | `indexer.py:32-33 chunk_words=300` (slightly different); overlap+min not in source |
@@ -207,8 +207,8 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
 | 128 | 50 cached explanations | ✅ | cache implementation present |
 | 129 | 3-4s per explanation on RTX 4080 | ✅ | latency profiled |
 | 130 | Explainer stress test 50/50 pass | ✅ | `explainer_stress_v2.json: n_test=50, passed=50, pass_rate=1.0` (exact) |
-| 131 | GCN edge attention PNG heatmaps | ✅ | `ShAuRyA_Supplymind/features/gcn_attention_viz.py` |
-| 132 | Provenance 5-tier trust classifier (regulatory/academic/reference/industry/uncertain) | ✅ | `ShAuRyA_Supplymind/features/rag_provenance.py:39-49` (5 tiers) |
 **Section I total: 13 ✅ + 1 ⚠️ = 14/14 = 100%**

 Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). Each row links to a file or a JSON receipt that proves the claim.
+**Note:** receipts named `R*_*.json` are mirrored from `versions/v3_arcadia/results/` to `FINAL_SUBMIT/receipts/`.
 ---
 |---|---|---|---|
 | 44 | Custom TFT 513K params on 3-target FRED | ✅ | `tft_real_metrics.json: params, test_mae_p50: {DCOILWTICO, PCOPPUSDM, PPICMM}` |
 | 45 | Custom TFT 90K params on real WTI MAE $7.83/bbl | ✅ | `tft_v2_metrics.json: params, test_mae_p50.DCOILWTICO` |
+| 46 | Chronos-Bolt 14-step quantile [0.1, 0.5, 0.9] | ✅ | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:53-71` (pass-10), `R3_TIMESFM_QUANTILE.json` |
 | 47 | TimesFM-2 + synthesized quantile via residual regression | ✅ | `forecast_v2/ensemble_brent.py:74-99`, `R3_TIMESFM_QUANTILE.json` |
 | 48 | Prophet weekly+yearly | ✅ | `R3_PAST_SELF.json` ensemble row (Prophet seasonality) |
 | 49 | ARIMA(5,1,0) classical baseline | ✅ | `R3_PAST_SELF.json` ensemble row |
 | 55 | 8 FRED targets (WTI, copper, EUR/USD, JPY/USD, CNY/USD, KOR/USD, EUR-USD, PPICMM) | ✅ | `tft_v2_metrics.json: targets` (7 targets confirmed in train_tft_real.py: DCOILWTICO/PCOPPUSDM/DEXTAUS/DEXKOUS/DEXJPUS/DEXUSEU/DEXCHUS); 8th = PPICMM in tft_real_metrics |
 | 56 | 3 horizons (7, 14, 28 days) | ⚠️ | `train_tft_real.py:39 HORIZON=14`; 7 and 28-day variants in `R3_PAST_SELF` rolling_backtest fields |
 | 57 | PICP@80/90/95% calibration | ✅ | `R3_PAST_SELF.json: per_target_horizon.picp_*` |
+| 58 | Per-horizon split-conformal (Foygel Barber 2022) | ✅ | `versions/v5_phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 59 | TimesFM-CP residual quantile regression | ✅ | `R3_TIMESFM_QUANTILE.json` + `R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 60 | Heteroscedastic Ridge widths | ✅ | `R3_PAST_SELF.json: ridge_widths` |
 | 61 | 2,883 business days (2015-2026) | ✅ | `tft_v2_metrics.json: train_size`, `rl/data/fred_cache.json` |
 |---|---|---|---|
 | 63 | MC Dropout 50 forward passes | ✅ | `rl/uncertainty.py:1-50, n_passes=50`, `mc_dropout_v2.json` |
 | 64 | Epistemic σ correlates accuracy (Q1=99.76%, Q4=55.92%) | ✅ | `mc_dropout_v2.json: reliability_full[bins]` |
+| 65 | Conformal RL on Q-values (3 alpha 0.05/0.05/0.1) | ✅ | `versions/v4_arcadia_live/features/conformal_rl.py:1-50` + `versions/v5_phoenix/action_v2/conformal.py` |
 | 66 | Confidence-damped projection | ✅ | `rl/uncertainty.py: confidence_damping` + `crisis_library.py: damp_on_weak_match` |
 | 67 | Beta-severity + Lognormal-duration MC | ✅ | `rl/surrogate/fast_monte_carlo.py: scenarios` |
 | 68 | Numba JIT MC hotloop (10-50× speedup) | ✅ | `rl/surrogate/fast_monte_carlo.py: @numba.jit` |
 | 86 | 26 BEIR Wikipedia subset | ✅ | `R5_BEIR_MANUAL.json` |
 | 87 | ChromaDB persistent at rl/rag/chroma_db/ | ✅ | dir present |
 | 88 | Ollama nomic-embed-text (768d) | ✅ | `rl/rag/indexer.py:29-30 EMBEDDING_MODEL=nomic-embed-text` |
+| 89 | mxbai-embed-large for crisis library | ✅ | `versions/v4_arcadia_live/scenarios/library_v2_search.py` (pass-6) |
 | 90 | Corpus SHA-256 hash caching | ⚠️ | grep finds `corpus_hash` references in some scripts; not in indexer.py directly |
 | 91 | min_score=0.60 | ✅ | `rl/rag/indexer.py:31 MIN_SCORE=0.60` |
 | 92 | chunk_words=256, overlap=32, min=30 | ⚠️ | `indexer.py:32-33 chunk_words=300` (slightly different); overlap+min not in source |
 | 128 | 50 cached explanations | ✅ | cache implementation present |
 | 129 | 3-4s per explanation on RTX 4080 | ✅ | latency profiled |
 | 130 | Explainer stress test 50/50 pass | ✅ | `explainer_stress_v2.json: n_test=50, passed=50, pass_rate=1.0` (exact) |
+| 131 | GCN edge attention PNG heatmaps | ✅ | `versions/v4_arcadia_live/features/gcn_attention_viz.py` |
+| 132 | Provenance 5-tier trust classifier (regulatory/academic/reference/industry/uncertain) | ✅ | `versions/v4_arcadia_live/features/rag_provenance.py:39-49` (5 tiers) |
 **Section I total: 13 ✅ + 1 ⚠️ = 14/14 = 100%**

FINAL_SUBMIT/FEATURE_INVENTORY_JT.md CHANGED Viewed

@@ -48,7 +48,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 |---|---|---|---|
 | 21 | NSGA2 via pymoo | ✅ | `rl/pareto/*` (pymoo import) |
 | 22 | 3 objectives (cost, resilience_loss, carbon) | ✅ | `pareto_results.json: objective_names` exact |
-| 23 | Carbon factors per IMO/EPA/ICAO | ✅ | `ShAuRyA_Supplymind/features/pareto_carbon.py` constants |
 | 24 | Air 0.82 / Sea 0.013 / Sea express 0.026 / Rail 0.028 / Road 0.096 kg CO2/tonne-km | ✅ | constants in source |
 | 25 | 20 mitigation plans tested | ⚠️ | `pareto_results.json: n_policies=5` (smaller run); 20-plan run may be older or in `pareto_frontier_v2.json` |
 | 26 | 11 Pareto-frontier plans (55%) | ⚠️ | current receipts 2/5 and 3/5; 11/20 from older run |
@@ -73,11 +73,11 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | 36 | GPU MC: 1 state → 100K with noise linspace(0.01-0.3) | ✅ | `rl/surrogate/gpu_monte_carlo.py` |
 | 37 | 80ms for 100K scenarios | ✅ | profiled in module |
 | 38 | p5/p50/p95/p99/cvar_10 outputs | ✅ | gpu_monte_carlo.py |
-| 39 | Counterfactual digital twin (100 rollouts MC) | ✅ | `ShAuRyA_Phoenix/counterfactual_twin/twin.py` |
 | 40 | REVENUE_AT_RISK_USD: easy $200M / med $320M / hard $400M | ✅ | constants in twin.py |
 | 41 | Severity multiplier 0.5 + 1.0 × clamp(severity, 0, 1) | ✅ | twin.py formula |
 | 42 | TwinReport dataclass (median, p95, savings, CI95, savings_pct) | ✅ | twin.py |
-| 43 | Receipt: $178.68M saved (48%) at sev=0.85, brent=$123, n=30 | ✅ | `ShAuRyA_Phoenix/receipts_v2/V5_Twin_savings_gt_zero.receipt.yaml` |
 **M: 13/13 = 100%**
@@ -87,7 +87,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 44 | NewsAPI (5 keyword queries, 7-day, 100 req/day) | ✅ | `ShAuRyA_Supplymind/realtime/sources/newsapi.py` |
 | 45 | GDELT 2.0 Doc API (15-min refresh, tone severity) | ✅ | `sources/gdelt.py` |
 | 46 | USGS M4.5+ in last 24h, 6 region boxes | ✅ | `sources/usgs.py` |
 | 47 | FRED Brent DCOILBRENTEU daily spot | ✅ | `sources/fred_brent.py` |
@@ -96,7 +96,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | 50 | FRED severity max(\|DoD\|/5%, \|WoW\|/10%) capped 1.0 | ✅ | `fred_brent.py: compute_severity` |
 | 51 | GDELT tone-derived severity | ✅ | `gdelt.py: tone_to_severity` |
 | 52 | USGS magnitude-based severity | ✅ | `usgs.py: magnitude_to_severity` |
-| 53 | SQLite events.db with full schema | ✅ | `ShAuRyA_Supplymind/realtime/store.py: DB_PATH` |
 | 54 | 4 indices (source+hash, ts, region, type) | ✅ | `store.py: CREATE INDEX` × 4 |
 | 55 | SHA-256 dedup hash 16 chars | ✅ | `store.py: hashlib.sha256(...).hexdigest()[:16]` |
 | 56 | 24-hour dedup window | ✅ | `store.py: DEDUP_WINDOW_S = 86400` |
@@ -112,7 +112,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 60 | 8 hand-curated real events (2022-2026) | ✅ | `ShAuRyA_Supplymind/scenarios/iran_israel_hormuz_2024_2026.json: 8 events` exact |
 | 61 | 3-4 citations per event (Reuters/BBC/CNBC/FRED/IDF/DoD/UNCTAD/Lloyd's) | ✅ | each event.citations[] in JSON has 3-4 entries with publisher field |
 | 62 | Curation policy ≥3 citations | ✅ | grep `citations` in v1 library |
 | 63 | mxbai embedding mode | ✅ | `realtime/crisis_library.py` SentenceTransformer |
@@ -172,7 +172,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | 110 | CatBoost (1500 iters, depth 8, GPU) | ✅ | DataCo training receipt |
 | 111 | TabPFN-v2 classifier (zero-shot) | ✅ | `tabpfn_verify.json + tabpfn_risk_judge.py` |
 | 112 | TabPFN-v2 regressor | ✅ | wired in pass-10 ensemble |
-| 113 | TabPFN bagging | ✅ | `v3_arcadia/10_caramel/r2_tabpfn_bagging.py` + `R2_BENEFIT_FIX.json` |
 | 114 | Stacking with Ridge meta-learner | ✅ | `R3_STACKING_V2.json` |
 | 115 | 5-fold CV | ✅ | rolling-fold in stacking |
 | 116 | OOF predictions | ✅ | `R3_STACKING_V2.json: oof_predictions` |
@@ -194,7 +194,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 126 | Political risk GBR R²=0.994, MAE=0.0095 on 214 countries | ✅ | `ShAuRyA_Supplymind/features/political_risk.py + receipts/F12_*.json` |
 | 127 | Political risk LSTM | ✅ | alternate model in same module |
 | 128 | Dependency MLP acc=97.45% on 144K | ✅ | `features/dependency_mlp.py + F11_*.json` |
 | 129 | Financial impact Ridge R²=0.736, MAE=$26.04 | ✅ | `features/financial_ridge.py + F8_*.json` |
@@ -241,7 +241,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | 152 | R6_MaskingAblation_easy_lift = 26.768% | ✅ | `R6_GETHSEMANE_MASKING_ABLATION.json` |
 | 153 | R6_GCN_easy_MAE_vs_MLP = 48.025% | ✅ | `R6_PROVIDER_V2.json: easy.improvement_vs_mlp_pct = 48.025` exact |
 | 154 | R6_AquaRegia_WTI_dev95 = 0.0238 | ✅ | `R6_AQUA_REGIA_V2.json` |
-| 155 | R3_TimesFM_CP_WTI_dev95 = 0.050 | ✅ | `ShAuRyA_Phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 156 | V4_SPOF_V2_F1 = 1.0 | ✅ | F23 receipt |
 | 157 | V4_STACKING_V2_lift_vs_WV = 0.0045 | ✅ | `R3_STACKING_V2.json` |
 | 158 | V4_Live_Brent_202604 = $123.28 | ✅ | live FRED fetch on 2026-04-21 |
@@ -253,15 +253,15 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
 | 164 | V5_Arena_baseline_leaderboard = 6 baselines | ✅ | `R6_ALGO_COMPARISON.json` per_algorithm has 4 + 2 implicit = 6 |
 | 165 | V5_Twin_savings_gt_zero = $178,684,200 | ✅ | twin receipt |
 | 166 | V5_DPO_JUDGE_preference_pairs_built = 21 | ✅ | `dpo_judge/data/preference_pairs.jsonl: 21 lines` exact |
-| 167 | V5_Skill_pack_shipped = 4 files | ✅ | `ShAuRyA_Phoenix/supplymind_skills/*` 4+ skills |
 | 168 | V5_Phoenix_tests_green = 15 passed | ✅ | phoenix smoke = 15 |
-| 169 | SHA-256 stdout tracking | ✅ | `ShAuRyA_Phoenix/receipts_v2/framework.py` |
 | 170 | Hardware capture (CUDA detection) | ✅ | framework.py |
 | 171 | Runtime tracking | ✅ | framework.py |
 | 172 | 5 comparators (==, >=, <=, in_range, regex) | ✅ | framework.py |
-| 173 | Tamper-evident SHA-256 + INDEX.json + INDEX.md auto-generated | ✅ | `ShAuRyA_Phoenix/receipts_v2/INDEX.{json,md}` |
 | 174 | Tiny YAML parser (no PyYAML dep) | ✅ | framework.py |
-| 175 | 271-line framework.py | ✅ | `wc -l ShAuRyA_Phoenix/receipts_v2/framework.py` |
 **T: 28/28 = 100%**

 |---|---|---|---|
 | 21 | NSGA2 via pymoo | ✅ | `rl/pareto/*` (pymoo import) |
 | 22 | 3 objectives (cost, resilience_loss, carbon) | ✅ | `pareto_results.json: objective_names` exact |
+| 23 | Carbon factors per IMO/EPA/ICAO | ✅ | `versions/v4_arcadia_live/features/pareto_carbon.py` constants |
 | 24 | Air 0.82 / Sea 0.013 / Sea express 0.026 / Rail 0.028 / Road 0.096 kg CO2/tonne-km | ✅ | constants in source |
 | 25 | 20 mitigation plans tested | ⚠️ | `pareto_results.json: n_policies=5` (smaller run); 20-plan run may be older or in `pareto_frontier_v2.json` |
 | 26 | 11 Pareto-frontier plans (55%) | ⚠️ | current receipts 2/5 and 3/5; 11/20 from older run |
 | 36 | GPU MC: 1 state → 100K with noise linspace(0.01-0.3) | ✅ | `rl/surrogate/gpu_monte_carlo.py` |
 | 37 | 80ms for 100K scenarios | ✅ | profiled in module |
 | 38 | p5/p50/p95/p99/cvar_10 outputs | ✅ | gpu_monte_carlo.py |
+| 39 | Counterfactual digital twin (100 rollouts MC) | ✅ | `versions/v5_phoenix/counterfactual_twin/twin.py` |
 | 40 | REVENUE_AT_RISK_USD: easy $200M / med $320M / hard $400M | ✅ | constants in twin.py |
 | 41 | Severity multiplier 0.5 + 1.0 × clamp(severity, 0, 1) | ✅ | twin.py formula |
 | 42 | TwinReport dataclass (median, p95, savings, CI95, savings_pct) | ✅ | twin.py |
+| 43 | Receipt: $178.68M saved (48%) at sev=0.85, brent=$123, n=30 | ✅ | `versions/v5_phoenix/receipts_v2/V5_Twin_savings_gt_zero.receipt.yaml` |
 **M: 13/13 = 100%**
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 44 | NewsAPI (5 keyword queries, 7-day, 100 req/day) | ✅ | `versions/v4_arcadia_live/realtime/sources/newsapi.py` |
 | 45 | GDELT 2.0 Doc API (15-min refresh, tone severity) | ✅ | `sources/gdelt.py` |
 | 46 | USGS M4.5+ in last 24h, 6 region boxes | ✅ | `sources/usgs.py` |
 | 47 | FRED Brent DCOILBRENTEU daily spot | ✅ | `sources/fred_brent.py` |
 | 50 | FRED severity max(\|DoD\|/5%, \|WoW\|/10%) capped 1.0 | ✅ | `fred_brent.py: compute_severity` |
 | 51 | GDELT tone-derived severity | ✅ | `gdelt.py: tone_to_severity` |
 | 52 | USGS magnitude-based severity | ✅ | `usgs.py: magnitude_to_severity` |
+| 53 | SQLite events.db with full schema | ✅ | `versions/v4_arcadia_live/realtime/store.py: DB_PATH` |
 | 54 | 4 indices (source+hash, ts, region, type) | ✅ | `store.py: CREATE INDEX` × 4 |
 | 55 | SHA-256 dedup hash 16 chars | ✅ | `store.py: hashlib.sha256(...).hexdigest()[:16]` |
 | 56 | 24-hour dedup window | ✅ | `store.py: DEDUP_WINDOW_S = 86400` |
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 60 | 8 hand-curated real events (2022-2026) | ✅ | `versions/v4_arcadia_live/scenarios/iran_israel_hormuz_2024_2026.json: 8 events` exact |
 | 61 | 3-4 citations per event (Reuters/BBC/CNBC/FRED/IDF/DoD/UNCTAD/Lloyd's) | ✅ | each event.citations[] in JSON has 3-4 entries with publisher field |
 | 62 | Curation policy ≥3 citations | ✅ | grep `citations` in v1 library |
 | 63 | mxbai embedding mode | ✅ | `realtime/crisis_library.py` SentenceTransformer |
 | 110 | CatBoost (1500 iters, depth 8, GPU) | ✅ | DataCo training receipt |
 | 111 | TabPFN-v2 classifier (zero-shot) | ✅ | `tabpfn_verify.json + tabpfn_risk_judge.py` |
 | 112 | TabPFN-v2 regressor | ✅ | wired in pass-10 ensemble |
+| 113 | TabPFN bagging | ✅ | `versions/v3_arcadia/10_caramel/r2_tabpfn_bagging.py` + `R2_BENEFIT_FIX.json` |
 | 114 | Stacking with Ridge meta-learner | ✅ | `R3_STACKING_V2.json` |
 | 115 | 5-fold CV | ✅ | rolling-fold in stacking |
 | 116 | OOF predictions | ✅ | `R3_STACKING_V2.json: oof_predictions` |
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 126 | Political risk GBR R²=0.994, MAE=0.0095 on 214 countries | ✅ | `versions/v4_arcadia_live/features/political_risk.py + receipts/F12_*.json` |
 | 127 | Political risk LSTM | ✅ | alternate model in same module |
 | 128 | Dependency MLP acc=97.45% on 144K | ✅ | `features/dependency_mlp.py + F11_*.json` |
 | 129 | Financial impact Ridge R²=0.736, MAE=$26.04 | ✅ | `features/financial_ridge.py + F8_*.json` |
 | 152 | R6_MaskingAblation_easy_lift = 26.768% | ✅ | `R6_GETHSEMANE_MASKING_ABLATION.json` |
 | 153 | R6_GCN_easy_MAE_vs_MLP = 48.025% | ✅ | `R6_PROVIDER_V2.json: easy.improvement_vs_mlp_pct = 48.025` exact |
 | 154 | R6_AquaRegia_WTI_dev95 = 0.0238 | ✅ | `R6_AQUA_REGIA_V2.json` |
+| 155 | R3_TimesFM_CP_WTI_dev95 = 0.050 | ✅ | `versions/v5_phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
 | 156 | V4_SPOF_V2_F1 = 1.0 | ✅ | F23 receipt |
 | 157 | V4_STACKING_V2_lift_vs_WV = 0.0045 | ✅ | `R3_STACKING_V2.json` |
 | 158 | V4_Live_Brent_202604 = $123.28 | ✅ | live FRED fetch on 2026-04-21 |
 | 164 | V5_Arena_baseline_leaderboard = 6 baselines | ✅ | `R6_ALGO_COMPARISON.json` per_algorithm has 4 + 2 implicit = 6 |
 | 165 | V5_Twin_savings_gt_zero = $178,684,200 | ✅ | twin receipt |
 | 166 | V5_DPO_JUDGE_preference_pairs_built = 21 | ✅ | `dpo_judge/data/preference_pairs.jsonl: 21 lines` exact |
+| 167 | V5_Skill_pack_shipped = 4 files | ✅ | `versions/v5_phoenix/supplymind_skills/*` 4+ skills |
 | 168 | V5_Phoenix_tests_green = 15 passed | ✅ | phoenix smoke = 15 |
+| 169 | SHA-256 stdout tracking | ✅ | `versions/v5_phoenix/receipts_v2/framework.py` |
 | 170 | Hardware capture (CUDA detection) | ✅ | framework.py |
 | 171 | Runtime tracking | ✅ | framework.py |
 | 172 | 5 comparators (==, >=, <=, in_range, regex) | ✅ | framework.py |
+| 173 | Tamper-evident SHA-256 + INDEX.json + INDEX.md auto-generated | ✅ | `versions/v5_phoenix/receipts_v2/INDEX.{json,md}` |
 | 174 | Tiny YAML parser (no PyYAML dep) | ✅ | framework.py |
+| 175 | 271-line framework.py | ✅ | `wc -l versions/v5_phoenix/receipts_v2/framework.py` |
 **T: 28/28 = 100%**

FINAL_SUBMIT/FEATURE_INVENTORY_UBB.md CHANGED Viewed

@@ -8,7 +8,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 1 | Karpathy-pattern overnight loop | ✅ | `ShAuRyA_Phoenix/autoresearch_fixed/orchestrator.py` |
 | 2 | LLM hypothesis generation (Qwen-14B local or Claude) | ✅ | `hypothesis_engine.py` |
 | 3 | Mutable `candidate_train.py` with safe-to-modify markers | ✅ | `autoresearch_fixed/candidate_train.py` |
 | 4 | Frozen `program.md` (immutable) | ✅ | `autoresearch_fixed/program.md` |
@@ -42,11 +42,11 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 26 | Counterfactual digital twin 100 rollouts MC | ✅ | `ShAuRyA_Phoenix/counterfactual_twin/twin.py` |
 | 27 | Arena leaderboard 6 baselines pre-seeded | ✅ | `arena/leaderboard.json: n_baselines=6` exact |
 | 28 | MaskablePPO #1 mean=2.209 CI95=[2.178,2.239] | ✅ | `arena/leaderboard.json: rows[0] = MaskablePPO-v3 (ours), overall_reward_mean=2.209, overall_ci95=[2.178,2.239]` EXACT |
 | 29 | runner.py with TaskResult + ArenaResult dataclasses | ✅ | `arena/runner.py` |
-| 30 | 3 Claude Code skills (benchmark-runner, autoresearch-experiment, live-demo-orchestrator) | ✅ | `ShAuRyA_Phoenix/supplymind_skills/` 3 dirs |
 | 31 | plugin.json v1.0.0 manifest | ✅ | `supplymind_skills/plugin.json` |
 | 32 | Replay cache 8 events frozen | ✅ | `realtime_v5/replay_cache_latest.json: n_events=8` exact |
 | 33 | replay_cache_latest.json + timestamped snapshot | ✅ | dir contains both |
@@ -56,7 +56,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 37 | DPO 21 pairs Qwen-2.5-3B LoRA r=8 | ✅ | `dpo_judge/data/preference_pairs.jsonl` 21 lines |
 | 38 | TRL fallback for ROLL fragility | ✅ | `dpo_judge/train_dpo_trl.py` |
 | 39 | Two upstream PRs ready (Meta OpenEnv + Alibaba ROLL) | ✅ | `docs/PHOENIX_PUSH_REPORT.md` |
-| 40 | build_pr_branch.sh | ✅ | `ShAuRyA_Phoenix/build_pr_branch.sh` |
 | 41 | Phoenix isolation (v3+v4 untouched) + copy-before-edit + .venv-roll/ | ✅ | docs note |
 | 42 | phoenix_app.py mounts /arena /twin /replay + /phoenix/status | ✅ | `phoenix_app.py` + server/app.py mount |
@@ -74,7 +74,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 46 | Non-root appuser UID 1000 | ✅ | Dockerfile RUN useradd |
 | 47 | HEALTHCHECK curl /health every 30s | ✅ | Dockerfile HEALTHCHECK |
 | 48 | uvicorn server.app:app entry | ✅ | Dockerfile CMD |
-| 49 | HF Space at huggingface.co/spaces/Shaurya-Noodle/Supplymind | ✅ | `DEPLOY_HF_SPACE.md` |
 | 50 | ONNX <5e-5 roundtrip 4 models | ✅ | `onnx_roundtrip.json` (BC 3.05e-5, CQL 5.22e-8, IQL 3.05e-5, TD3+BC 1.53e-5) all <5e-5 |
 | 51 | .gitignore excludes 159GB models/ | ✅ | `.gitignore` line `models/` |
 | 52 | <2GB container size | ✅ | DEPLOY_HF_SPACE notes |
@@ -153,7 +153,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 107 | 15+ disruption taxonomy | ✅ | `server/data/disruptions.json` |
 | 108 | 15 leading indicators with correlations | ✅ | `rl/leading_indicators.py` |
 | 109 | FRED state[400:407] features | ✅ | `rl/state_builder.py` slice |
-| 110 | 40+ industry citations DATA_SOURCES.md | ✅ | `DATA_SOURCES.md` |
 **Y: 21 ✅ + 2 ⚠️ = 23/23 = 100%**
@@ -164,42 +164,42 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | # | Doc | Status | Path |
 |---|---|---|---|
 | 111 | README.md (40KB) | ✅ | repo root |
-| 112 | SUPPLYMIND_BLUEPRINT.md (81KB) | ✅ | repo root |
-| 113 | ALIENWARE_KICKOFF.md (53KB) | ✅ | repo root |
-| 114 | AUDIT_PLAN.md (22KB) | ✅ | repo root |
 | 115 | MODEL_CARD.md (19KB) | ✅ | repo root |
-| 116 | PYTORCH_STORY.md | ✅ | repo root |
-| 117 | BENCHMARKS_VS_PUBLIC.md | ✅ | repo root |
-| 118 | DATA_SOURCES.md | ✅ | repo root |
-| 119 | EXTERNAL_CREDIBILITY.md | ✅ | repo root |
-| 120 | JUDGES.md | ✅ | repo root |
-| 121 | FINAL_DEMO.md | ✅ | repo root |
-| 122 | DEMO_SCRIPT.md | ✅ | repo root |
-| 123 | DEPLOY_HF_SPACE.md | ✅ | repo root |
-| 124 | EXECUTIVE_SUMMARY.md | ✅ | repo root |
-| 125 | RESULTS.md | ✅ | repo root |
 | 126 | CLONE_AND_STUDY.md | ✅ | docs/ |
 | 127 | FINAL_AUDIT_REPORT.md | ✅ | docs/ |
 | 128 | MULTI_TURN_GRPO_ROADMAP.md | ✅ | docs/ |
 | 129 | LIVE_DEMO_HORMUZ.md | ✅ | demo/ or root |
-| 130 | PREPRINT.md | ✅ | ShAuRyA_Supplymind/docs/ |
-| 131 | PREPRINT_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
 | 132 | PITCH_DECK.md | ✅ | demo/ |
-| 133 | PITCH_DECK_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
 | 134 | DEMO_VIDEO_SCRIPT.md | ✅ | demo/ |
-| 135 | DEMO_VIDEO_SCRIPT_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
-| 136 | JUDGES_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
 | 137 | CHECKLIST.md | ✅ | demo/ |
 | 138 | LANDING_PAGE.md | ✅ | demo/ |
 | 139 | EXTERNAL_OUTREACH.md | ✅ | demo/ |
 | 140 | SECRETS_ROTATION.md | ✅ | docs/ |
-| 141 | PHOENIX_PLAN_V5.md | ✅ | ShAuRyA_Supplymind/docs/ |
-| 142 | PHOENIX_COMPLETION_AUDIT.md | ✅ | ShAuRyA_Phoenix/docs/ |
-| 143 | PHOENIX_PUSH_REPORT.md | ✅ | ShAuRyA_Phoenix/docs/ |
 | 144 | HF_DEPLOY_V4.md | ✅ | docs/ |
 | 145 | R4_RUBRIC_CHALLENGE.md | ✅ | challenges/ |
 | 146 | FAILURE_TABLE.md | ✅ | repo root |
-| 147 | 12 Sleep Token album-track stages (00_emergence → 95_arcadia) | ✅ | `v3_arcadia/` 12 dirs verified exact |
 | 148 | Notebook 01_environment_quickstart | ✅ | `notebooks/01_environment_quickstart.ipynb` |
 | 149 | Notebook 02_training_your_own_agent | ✅ | `notebooks/02_*.ipynb` |
 | 150 | Notebook 03_reproducing_benchmarks | ✅ | same |
@@ -216,16 +216,16 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
-| 155 | Hero result card 10-number 2×5 grid | ✅ | `make_hero_card.py` + `v3_arcadia/plots/hero_*.png` |
 | 156 | make_hero_card.py | ✅ | repo |
-| 157 | Caramel reliability calibration curves | ✅ | `v3_arcadia/plots/r2_caramel_*` |
-| 158 | R4 dangerous 7 plots | ✅ | `v3_arcadia/plots/r4_dangerous_*.png` |
-| 159 | R5 granite 5 plots | ✅ | `v3_arcadia/plots/r5_granite_*.png` |
-| 160 | R6 gethsemane 3 plots | ✅ | `v3_arcadia/plots/r6_gethsemane_*.png` |
-| 161 | R3 past-self 2 plots | ✅ | `v3_arcadia/plots/r3_past_self_*.png` |
-| 162 | R6 provider network graph | ✅ | `v3_arcadia/plots/r6_provider_graph.png` |
-| 163 | R6 euclidian bootstrap CI bands | ✅ | `v3_arcadia/plots/r6_euclidian_*.png` |
-| 164 | R6 aqua-regia coverage plot | ✅ | `v3_arcadia/plots/r6_aqua_regia_coverage.png` |
 | 165 | GCN attention heatmaps 3 graphs | ✅ | `rl/gnn/attention.py` outputs PNG |
 | 166 | Streamlit dashboard 12 panels | ✅ | `dashboard/streamlit_app.py` |
 | 167 | Pareto 3D scatter Plotly | ✅ | `rl/pareto/visualize.py` |
@@ -246,7 +246,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 173 | Two-pass DeepSeek extraction (free CoT → Qwen JSON parse) | ✅ | `R4_DANGEROUS_V2.json: extractor field` 100% parse rate |
 | 174 | Phoenix isolation guarantee 3 layers | ✅ | `PHOENIX_COMPLETION_AUDIT.md` |
 | 175 | Copy-before-edit discipline | ✅ | `PHOENIX_PUSH_REPORT.md` |
-| 176 | Tiny YAML parser (no PyYAML) | ✅ | `ShAuRyA_Phoenix/receipts_v2/framework.py` |
 | 177 | _corpus_hash SHA-256 embedding cache invalidation | ✅ | `crisis_library.py: corpus_hash` |
 | 178 | Token-bucket OpenRouter limiter | ✅ | `openrouter_client.py: per_minute=18` |
 | 179 | .openrouter_cache/ API caching | ✅ | dir exists |
@@ -263,7 +263,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 190 | Honest fallback labeling | ✅ | `data_source_flags.live_pipeline = "deterministic_rubric_fallback"` |
 | 191 | judge_source field | ✅ | `_call_ollama_judge: judge_source = ollama:<model>` |
 | 192 | Scenario JSON ingestion_note | ✅ | crisis library schema |
-| 193 | 4-minute judge path designed | ✅ | `JUDGES.md` |
 | 194 | 30-second receipt verification target | ✅ | `framework.py` design |
 | 195 | Sleep Token thesis "Even in Arcadia, disruptions happen" | ✅ | tagline in docs |
@@ -301,7 +301,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
 | 6 baselines pre-seeded | `n_baselines=6` ✅ EXACT |
 | Replay cache 8 events | `replay_cache_latest.json: n_events=8` ✅ EXACT |
 | Phoenix INDEX 20 receipts | `INDEX.json: list[20]` ✅ EXACT |
-| 12 Sleep Token stages | `v3_arcadia/` 12 dirs ✅ EXACT |
 | 125 .md docs | `find *.md` 125 ✅ |
 | 4 ONNX <5e-5 | onnx_roundtrip ✅ |
 | Token-bucket 18 req/min, 950 req/day | `openrouter_client.py` ✅ EXACT |

 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 1 | Karpathy-pattern overnight loop | ✅ | `versions/v5_phoenix/autoresearch_fixed/orchestrator.py` |
 | 2 | LLM hypothesis generation (Qwen-14B local or Claude) | ✅ | `hypothesis_engine.py` |
 | 3 | Mutable `candidate_train.py` with safe-to-modify markers | ✅ | `autoresearch_fixed/candidate_train.py` |
 | 4 | Frozen `program.md` (immutable) | ✅ | `autoresearch_fixed/program.md` |
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 26 | Counterfactual digital twin 100 rollouts MC | ✅ | `versions/v5_phoenix/counterfactual_twin/twin.py` |
 | 27 | Arena leaderboard 6 baselines pre-seeded | ✅ | `arena/leaderboard.json: n_baselines=6` exact |
 | 28 | MaskablePPO #1 mean=2.209 CI95=[2.178,2.239] | ✅ | `arena/leaderboard.json: rows[0] = MaskablePPO-v3 (ours), overall_reward_mean=2.209, overall_ci95=[2.178,2.239]` EXACT |
 | 29 | runner.py with TaskResult + ArenaResult dataclasses | ✅ | `arena/runner.py` |
+| 30 | 3 Claude Code skills (benchmark-runner, autoresearch-experiment, live-demo-orchestrator) | ✅ | `versions/v5_phoenix/supplymind_skills/` 3 dirs |
 | 31 | plugin.json v1.0.0 manifest | ✅ | `supplymind_skills/plugin.json` |
 | 32 | Replay cache 8 events frozen | ✅ | `realtime_v5/replay_cache_latest.json: n_events=8` exact |
 | 33 | replay_cache_latest.json + timestamped snapshot | ✅ | dir contains both |
 | 37 | DPO 21 pairs Qwen-2.5-3B LoRA r=8 | ✅ | `dpo_judge/data/preference_pairs.jsonl` 21 lines |
 | 38 | TRL fallback for ROLL fragility | ✅ | `dpo_judge/train_dpo_trl.py` |
 | 39 | Two upstream PRs ready (Meta OpenEnv + Alibaba ROLL) | ✅ | `docs/PHOENIX_PUSH_REPORT.md` |
+| 40 | build_pr_branch.sh | ✅ | `versions/v5_phoenix/build_pr_branch.sh` |
 | 41 | Phoenix isolation (v3+v4 untouched) + copy-before-edit + .venv-roll/ | ✅ | docs note |
 | 42 | phoenix_app.py mounts /arena /twin /replay + /phoenix/status | ✅ | `phoenix_app.py` + server/app.py mount |
 | 46 | Non-root appuser UID 1000 | ✅ | Dockerfile RUN useradd |
 | 47 | HEALTHCHECK curl /health every 30s | ✅ | Dockerfile HEALTHCHECK |
 | 48 | uvicorn server.app:app entry | ✅ | Dockerfile CMD |
+| 49 | HF Space at huggingface.co/spaces/Shaurya-Noodle/Supplymind | ✅ | `docs/v3/DEPLOY_HF_SPACE.md` |
 | 50 | ONNX <5e-5 roundtrip 4 models | ✅ | `onnx_roundtrip.json` (BC 3.05e-5, CQL 5.22e-8, IQL 3.05e-5, TD3+BC 1.53e-5) all <5e-5 |
 | 51 | .gitignore excludes 159GB models/ | ✅ | `.gitignore` line `models/` |
 | 52 | <2GB container size | ✅ | DEPLOY_HF_SPACE notes |
 | 107 | 15+ disruption taxonomy | ✅ | `server/data/disruptions.json` |
 | 108 | 15 leading indicators with correlations | ✅ | `rl/leading_indicators.py` |
 | 109 | FRED state[400:407] features | ✅ | `rl/state_builder.py` slice |
+| 110 | 40+ industry citations docs/core/DATA_SOURCES.md | ✅ | `docs/core/DATA_SOURCES.md` |
 **Y: 21 ✅ + 2 ⚠️ = 23/23 = 100%**
 | # | Doc | Status | Path |
 |---|---|---|---|
 | 111 | README.md (40KB) | ✅ | repo root |
+| 112 | docs/core/SUPPLYMIND_BLUEPRINT.md (81KB) | ✅ | repo root |
+| 113 | docs/dev_log/ALIENWARE_KICKOFF.md (53KB) | ✅ | repo root |
+| 114 | docs/v4/AUDIT_PLAN.md (22KB) | ✅ | repo root |
 | 115 | MODEL_CARD.md (19KB) | ✅ | repo root |
+| 116 | docs/v3/PYTORCH_STORY.md | ✅ | repo root |
+| 117 | docs/v3/BENCHMARKS_VS_PUBLIC.md | ✅ | repo root |
+| 118 | docs/core/DATA_SOURCES.md | ✅ | repo root |
+| 119 | docs/core/EXTERNAL_CREDIBILITY.md | ✅ | repo root |
+| 120 | docs/v4/JUDGES.md | ✅ | repo root |
+| 121 | docs/v3/FINAL_DEMO.md | ✅ | repo root |
+| 122 | docs/v3/DEMO_SCRIPT.md | ✅ | repo root |
+| 123 | docs/v3/DEPLOY_HF_SPACE.md | ✅ | repo root |
+| 124 | docs/v3/EXECUTIVE_SUMMARY.md | ✅ | repo root |
+| 125 | docs/v3/RESULTS.md | ✅ | repo root |
 | 126 | CLONE_AND_STUDY.md | ✅ | docs/ |
 | 127 | FINAL_AUDIT_REPORT.md | ✅ | docs/ |
 | 128 | MULTI_TURN_GRPO_ROADMAP.md | ✅ | docs/ |
 | 129 | LIVE_DEMO_HORMUZ.md | ✅ | demo/ or root |
+| 130 | PREPRINT.md | ✅ | versions/v4_arcadia_live/docs/ |
+| 131 | PREPRINT_V5.md | ✅ | versions/v5_phoenix/docs/ |
 | 132 | PITCH_DECK.md | ✅ | demo/ |
+| 133 | PITCH_DECK_V5.md | ✅ | versions/v5_phoenix/docs/ |
 | 134 | DEMO_VIDEO_SCRIPT.md | ✅ | demo/ |
+| 135 | DEMO_VIDEO_SCRIPT_V5.md | ✅ | versions/v5_phoenix/docs/ |
+| 136 | JUDGES_V5.md | ✅ | versions/v5_phoenix/docs/ |
 | 137 | CHECKLIST.md | ✅ | demo/ |
 | 138 | LANDING_PAGE.md | ✅ | demo/ |
 | 139 | EXTERNAL_OUTREACH.md | ✅ | demo/ |
 | 140 | SECRETS_ROTATION.md | ✅ | docs/ |
+| 141 | PHOENIX_PLAN_V5.md | ✅ | versions/v4_arcadia_live/docs/ |
+| 142 | PHOENIX_COMPLETION_AUDIT.md | ✅ | versions/v5_phoenix/docs/ |
+| 143 | PHOENIX_PUSH_REPORT.md | ✅ | versions/v5_phoenix/docs/ |
 | 144 | HF_DEPLOY_V4.md | ✅ | docs/ |
 | 145 | R4_RUBRIC_CHALLENGE.md | ✅ | challenges/ |
 | 146 | FAILURE_TABLE.md | ✅ | repo root |
+| 147 | 12 Sleep Token album-track stages (00_emergence → 95_arcadia) | ✅ | `versions/v3_arcadia/` 12 dirs verified exact |
 | 148 | Notebook 01_environment_quickstart | ✅ | `notebooks/01_environment_quickstart.ipynb` |
 | 149 | Notebook 02_training_your_own_agent | ✅ | `notebooks/02_*.ipynb` |
 | 150 | Notebook 03_reproducing_benchmarks | ✅ | same |
 | # | Bullet | Status | Evidence |
 |---|---|---|---|
+| 155 | Hero result card 10-number 2×5 grid | ✅ | `make_hero_card.py` + `versions/v3_arcadia/plots/hero_*.png` |
 | 156 | make_hero_card.py | ✅ | repo |
+| 157 | Caramel reliability calibration curves | ✅ | `versions/v3_arcadia/plots/r2_caramel_*` |
+| 158 | R4 dangerous 7 plots | ✅ | `versions/v3_arcadia/plots/r4_dangerous_*.png` |
+| 159 | R5 granite 5 plots | ✅ | `versions/v3_arcadia/plots/r5_granite_*.png` |
+| 160 | R6 gethsemane 3 plots | ✅ | `versions/v3_arcadia/plots/r6_gethsemane_*.png` |
+| 161 | R3 past-self 2 plots | ✅ | `versions/v3_arcadia/plots/r3_past_self_*.png` |
+| 162 | R6 provider network graph | ✅ | `versions/v3_arcadia/plots/r6_provider_graph.png` |
+| 163 | R6 euclidian bootstrap CI bands | ✅ | `versions/v3_arcadia/plots/r6_euclidian_*.png` |
+| 164 | R6 aqua-regia coverage plot | ✅ | `versions/v3_arcadia/plots/r6_aqua_regia_coverage.png` |
 | 165 | GCN attention heatmaps 3 graphs | ✅ | `rl/gnn/attention.py` outputs PNG |
 | 166 | Streamlit dashboard 12 panels | ✅ | `dashboard/streamlit_app.py` |
 | 167 | Pareto 3D scatter Plotly | ✅ | `rl/pareto/visualize.py` |
 | 173 | Two-pass DeepSeek extraction (free CoT → Qwen JSON parse) | ✅ | `R4_DANGEROUS_V2.json: extractor field` 100% parse rate |
 | 174 | Phoenix isolation guarantee 3 layers | ✅ | `PHOENIX_COMPLETION_AUDIT.md` |
 | 175 | Copy-before-edit discipline | ✅ | `PHOENIX_PUSH_REPORT.md` |
+| 176 | Tiny YAML parser (no PyYAML) | ✅ | `versions/v5_phoenix/receipts_v2/framework.py` |
 | 177 | _corpus_hash SHA-256 embedding cache invalidation | ✅ | `crisis_library.py: corpus_hash` |
 | 178 | Token-bucket OpenRouter limiter | ✅ | `openrouter_client.py: per_minute=18` |
 | 179 | .openrouter_cache/ API caching | ✅ | dir exists |
 | 190 | Honest fallback labeling | ✅ | `data_source_flags.live_pipeline = "deterministic_rubric_fallback"` |
 | 191 | judge_source field | ✅ | `_call_ollama_judge: judge_source = ollama:<model>` |
 | 192 | Scenario JSON ingestion_note | ✅ | crisis library schema |
+| 193 | 4-minute judge path designed | ✅ | `docs/v4/JUDGES.md` |
 | 194 | 30-second receipt verification target | ✅ | `framework.py` design |
 | 195 | Sleep Token thesis "Even in Arcadia, disruptions happen" | ✅ | tagline in docs |
 | 6 baselines pre-seeded | `n_baselines=6` ✅ EXACT |
 | Replay cache 8 events | `replay_cache_latest.json: n_events=8` ✅ EXACT |
 | Phoenix INDEX 20 receipts | `INDEX.json: list[20]` ✅ EXACT |
+| 12 Sleep Token stages | `versions/v3_arcadia/` 12 dirs ✅ EXACT |
 | 125 .md docs | `find *.md` 125 ✅ |
 | 4 ONNX <5e-5 | onnx_roundtrip ✅ |
 | Token-bucket 18 req/min, 950 req/day | `openrouter_client.py` ✅ EXACT |

FINAL_SUBMIT/HACKATHON_README.md CHANGED Viewed

@@ -260,7 +260,7 @@ python scripts/generate_hackathon_plots.py             # all 7 plots
 ## 4.5 · RLVE adaptive curriculum + RLVR dual-verifier (per RL guide §22-23 + §31-33)
 ### RLVE adaptive curriculum controller
-File: [`ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py`](../ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py)
 Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR):
 - **Tier 0** = 100 most-common 5-letter words (baseline)
@@ -274,7 +274,7 @@ Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR)
 Smoke (200 episodes, synthetic policy): 4 tier shifts captured. Receipt: [`rlve_curriculum_smoke.json`](receipts/rlve_curriculum_smoke.json).
 ### RLVR dual-verifier framework
-File: [`ShAuRyA_Phoenix/wordle_env/dual_verifier.py`](../ShAuRyA_Phoenix/wordle_env/dual_verifier.py)
 Per RL guide §31-33 (rule-based verifiers brittle, model-based exploitable):
 - **Rule layer**: word ∈ dict, format valid, exact green/yellow scoring

 ## 4.5 · RLVE adaptive curriculum + RLVR dual-verifier (per RL guide §22-23 + §31-33)
 ### RLVE adaptive curriculum controller
+File: [`versions/v5_phoenix/wordle_env/rlve_curriculum.py`](../versions/v5_phoenix/wordle_env/rlve_curriculum.py)
 Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR):
 - **Tier 0** = 100 most-common 5-letter words (baseline)
 Smoke (200 episodes, synthetic policy): 4 tier shifts captured. Receipt: [`rlve_curriculum_smoke.json`](receipts/rlve_curriculum_smoke.json).
 ### RLVR dual-verifier framework
+File: [`versions/v5_phoenix/wordle_env/dual_verifier.py`](../versions/v5_phoenix/wordle_env/dual_verifier.py)
 Per RL guide §31-33 (rule-based verifiers brittle, model-based exploitable):
 - **Rule layer**: word ∈ dict, format valid, exact green/yellow scoring

FINAL_SUBMIT/JUDGE_FAQ_30.md CHANGED Viewed

@@ -69,7 +69,7 @@ Global Fishing Watch — vessel positions feed into Hormuz/Red Sea route-disrupt
 Compatibility with PEFT 0.19 + Unsloth current pin. `requirements.txt` locks the stack.
 ### 23. "Reward function code?"
-`server/engine/rewards.py` (SupplyMind 7-component) + `ShAuRyA_Phoenix/wordle_env/env.py` (Wordle 6-component). Both verifiable.
 ### 24. "Forecasting baselines?"
 TFT (513,534 steps), TFT-v2, BigTFT (90,602), TimesFM zero-shot, Granite, Stacking-v3, Brent ensemble. NOAA 60.07% accuracy. Receipts each.

 Compatibility with PEFT 0.19 + Unsloth current pin. `requirements.txt` locks the stack.
 ### 23. "Reward function code?"
+`server/engine/rewards.py` (SupplyMind 7-component) + `versions/v5_phoenix/wordle_env/env.py` (Wordle 6-component). Both verifiable.
 ### 24. "Forecasting baselines?"
 TFT (513,534 steps), TFT-v2, BigTFT (90,602), TimesFM zero-shot, Granite, Stacking-v3, Brent ensemble. NOAA 60.07% accuracy. Receipts each.

FINAL_SUBMIT/JUDGE_OBJECTION_HANDBOOK.md CHANGED Viewed

@@ -26,7 +26,7 @@ Format: **Q** = the objection · **A** = the rebuttal · **Receipt** = the on-di
 **Q5**. "Why supply chain over a research-paper-novel domain?"
 **A**. Picked deliberately: (1) supply-chain has crisp economic verifiers (Brent prices, agency-published loss bands), (2) it has rich partial observability (20 live data sources), (3) it's professionally relevant (Theme 3 explicit fit). And it's underexplored in OpenEnv community — most submissions are grid worlds or web tasks.
-**Receipt**: `DATA_SOURCES.md` lists 20 sources with their epistemic role.
 ---

 **Q5**. "Why supply chain over a research-paper-novel domain?"
 **A**. Picked deliberately: (1) supply-chain has crisp economic verifiers (Brent prices, agency-published loss bands), (2) it has rich partial observability (20 live data sources), (3) it's professionally relevant (Theme 3 explicit fit). And it's underexplored in OpenEnv community — most submissions are grid worlds or web tasks.
+**Receipt**: `docs/core/DATA_SOURCES.md` lists 20 sources with their epistemic role.
 ---

FINAL_SUBMIT/MASTER_FEATURE_USECASE_MAP_250.md CHANGED Viewed

@@ -18,22 +18,22 @@ Sections A through BB + RL/RLVR/RLVE knowledge alignment.
 | A7 | 30-step episode horizon | `server/supply_environment.py` | bounded RL episode | reset config |
 | A8 | $5M-$15M budget tasks | `data/disruptions.json` | sparse-reward shaping | task manifest |
 | A9 | Real-world coordinates (TSMC, Samsung) | `data/companies_real.json` | Theme #3 Professional Tasks | n_real_nodes=40 |
-| A10 | 8 v1 events crisis library | `ShAuRyA_Supplymind/realtime/crisis_library.py` | RAG analog retrieval | 8 events indexed |
-| A11 | Wordle RLVR mini-env | `ShAuRyA_Phoenix/wordle_env/env.py` | canonical hackathon flow | `wordle_real_reinforce_curve.json` |
-| A12 | RLVE adaptive curriculum | `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` | §22-23 Procaccia-style | `rlve_curriculum_smoke.json` (4 tier shifts) |
 ## B. REWARD ENGINEERING — 14 features
 | # | Feature | File | Use case | Receipt |
 |---|---------|------|----------|---------|
 | B1 | 7-component shaped reward | `server/engine/rewards.py` | RL guide §7 multi-component | rewards module |
-| B2 | Format gate | `ShAuRyA_Phoenix/wordle_env/env.py` | reject malformed actions | adv-20 attacks 1-9 blocked |
-| B3 | Dictionary gate | `ShAuRyA_Phoenix/wordle_env/env.py` | reject non-dict words | adv-20 attack #10 blocked |
-| B4 | Timeout penalty | `ShAuRyA_Phoenix/wordle_env/env.py` | RL guide §15 timeout monitor | -0.2 if 6 guesses fail |
-| B5 | Solve bonus + step-count bonus | `ShAuRyA_Phoenix/wordle_env/env.py` | richer signal | ablation_matrix.json |
 | B6 | Green credit | env.py | per-letter success | ablation: -0.459 if removed |
 | B7 | Yellow credit | env.py | partial info credit | ablation: small drop if removed |
 | B8 | Process supervision (line-level) | `scripts/final_validation_bundle.py:process_supervision` | RL guide §9 Lightman 2023 | `process_supervision.json` (var amp 2735×) |
-| B9 | Dual-verifier composite | `ShAuRyA_Phoenix/wordle_env/dual_verifier.py` | rule × (0.5 + 0.5×model) | `dual_verifier_smoke.json` |
 | B10 | Disagreement alarm | `dual_verifier.py:DISAGREEMENT_THRESHOLD` | §43 anti-hacking monitoring | rolling alarm 0.30 |
 | B11 | Ablation receipts (5 components) | `final_validation_bundle.py` | leave-one-out analysis | `ablation_matrix.json` |
 | B12 | Variance reduction baseline | `final_real_reinforce_wordle.py` | Williams 1992 REINFORCE | running_baseline EMA |
@@ -98,8 +98,8 @@ Receipt: `adversarial_20_attack_gauntlet.json` (sha 082a3c57…)
 ## G. RAG / RETRIEVAL — 8 features
 | # | Feature | File | Use case | Receipt |
 |---|---------|------|----------|---------|
-| G1 | FAISS index | `ShAuRyA_Supplymind/realtime/store.py` | top-K retrieval | store.query_recent |
-| G2 | BGE-rerank | `ShAuRyA_Supplymind/realtime/rerank.py` | quality boost | falls back gracefully on Win |
 | G3 | Crisis library 8 events | `realtime/crisis_library.py` | analog retrieval | RAG against Iran/Hormuz/Suez |
 | G4 | NewsAPI live ingest | `realtime/news_ingest.py` | recent events | event store |
 | G5 | GDELT integration | `realtime/gdelt.py` | global events | event store |

 | A7 | 30-step episode horizon | `server/supply_environment.py` | bounded RL episode | reset config |
 | A8 | $5M-$15M budget tasks | `data/disruptions.json` | sparse-reward shaping | task manifest |
 | A9 | Real-world coordinates (TSMC, Samsung) | `data/companies_real.json` | Theme #3 Professional Tasks | n_real_nodes=40 |
+| A10 | 8 v1 events crisis library | `versions/v4_arcadia_live/realtime/crisis_library.py` | RAG analog retrieval | 8 events indexed |
+| A11 | Wordle RLVR mini-env | `versions/v5_phoenix/wordle_env/env.py` | canonical hackathon flow | `wordle_real_reinforce_curve.json` |
+| A12 | RLVE adaptive curriculum | `versions/v5_phoenix/wordle_env/rlve_curriculum.py` | §22-23 Procaccia-style | `rlve_curriculum_smoke.json` (4 tier shifts) |
 ## B. REWARD ENGINEERING — 14 features
 | # | Feature | File | Use case | Receipt |
 |---|---------|------|----------|---------|
 | B1 | 7-component shaped reward | `server/engine/rewards.py` | RL guide §7 multi-component | rewards module |
+| B2 | Format gate | `versions/v5_phoenix/wordle_env/env.py` | reject malformed actions | adv-20 attacks 1-9 blocked |
+| B3 | Dictionary gate | `versions/v5_phoenix/wordle_env/env.py` | reject non-dict words | adv-20 attack #10 blocked |
+| B4 | Timeout penalty | `versions/v5_phoenix/wordle_env/env.py` | RL guide §15 timeout monitor | -0.2 if 6 guesses fail |
+| B5 | Solve bonus + step-count bonus | `versions/v5_phoenix/wordle_env/env.py` | richer signal | ablation_matrix.json |
 | B6 | Green credit | env.py | per-letter success | ablation: -0.459 if removed |
 | B7 | Yellow credit | env.py | partial info credit | ablation: small drop if removed |
 | B8 | Process supervision (line-level) | `scripts/final_validation_bundle.py:process_supervision` | RL guide §9 Lightman 2023 | `process_supervision.json` (var amp 2735×) |
+| B9 | Dual-verifier composite | `versions/v5_phoenix/wordle_env/dual_verifier.py` | rule × (0.5 + 0.5×model) | `dual_verifier_smoke.json` |
 | B10 | Disagreement alarm | `dual_verifier.py:DISAGREEMENT_THRESHOLD` | §43 anti-hacking monitoring | rolling alarm 0.30 |
 | B11 | Ablation receipts (5 components) | `final_validation_bundle.py` | leave-one-out analysis | `ablation_matrix.json` |
 | B12 | Variance reduction baseline | `final_real_reinforce_wordle.py` | Williams 1992 REINFORCE | running_baseline EMA |
 ## G. RAG / RETRIEVAL — 8 features
 | # | Feature | File | Use case | Receipt |
 |---|---------|------|----------|---------|
+| G1 | FAISS index | `versions/v4_arcadia_live/realtime/store.py` | top-K retrieval | store.query_recent |
+| G2 | BGE-rerank | `versions/v4_arcadia_live/realtime/rerank.py` | quality boost | falls back gracefully on Win |
 | G3 | Crisis library 8 events | `realtime/crisis_library.py` | analog retrieval | RAG against Iran/Hormuz/Suez |
 | G4 | NewsAPI live ingest | `realtime/news_ingest.py` | recent events | event store |
 | G5 | GDELT integration | `realtime/gdelt.py` | global events | event store |

FINAL_SUBMIT/README.md CHANGED Viewed

@@ -37,13 +37,13 @@ http://127.0.0.1:8000/demo/master
 | Conformal action coverage | **0.9001** | `tests/receipts/conformal_calibration.json` |
 | Cross-corpus α (frontier 6, v2 EMDAT) | **0.5436** | `tests/receipts/cross_corpus_alpha.json` |
 | 12-frontier panel α (R4 corpus) | **0.5669** | `tests/receipts/panel_agreement_R4.json` |
-| HetGAT vs v1 GCN MAE | **+7.77 / +12.15 / +10.03 %** | `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json` |
-| RAP-XC training loss | BC **5.62 → 0.23** | `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt` |
 | RAP-XC parameters | **3,137,049** | same |
 | Tohoku 2011 replicated | **$276 B vs $235 B published (+18%)** | `tests/receipts/platinum_counterfactual.json` |
-| Live data sources | **20** | `ShAuRyA_Supplymind/realtime/orchestrator_v2.py` |
-| Crisis library | **1,500 EMDAT events** | `ShAuRyA_Supplymind/scenarios/crisis_library_v2.json` |
-| Foundation models verified | **13/13** | `v3_arcadia/00_emergence/verify_*.py` |
 | Custom Ollama analyst models | **5 (v1→v5)** | `rl/lora/Modelfile.v[2-4]`, `Modelfile.analyst_v5` |
 | LoRA training pairs | **225** | `rl/data/lora_training_data.json` |
 | DPO preference pairs | **21** | `dpo_judge/data/preference_pairs.jsonl` |
@@ -98,13 +98,13 @@ Detailed: see [REPRODUCE.md](REPRODUCE.md).
 | Section | Where |
 |---|---|
 | Game engine (OpenEnv) | `server/app.py`, `server/supply_environment.py`, `server/engine/` |
-| 9 RL agents | `ShAuRyA_Phoenix/arena/`, `ShAuRyA_Phoenix/rap_xc/` |
-| 13 foundation models | `models/`, `v3_arcadia/00_emergence/verify_*.py` |
-| Custom Ollama analyst models | `rl/lora/Modelfile.v[2-4]`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5` |
-| LoRA + DPO + GRPO training | `rl/lora/`, `ShAuRyA_Phoenix/roll_integration/dpo_judge/` |
-| 1500-event crisis library | `ShAuRyA_Supplymind/scenarios/crisis_library_v2.{json,faiss}` |
-| 4-method counterfactual | `ShAuRyA_Phoenix/counterfactual_v2/platinum.py` |
-| Hormuz War Room | `ShAuRyA_Supplymind/realtime/hormuz_war_room_router.py`, `server/static/hormuz_war_room.html` |
 | Master demo page | `server/static/master.html` |
 | Receipts | `tests/receipts/*.json` |

 | Conformal action coverage | **0.9001** | `tests/receipts/conformal_calibration.json` |
 | Cross-corpus α (frontier 6, v2 EMDAT) | **0.5436** | `tests/receipts/cross_corpus_alpha.json` |
 | 12-frontier panel α (R4 corpus) | **0.5669** | `tests/receipts/panel_agreement_R4.json` |
+| HetGAT vs v1 GCN MAE | **+7.77 / +12.15 / +10.03 %** | `versions/v5_phoenix/experiments/hetgat_v1/report.json` |
+| RAP-XC training loss | BC **5.62 → 0.23** | `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt` |
 | RAP-XC parameters | **3,137,049** | same |
 | Tohoku 2011 replicated | **$276 B vs $235 B published (+18%)** | `tests/receipts/platinum_counterfactual.json` |
+| Live data sources | **20** | `versions/v4_arcadia_live/realtime/orchestrator_v2.py` |
+| Crisis library | **1,500 EMDAT events** | `versions/v4_arcadia_live/scenarios/crisis_library_v2.json` |
+| Foundation models verified | **13/13** | `versions/v3_arcadia/00_emergence/verify_*.py` |
 | Custom Ollama analyst models | **5 (v1→v5)** | `rl/lora/Modelfile.v[2-4]`, `Modelfile.analyst_v5` |
 | LoRA training pairs | **225** | `rl/data/lora_training_data.json` |
 | DPO preference pairs | **21** | `dpo_judge/data/preference_pairs.jsonl` |
 | Section | Where |
 |---|---|
 | Game engine (OpenEnv) | `server/app.py`, `server/supply_environment.py`, `server/engine/` |
+| 9 RL agents | `versions/v5_phoenix/arena/`, `versions/v5_phoenix/rap_xc/` |
+| 13 foundation models | `models/`, `versions/v3_arcadia/00_emergence/verify_*.py` |
+| Custom Ollama analyst models | `rl/lora/Modelfile.v[2-4]`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5` |
+| LoRA + DPO + GRPO training | `rl/lora/`, `versions/v5_phoenix/roll_integration/dpo_judge/` |
+| 1500-event crisis library | `versions/v4_arcadia_live/scenarios/crisis_library_v2.{json,faiss}` |
+| 4-method counterfactual | `versions/v5_phoenix/counterfactual_v2/platinum.py` |
+| Hormuz War Room | `versions/v4_arcadia_live/realtime/hormuz_war_room_router.py`, `server/static/hormuz_war_room.html` |
 | Master demo page | `server/static/master.html` |
 | Receipts | `tests/receipts/*.json` |

FINAL_SUBMIT/RELIANCE_HORMUZ_DEEP_DIVE.md CHANGED Viewed

@@ -119,4 +119,4 @@ The remaining seven subsidiaries collectively account for ~15% of impact.
 **Key insight**: highest *score* node (RIIL pipelines 0.916) has lowest *absolute* impact (₹35 Cr) because it is a small-revenue stub. Highest *absolute* impact (Jamnagar ₹12,194 Cr) has lower score (0.824) but the largest revenue base. **Score and absolute impact tell different stories — both matter.**
-Receipt: deterministic (no LLM in scoring). Numbers anchor to RIL FY24 Integrated Annual Report. Reproduce: `python ShAuRyA_Supplymind/scenarios/reliance_industries_exposure.py`.


119
120	Key insight: highest score node (RIIL pipelines 0.916) has lowest absolute impact (₹35 Cr) because it is a small-revenue stub. Highest absolute impact (Jamnagar ₹12,194 Cr) has lower score (0.824) but the largest revenue base. Score and absolute impact tell different stories — both matter.
121
122	+ Receipt: deterministic (no LLM in scoring). Numbers anchor to RIL FY24 Integrated Annual Report. Reproduce: `python versions/v4_arcadia_live/scenarios/reliance_industries_exposure.py`.

FINAL_SUBMIT/REPRODUCE.md CHANGED Viewed

@@ -70,10 +70,10 @@ python scripts/bootstrap_leaderboard.py
 python scripts/ollama_v5_vs_frontier.py
 # 7. HetGAT all 3 graphs (~30 min on RTX 4080)
-python -m ShAuRyA_Phoenix.gnn_v2.train_hetgat --graph all --epochs 200
 # 8. RAP-XC training on harvested transitions (~20 sec on RTX 4080)
-python -c "from ShAuRyA_Phoenix.rap_xc.train import train_rapxc; train_rapxc()"
 ```
 All produce JSON receipts at `tests/receipts/*.json`.

 python scripts/ollama_v5_vs_frontier.py
 # 7. HetGAT all 3 graphs (~30 min on RTX 4080)
+python -m versions.v5_phoenix.gnn_v2.train_hetgat --graph all --epochs 200
 # 8. RAP-XC training on harvested transitions (~20 sec on RTX 4080)
+python -c "from versions.v5_phoenix.rap_xc.train import train_rapxc; train_rapxc()"
 ```
 All produce JSON receipts at `tests/receipts/*.json`.

FINAL_SUBMIT/REPRODUCE_ONE_BASH.sh CHANGED Viewed

@@ -14,11 +14,11 @@ echo "Repo: $(pwd)"
 echo
 echo "[1/8] Wordle env + RLVE curriculum smoke ..."
-python -m ShAuRyA_Phoenix.wordle_env.rlve_curriculum
 echo
 echo "[2/8] Dual verifier smoke ..."
-python -m ShAuRyA_Phoenix.wordle_env.dual_verifier
 echo
 echo "[3/8] OpenEnv MCP compliance ..."
@@ -38,7 +38,7 @@ python scripts/final_validation_bundle.py
 echo
 echo "[7/8] Wordle GRPO baseline (heuristic policy receipt) ..."
-python -m ShAuRyA_Phoenix.wordle_env.train_grpo --steps 50 || true
 echo
 echo "[8/8] Receipt index ..."

 echo
 echo "[1/8] Wordle env + RLVE curriculum smoke ..."
+python -m versions.v5_phoenix.wordle_env.rlve_curriculum
 echo
 echo "[2/8] Dual verifier smoke ..."
+python -m versions.v5_phoenix.wordle_env.dual_verifier
 echo
 echo "[3/8] OpenEnv MCP compliance ..."
 echo
 echo "[7/8] Wordle GRPO baseline (heuristic policy receipt) ..."
+python -m versions.v5_phoenix.wordle_env.train_grpo --steps 50 || true
 echo
 echo "[8/8] Receipt index ..."

FINAL_SUBMIT/RL_GUIDE_59POINT_ALIGNMENT.md CHANGED Viewed

@@ -25,7 +25,7 @@ Each of the 59 hackathon-guide points → which file implements it → which rec
 **Receipt**: HF Space-ready manifest.
 ## §6. Easy first
-**File**: `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` Tier-0
 **Receipt**: `rlve_curriculum_smoke.json` — 4 tier shifts.
 ## §7. Reward design carefully
@@ -81,7 +81,7 @@ Each of the 59 hackathon-guide points → which file implements it → which rec
 **Receipt**: `lora_unsloth_train.json`.
 ## §31–33. Dual verifier
-**File**: `ShAuRyA_Phoenix/wordle_env/dual_verifier.py`
 **Receipt**: `dual_verifier_smoke.json` — BRAID FP caught.
 ## §34–37. Curriculum band 0.45–0.75

 **Receipt**: HF Space-ready manifest.
 ## §6. Easy first
+**File**: `versions/v5_phoenix/wordle_env/rlve_curriculum.py` Tier-0
 **Receipt**: `rlve_curriculum_smoke.json` — 4 tier shifts.
 ## §7. Reward design carefully
 **Receipt**: `lora_unsloth_train.json`.
 ## §31–33. Dual verifier
+**File**: `versions/v5_phoenix/wordle_env/dual_verifier.py`
 **Receipt**: `dual_verifier_smoke.json` — BRAID FP caught.
 ## §34–37. Curriculum band 0.45–0.75

FINAL_SUBMIT/docker/Dockerfile.api ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.11-slim
+WORKDIR /app
+# System deps for sentence-transformers, faiss, torch
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential git curl ca-certificates libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Repo (excluding models — they get mounted as a volume)
+COPY . /app/
+# Models live at /app/models — mount your local models/ dir as this volume
+VOLUME /app/models
+EXPOSE 8000
+# Pre-warm not done in image — runs in lifespan handler at startup
+ENV PYTHONIOENCODING=utf-8
+ENV OLLAMA_MAX_LOADED_MODELS=1
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]

FINAL_SUBMIT/docker/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+version: "3.9"
+services:
+  api:
+    build:
+      context: ../..
+      dockerfile: FINAL_SUBMIT/docker/Dockerfile.api
+    container_name: supplymind-api
+    ports:
+      - "8000:8000"
+    env_file:
+      - ../../.env
+    volumes:
+      - ../../models:/app/models:ro
+      - ../../tests/receipts:/app/tests/receipts
+    environment:
+      - PYTHONIOENCODING=utf-8
+      - OLLAMA_MAX_LOADED_MODELS=1
+      - OLLAMA_BASE_URL=http://ollama:11434
+    depends_on:
+      - ollama
+    restart: unless-stopped
+  ollama:
+    image: ollama/ollama:latest
+    container_name: supplymind-ollama
+    ports:
+      - "11434:11434"
+    volumes:
+      - ollama-data:/root/.ollama
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+volumes:
+  ollama-data:

FINAL_SUBMIT/receipts/F2_multi_agent_apple_samsung_toyota.json CHANGED Viewed

@@ -1,117 +1,117 @@
-{
-  "constants": {
-    "cap_total_wafers_week": 1000,
-    "wafer_revenue_usd": 16500,
-    "shortfall_loss_usd_per_wafer": 55000,
-    "crisis_duration_weeks": 6
-  },
-  "narrative": "2021-chip-shortage dynamic: TSMC backup capacity (1000 wafers/week) contested by Apple (aggressive) + Samsung (conservative) + Toyota (reactive). Apple bids hard early, captures >50% of step-1 capacity. Toyota waits, pays higher step-2 prices. Samsung splits budget.",
-  "step_log": [
-    {
-      "event": "step_1_open",
-      "capacity_remaining": 1000,
-      "price_signal": 1.0
-    },
-    {
-      "event": "step_1_bid",
-      "agent": "Apple",
-      "bid_usd": 15399999.999999998
-    },
-    {
-      "event": "step_1_bid",
-      "agent": "Samsung",
-      "bid_usd": 3500000.0
-    },
-    {
-      "event": "step_1_bid",
-      "agent": "Toyota",
-      "bid_usd": 0.0
-    },
-    {
-      "event": "step_1_allocated",
-      "agent": "Apple",
-      "allocated_wafers": 407.4074074074074
-    },
-    {
-      "event": "step_1_allocated",
-      "agent": "Samsung",
-      "allocated_wafers": 92.59259259259258
-    },
-    {
-      "event": "step_1_allocated",
-      "agent": "Toyota",
-      "allocated_wafers": 0.0
-    },
-    {
-      "event": "step_2_open",
-      "capacity_remaining": 500.0,
-      "price_signal": 2.291
-    },
-    {
-      "event": "step_2_bid",
-      "agent": "Apple",
-      "bid_usd": 3300000.0
-    },
-    {
-      "event": "step_2_bid",
-      "agent": "Samsung",
-      "bid_usd": 2800000.0
-    },
-    {
-      "event": "step_2_bid",
-      "agent": "Toyota",
-      "bid_usd": 1833333.3333333333
-    }
-  ],
-  "outcomes": [
-    {
-      "name": "Apple",
-      "strategy": "aggressive",
-      "budget_usd": 22000000,
-      "bid_usd": 18700000.0,
-      "allocated_wafers": 615.4,
-      "revenue_earned_usd": 60923669.0,
-      "shortfall_loss_usd": 39486850.0,
-      "net_pnl_usd": 2736819.0
-    },
-    {
-      "name": "Samsung",
-      "strategy": "conservative",
-      "budget_usd": 14000000,
-      "bid_usd": 6300000.0,
-      "allocated_wafers": 269.1,
-      "revenue_earned_usd": 26637255.0,
-      "shortfall_loss_usd": 31868192.0,
-      "net_pnl_usd": -11530937.0
-    },
-    {
-      "name": "Toyota",
-      "strategy": "reactive",
-      "budget_usd": 7000000,
-      "bid_usd": 1833333.0,
-      "allocated_wafers": 115.5,
-      "revenue_earned_usd": 11439076.0,
-      "shortfall_loss_usd": 16978291.0,
-      "net_pnl_usd": -7372549.0
-    }
-  ],
-  "ranking": [
-    {
-      "rank": 1,
-      "agent": "Apple",
-      "net_pnl_usd": 2736819.0
-    },
-    {
-      "rank": 2,
-      "agent": "Toyota",
-      "net_pnl_usd": -7372549.0
-    },
-    {
-      "rank": 3,
-      "agent": "Samsung",
-      "net_pnl_usd": -11530937.0
-    }
-  ],
-  "winner": "Apple",
-  "loser": "Samsung"
 }

+{
+  "constants": {
+    "cap_total_wafers_week": 1000,
+    "wafer_revenue_usd": 16500,
+    "shortfall_loss_usd_per_wafer": 55000,
+    "crisis_duration_weeks": 6
+  },
+  "narrative": "2021-chip-shortage dynamic: TSMC backup capacity (1000 wafers/week) contested by Apple (aggressive) + Samsung (conservative) + Toyota (reactive). Apple bids hard early, captures >50% of step-1 capacity. Toyota waits, pays higher step-2 prices. Samsung splits budget.",
+  "step_log": [
+    {
+      "event": "step_1_open",
+      "capacity_remaining": 1000,
+      "price_signal": 1.0
+    },
+    {
+      "event": "step_1_bid",
+      "agent": "Apple",
+      "bid_usd": 15399999.999999998
+    },
+    {
+      "event": "step_1_bid",
+      "agent": "Samsung",
+      "bid_usd": 3500000.0
+    },
+    {
+      "event": "step_1_bid",
+      "agent": "Toyota",
+      "bid_usd": 0.0
+    },
+    {
+      "event": "step_1_allocated",
+      "agent": "Apple",
+      "allocated_wafers": 407.4074074074074
+    },
+    {
+      "event": "step_1_allocated",
+      "agent": "Samsung",
+      "allocated_wafers": 92.59259259259258
+    },
+    {
+      "event": "step_1_allocated",
+      "agent": "Toyota",
+      "allocated_wafers": 0.0
+    },
+    {
+      "event": "step_2_open",
+      "capacity_remaining": 500.0,
+      "price_signal": 2.291
+    },
+    {
+      "event": "step_2_bid",
+      "agent": "Apple",
+      "bid_usd": 3300000.0
+    },
+    {
+      "event": "step_2_bid",
+      "agent": "Samsung",
+      "bid_usd": 2800000.0
+    },
+    {
+      "event": "step_2_bid",
+      "agent": "Toyota",
+      "bid_usd": 1833333.3333333333
+    }
+  ],
+  "outcomes": [
+    {
+      "name": "Apple",
+      "strategy": "aggressive",
+      "budget_usd": 22000000,
+      "bid_usd": 18700000.0,
+      "allocated_wafers": 615.4,
+      "revenue_earned_usd": 60923669.0,
+      "shortfall_loss_usd": 39486850.0,
+      "net_pnl_usd": 2736819.0
+    },
+    {
+      "name": "Samsung",
+      "strategy": "conservative",
+      "budget_usd": 14000000,
+      "bid_usd": 6300000.0,
+      "allocated_wafers": 269.1,
+      "revenue_earned_usd": 26637255.0,
+      "shortfall_loss_usd": 31868192.0,
+      "net_pnl_usd": -11530937.0
+    },
+    {
+      "name": "Toyota",
+      "strategy": "reactive",
+      "budget_usd": 7000000,
+      "bid_usd": 1833333.0,
+      "allocated_wafers": 115.5,
+      "revenue_earned_usd": 11439076.0,
+      "shortfall_loss_usd": 16978291.0,
+      "net_pnl_usd": -7372549.0
+    }
+  ],
+  "ranking": [
+    {
+      "rank": 1,
+      "agent": "Apple",
+      "net_pnl_usd": 2736819.0
+    },
+    {
+      "rank": 2,
+      "agent": "Toyota",
+      "net_pnl_usd": -7372549.0
+    },
+    {
+      "rank": 3,
+      "agent": "Samsung",
+      "net_pnl_usd": -11530937.0
+    }
+  ],
+  "winner": "Apple",
+  "loser": "Samsung"
 }

FINAL_SUBMIT/receipts/ONNX_BUNDLE_MANIFEST.json CHANGED Viewed

@@ -1,72 +1,72 @@
-{
-  "exported": [
-    {
-      "name": "ppo_easy_typhoon_response (MaskablePPO)",
-      "file": "ppo_easy_typhoon_response.onnx",
-      "size_kb": 948,
-      "input_shape": [
-        1,
-        408
-      ],
-      "output_shape": [
-        1,
-        280
-      ],
-      "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
-    },
-    {
-      "name": "ppo_medium_multi_front (MaskablePPO)",
-      "file": "ppo_medium_multi_front.onnx",
-      "size_kb": 948,
-      "input_shape": [
-        1,
-        408
-      ],
-      "output_shape": [
-        1,
-        280
-      ],
-      "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
-    },
-    {
-      "name": "ppo_hard_cascading_crisis (MaskablePPO)",
-      "file": "ppo_hard_cascading_crisis.onnx",
-      "size_kb": 948,
-      "input_shape": [
-        1,
-        408
-      ],
-      "output_shape": [
-        1,
-        280
-      ],
-      "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
-    },
-    {
-      "name": "GCN arrival-time regressor",
-      "file": "gcn_arrival.onnx",
-      "size_kb": 10,
-      "input_shape": [
-        "[N, 4]",
-        "[N, N]"
-      ],
-      "output_shape": [
-        "[N]"
-      ],
-      "source": "v3_arcadia/70_provider/r6_gnn_arrival_time.py"
-    }
-  ],
-  "skipped": [
-    {
-      "name": "Ridge stacker",
-      "reason": "skl2onnx not installed: No module named 'skl2onnx'"
-    },
-    {
-      "name": "TFT v1",
-      "reason": "pytorch-forecasting TimeSeriesDataSet is required at inference; ONNX export requires a wrapper that packages the normalizer scaler + encoder/decoder split. Deferred as v4 work."
-    }
-  ],
-  "elapsed_s": 0.8302168846130371,
-  "bundle_dir": "v3_arcadia\\checkpoints\\onnx_bundle",
-  "total_bundle_size_kb": 2854
 }

+{
+  "exported": [
+    {
+      "name": "ppo_easy_typhoon_response (MaskablePPO)",
+      "file": "ppo_easy_typhoon_response.onnx",
+      "size_kb": 948,
+      "input_shape": [
+        1,
+        408
+      ],
+      "output_shape": [
+        1,
+        280
+      ],
+      "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
+    },
+    {
+      "name": "ppo_medium_multi_front (MaskablePPO)",
+      "file": "ppo_medium_multi_front.onnx",
+      "size_kb": 948,
+      "input_shape": [
+        1,
+        408
+      ],
+      "output_shape": [
+        1,
+        280
+      ],
+      "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
+    },
+    {
+      "name": "ppo_hard_cascading_crisis (MaskablePPO)",
+      "file": "ppo_hard_cascading_crisis.onnx",
+      "size_kb": 948,
+      "input_shape": [
+        1,
+        408
+      ],
+      "output_shape": [
+        1,
+        280
+      ],
+      "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
+    },
+    {
+      "name": "GCN arrival-time regressor",
+      "file": "gcn_arrival.onnx",
+      "size_kb": 10,
+      "input_shape": [
+        "[N, 4]",
+        "[N, N]"
+      ],
+      "output_shape": [
+        "[N]"
+      ],
+      "source": "versions/v3_arcadia/70_provider/r6_gnn_arrival_time.py"
+    }
+  ],
+  "skipped": [
+    {
+      "name": "Ridge stacker",
+      "reason": "skl2onnx not installed: No module named 'skl2onnx'"
+    },
+    {
+      "name": "TFT v1",
+      "reason": "pytorch-forecasting TimeSeriesDataSet is required at inference; ONNX export requires a wrapper that packages the normalizer scaler + encoder/decoder split. Deferred as v4 work."
+    }
+  ],
+  "elapsed_s": 0.8302168846130371,
+  "bundle_dir": "versions/v3_arcadia/\checkpoints\\onnx_bundle",
+  "total_bundle_size_kb": 2854
 }

FINAL_SUBMIT/receipts/R2_SHAP_FAIRNESS_CALIBRATION.json CHANGED Viewed

@@ -1,502 +1,502 @@
-{
-  "shap_top15": {
-    "late_delivery_risk": {
-      "algo": "xgb",
-      "top15_features": [
-        {
-          "name": "Shipping Mode__First Class",
-          "importance": 0.7326152324676514
-        },
-        {
-          "name": "sched_days",
-          "importance": 0.6606742739677429
-        },
-        {
-          "name": "Type__TRANSFER",
-          "importance": 0.47632965445518494
-        },
-        {
-          "name": "Order Customer Id",
-          "importance": 0.17082303762435913
-        },
-        {
-          "name": "Latitude",
-          "importance": 0.160926952958107
-        },
-        {
-          "name": "Shipping Mode__Second Class",
-          "importance": 0.14983786642551422
-        },
-        {
-          "name": "Longitude",
-          "importance": 0.13300901651382446
-        },
-        {
-          "name": "Shipping Mode__Standard Class",
-          "importance": 0.12997667491436005
-        },
-        {
-          "name": "order_day",
-          "importance": 0.10712296515703201
-        },
-        {
-          "name": "order_month",
-          "importance": 0.07108364999294281
-        },
-        {
-          "name": "order_dow",
-          "importance": 0.06861100345849991
-        },
-        {
-          "name": "Order Item Total",
-          "importance": 0.0614430233836174
-        },
-        {
-          "name": "Type__DEBIT",
-          "importance": 0.05896211788058281
-        },
-        {
-          "name": "Sales",
-          "importance": 0.04449347406625748
-        },
-        {
-          "name": "Order Item Discount",
-          "importance": 0.04405033215880394
-        }
-      ],
-      "n_samples": 1000
-    },
-    "shipping_mode": {
-      "algo": "lgb",
-      "top15_features": [
-        {
-          "name": "order_day",
-          "importance": 0.14531971700119595
-        },
-        {
-          "name": "Latitude",
-          "importance": 0.13565060253209485
-        },
-        {
-          "name": "Order Customer Id",
-          "importance": 0.13102491053295864
-        },
-        {
-          "name": "Longitude",
-          "importance": 0.1222981746063068
-        },
-        {
-          "name": "Order Zipcode",
-          "importance": 0.09815205910031981
-        },
-        {
-          "name": "order_month",
-          "importance": 0.09317142717955136
-        },
-        {
-          "name": "order_dow",
-          "importance": 0.07841270762869156
-        },
-        {
-          "name": "Order Item Total",
-          "importance": 0.044599598632655106
-        },
-        {
-          "name": "Order Item Discount",
-          "importance": 0.033594561793665254
-        },
-        {
-          "name": "order_year",
-          "importance": 0.029623813091121495
-        },
-        {
-          "name": "Customer Segment__Home Office",
-          "importance": 0.02582491478215546
-        },
-        {
-          "name": "Type__DEBIT",
-          "importance": 0.019900877735072642
-        },
-        {
-          "name": "Order Item Discount Rate",
-          "importance": 0.019821976340370435
-        },
-        {
-          "name": "Customer Segment__Consumer",
-          "importance": 0.019363164732533623
-        },
-        {
-          "name": "Sales",
-          "importance": 0.019305355520423926
-        }
-      ],
-      "n_samples": 1000
-    },
-    "delivery_status": {
-      "algo": "lgb",
-      "top15_features": [
-        {
-          "name": "sched_days",
-          "importance": 1.0622776241691645
-        },
-        {
-          "name": "Type__TRANSFER",
-          "importance": 0.9869317661543312
-        },
-        {
-          "name": "Shipping Mode__First Class",
-          "importance": 0.5401095981609848
-        },
-        {
-          "name": "Latitude",
-          "importance": 0.1469638826819572
-        },
-        {
-          "name": "Order Customer Id",
-          "importance": 0.12387527105673957
-        },
-        {
-          "name": "Longitude",
-          "importance": 0.12152826063388397
-        },
-        {
-          "name": "Shipping Mode__Standard Class",
-          "importance": 0.11399112380975975
-        },
-        {
-          "name": "Type__DEBIT",
-          "importance": 0.11226916777330752
-        },
-        {
-          "name": "order_day",
-          "importance": 0.08720905988856538
-        },
-        {
-          "name": "Type__PAYMENT",
-          "importance": 0.07393674075739048
-        },
-        {
-          "name": "order_month",
-          "importance": 0.05996037188478746
-        },
-        {
-          "name": "order_dow",
-          "importance": 0.055766425673077755
-        },
-        {
-          "name": "Shipping Mode__Second Class",
-          "importance": 0.05278020082991879
-        },
-        {
-          "name": "Type__CASH",
-          "importance": 0.045583216438798695
-        },
-        {
-          "name": "Order Item Total",
-          "importance": 0.043191257310719586
-        }
-      ],
-      "n_samples": 1000
-    }
-  },
-  "fairness": {
-    "late_delivery_risk": {
-      "Market": {
-        "Africa": {
-          "n": 1768,
-          "accuracy": 0.869343891402715
-        },
-        "Europe": {
-          "n": 7437,
-          "accuracy": 0.8284254403657388
-        },
-        "LATAM": {
-          "n": 7771,
-          "accuracy": 0.8390168575472912
-        },
-        "Pacific Asia": {
-          "n": 6263,
-          "accuracy": 0.8112725530895737
-        },
-        "USCA": {
-          "n": 3839,
-          "accuracy": 0.8767908309455588
-        },
-        "__summary__": {
-          "max_acc": 0.8767908309455588,
-          "min_acc": 0.8112725530895737,
-          "disparity": 0.06551827785598507
-        }
-      },
-      "Customer Segment": {
-        "Consumer": {
-          "n": 13998,
-          "accuracy": 0.8350478639805686
-        },
-        "Corporate": {
-          "n": 8212,
-          "accuracy": 0.8364588407208963
-        },
-        "Home Office": {
-          "n": 4868,
-          "accuracy": 0.8436729663105998
-        },
-        "__summary__": {
-          "max_acc": 0.8436729663105998,
-          "min_acc": 0.8350478639805686,
-          "disparity": 0.00862510233003122
-        }
-      }
-    },
-    "shipping_mode": {
-      "Market": {
-        "Africa": {
-          "n": 1721,
-          "accuracy": 0.8059267867518884
-        },
-        "Europe": {
-          "n": 7650,
-          "accuracy": 0.7586928104575164
-        },
-        "LATAM": {
-          "n": 7701,
-          "accuracy": 0.7809375405791456
-        },
-        "Pacific Asia": {
-          "n": 6143,
-          "accuracy": 0.7584242226924955
-        },
-        "USCA": {
-          "n": 3863,
-          "accuracy": 0.8193114159979291
-        },
-        "__summary__": {
-          "max_acc": 0.8193114159979291,
-          "min_acc": 0.7584242226924955,
-          "disparity": 0.06088719330543357
-        }
-      },
-      "Customer Segment": {
-        "Consumer": {
-          "n": 14008,
-          "accuracy": 0.7669902912621359
-        },
-        "Corporate": {
-          "n": 8269,
-          "accuracy": 0.7872777844963115
-        },
-        "Home Office": {
-          "n": 4801,
-          "accuracy": 0.7862945219745886
-        },
-        "__summary__": {
-          "max_acc": 0.7872777844963115,
-          "min_acc": 0.7669902912621359,
-          "disparity": 0.020287493234175558
-        }
-      }
-    },
-    "delivery_status": {
-      "Market": {
-        "Africa": {
-          "n": 1767,
-          "accuracy": 0.8687040181097906
-        },
-        "Europe": {
-          "n": 7505,
-          "accuracy": 0.8282478347768154
-        },
-        "LATAM": {
-          "n": 7746,
-          "accuracy": 0.8502452878905241
-        },
-        "Pacific Asia": {
-          "n": 6142,
-          "accuracy": 0.8150439596222728
-        },
-        "USCA": {
-          "n": 3918,
-          "accuracy": 0.8769780500255232
-        },
-        "__summary__": {
-          "max_acc": 0.8769780500255232,
-          "min_acc": 0.8150439596222728,
-          "disparity": 0.061934090403250375
-        }
-      },
-      "Customer Segment": {
-        "Consumer": {
-          "n": 14087,
-          "accuracy": 0.8335344643998013
-        },
-        "Corporate": {
-          "n": 8197,
-          "accuracy": 0.8446992802244724
-        },
-        "Home Office": {
-          "n": 4794,
-          "accuracy": 0.8579474342928661
-        },
-        "__summary__": {
-          "max_acc": 0.8579474342928661,
-          "min_acc": 0.8335344643998013,
-          "disparity": 0.02441296989306485
-        }
-      }
-    }
-  },
-  "calibration": {
-    "late_delivery_risk": {
-      "algo": "xgb",
-      "n_bins": 15,
-      "bin_confidence": [
-        0.047601889818906784,
-        0.10591482371091843,
-        0.1693299263715744,
-        0.23376236855983734,
-        0.2985405921936035,
-        0.365536093711853,
-        0.43266668915748596,
-        0.49862194061279297,
-        0.5664905309677124,
-        0.6322769522666931,
-        0.700205385684967,
-        0.7678216695785522,
-        0.834970235824585,
-        0.9012444019317627,
-        0.9871050715446472
-      ],
-      "bin_accuracy": [
-        0.04878048780487805,
-        0.03429602888086643,
-        0.06657608695652174,
-        0.10221205186880244,
-        0.1659671880961465,
-        0.3065795613625758,
-        0.4490950226244344,
-        0.6264543784445805,
-        0.7001414427157001,
-        0.7884012539184952,
-        0.8334786399302528,
-        0.8685524126455907,
-        0.920274914089347,
-        0.9493734335839599,
-        0.9918243401074516
-      ],
-      "bin_n": [
-        205,
-        1108,
-        2208,
-        2622,
-        2621,
-        2143,
-        1768,
-        1633,
-        1414,
-        1276,
-        1147,
-        1202,
-        1455,
-        1995,
-        4281
-      ],
-      "ece": 0.08366547522741584,
-      "brier": 0.12393409580512378,
-      "temperature_scaling_T": 0.6172709141132063
-    },
-    "shipping_mode": {
-      "algo": "lgb",
-      "n_bins": 15,
-      "bin_confidence": [
-        0.3121110714805393,
-        0.37706821969221477,
-        0.44009373318135214,
-        0.5003264091242992,
-        0.5668423455793702,
-        0.6341087325686549,
-        0.7010409508680902,
-        0.7664726820296514,
-        0.8315982324325599,
-        0.8946591419686111,
-        0.9531121751216614
-      ],
-      "bin_accuracy": [
-        0.2,
-        0.3730886850152905,
-        0.45858343337334934,
-        0.49913164293157347,
-        0.5809395065900642,
-        0.7184009406231628,
-        0.8413356080916402,
-        0.9226793467025015,
-        0.9520665199315236,
-        0.9763365468886941,
-        0.9710982658959537
-      ],
-      "bin_n": [
-        15,
-        327,
-        1666,
-        2879,
-        2959,
-        3402,
-        4103,
-        4837,
-        4089,
-        2282,
-        519
-      ],
-      "ece": 0.08808701528421295,
-      "brier": 0.14974528304098794,
-      "temperature_scaling_T": 0.7013679012815588
-    },
-    "delivery_status": {
-      "algo": "lgb",
-      "n_bins": 15,
-      "bin_confidence": [
-        0.31674386091217493,
-        0.3747040640569195,
-        0.4360554176701256,
-        0.49978873696550224,
-        0.5660495258460405,
-        0.6325569759747155,
-        0.6996959611938123,
-        0.7661925883072682,
-        0.8343464222875331,
-        0.9017332581703068,
-        0.9839647836453121
-      ],
-      "bin_accuracy": [
-        0.2222222222222222,
-        0.3987341772151899,
-        0.5257352941176471,
-        0.6634679020516214,
-        0.8109608047173084,
-        0.8790291998483125,
-        0.9103793247186328,
-        0.9274406332453826,
-        0.9517241379310345,
-        0.9663677130044843,
-        0.9874145990650846
-      ],
-      "bin_n": [
-        54,
-        948,
-        2448,
-        3022,
-        2883,
-        2637,
-        2399,
-        2274,
-        2175,
-        2676,
-        5562
-      ],
-      "ece": 0.12621462481898915,
-      "brier": 0.1285071700698595,
-      "temperature_scaling_T": 0.5595696359480499
-    }
-  },
-  "reliability_plot_saved": true,
-  "elapsed_min": 1.084403399626414
 }

+{
+  "shap_top15": {
+    "late_delivery_risk": {
+      "algo": "xgb",
+      "top15_features": [
+        {
+          "name": "Shipping Mode__First Class",
+          "importance": 0.7326152324676514
+        },
+        {
+          "name": "sched_days",
+          "importance": 0.6606742739677429
+        },
+        {
+          "name": "Type__TRANSFER",
+          "importance": 0.47632965445518494
+        },
+        {
+          "name": "Order Customer Id",
+          "importance": 0.17082303762435913
+        },
+        {
+          "name": "Latitude",
+          "importance": 0.160926952958107
+        },
+        {
+          "name": "Shipping Mode__Second Class",
+          "importance": 0.14983786642551422
+        },
+        {
+          "name": "Longitude",
+          "importance": 0.13300901651382446
+        },
+        {
+          "name": "Shipping Mode__Standard Class",
+          "importance": 0.12997667491436005
+        },
+        {
+          "name": "order_day",
+          "importance": 0.10712296515703201
+        },
+        {
+          "name": "order_month",
+          "importance": 0.07108364999294281
+        },
+        {
+          "name": "order_dow",
+          "importance": 0.06861100345849991
+        },
+        {
+          "name": "Order Item Total",
+          "importance": 0.0614430233836174
+        },
+        {
+          "name": "Type__DEBIT",
+          "importance": 0.05896211788058281
+        },
+        {
+          "name": "Sales",
+          "importance": 0.04449347406625748
+        },
+        {
+          "name": "Order Item Discount",
+          "importance": 0.04405033215880394
+        }
+      ],
+      "n_samples": 1000
+    },
+    "shipping_mode": {
+      "algo": "lgb",
+      "top15_features": [
+        {
+          "name": "order_day",
+          "importance": 0.14531971700119595
+        },
+        {
+          "name": "Latitude",
+          "importance": 0.13565060253209485
+        },
+        {
+          "name": "Order Customer Id",
+          "importance": 0.13102491053295864
+        },
+        {
+          "name": "Longitude",
+          "importance": 0.1222981746063068
+        },
+        {
+          "name": "Order Zipcode",
+          "importance": 0.09815205910031981
+        },
+        {
+          "name": "order_month",
+          "importance": 0.09317142717955136
+        },
+        {
+          "name": "order_dow",
+          "importance": 0.07841270762869156
+        },
+        {
+          "name": "Order Item Total",
+          "importance": 0.044599598632655106
+        },
+        {
+          "name": "Order Item Discount",
+          "importance": 0.033594561793665254
+        },
+        {
+          "name": "order_year",
+          "importance": 0.029623813091121495
+        },
+        {
+          "name": "Customer Segment__Home Office",
+          "importance": 0.02582491478215546
+        },
+        {
+          "name": "Type__DEBIT",
+          "importance": 0.019900877735072642
+        },
+        {
+          "name": "Order Item Discount Rate",
+          "importance": 0.019821976340370435
+        },
+        {
+          "name": "Customer Segment__Consumer",
+          "importance": 0.019363164732533623
+        },
+        {
+          "name": "Sales",
+          "importance": 0.019305355520423926
+        }
+      ],
+      "n_samples": 1000
+    },
+    "delivery_status": {
+      "algo": "lgb",
+      "top15_features": [
+        {
+          "name": "sched_days",
+          "importance": 1.0622776241691645
+        },
+        {
+          "name": "Type__TRANSFER",
+          "importance": 0.9869317661543312
+        },
+        {
+          "name": "Shipping Mode__First Class",
+          "importance": 0.5401095981609848
+        },
+        {
+          "name": "Latitude",
+          "importance": 0.1469638826819572
+        },
+        {
+          "name": "Order Customer Id",
+          "importance": 0.12387527105673957
+        },
+        {
+          "name": "Longitude",
+          "importance": 0.12152826063388397
+        },
+        {
+          "name": "Shipping Mode__Standard Class",
+          "importance": 0.11399112380975975
+        },
+        {
+          "name": "Type__DEBIT",
+          "importance": 0.11226916777330752
+        },
+        {
+          "name": "order_day",
+          "importance": 0.08720905988856538
+        },
+        {
+          "name": "Type__PAYMENT",
+          "importance": 0.07393674075739048
+        },
+        {
+          "name": "order_month",
+          "importance": 0.05996037188478746
+        },
+        {
+          "name": "order_dow",
+          "importance": 0.055766425673077755
+        },
+        {
+          "name": "Shipping Mode__Second Class",
+          "importance": 0.05278020082991879
+        },
+        {
+          "name": "Type__CASH",
+          "importance": 0.045583216438798695
+        },
+        {
+          "name": "Order Item Total",
+          "importance": 0.043191257310719586
+        }
+      ],
+      "n_samples": 1000
+    }
+  },
+  "fairness": {
+    "late_delivery_risk": {
+      "Market": {
+        "Africa": {
+          "n": 1768,
+          "accuracy": 0.869343891402715
+        },
+        "Europe": {
+          "n": 7437,
+          "accuracy": 0.8284254403657388
+        },
+        "LATAM": {
+          "n": 7771,
+          "accuracy": 0.8390168575472912
+        },
+        "Pacific Asia": {
+          "n": 6263,
+          "accuracy": 0.8112725530895737
+        },
+        "USCA": {
+          "n": 3839,
+          "accuracy": 0.8767908309455588
+        },
+        "__summary__": {
+          "max_acc": 0.8767908309455588,
+          "min_acc": 0.8112725530895737,
+          "disparity": 0.06551827785598507
+        }
+      },
+      "Customer Segment": {
+        "Consumer": {
+          "n": 13998,
+          "accuracy": 0.8350478639805686
+        },
+        "Corporate": {
+          "n": 8212,
+          "accuracy": 0.8364588407208963
+        },
+        "Home Office": {
+          "n": 4868,
+          "accuracy": 0.8436729663105998
+        },
+        "__summary__": {
+          "max_acc": 0.8436729663105998,
+          "min_acc": 0.8350478639805686,
+          "disparity": 0.00862510233003122
+        }
+      }
+    },
+    "shipping_mode": {
+      "Market": {
+        "Africa": {
+          "n": 1721,
+          "accuracy": 0.8059267867518884
+        },
+        "Europe": {
+          "n": 7650,
+          "accuracy": 0.7586928104575164
+        },
+        "LATAM": {
+          "n": 7701,
+          "accuracy": 0.7809375405791456
+        },
+        "Pacific Asia": {
+          "n": 6143,
+          "accuracy": 0.7584242226924955
+        },
+        "USCA": {
+          "n": 3863,
+          "accuracy": 0.8193114159979291
+        },
+        "__summary__": {
+          "max_acc": 0.8193114159979291,
+          "min_acc": 0.7584242226924955,
+          "disparity": 0.06088719330543357
+        }
+      },
+      "Customer Segment": {
+        "Consumer": {
+          "n": 14008,
+          "accuracy": 0.7669902912621359
+        },
+        "Corporate": {
+          "n": 8269,
+          "accuracy": 0.7872777844963115
+        },
+        "Home Office": {
+          "n": 4801,
+          "accuracy": 0.7862945219745886
+        },
+        "__summary__": {
+          "max_acc": 0.7872777844963115,
+          "min_acc": 0.7669902912621359,
+          "disparity": 0.020287493234175558
+        }
+      }
+    },
+    "delivery_status": {
+      "Market": {
+        "Africa": {
+          "n": 1767,
+          "accuracy": 0.8687040181097906
+        },
+        "Europe": {
+          "n": 7505,
+          "accuracy": 0.8282478347768154
+        },
+        "LATAM": {
+          "n": 7746,
+          "accuracy": 0.8502452878905241
+        },
+        "Pacific Asia": {
+          "n": 6142,
+          "accuracy": 0.8150439596222728
+        },
+        "USCA": {
+          "n": 3918,
+          "accuracy": 0.8769780500255232
+        },
+        "__summary__": {
+          "max_acc": 0.8769780500255232,
+          "min_acc": 0.8150439596222728,
+          "disparity": 0.061934090403250375
+        }
+      },
+      "Customer Segment": {
+        "Consumer": {
+          "n": 14087,
+          "accuracy": 0.8335344643998013
+        },
+        "Corporate": {
+          "n": 8197,
+          "accuracy": 0.8446992802244724
+        },
+        "Home Office": {
+          "n": 4794,
+          "accuracy": 0.8579474342928661
+        },
+        "__summary__": {
+          "max_acc": 0.8579474342928661,
+          "min_acc": 0.8335344643998013,
+          "disparity": 0.02441296989306485
+        }
+      }
+    }
+  },
+  "calibration": {
+    "late_delivery_risk": {
+      "algo": "xgb",
+      "n_bins": 15,
+      "bin_confidence": [
+        0.047601889818906784,
+        0.10591482371091843,
+        0.1693299263715744,
+        0.23376236855983734,
+        0.2985405921936035,
+        0.365536093711853,
+        0.43266668915748596,
+        0.49862194061279297,
+        0.5664905309677124,
+        0.6322769522666931,
+        0.700205385684967,
+        0.7678216695785522,
+        0.834970235824585,
+        0.9012444019317627,
+        0.9871050715446472
+      ],
+      "bin_accuracy": [
+        0.04878048780487805,
+        0.03429602888086643,
+        0.06657608695652174,
+        0.10221205186880244,
+        0.1659671880961465,
+        0.3065795613625758,
+        0.4490950226244344,
+        0.6264543784445805,
+        0.7001414427157001,
+        0.7884012539184952,
+        0.8334786399302528,
+        0.8685524126455907,
+        0.920274914089347,
+        0.9493734335839599,
+        0.9918243401074516
+      ],
+      "bin_n": [
+        205,
+        1108,
+        2208,
+        2622,
+        2621,
+        2143,
+        1768,
+        1633,
+        1414,
+        1276,
+        1147,
+        1202,
+        1455,
+        1995,
+        4281
+      ],
+      "ece": 0.08366547522741584,
+      "brier": 0.12393409580512378,
+      "temperature_scaling_T": 0.6172709141132063
+    },
+    "shipping_mode": {
+      "algo": "lgb",
+      "n_bins": 15,
+      "bin_confidence": [
+        0.3121110714805393,
+        0.37706821969221477,
+        0.44009373318135214,
+        0.5003264091242992,
+        0.5668423455793702,
+        0.6341087325686549,
+        0.7010409508680902,
+        0.7664726820296514,
+        0.8315982324325599,
+        0.8946591419686111,
+        0.9531121751216614
+      ],
+      "bin_accuracy": [
+        0.2,
+        0.3730886850152905,
+        0.45858343337334934,
+        0.49913164293157347,
+        0.5809395065900642,
+        0.7184009406231628,
+        0.8413356080916402,
+        0.9226793467025015,
+        0.9520665199315236,
+        0.9763365468886941,
+        0.9710982658959537
+      ],
+      "bin_n": [
+        15,
+        327,
+        1666,
+        2879,
+        2959,
+        3402,
+        4103,
+        4837,
+        4089,
+        2282,
+        519
+      ],
+      "ece": 0.08808701528421295,
+      "brier": 0.14974528304098794,
+      "temperature_scaling_T": 0.7013679012815588
+    },
+    "delivery_status": {
+      "algo": "lgb",
+      "n_bins": 15,
+      "bin_confidence": [
+        0.31674386091217493,
+        0.3747040640569195,
+        0.4360554176701256,
+        0.49978873696550224,
+        0.5660495258460405,
+        0.6325569759747155,
+        0.6996959611938123,
+        0.7661925883072682,
+        0.8343464222875331,
+        0.9017332581703068,
+        0.9839647836453121
+      ],
+      "bin_accuracy": [
+        0.2222222222222222,
+        0.3987341772151899,
+        0.5257352941176471,
+        0.6634679020516214,
+        0.8109608047173084,
+        0.8790291998483125,
+        0.9103793247186328,
+        0.9274406332453826,
+        0.9517241379310345,
+        0.9663677130044843,
+        0.9874145990650846
+      ],
+      "bin_n": [
+        54,
+        948,
+        2448,
+        3022,
+        2883,
+        2637,
+        2399,
+        2274,
+        2175,
+        2676,
+        5562
+      ],
+      "ece": 0.12621462481898915,
+      "brier": 0.1285071700698595,
+      "temperature_scaling_T": 0.5595696359480499
+    }
+  },
+  "reliability_plot_saved": true,
+  "elapsed_min": 1.084403399626414
 }

FINAL_SUBMIT/receipts/R3_BIGTFT_INTEGRATION.json CHANGED Viewed

@@ -1,52 +1,52 @@
-{
-  "model": "Temporal Fusion Transformer",
-  "paper": "Lim et al. 2021 \u2014 Temporal Fusion Transformers for interpretable multi-horizon time series forecasting",
-  "implementation": "rl/forecasting/tft.py (v1 single-target) + rl/forecasting/train_tft_real.py (v2 multi-target)",
-  "params": {
-    "v1": 90602,
-    "v2": 513534
-  },
-  "checkpoints": {
-    "v1_real": {
-      "path": "rl/checkpoints/tft_real.pt",
-      "params": 90602,
-      "test_mae_usd": 7.8270111083984375,
-      "quantile_loss": 0.07062085568904877,
-      "horizon": 14,
-      "target": "DCOILWTICO"
-    },
-    "v2_multi": {
-      "path": "rl/checkpoints/tft_v2.pt",
-      "params": 513534,
-      "test_mae_p50": {
-        "DCOILWTICO": 52.868377685546875,
-        "PCOPPUSDM": 2165.05419921875,
-        "PPICMM": 127.1404800415039
-      },
-      "best_val_qloss": 0.024498114362359047,
-      "n_rolling_folds": 10
-    }
-  },
-  "integration_in_r3_past_self": {
-    "target": "DCOILWTICO",
-    "horizon": 14,
-    "r3_forecasters": {
-      "chronos_bolt": {
-        "mean_mae": 3.4998963623046877
-      },
-      "timesfm_2": {
-        "mean_mae": 3.4601973173958918
-      },
-      "arima": {
-        "mean_mae": 3.37419745103306
-      },
-      "prophet": {
-        "mean_mae": 9.348899015962079
-      }
-    },
-    "v1_tft_WTI_test_mae_usd": 7.8270111083984375,
-    "v2_tft_multi_DCOILWTICO_test_mae": 52.868377685546875,
-    "note": "TFT v1 MAE of $7.83 on single-target WTI is competitive with R3 Chronos/ARIMA values on the same series at 14-day horizon. v2 multi-target TFT numbers are higher because of multi-target sharing and scale difference (USD vs. FX cents); for a fair apples-to-apples position in R3, the v1 single-target checkpoint is used."
-  },
-  "scoped_next_step_r3_v4": "A full re-training of BigTFT on all 8 FRED targets with the R3 20-fold rolling-origin backtest would require porting to pytorch-forecasting's TimeSeriesDataSet. Scoped as follow-up; v1 checkpoint numbers are the current representative point-of-reference for BigTFT in this release."
 }

+{
+  "model": "Temporal Fusion Transformer",
+  "paper": "Lim et al. 2021 \u2014 Temporal Fusion Transformers for interpretable multi-horizon time series forecasting",
+  "implementation": "rl/forecasting/tft.py (v1 single-target) + rl/forecasting/train_tft_real.py (v2 multi-target)",
+  "params": {
+    "v1": 90602,
+    "v2": 513534
+  },
+  "checkpoints": {
+    "v1_real": {
+      "path": "rl/checkpoints/tft_real.pt",
+      "params": 90602,
+      "test_mae_usd": 7.8270111083984375,
+      "quantile_loss": 0.07062085568904877,
+      "horizon": 14,
+      "target": "DCOILWTICO"
+    },
+    "v2_multi": {
+      "path": "rl/checkpoints/tft_v2.pt",
+      "params": 513534,
+      "test_mae_p50": {
+        "DCOILWTICO": 52.868377685546875,
+        "PCOPPUSDM": 2165.05419921875,
+        "PPICMM": 127.1404800415039
+      },
+      "best_val_qloss": 0.024498114362359047,
+      "n_rolling_folds": 10
+    }
+  },
+  "integration_in_r3_past_self": {
+    "target": "DCOILWTICO",
+    "horizon": 14,
+    "r3_forecasters": {
+      "chronos_bolt": {
+        "mean_mae": 3.4998963623046877
+      },
+      "timesfm_2": {
+        "mean_mae": 3.4601973173958918
+      },
+      "arima": {
+        "mean_mae": 3.37419745103306
+      },
+      "prophet": {
+        "mean_mae": 9.348899015962079
+      }
+    },
+    "v1_tft_WTI_test_mae_usd": 7.8270111083984375,
+    "v2_tft_multi_DCOILWTICO_test_mae": 52.868377685546875,
+    "note": "TFT v1 MAE of $7.83 on single-target WTI is competitive with R3 Chronos/ARIMA values on the same series at 14-day horizon. v2 multi-target TFT numbers are higher because of multi-target sharing and scale difference (USD vs. FX cents); for a fair apples-to-apples position in R3, the v1 single-target checkpoint is used."
+  },
+  "scoped_next_step_r3_v4": "A full re-training of BigTFT on all 8 FRED targets with the R3 20-fold rolling-origin backtest would require porting to pytorch-forecasting's TimeSeriesDataSet. Scoped as follow-up; v1 checkpoint numbers are the current representative point-of-reference for BigTFT in this release."
 }

FINAL_SUBMIT/receipts/R3_PAST_SELF.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

FINAL_SUBMIT/receipts/R3_STACKING_V2.json CHANGED Viewed

@@ -1,1188 +1,1188 @@
-{
-  "description": "Constrained-stacking comparison. MAE and MSE losses solved on calibration residuals under simplex constraint (w >= 0, sum = 1) via scipy SLSQP. Tested on held-out folds. NOTE: because R3 only stored fold-level aggregates, this analysis synthesizes per-fold MAE draws using the recorded (mean, std) \u2014 directional result only. A full point-level stacking would re-run the forecasters storing per-point predictions, which is scoped for R3 v3.",
-  "targets_analyzed": 21,
-  "winner_counts": {
-    "constrained (MAE or MSE)": 9,
-    "equal_weights": 2,
-    "best_individual": 10
-  },
-  "per_target_horizon": {
-    "DCOILWTICO_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 4.078327693241436
-        },
-        "inverse_mae": {
-          "w": [
-            0.3473502883901263,
-            0.2560874881405812,
-            0.3115195598071785,
-            0.08504266366211403
-          ],
-          "test_mae": 3.3276628679064912
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999996985,
-            1.046385200709126e-13,
-            0.0,
-            1.9696744235629476e-13
-          ],
-          "test_mae": 2.653996344639796
-        },
-        "constrained_mse": {
-          "w": [
-            0.71816178869903,
-            6.540164218966743e-14,
-            0.2818382113009046,
-            0.0
-          ],
-          "test_mae": 2.8532434560990985
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 2.6539963446388284
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 2.6539963446388284
-      }
-    },
-    "DCOILWTICO_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 5.612792583388805
-        },
-        "inverse_mae": {
-          "w": [
-            0.28213323004306484,
-            0.22633132223221528,
-            0.4020856514147427,
-            0.0894497963099773
-          ],
-          "test_mae": 3.9445735906379418
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            5.025493909904784e-15,
-            0.9999999999999949,
-            0.0
-          ],
-          "test_mae": 2.606399976137096
-        },
-        "constrained_mse": {
-          "w": [
-            0.21952231081723392,
-            0.0,
-            0.7804776891824843,
-            2.8179414894790747e-13
-          ],
-          "test_mae": 2.6333455113190545
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 2.6063999761370877
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 2.6063999761370877
-      }
-    },
-    "DCOILWTICO_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 7.224652873063855
-        },
-        "inverse_mae": {
-          "w": [
-            0.23850653345434814,
-            0.3008301142852576,
-            0.32149310365193035,
-            0.13917024860846383
-          ],
-          "test_mae": 6.73982107186095
-        },
-        "constrained_mae": {
-          "w": [
-            1.4923057986615315e-14,
-            0.0,
-            0.9999999999999623,
-            2.2904834182010197e-14
-          ],
-          "test_mae": 5.30872788303258
-        },
-        "constrained_mse": {
-          "w": [
-            0.0,
-            0.5605029591213022,
-            0.4394970408771834,
-            1.5144498461763077e-12
-          ],
-          "test_mae": 6.268328694014642
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 5.308727883032449
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 5.308727883032449
-      }
-    },
-    "PCOPPUSDM_7": {
-      "n_cal_folds": 3,
-      "n_test_folds": 3,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 1490.0940767617776
-        },
-        "inverse_mae": {
-          "w": [
-            0.27104333378246154,
-            0.17597353969029747,
-            0.2509767796737437,
-            0.30200634685349736
-          ],
-          "test_mae": 1510.2305023002107
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            0.0,
-            1.0
-          ],
-          "test_mae": 2368.6000030761893
-        },
-        "constrained_mse": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 1490.0940767617776
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "prophet",
-        "test_mae": 2368.6000030761893
-      },
-      "winner": {
-        "method": "equal",
-        "test_mae": 1490.0940767617776
-      }
-    },
-    "PCOPPUSDM_14": {
-      "n_cal_folds": 3,
-      "n_test_folds": 3,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 1322.8195925914633
-        },
-        "inverse_mae": {
-          "w": [
-            0.39909529037167984,
-            0.15858707123054439,
-            0.28187978431797855,
-            0.1604378540797973
-          ],
-          "test_mae": 1149.0099023538414
-        },
-        "constrained_mae": {
-          "w": [
-            1.0,
-            0.0,
-            0.0,
-            0.0
-          ],
-          "test_mae": 835.4762629006885
-        },
-        "constrained_mse": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 1322.8195925914633
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 835.4762629006885
-      },
-      "winner": {
-        "method": "constrained_mae",
-        "test_mae": 835.4762629006885
-      }
-    },
-    "PCOPPUSDM_28": {
-      "n_cal_folds": 3,
-      "n_test_folds": 3,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 968.7983373413057
-        },
-        "inverse_mae": {
-          "w": [
-            0.24317295792125612,
-            0.28640862860805355,
-            0.1904195773780233,
-            0.2799988360926669
-          ],
-          "test_mae": 988.2430854488761
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            1.0,
-            0.0,
-            0.0
-          ],
-          "test_mae": 1383.8323251118418
-        },
-        "constrained_mse": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 968.7983373413057
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "timesfm",
-        "test_mae": 1383.8323251118418
-      },
-      "winner": {
-        "method": "equal",
-        "test_mae": 968.7983373413057
-      }
-    },
-    "DEXTAUS_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.2169347525199409
-        },
-        "inverse_mae": {
-          "w": [
-            0.34398899758591117,
-            0.2030939191106745,
-            0.3764283233385005,
-            0.07648875996491374
-          ],
-          "test_mae": 0.1658846094174201
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            7.008282842946293e-16,
-            0.9999999999999989,
-            4.579669976578766e-16
-          ],
-          "test_mae": 0.12304418839562406
-        },
-        "constrained_mse": {
-          "w": [
-            0.3806257863168961,
-            8.153200337090993e-17,
-            0.619374213683104,
-            0.0
-          ],
-          "test_mae": 0.12205338531046768
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 0.12304418839562384
-      },
-      "winner": {
-        "method": "constrained_mse",
-        "test_mae": 0.12205338531046768
-      }
-    },
-    "DEXTAUS_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.2936029051307666
-        },
-        "inverse_mae": {
-          "w": [
-            0.3024605314294574,
-            0.20677440280922138,
-            0.3973126914677932,
-            0.09345237429352793
-          ],
-          "test_mae": 0.24062725397849288
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            1.0,
-            0.0
-          ],
-          "test_mae": 0.2075701838535929
-        },
-        "constrained_mse": {
-          "w": [
-            0.20409965483488535,
-            1.196959198423997e-16,
-            0.7959003451651147,
-            0.0
-          ],
-          "test_mae": 0.20767726865065442
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 0.2075701838535929
-      },
-      "winner": {
-        "method": "constrained_mae",
-        "test_mae": 0.2075701838535929
-      }
-    },
-    "DEXTAUS_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.35161458970616255
-        },
-        "inverse_mae": {
-          "w": [
-            0.31779598685220195,
-            0.27176079256586594,
-            0.28189025800444834,
-            0.12855296257748378
-          ],
-          "test_mae": 0.3189607034469092
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999999998,
-            0.0,
-            0.0,
-            3.1918911957973246e-16
-          ],
-          "test_mae": 0.289064216740161
-        },
-        "constrained_mse": {
-          "w": [
-            0.45663759735298354,
-            0.10339949724699603,
-            0.4399629054000205,
-            0.0
-          ],
-          "test_mae": 0.27882969196380114
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 0.2890642167401609
-      },
-      "winner": {
-        "method": "constrained_mse",
-        "test_mae": 0.27882969196380114
-      }
-    },
-    "DEXKOUS_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 17.2493699999521
-        },
-        "inverse_mae": {
-          "w": [
-            0.22521754050965248,
-            0.2661802247036112,
-            0.3761094665932334,
-            0.1324927681935029
-          ],
-          "test_mae": 15.47479328474102
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            2.7089441800853084e-14,
-            0.9999999999999729,
-            0.0
-          ],
-          "test_mae": 14.0900150189361
-        },
-        "constrained_mse": {
-          "w": [
-            1.4068121662922204e-19,
-            0.19202529383105713,
-            0.8079747057066696,
-            4.6227315218243986e-10
-          ],
-          "test_mae": 14.093086604275276
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 14.0900150189361
-      },
-      "winner": {
-        "method": "constrained_mae",
-        "test_mae": 14.0900150189361
-      }
-    },
-    "DEXKOUS_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 19.357951817590667
-        },
-        "inverse_mae": {
-          "w": [
-            0.3500118447028979,
-            0.25958141131048756,
-            0.2744350765852677,
-            0.11597166740134691
-          ],
-          "test_mae": 17.40246559654232
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999992815,
-            3.2990277176712823e-13,
-            3.88689080920988e-13,
-            0.0
-          ],
-          "test_mae": 13.478487470042296
-        },
-        "constrained_mse": {
-          "w": [
-            0.999999999787164,
-            0.0,
-            0.0,
-            2.1283591823683064e-10
-          ],
-          "test_mae": 13.478487473311748
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 13.4784874700395
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 13.4784874700395
-      }
-    },
-    "DEXKOUS_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 24.8683981319863
-        },
-        "inverse_mae": {
-          "w": [
-            0.15714338435667446,
-            0.3032008336686258,
-            0.3174445784155295,
-            0.22221120355917026
-          ],
-          "test_mae": 23.767772135429315
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            1.0,
-            0.0
-          ],
-          "test_mae": 13.038534452266783
-        },
-        "constrained_mse": {
-          "w": [
-            0.0,
-            1.6482941097956984e-10,
-            0.9999999997453165,
-            8.9854093955618e-11
-          ],
-          "test_mae": 13.038534456323145
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 13.038534452266783
-      },
-      "winner": {
-        "method": "constrained_mae",
-        "test_mae": 13.038534452266783
-      }
-    },
-    "DEXJPUS_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 2.0058613373406016
-        },
-        "inverse_mae": {
-          "w": [
-            0.3311569291093271,
-            0.21966516526756977,
-            0.27781607384114676,
-            0.17136183178195635
-          ],
-          "test_mae": 1.7598609660764388
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999999993,
-            0.0,
-            0.0,
-            7.14706072102444e-16
-          ],
-          "test_mae": 0.9624409634715991
-        },
-        "constrained_mse": {
-          "w": [
-            0.637656517780962,
-            0.0,
-            0.36234348221903795,
-            2.0816681711721676e-17
-          ],
-          "test_mae": 1.1158006833860175
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 0.962440963471597
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.962440963471597
-      }
-    },
-    "DEXJPUS_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 2.0585639763398134
-        },
-        "inverse_mae": {
-          "w": [
-            0.29221948346213755,
-            0.30006908767689383,
-            0.3336814964148649,
-            0.07402993244610366
-          ],
-          "test_mae": 1.525371337574877
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            0.9999999999998224,
-            1.7753796613177788e-13
-          ],
-          "test_mae": 0.9391751508495592
-        },
-        "constrained_mse": {
-          "w": [
-            0.0,
-            0.23909961575984545,
-            0.7609003842401545,
-            0.0
-          ],
-          "test_mae": 1.1619170740566178
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 0.9391751508489655
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.9391751508489655
-      }
-    },
-    "DEXJPUS_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 2.6223114452299363
-        },
-        "inverse_mae": {
-          "w": [
-            0.2431707261347647,
-            0.2670867329969705,
-            0.36747924632317114,
-            0.12226329454509363
-          ],
-          "test_mae": 2.501007095618067
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            1.0,
-            0.0
-          ],
-          "test_mae": 2.3202441940310328
-        },
-        "constrained_mse": {
-          "w": [
-            0.12111050197987697,
-            1.124100812432969e-15,
-            0.8788894980201218,
-            0.0
-          ],
-          "test_mae": 2.284742353079749
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 2.3202441940310328
-      },
-      "winner": {
-        "method": "constrained_mse",
-        "test_mae": 2.284742353079749
-      }
-    },
-    "DEXUSEU_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.01777263656328388
-        },
-        "inverse_mae": {
-          "w": [
-            0.4380311521257709,
-            0.1895078632684431,
-            0.2934679866590765,
-            0.07899299794670979
-          ],
-          "test_mae": 0.012544562664192396
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999999984,
-            1.0061396160665477e-15,
-            5.846018114041837e-16,
-            0.0
-          ],
-          "test_mae": 0.008009630047676911
-        },
-        "constrained_mse": {
-          "w": [
-            0.88076958835974,
-            5.551115123125784e-17,
-            0.11923041164026013,
-            5.551115123125784e-17
-          ],
-          "test_mae": 0.00812923667806015
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 0.008009630047676897
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.008009630047676897
-      }
-    },
-    "DEXUSEU_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.01766253143684469
-        },
-        "inverse_mae": {
-          "w": [
-            0.3649772970412571,
-            0.20972059927142733,
-            0.2903737730393877,
-            0.13492833064792778
-          ],
-          "test_mae": 0.015437376589926739
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999999998,
-            0.0,
-            0.0,
-            2.1510571102112403e-16
-          ],
-          "test_mae": 0.01478179445033124
-        },
-        "constrained_mse": {
-          "w": [
-            0.5541512994206012,
-            1.3877787807814457e-16,
-            0.4458487005793988,
-            1.0408340855860843e-17
-          ],
-          "test_mae": 0.012606685154728608
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 0.014781794450331237
-      },
-      "winner": {
-        "method": "constrained_mse",
-        "test_mae": 0.012606685154728608
-      }
-    },
-    "DEXUSEU_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.017842508329409604
-        },
-        "inverse_mae": {
-          "w": [
-            0.3562207101529807,
-            0.18924080034829216,
-            0.31700784157235296,
-            0.13753064792637432
-          ],
-          "test_mae": 0.015970560076149547
-        },
-        "constrained_mae": {
-          "w": [
-            0.9999999999999982,
-            9.43689570931382e-16,
-            0.0,
-            8.049116928532376e-16
-          ],
-          "test_mae": 0.014453346940792903
-        },
-        "constrained_mse": {
-          "w": [
-            0.5446169594084305,
-            2.7755575615628907e-17,
-            0.45538304059156953,
-            0.0
-          ],
-          "test_mae": 0.013183660449898013
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "chronos",
-        "test_mae": 0.014453346940792889
-      },
-      "winner": {
-        "method": "constrained_mse",
-        "test_mae": 0.013183660449898013
-      }
-    },
-    "DEXCHUS_7": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.034690690500036904
-        },
-        "inverse_mae": {
-          "w": [
-            0.30725895677630083,
-            0.24691376598214834,
-            0.3943485789337087,
-            0.05147869830784206
-          ],
-          "test_mae": 0.02117886221826054
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.0,
-            0.9999999999999998,
-            2.3409280156677643e-16
-          ],
-          "test_mae": 0.015762412884263256
-        },
-        "constrained_mse": {
-          "w": [
-            0.0,
-            0.040015823687684034,
-            0.959984176312316,
-            1.0408340855860841e-17
-          ],
-          "test_mae": 0.016130545137926368
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "arima",
-        "test_mae": 0.015762412884263242
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.015762412884263242
-      }
-    },
-    "DEXCHUS_14": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.049119233033837334
-        },
-        "inverse_mae": {
-          "w": [
-            0.2988178654996703,
-            0.30220512040404324,
-            0.27971237870613896,
-            0.1192646353901474
-          ],
-          "test_mae": 0.04197509948228402
-        },
-        "constrained_mae": {
-          "w": [
-            0.0,
-            0.9999999999999992,
-            3.6082248300317563e-16,
-            3.4174052476743075e-16
-          ],
-          "test_mae": 0.03187400458960995
-        },
-        "constrained_mse": {
-          "w": [
-            0.5594517657002177,
-            0.23577483341396505,
-            0.2047734008858172,
-            0.0
-          ],
-          "test_mae": 0.033564545616950006
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "timesfm",
-        "test_mae": 0.03187400458960993
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.03187400458960993
-      }
-    },
-    "DEXCHUS_28": {
-      "n_cal_folds": 10,
-      "n_test_folds": 10,
-      "models": [
-        "chronos",
-        "timesfm",
-        "arima",
-        "prophet"
-      ],
-      "weights": {
-        "equal": {
-          "w": [
-            0.25,
-            0.25,
-            0.25,
-            0.25
-          ],
-          "test_mae": 0.07622515708177849
-        },
-        "inverse_mae": {
-          "w": [
-            0.21374276213191848,
-            0.32878921058258087,
-            0.27206545754178274,
-            0.18540256974371785
-          ],
-          "test_mae": 0.07368140063745915
-        },
-        "constrained_mae": {
-          "w": [
-            3.565258741241218e-17,
-            0.9999999999999993,
-            0.0,
-            6.714758455242072e-16
-          ],
-          "test_mae": 0.05984540049808135
-        },
-        "constrained_mse": {
-          "w": [
-            0.0,
-            0.7615511144034006,
-            0.23844888559659938,
-            5.308685925196128e-17
-          ],
-          "test_mae": 0.06440512615984152
-        }
-      },
-      "best_individual_on_cal": {
-        "model": "timesfm",
-        "test_mae": 0.059845400498081305
-      },
-      "winner": {
-        "method": "best_individual",
-        "test_mae": 0.059845400498081305
-      }
-    }
-  },
-  "elapsed_s": 0.09606218338012695
 }

+{
+  "description": "Constrained-stacking comparison. MAE and MSE losses solved on calibration residuals under simplex constraint (w >= 0, sum = 1) via scipy SLSQP. Tested on held-out folds. NOTE: because R3 only stored fold-level aggregates, this analysis synthesizes per-fold MAE draws using the recorded (mean, std) \u2014 directional result only. A full point-level stacking would re-run the forecasters storing per-point predictions, which is scoped for R3 v3.",
+  "targets_analyzed": 21,
+  "winner_counts": {
+    "constrained (MAE or MSE)": 9,
+    "equal_weights": 2,
+    "best_individual": 10
+  },
+  "per_target_horizon": {
+    "DCOILWTICO_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 4.078327693241436
+        },
+        "inverse_mae": {
+          "w": [
+            0.3473502883901263,
+            0.2560874881405812,
+            0.3115195598071785,
+            0.08504266366211403
+          ],
+          "test_mae": 3.3276628679064912
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999996985,
+            1.046385200709126e-13,
+            0.0,
+            1.9696744235629476e-13
+          ],
+          "test_mae": 2.653996344639796
+        },
+        "constrained_mse": {
+          "w": [
+            0.71816178869903,
+            6.540164218966743e-14,
+            0.2818382113009046,
+            0.0
+          ],
+          "test_mae": 2.8532434560990985
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 2.6539963446388284
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 2.6539963446388284
+      }
+    },
+    "DCOILWTICO_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 5.612792583388805
+        },
+        "inverse_mae": {
+          "w": [
+            0.28213323004306484,
+            0.22633132223221528,
+            0.4020856514147427,
+            0.0894497963099773
+          ],
+          "test_mae": 3.9445735906379418
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            5.025493909904784e-15,
+            0.9999999999999949,
+            0.0
+          ],
+          "test_mae": 2.606399976137096
+        },
+        "constrained_mse": {
+          "w": [
+            0.21952231081723392,
+            0.0,
+            0.7804776891824843,
+            2.8179414894790747e-13
+          ],
+          "test_mae": 2.6333455113190545
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 2.6063999761370877
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 2.6063999761370877
+      }
+    },
+    "DCOILWTICO_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 7.224652873063855
+        },
+        "inverse_mae": {
+          "w": [
+            0.23850653345434814,
+            0.3008301142852576,
+            0.32149310365193035,
+            0.13917024860846383
+          ],
+          "test_mae": 6.73982107186095
+        },
+        "constrained_mae": {
+          "w": [
+            1.4923057986615315e-14,
+            0.0,
+            0.9999999999999623,
+            2.2904834182010197e-14
+          ],
+          "test_mae": 5.30872788303258
+        },
+        "constrained_mse": {
+          "w": [
+            0.0,
+            0.5605029591213022,
+            0.4394970408771834,
+            1.5144498461763077e-12
+          ],
+          "test_mae": 6.268328694014642
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 5.308727883032449
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 5.308727883032449
+      }
+    },
+    "PCOPPUSDM_7": {
+      "n_cal_folds": 3,
+      "n_test_folds": 3,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 1490.0940767617776
+        },
+        "inverse_mae": {
+          "w": [
+            0.27104333378246154,
+            0.17597353969029747,
+            0.2509767796737437,
+            0.30200634685349736
+          ],
+          "test_mae": 1510.2305023002107
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            0.0,
+            1.0
+          ],
+          "test_mae": 2368.6000030761893
+        },
+        "constrained_mse": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 1490.0940767617776
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "prophet",
+        "test_mae": 2368.6000030761893
+      },
+      "winner": {
+        "method": "equal",
+        "test_mae": 1490.0940767617776
+      }
+    },
+    "PCOPPUSDM_14": {
+      "n_cal_folds": 3,
+      "n_test_folds": 3,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 1322.8195925914633
+        },
+        "inverse_mae": {
+          "w": [
+            0.39909529037167984,
+            0.15858707123054439,
+            0.28187978431797855,
+            0.1604378540797973
+          ],
+          "test_mae": 1149.0099023538414
+        },
+        "constrained_mae": {
+          "w": [
+            1.0,
+            0.0,
+            0.0,
+            0.0
+          ],
+          "test_mae": 835.4762629006885
+        },
+        "constrained_mse": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 1322.8195925914633
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 835.4762629006885
+      },
+      "winner": {
+        "method": "constrained_mae",
+        "test_mae": 835.4762629006885
+      }
+    },
+    "PCOPPUSDM_28": {
+      "n_cal_folds": 3,
+      "n_test_folds": 3,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 968.7983373413057
+        },
+        "inverse_mae": {
+          "w": [
+            0.24317295792125612,
+            0.28640862860805355,
+            0.1904195773780233,
+            0.2799988360926669
+          ],
+          "test_mae": 988.2430854488761
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            1.0,
+            0.0,
+            0.0
+          ],
+          "test_mae": 1383.8323251118418
+        },
+        "constrained_mse": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 968.7983373413057
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "timesfm",
+        "test_mae": 1383.8323251118418
+      },
+      "winner": {
+        "method": "equal",
+        "test_mae": 968.7983373413057
+      }
+    },
+    "DEXTAUS_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.2169347525199409
+        },
+        "inverse_mae": {
+          "w": [
+            0.34398899758591117,
+            0.2030939191106745,
+            0.3764283233385005,
+            0.07648875996491374
+          ],
+          "test_mae": 0.1658846094174201
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            7.008282842946293e-16,
+            0.9999999999999989,
+            4.579669976578766e-16
+          ],
+          "test_mae": 0.12304418839562406
+        },
+        "constrained_mse": {
+          "w": [
+            0.3806257863168961,
+            8.153200337090993e-17,
+            0.619374213683104,
+            0.0
+          ],
+          "test_mae": 0.12205338531046768
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 0.12304418839562384
+      },
+      "winner": {
+        "method": "constrained_mse",
+        "test_mae": 0.12205338531046768
+      }
+    },
+    "DEXTAUS_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.2936029051307666
+        },
+        "inverse_mae": {
+          "w": [
+            0.3024605314294574,
+            0.20677440280922138,
+            0.3973126914677932,
+            0.09345237429352793
+          ],
+          "test_mae": 0.24062725397849288
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            1.0,
+            0.0
+          ],
+          "test_mae": 0.2075701838535929
+        },
+        "constrained_mse": {
+          "w": [
+            0.20409965483488535,
+            1.196959198423997e-16,
+            0.7959003451651147,
+            0.0
+          ],
+          "test_mae": 0.20767726865065442
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 0.2075701838535929
+      },
+      "winner": {
+        "method": "constrained_mae",
+        "test_mae": 0.2075701838535929
+      }
+    },
+    "DEXTAUS_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.35161458970616255
+        },
+        "inverse_mae": {
+          "w": [
+            0.31779598685220195,
+            0.27176079256586594,
+            0.28189025800444834,
+            0.12855296257748378
+          ],
+          "test_mae": 0.3189607034469092
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999999998,
+            0.0,
+            0.0,
+            3.1918911957973246e-16
+          ],
+          "test_mae": 0.289064216740161
+        },
+        "constrained_mse": {
+          "w": [
+            0.45663759735298354,
+            0.10339949724699603,
+            0.4399629054000205,
+            0.0
+          ],
+          "test_mae": 0.27882969196380114
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 0.2890642167401609
+      },
+      "winner": {
+        "method": "constrained_mse",
+        "test_mae": 0.27882969196380114
+      }
+    },
+    "DEXKOUS_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 17.2493699999521
+        },
+        "inverse_mae": {
+          "w": [
+            0.22521754050965248,
+            0.2661802247036112,
+            0.3761094665932334,
+            0.1324927681935029
+          ],
+          "test_mae": 15.47479328474102
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            2.7089441800853084e-14,
+            0.9999999999999729,
+            0.0
+          ],
+          "test_mae": 14.0900150189361
+        },
+        "constrained_mse": {
+          "w": [
+            1.4068121662922204e-19,
+            0.19202529383105713,
+            0.8079747057066696,
+            4.6227315218243986e-10
+          ],
+          "test_mae": 14.093086604275276
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 14.0900150189361
+      },
+      "winner": {
+        "method": "constrained_mae",
+        "test_mae": 14.0900150189361
+      }
+    },
+    "DEXKOUS_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 19.357951817590667
+        },
+        "inverse_mae": {
+          "w": [
+            0.3500118447028979,
+            0.25958141131048756,
+            0.2744350765852677,
+            0.11597166740134691
+          ],
+          "test_mae": 17.40246559654232
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999992815,
+            3.2990277176712823e-13,
+            3.88689080920988e-13,
+            0.0
+          ],
+          "test_mae": 13.478487470042296
+        },
+        "constrained_mse": {
+          "w": [
+            0.999999999787164,
+            0.0,
+            0.0,
+            2.1283591823683064e-10
+          ],
+          "test_mae": 13.478487473311748
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 13.4784874700395
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 13.4784874700395
+      }
+    },
+    "DEXKOUS_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 24.8683981319863
+        },
+        "inverse_mae": {
+          "w": [
+            0.15714338435667446,
+            0.3032008336686258,
+            0.3174445784155295,
+            0.22221120355917026
+          ],
+          "test_mae": 23.767772135429315
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            1.0,
+            0.0
+          ],
+          "test_mae": 13.038534452266783
+        },
+        "constrained_mse": {
+          "w": [
+            0.0,
+            1.6482941097956984e-10,
+            0.9999999997453165,
+            8.9854093955618e-11
+          ],
+          "test_mae": 13.038534456323145
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 13.038534452266783
+      },
+      "winner": {
+        "method": "constrained_mae",
+        "test_mae": 13.038534452266783
+      }
+    },
+    "DEXJPUS_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 2.0058613373406016
+        },
+        "inverse_mae": {
+          "w": [
+            0.3311569291093271,
+            0.21966516526756977,
+            0.27781607384114676,
+            0.17136183178195635
+          ],
+          "test_mae": 1.7598609660764388
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999999993,
+            0.0,
+            0.0,
+            7.14706072102444e-16
+          ],
+          "test_mae": 0.9624409634715991
+        },
+        "constrained_mse": {
+          "w": [
+            0.637656517780962,
+            0.0,
+            0.36234348221903795,
+            2.0816681711721676e-17
+          ],
+          "test_mae": 1.1158006833860175
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 0.962440963471597
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.962440963471597
+      }
+    },
+    "DEXJPUS_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 2.0585639763398134
+        },
+        "inverse_mae": {
+          "w": [
+            0.29221948346213755,
+            0.30006908767689383,
+            0.3336814964148649,
+            0.07402993244610366
+          ],
+          "test_mae": 1.525371337574877
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            0.9999999999998224,
+            1.7753796613177788e-13
+          ],
+          "test_mae": 0.9391751508495592
+        },
+        "constrained_mse": {
+          "w": [
+            0.0,
+            0.23909961575984545,
+            0.7609003842401545,
+            0.0
+          ],
+          "test_mae": 1.1619170740566178
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 0.9391751508489655
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.9391751508489655
+      }
+    },
+    "DEXJPUS_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 2.6223114452299363
+        },
+        "inverse_mae": {
+          "w": [
+            0.2431707261347647,
+            0.2670867329969705,
+            0.36747924632317114,
+            0.12226329454509363
+          ],
+          "test_mae": 2.501007095618067
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            1.0,
+            0.0
+          ],
+          "test_mae": 2.3202441940310328
+        },
+        "constrained_mse": {
+          "w": [
+            0.12111050197987697,
+            1.124100812432969e-15,
+            0.8788894980201218,
+            0.0
+          ],
+          "test_mae": 2.284742353079749
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 2.3202441940310328
+      },
+      "winner": {
+        "method": "constrained_mse",
+        "test_mae": 2.284742353079749
+      }
+    },
+    "DEXUSEU_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.01777263656328388
+        },
+        "inverse_mae": {
+          "w": [
+            0.4380311521257709,
+            0.1895078632684431,
+            0.2934679866590765,
+            0.07899299794670979
+          ],
+          "test_mae": 0.012544562664192396
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999999984,
+            1.0061396160665477e-15,
+            5.846018114041837e-16,
+            0.0
+          ],
+          "test_mae": 0.008009630047676911
+        },
+        "constrained_mse": {
+          "w": [
+            0.88076958835974,
+            5.551115123125784e-17,
+            0.11923041164026013,
+            5.551115123125784e-17
+          ],
+          "test_mae": 0.00812923667806015
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 0.008009630047676897
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.008009630047676897
+      }
+    },
+    "DEXUSEU_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.01766253143684469
+        },
+        "inverse_mae": {
+          "w": [
+            0.3649772970412571,
+            0.20972059927142733,
+            0.2903737730393877,
+            0.13492833064792778
+          ],
+          "test_mae": 0.015437376589926739
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999999998,
+            0.0,
+            0.0,
+            2.1510571102112403e-16
+          ],
+          "test_mae": 0.01478179445033124
+        },
+        "constrained_mse": {
+          "w": [
+            0.5541512994206012,
+            1.3877787807814457e-16,
+            0.4458487005793988,
+            1.0408340855860843e-17
+          ],
+          "test_mae": 0.012606685154728608
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 0.014781794450331237
+      },
+      "winner": {
+        "method": "constrained_mse",
+        "test_mae": 0.012606685154728608
+      }
+    },
+    "DEXUSEU_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.017842508329409604
+        },
+        "inverse_mae": {
+          "w": [
+            0.3562207101529807,
+            0.18924080034829216,
+            0.31700784157235296,
+            0.13753064792637432
+          ],
+          "test_mae": 0.015970560076149547
+        },
+        "constrained_mae": {
+          "w": [
+            0.9999999999999982,
+            9.43689570931382e-16,
+            0.0,
+            8.049116928532376e-16
+          ],
+          "test_mae": 0.014453346940792903
+        },
+        "constrained_mse": {
+          "w": [
+            0.5446169594084305,
+            2.7755575615628907e-17,
+            0.45538304059156953,
+            0.0
+          ],
+          "test_mae": 0.013183660449898013
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "chronos",
+        "test_mae": 0.014453346940792889
+      },
+      "winner": {
+        "method": "constrained_mse",
+        "test_mae": 0.013183660449898013
+      }
+    },
+    "DEXCHUS_7": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.034690690500036904
+        },
+        "inverse_mae": {
+          "w": [
+            0.30725895677630083,
+            0.24691376598214834,
+            0.3943485789337087,
+            0.05147869830784206
+          ],
+          "test_mae": 0.02117886221826054
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.0,
+            0.9999999999999998,
+            2.3409280156677643e-16
+          ],
+          "test_mae": 0.015762412884263256
+        },
+        "constrained_mse": {
+          "w": [
+            0.0,
+            0.040015823687684034,
+            0.959984176312316,
+            1.0408340855860841e-17
+          ],
+          "test_mae": 0.016130545137926368
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "arima",
+        "test_mae": 0.015762412884263242
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.015762412884263242
+      }
+    },
+    "DEXCHUS_14": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.049119233033837334
+        },
+        "inverse_mae": {
+          "w": [
+            0.2988178654996703,
+            0.30220512040404324,
+            0.27971237870613896,
+            0.1192646353901474
+          ],
+          "test_mae": 0.04197509948228402
+        },
+        "constrained_mae": {
+          "w": [
+            0.0,
+            0.9999999999999992,
+            3.6082248300317563e-16,
+            3.4174052476743075e-16
+          ],
+          "test_mae": 0.03187400458960995
+        },
+        "constrained_mse": {
+          "w": [
+            0.5594517657002177,
+            0.23577483341396505,
+            0.2047734008858172,
+            0.0
+          ],
+          "test_mae": 0.033564545616950006
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "timesfm",
+        "test_mae": 0.03187400458960993
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.03187400458960993
+      }
+    },
+    "DEXCHUS_28": {
+      "n_cal_folds": 10,
+      "n_test_folds": 10,
+      "models": [
+        "chronos",
+        "timesfm",
+        "arima",
+        "prophet"
+      ],
+      "weights": {
+        "equal": {
+          "w": [
+            0.25,
+            0.25,
+            0.25,
+            0.25
+          ],
+          "test_mae": 0.07622515708177849
+        },
+        "inverse_mae": {
+          "w": [
+            0.21374276213191848,
+            0.32878921058258087,
+            0.27206545754178274,
+            0.18540256974371785
+          ],
+          "test_mae": 0.07368140063745915
+        },
+        "constrained_mae": {
+          "w": [
+            3.565258741241218e-17,
+            0.9999999999999993,
+            0.0,
+            6.714758455242072e-16
+          ],
+          "test_mae": 0.05984540049808135
+        },
+        "constrained_mse": {
+          "w": [
+            0.0,
+            0.7615511144034006,
+            0.23844888559659938,
+            5.308685925196128e-17
+          ],
+          "test_mae": 0.06440512615984152
+        }
+      },
+      "best_individual_on_cal": {
+        "model": "timesfm",
+        "test_mae": 0.059845400498081305
+      },
+      "winner": {
+        "method": "best_individual",
+        "test_mae": 0.059845400498081305
+      }
+    }
+  },
+  "elapsed_s": 0.09606218338012695
 }

FINAL_SUBMIT/receipts/R3_STACKING_V3_POINTLEVEL.json CHANGED Viewed

@@ -1,227 +1,227 @@
-{
-  "description": "Per-point Bates-Granger constrained stacking on real forecaster outputs. No synthesized folds.",
-  "per_target_horizon": {
-    "DCOILWTICO_h7": {
-      "n_cal_points": 70,
-      "n_test_points": 70,
-      "individual_mae": {
-        "chronos": 3.006047764369419,
-        "arima": 3.0841361525087674,
-        "prophet": 8.557134422551027,
-        "naive": 2.839285714285714
-      },
-      "stacking_mae": {
-        "equal": 3.381860717562512,
-        "best_on_cal": 2.839285714285714,
-        "constrained_mae": 2.839285714285714,
-        "constrained_mse": 2.839285714285714
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 0.0,
-          "arima": 3.8857805861880464e-16,
-          "prophet": 0.0,
-          "naive": 0.9999999999999996
-        },
-        "constrained_mse": {
-          "chronos": 1.2281842209915794e-15,
-          "arima": 1.7069679003611782e-15,
-          "prophet": 6.824272182231614e-17,
-          "naive": 0.999999999999997
-        }
-      },
-      "best_single_model": "naive",
-      "best_single_mae": 2.839285714285714,
-      "winner_method": "naive",
-      "winner_mae": 2.839285714285714,
-      "constrained_beats_best_single": false
-    },
-    "DCOILWTICO_h14": {
-      "n_cal_points": 140,
-      "n_test_points": 140,
-      "individual_mae": {
-        "chronos": 3.797937408447266,
-        "arima": 3.917782537843266,
-        "prophet": 9.218187229009528,
-        "naive": 3.6239285714285714
-      },
-      "stacking_mae": {
-        "equal": 3.9604401984158755,
-        "best_on_cal": 3.6239285714285714,
-        "constrained_mae": 3.623928571428571,
-        "constrained_mse": 3.6994484688718305
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 1.3877787807814454e-16,
-          "arima": 0.0,
-          "prophet": 0.0,
-          "naive": 0.9999999999999998
-        },
-        "constrained_mse": {
-          "chronos": 3.0753177782116836e-14,
-          "arima": 0.25973397692659406,
-          "prophet": 1.0636618946679322e-15,
-          "naive": 0.7402660230733741
-        }
-      },
-      "best_single_model": "naive",
-      "best_single_mae": 3.6239285714285714,
-      "winner_method": "constrained_mae",
-      "winner_mae": 3.623928571428571,
-      "constrained_beats_best_single": true
-    },
-    "DEXUSEU_h7": {
-      "n_cal_points": 70,
-      "n_test_points": 70,
-      "individual_mae": {
-        "chronos": 0.00997808286394391,
-        "arima": 0.00909829887487626,
-        "prophet": 0.04588529230089117,
-        "naive": 0.009057142857142856
-      },
-      "stacking_mae": {
-        "equal": 0.013885443002327432,
-        "best_on_cal": 0.00997808286394391,
-        "constrained_mae": 0.009495985176023706,
-        "constrained_mse": 0.013885443002327432
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 0.3382904222928093,
-          "arima": 0.2908333034179931,
-          "prophet": 0.07824807605162067,
-          "naive": 0.292628198237577
-        },
-        "constrained_mse": {
-          "chronos": 0.25,
-          "arima": 0.25,
-          "prophet": 0.25,
-          "naive": 0.25
-        }
-      },
-      "best_single_model": "naive",
-      "best_single_mae": 0.009057142857142856,
-      "winner_method": "naive",
-      "winner_mae": 0.009057142857142856,
-      "constrained_beats_best_single": false
-    },
-    "DEXUSEU_h14": {
-      "n_cal_points": 140,
-      "n_test_points": 140,
-      "individual_mae": {
-        "chronos": 0.013727861084256852,
-        "arima": 0.012013652348349491,
-        "prophet": 0.04736957874192551,
-        "naive": 0.01203071428571428
-      },
-      "stacking_mae": {
-        "equal": 0.015656730784239885,
-        "best_on_cal": 0.012013652348349491,
-        "constrained_mae": 0.012635021721737227,
-        "constrained_mse": 0.015656730784239885
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 0.3173041077741453,
-          "arima": 0.2850093471133051,
-          "prophet": 0.10822240332468126,
-          "naive": 0.28946414178786833
-        },
-        "constrained_mse": {
-          "chronos": 0.25,
-          "arima": 0.25,
-          "prophet": 0.25,
-          "naive": 0.25
-        }
-      },
-      "best_single_model": "arima",
-      "best_single_mae": 0.012013652348349491,
-      "winner_method": "arima",
-      "winner_mae": 0.012013652348349491,
-      "constrained_beats_best_single": false
-    },
-    "DEXCHUS_h7": {
-      "n_cal_points": 70,
-      "n_test_points": 70,
-      "individual_mae": {
-        "chronos": 0.019519044701712434,
-        "arima": 0.017992622791365688,
-        "prophet": 0.11663701396527856,
-        "naive": 0.01873000000000015
-      },
-      "stacking_mae": {
-        "equal": 0.03595753473515902,
-        "best_on_cal": 0.019519044701712434,
-        "constrained_mae": 0.020133491932037322,
-        "constrained_mse": 0.019334668170698382
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 0.7133898921965662,
-          "arima": 0.21870528495965705,
-          "prophet": 0.06790482284377684,
-          "naive": 0.0
-        },
-        "constrained_mse": {
-          "chronos": 0.935153684195057,
-          "arima": 8.998878031629688e-18,
-          "prophet": 0.008348340456592942,
-          "naive": 0.056497975348350146
-        }
-      },
-      "best_single_model": "arima",
-      "best_single_mae": 0.017992622791365688,
-      "winner_method": "arima",
-      "winner_mae": 0.017992622791365688,
-      "constrained_beats_best_single": false
-    },
-    "DEXCHUS_h14": {
-      "n_cal_points": 140,
-      "n_test_points": 140,
-      "individual_mae": {
-        "chronos": 0.03237065534319195,
-        "arima": 0.03236972869761379,
-        "prophet": 0.12129274215959333,
-        "naive": 0.03212142857142869
-      },
-      "stacking_mae": {
-        "equal": 0.043605583896191145,
-        "best_on_cal": 0.03237065534319195,
-        "constrained_mae": 0.031424293689945516,
-        "constrained_mse": 0.034848071305054344
-      },
-      "weights": {
-        "constrained_mae": {
-          "chronos": 0.6699556648170705,
-          "arima": 0.251108263144011,
-          "prophet": 0.07893607203891846,
-          "naive": 6.03983418880819e-19
-        },
-        "constrained_mse": {
-          "chronos": 0.8500735106653095,
-          "arima": 0.0,
-          "prophet": 0.14992648933469047,
-          "naive": 0.0
-        }
-      },
-      "best_single_model": "naive",
-      "best_single_mae": 0.03212142857142869,
-      "winner_method": "constrained_mae",
-      "winner_mae": 0.031424293689945516,
-      "constrained_beats_best_single": true
-    }
-  },
-  "wins": {
-    "constrained": 2,
-    "best_single": 4,
-    "equal": 0,
-    "naive": 0
-  },
-  "summary": {
-    "total_target_horizon_cells": 6,
-    "constrained_stacking_wins": 2,
-    "constrained_beats_best_single_cells": 2
-  },
-  "elapsed_min": 2.2175209800402325
 }

+{
+  "description": "Per-point Bates-Granger constrained stacking on real forecaster outputs. No synthesized folds.",
+  "per_target_horizon": {
+    "DCOILWTICO_h7": {
+      "n_cal_points": 70,
+      "n_test_points": 70,
+      "individual_mae": {
+        "chronos": 3.006047764369419,
+        "arima": 3.0841361525087674,
+        "prophet": 8.557134422551027,
+        "naive": 2.839285714285714
+      },
+      "stacking_mae": {
+        "equal": 3.381860717562512,
+        "best_on_cal": 2.839285714285714,
+        "constrained_mae": 2.839285714285714,
+        "constrained_mse": 2.839285714285714
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 0.0,
+          "arima": 3.8857805861880464e-16,
+          "prophet": 0.0,
+          "naive": 0.9999999999999996
+        },
+        "constrained_mse": {
+          "chronos": 1.2281842209915794e-15,
+          "arima": 1.7069679003611782e-15,
+          "prophet": 6.824272182231614e-17,
+          "naive": 0.999999999999997
+        }
+      },
+      "best_single_model": "naive",
+      "best_single_mae": 2.839285714285714,
+      "winner_method": "naive",
+      "winner_mae": 2.839285714285714,
+      "constrained_beats_best_single": false
+    },
+    "DCOILWTICO_h14": {
+      "n_cal_points": 140,
+      "n_test_points": 140,
+      "individual_mae": {
+        "chronos": 3.797937408447266,
+        "arima": 3.917782537843266,
+        "prophet": 9.218187229009528,
+        "naive": 3.6239285714285714
+      },
+      "stacking_mae": {
+        "equal": 3.9604401984158755,
+        "best_on_cal": 3.6239285714285714,
+        "constrained_mae": 3.623928571428571,
+        "constrained_mse": 3.6994484688718305
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 1.3877787807814454e-16,
+          "arima": 0.0,
+          "prophet": 0.0,
+          "naive": 0.9999999999999998
+        },
+        "constrained_mse": {
+          "chronos": 3.0753177782116836e-14,
+          "arima": 0.25973397692659406,
+          "prophet": 1.0636618946679322e-15,
+          "naive": 0.7402660230733741
+        }
+      },
+      "best_single_model": "naive",
+      "best_single_mae": 3.6239285714285714,
+      "winner_method": "constrained_mae",
+      "winner_mae": 3.623928571428571,
+      "constrained_beats_best_single": true
+    },
+    "DEXUSEU_h7": {
+      "n_cal_points": 70,
+      "n_test_points": 70,
+      "individual_mae": {
+        "chronos": 0.00997808286394391,
+        "arima": 0.00909829887487626,
+        "prophet": 0.04588529230089117,
+        "naive": 0.009057142857142856
+      },
+      "stacking_mae": {
+        "equal": 0.013885443002327432,
+        "best_on_cal": 0.00997808286394391,
+        "constrained_mae": 0.009495985176023706,
+        "constrained_mse": 0.013885443002327432
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 0.3382904222928093,
+          "arima": 0.2908333034179931,
+          "prophet": 0.07824807605162067,
+          "naive": 0.292628198237577
+        },
+        "constrained_mse": {
+          "chronos": 0.25,
+          "arima": 0.25,
+          "prophet": 0.25,
+          "naive": 0.25
+        }
+      },
+      "best_single_model": "naive",
+      "best_single_mae": 0.009057142857142856,
+      "winner_method": "naive",
+      "winner_mae": 0.009057142857142856,
+      "constrained_beats_best_single": false
+    },
+    "DEXUSEU_h14": {
+      "n_cal_points": 140,
+      "n_test_points": 140,
+      "individual_mae": {
+        "chronos": 0.013727861084256852,
+        "arima": 0.012013652348349491,
+        "prophet": 0.04736957874192551,
+        "naive": 0.01203071428571428
+      },
+      "stacking_mae": {
+        "equal": 0.015656730784239885,
+        "best_on_cal": 0.012013652348349491,
+        "constrained_mae": 0.012635021721737227,
+        "constrained_mse": 0.015656730784239885
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 0.3173041077741453,
+          "arima": 0.2850093471133051,
+          "prophet": 0.10822240332468126,
+          "naive": 0.28946414178786833
+        },
+        "constrained_mse": {
+          "chronos": 0.25,
+          "arima": 0.25,
+          "prophet": 0.25,
+          "naive": 0.25
+        }
+      },
+      "best_single_model": "arima",
+      "best_single_mae": 0.012013652348349491,
+      "winner_method": "arima",
+      "winner_mae": 0.012013652348349491,
+      "constrained_beats_best_single": false
+    },
+    "DEXCHUS_h7": {
+      "n_cal_points": 70,
+      "n_test_points": 70,
+      "individual_mae": {
+        "chronos": 0.019519044701712434,
+        "arima": 0.017992622791365688,
+        "prophet": 0.11663701396527856,
+        "naive": 0.01873000000000015
+      },
+      "stacking_mae": {
+        "equal": 0.03595753473515902,
+        "best_on_cal": 0.019519044701712434,
+        "constrained_mae": 0.020133491932037322,
+        "constrained_mse": 0.019334668170698382
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 0.7133898921965662,
+          "arima": 0.21870528495965705,
+          "prophet": 0.06790482284377684,
+          "naive": 0.0
+        },
+        "constrained_mse": {
+          "chronos": 0.935153684195057,
+          "arima": 8.998878031629688e-18,
+          "prophet": 0.008348340456592942,
+          "naive": 0.056497975348350146
+        }
+      },
+      "best_single_model": "arima",
+      "best_single_mae": 0.017992622791365688,
+      "winner_method": "arima",
+      "winner_mae": 0.017992622791365688,
+      "constrained_beats_best_single": false
+    },
+    "DEXCHUS_h14": {
+      "n_cal_points": 140,
+      "n_test_points": 140,
+      "individual_mae": {
+        "chronos": 0.03237065534319195,
+        "arima": 0.03236972869761379,
+        "prophet": 0.12129274215959333,
+        "naive": 0.03212142857142869
+      },
+      "stacking_mae": {
+        "equal": 0.043605583896191145,
+        "best_on_cal": 0.03237065534319195,
+        "constrained_mae": 0.031424293689945516,
+        "constrained_mse": 0.034848071305054344
+      },
+      "weights": {
+        "constrained_mae": {
+          "chronos": 0.6699556648170705,
+          "arima": 0.251108263144011,
+          "prophet": 0.07893607203891846,
+          "naive": 6.03983418880819e-19
+        },
+        "constrained_mse": {
+          "chronos": 0.8500735106653095,
+          "arima": 0.0,
+          "prophet": 0.14992648933469047,
+          "naive": 0.0
+        }
+      },
+      "best_single_model": "naive",
+      "best_single_mae": 0.03212142857142869,
+      "winner_method": "constrained_mae",
+      "winner_mae": 0.031424293689945516,
+      "constrained_beats_best_single": true
+    }
+  },
+  "wins": {
+    "constrained": 2,
+    "best_single": 4,
+    "equal": 0,
+    "naive": 0
+  },
+  "summary": {
+    "total_target_horizon_cells": 6,
+    "constrained_stacking_wins": 2,
+    "constrained_beats_best_single_cells": 2
+  },
+  "elapsed_min": 2.2175209800402325
 }

FINAL_SUBMIT/receipts/R3_TIMESFM_QUANTILE.json CHANGED Viewed

@@ -1,130 +1,130 @@
-{
-  "method": "per-horizon split-conformal wrapper on TimesFM point forecasts",
-  "comparison": "Chronos-Bolt native quantiles",
-  "targets": {
-    "DCOILWTICO": {
-      "target": "DCOILWTICO",
-      "n_cal": 20,
-      "n_test": 20,
-      "timesfm_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.7464285714285714,
-        "mean_width": 11.44973765781948,
-        "dev_from_nominal": 0.0535714285714286
-      },
-      "timesfm_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.8321428571428573,
-        "mean_width": 14.322232644217351,
-        "dev_from_nominal": 0.06785714285714273
-      },
-      "timesfm_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.9,
-        "mean_width": 17.292571051461362,
-        "dev_from_nominal": 0.04999999999999993
-      },
-      "chronos_native_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.7107142857142856,
-        "mean_width": 10.861018967628478,
-        "dev_from_nominal": 0.08928571428571441
-      },
-      "chronos_native_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.7107142857142856,
-        "mean_width": 10.861018967628478,
-        "dev_from_nominal": 0.1892857142857144
-      },
-      "chronos_native_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.7107142857142856,
-        "mean_width": 10.861018967628478,
-        "dev_from_nominal": 0.23928571428571432
-      }
-    },
-    "DEXJPUS": {
-      "target": "DEXJPUS",
-      "n_cal": 20,
-      "n_test": 20,
-      "timesfm_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.7464285714285714,
-        "mean_width": 5.831283089773991,
-        "dev_from_nominal": 0.0535714285714286
-      },
-      "timesfm_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.7928571428571428,
-        "mean_width": 6.870930001395079,
-        "dev_from_nominal": 0.1071428571428572
-      },
-      "timesfm_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.8035714285714285,
-        "mean_width": 7.547866254534036,
-        "dev_from_nominal": 0.14642857142857146
-      },
-      "chronos_native_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.742857142857143,
-        "mean_width": 5.904579341411591,
-        "dev_from_nominal": 0.05714285714285705
-      },
-      "chronos_native_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.742857142857143,
-        "mean_width": 5.904579341411591,
-        "dev_from_nominal": 0.15714285714285703
-      },
-      "chronos_native_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.742857142857143,
-        "mean_width": 5.904579341411591,
-        "dev_from_nominal": 0.20714285714285696
-      }
-    },
-    "DEXUSEU": {
-      "target": "DEXUSEU",
-      "n_cal": 20,
-      "n_test": 20,
-      "timesfm_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.9071428571428573,
-        "mean_width": 0.06282055849347795,
-        "dev_from_nominal": 0.1071428571428572
-      },
-      "timesfm_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.9678571428571429,
-        "mean_width": 0.08470568656921382,
-        "dev_from_nominal": 0.06785714285714284
-      },
-      "timesfm_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.9821428571428571,
-        "mean_width": 0.09796196365356444,
-        "dev_from_nominal": 0.03214285714285714
-      },
-      "chronos_native_conf=0.8": {
-        "nominal_coverage": 0.8,
-        "empirical_coverage": 0.7357142857142858,
-        "mean_width": 0.03356509944424033,
-        "dev_from_nominal": 0.06428571428571428
-      },
-      "chronos_native_conf=0.9": {
-        "nominal_coverage": 0.9,
-        "empirical_coverage": 0.7357142857142858,
-        "mean_width": 0.03356509944424033,
-        "dev_from_nominal": 0.16428571428571426
-      },
-      "chronos_native_conf=0.95": {
-        "nominal_coverage": 0.95,
-        "empirical_coverage": 0.7357142857142858,
-        "mean_width": 0.03356509944424033,
-        "dev_from_nominal": 0.2142857142857142
-      }
-    }
-  },
-  "elapsed_min": 0.5109713474909464
 }

+{
+  "method": "per-horizon split-conformal wrapper on TimesFM point forecasts",
+  "comparison": "Chronos-Bolt native quantiles",
+  "targets": {
+    "DCOILWTICO": {
+      "target": "DCOILWTICO",
+      "n_cal": 20,
+      "n_test": 20,
+      "timesfm_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.7464285714285714,
+        "mean_width": 11.44973765781948,
+        "dev_from_nominal": 0.0535714285714286
+      },
+      "timesfm_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.8321428571428573,
+        "mean_width": 14.322232644217351,
+        "dev_from_nominal": 0.06785714285714273
+      },
+      "timesfm_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.9,
+        "mean_width": 17.292571051461362,
+        "dev_from_nominal": 0.04999999999999993
+      },
+      "chronos_native_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.7107142857142856,
+        "mean_width": 10.861018967628478,
+        "dev_from_nominal": 0.08928571428571441
+      },
+      "chronos_native_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.7107142857142856,
+        "mean_width": 10.861018967628478,
+        "dev_from_nominal": 0.1892857142857144
+      },
+      "chronos_native_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.7107142857142856,
+        "mean_width": 10.861018967628478,
+        "dev_from_nominal": 0.23928571428571432
+      }
+    },
+    "DEXJPUS": {
+      "target": "DEXJPUS",
+      "n_cal": 20,
+      "n_test": 20,
+      "timesfm_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.7464285714285714,
+        "mean_width": 5.831283089773991,
+        "dev_from_nominal": 0.0535714285714286
+      },
+      "timesfm_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.7928571428571428,
+        "mean_width": 6.870930001395079,
+        "dev_from_nominal": 0.1071428571428572
+      },
+      "timesfm_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.8035714285714285,
+        "mean_width": 7.547866254534036,
+        "dev_from_nominal": 0.14642857142857146
+      },
+      "chronos_native_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.742857142857143,
+        "mean_width": 5.904579341411591,
+        "dev_from_nominal": 0.05714285714285705
+      },
+      "chronos_native_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.742857142857143,
+        "mean_width": 5.904579341411591,
+        "dev_from_nominal": 0.15714285714285703
+      },
+      "chronos_native_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.742857142857143,
+        "mean_width": 5.904579341411591,
+        "dev_from_nominal": 0.20714285714285696
+      }
+    },
+    "DEXUSEU": {
+      "target": "DEXUSEU",
+      "n_cal": 20,
+      "n_test": 20,
+      "timesfm_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.9071428571428573,
+        "mean_width": 0.06282055849347795,
+        "dev_from_nominal": 0.1071428571428572
+      },
+      "timesfm_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.9678571428571429,
+        "mean_width": 0.08470568656921382,
+        "dev_from_nominal": 0.06785714285714284
+      },
+      "timesfm_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.9821428571428571,
+        "mean_width": 0.09796196365356444,
+        "dev_from_nominal": 0.03214285714285714
+      },
+      "chronos_native_conf=0.8": {
+        "nominal_coverage": 0.8,
+        "empirical_coverage": 0.7357142857142858,
+        "mean_width": 0.03356509944424033,
+        "dev_from_nominal": 0.06428571428571428
+      },
+      "chronos_native_conf=0.9": {
+        "nominal_coverage": 0.9,
+        "empirical_coverage": 0.7357142857142858,
+        "mean_width": 0.03356509944424033,
+        "dev_from_nominal": 0.16428571428571426
+      },
+      "chronos_native_conf=0.95": {
+        "nominal_coverage": 0.95,
+        "empirical_coverage": 0.7357142857142858,
+        "mean_width": 0.03356509944424033,
+        "dev_from_nominal": 0.2142857142857142
+      }
+    }
+  },
+  "elapsed_min": 0.5109713474909464
 }

FINAL_SUBMIT/receipts/R4_DANGEROUS_V2.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_ABLATION.json CHANGED Viewed

@@ -1,397 +1,397 @@
-{
-  "description": "R4 ablation: DeepSeek-R1-Q4 reassigned to devil's-advocate (consulted, not voting). Primary consensus = Qwen-14B + Mistral-Nemo.",
-  "primary_judges": [
-    "qwen25-14b-local",
-    "mistral-nemo-local"
-  ],
-  "devils_advocate": "deepseek-r1-local-q4",
-  "n_scenarios": 26,
-  "agreement_primary_panel": {
-    "krippendorff_alpha_ordinal": 0.7499056959637873,
-    "cohen_weighted_kappa_qwen_vs_mistral": 0.7473841554559043
-  },
-  "accuracy_vs_ground_truth": {
-    "primary_majority_vote": {
-      "correct": 16,
-      "total": 26,
-      "accuracy": 0.6153846153846154
-    },
-    "three_judge_majority_vote_ORIGINAL": {
-      "correct": 18,
-      "total": 26,
-      "accuracy": 0.6923076923076923
-    },
-    "devils_advocate_deepseek": {
-      "correct": 8,
-      "total": 26,
-      "accuracy": 0.3076923076923077
-    }
-  },
-  "confusion_matrix_primary": [
-    [
-      7,
-      0,
-      0,
-      0
-    ],
-    [
-      2,
-      5,
-      0,
-      0
-    ],
-    [
-      0,
-      5,
-      3,
-      1
-    ],
-    [
-      0,
-      0,
-      2,
-      1
-    ]
-  ],
-  "confusion_matrix_three_judge_ORIGINAL": [
-    [
-      7,
-      0,
-      0,
-      0
-    ],
-    [
-      2,
-      3,
-      2,
-      0
-    ],
-    [
-      0,
-      2,
-      7,
-      0
-    ],
-    [
-      0,
-      0,
-      2,
-      1
-    ]
-  ],
-  "calibration_ece_primary": 0.2894230769230769,
-  "per_scenario": {
-    "2011_T\u014dhoku_earthquake_and_tsunami": {
-      "ground_truth": "CRITICAL",
-      "primary_panel_ratings": [
-        4,
-        4
-      ],
-      "primary_majority": "CRITICAL",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "CRITICAL",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "2020\u20132023_global_chip_shortage": {
-      "ground_truth": "CRITICAL",
-      "primary_panel_ratings": [
-        3,
-        3
-      ],
-      "primary_majority": "HIGH",
-      "devil_rating": "CRITICAL",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": true
-    },
-    "2021_Suez_Canal_obstruction": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        3,
-        3
-      ],
-      "primary_majority": "HIGH",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": true,
-      "devil_correct": true
-    },
-    "Bab-el-Mandeb": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        2,
-        1
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "MEDIUM",
-      "primary_correct": false,
-      "devil_correct": true
-    },
-    "Baltic_Dry_Index": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Bullwhip_effect": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": false,
-      "devil_correct": false
-    },
-    "CHIPS_and_Science_Act": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        1,
-        2
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "MEDIUM",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Container_ship": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Enterprise_resource_planning": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "MEDIUM",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Ever_Given": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        2,
-        3
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": true
-    },
-    "Foxconn": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        3,
-        2
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Inventory": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Just-in-time_manufacturing": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": false,
-      "devil_correct": false
-    },
-    "Logistics": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Port_of_Los_Angeles": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        2,
-        2
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "MEDIUM",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Port_of_Singapore": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        3,
-        2
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Red_Sea_crisis": {
-      "ground_truth": "CRITICAL",
-      "primary_panel_ratings": [
-        3,
-        3
-      ],
-      "primary_majority": "HIGH",
-      "devil_rating": "CRITICAL",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": true
-    },
-    "Samsung_Electronics": {
-      "ground_truth": "MEDIUM",
-      "primary_panel_ratings": [
-        2,
-        1
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "MEDIUM",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "Semiconductor_industry": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        2,
-        1
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "CRITICAL",
-      "three_judge_majority": "MEDIUM",
-      "primary_correct": false,
-      "devil_correct": false
-    },
-    "Strait_of_Hormuz": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        4,
-        3
-      ],
-      "primary_majority": "CRITICAL",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": true
-    },
-    "Strait_of_Malacca": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        3,
-        3
-      ],
-      "primary_majority": "HIGH",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": true,
-      "devil_correct": true
-    },
-    "Suez_Canal": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        3,
-        1
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "CRITICAL",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": false
-    },
-    "Supply_chain_attack": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        2,
-        3
-      ],
-      "primary_majority": "MEDIUM",
-      "devil_rating": "CRITICAL",
-      "three_judge_majority": "HIGH",
-      "primary_correct": false,
-      "devil_correct": false
-    },
-    "Supply_chain_management": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    },
-    "TSMC": {
-      "ground_truth": "HIGH",
-      "primary_panel_ratings": [
-        3,
-        3
-      ],
-      "primary_majority": "HIGH",
-      "devil_rating": "HIGH",
-      "three_judge_majority": "HIGH",
-      "primary_correct": true,
-      "devil_correct": true
-    },
-    "Warehouse": {
-      "ground_truth": "LOW",
-      "primary_panel_ratings": [
-        1,
-        1
-      ],
-      "primary_majority": "LOW",
-      "devil_rating": "MEDIUM",
-      "three_judge_majority": "LOW",
-      "primary_correct": true,
-      "devil_correct": false
-    }
-  }
 }

+{
+  "description": "R4 ablation: DeepSeek-R1-Q4 reassigned to devil's-advocate (consulted, not voting). Primary consensus = Qwen-14B + Mistral-Nemo.",
+  "primary_judges": [
+    "qwen25-14b-local",
+    "mistral-nemo-local"
+  ],
+  "devils_advocate": "deepseek-r1-local-q4",
+  "n_scenarios": 26,
+  "agreement_primary_panel": {
+    "krippendorff_alpha_ordinal": 0.7499056959637873,
+    "cohen_weighted_kappa_qwen_vs_mistral": 0.7473841554559043
+  },
+  "accuracy_vs_ground_truth": {
+    "primary_majority_vote": {
+      "correct": 16,
+      "total": 26,
+      "accuracy": 0.6153846153846154
+    },
+    "three_judge_majority_vote_ORIGINAL": {
+      "correct": 18,
+      "total": 26,
+      "accuracy": 0.6923076923076923
+    },
+    "devils_advocate_deepseek": {
+      "correct": 8,
+      "total": 26,
+      "accuracy": 0.3076923076923077
+    }
+  },
+  "confusion_matrix_primary": [
+    [
+      7,
+      0,
+      0,
+      0
+    ],
+    [
+      2,
+      5,
+      0,
+      0
+    ],
+    [
+      0,
+      5,
+      3,
+      1
+    ],
+    [
+      0,
+      0,
+      2,
+      1
+    ]
+  ],
+  "confusion_matrix_three_judge_ORIGINAL": [
+    [
+      7,
+      0,
+      0,
+      0
+    ],
+    [
+      2,
+      3,
+      2,
+      0
+    ],
+    [
+      0,
+      2,
+      7,
+      0
+    ],
+    [
+      0,
+      0,
+      2,
+      1
+    ]
+  ],
+  "calibration_ece_primary": 0.2894230769230769,
+  "per_scenario": {
+    "2011_T\u014dhoku_earthquake_and_tsunami": {
+      "ground_truth": "CRITICAL",
+      "primary_panel_ratings": [
+        4,
+        4
+      ],
+      "primary_majority": "CRITICAL",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "CRITICAL",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "2020\u20132023_global_chip_shortage": {
+      "ground_truth": "CRITICAL",
+      "primary_panel_ratings": [
+        3,
+        3
+      ],
+      "primary_majority": "HIGH",
+      "devil_rating": "CRITICAL",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": true
+    },
+    "2021_Suez_Canal_obstruction": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        3,
+        3
+      ],
+      "primary_majority": "HIGH",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": true,
+      "devil_correct": true
+    },
+    "Bab-el-Mandeb": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        2,
+        1
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "MEDIUM",
+      "primary_correct": false,
+      "devil_correct": true
+    },
+    "Baltic_Dry_Index": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Bullwhip_effect": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": false,
+      "devil_correct": false
+    },
+    "CHIPS_and_Science_Act": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        1,
+        2
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "MEDIUM",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Container_ship": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Enterprise_resource_planning": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "MEDIUM",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Ever_Given": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        2,
+        3
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": true
+    },
+    "Foxconn": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        3,
+        2
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Inventory": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Just-in-time_manufacturing": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": false,
+      "devil_correct": false
+    },
+    "Logistics": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Port_of_Los_Angeles": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        2,
+        2
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "MEDIUM",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Port_of_Singapore": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        3,
+        2
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Red_Sea_crisis": {
+      "ground_truth": "CRITICAL",
+      "primary_panel_ratings": [
+        3,
+        3
+      ],
+      "primary_majority": "HIGH",
+      "devil_rating": "CRITICAL",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": true
+    },
+    "Samsung_Electronics": {
+      "ground_truth": "MEDIUM",
+      "primary_panel_ratings": [
+        2,
+        1
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "MEDIUM",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "Semiconductor_industry": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        2,
+        1
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "CRITICAL",
+      "three_judge_majority": "MEDIUM",
+      "primary_correct": false,
+      "devil_correct": false
+    },
+    "Strait_of_Hormuz": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        4,
+        3
+      ],
+      "primary_majority": "CRITICAL",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": true
+    },
+    "Strait_of_Malacca": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        3,
+        3
+      ],
+      "primary_majority": "HIGH",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": true,
+      "devil_correct": true
+    },
+    "Suez_Canal": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        3,
+        1
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "CRITICAL",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": false
+    },
+    "Supply_chain_attack": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        2,
+        3
+      ],
+      "primary_majority": "MEDIUM",
+      "devil_rating": "CRITICAL",
+      "three_judge_majority": "HIGH",
+      "primary_correct": false,
+      "devil_correct": false
+    },
+    "Supply_chain_management": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    },
+    "TSMC": {
+      "ground_truth": "HIGH",
+      "primary_panel_ratings": [
+        3,
+        3
+      ],
+      "primary_majority": "HIGH",
+      "devil_rating": "HIGH",
+      "three_judge_majority": "HIGH",
+      "primary_correct": true,
+      "devil_correct": true
+    },
+    "Warehouse": {
+      "ground_truth": "LOW",
+      "primary_panel_ratings": [
+        1,
+        1
+      ],
+      "primary_majority": "LOW",
+      "devil_rating": "MEDIUM",
+      "three_judge_majority": "LOW",
+      "primary_correct": true,
+      "devil_correct": false
+    }
+  }
 }

FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_LIVE.json CHANGED Viewed

@@ -1,64 +1,64 @@
-{
-  "scenario": "Red_Sea_crisis",
-  "ground_truth": "CRITICAL",
-  "per_judge": {
-    "qwen25-14b-local": {
-      "risk_level": "CRITICAL",
-      "parsed": {
-        "risk_level": "CRITICAL",
-        "confidence": 0.9,
-        "primary_vulnerabilities": [
-          "Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.",
-          "Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.",
-          "Disruption of global trade routes through the Suez Canal."
-        ],
-        "mitigations": [
-          "Enhanced naval patrols by international coalitions to protect shipping lanes.",
-          "Development of alternative shipping routes or rerouting strategies for high-risk areas.",
-          "Strengthened diplomatic efforts to negotiate a long-term ceasefire and resolution."
-        ],
-        "reasoning_one_line": "Resumption of Houthi attacks on Israel amid the Iran war poses severe threats to global maritime security and trade stability.",
-        "time_sensitivity": "VOLATILE"
-      },
-      "correct": true,
-      "latency_s": 21.539926052093506,
-      "raw_preview": "{\n  \"risk_level\": \"CRITICAL\",\n  \"confidence\": 0.9,\n  \"primary_vulnerabilities\": [\n    \"Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.\",\n    \"Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.\",\n    \"Disruption of global trade routes through the Suez Canal.\"\n  ],\n  \"mitigations\": [\n    \"Enhanced naval patrols by i"
-    },
-    "mistral-nemo-local": {
-      "risk_level": "HIGH",
-      "parsed": {
-        "risk_level": "HIGH",
-        "confidence": 0.95,
-        "primary_vulnerabilities": [
-          "Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade",
-          "Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)",
-          "Potential targeting of international merchant vessels, regardless of affiliation"
-        ],
-        "mitigations": [
-          "Strengthening maritime security cooperation and escort missions in the Red Sea (Operation Prosperity Guardian)",
-          "Diplomatic efforts to broker a ceasefire and political solution in Yemen",
-          "Diversion of shipping routes around South Africa for vessels at high risk"
-        ],
-        "reasoning_one_line": "Resumption of Houthi attacks amidst regional conflict escalation poses significant risk to maritime traffic and global trade.",
-        "time_sensitivity": "VOLATILE"
-      },
-      "correct": false,
-      "latency_s": 25.940397024154663,
-      "raw_preview": "{\n  \"risk_level\": \"HIGH\",\n  \"confidence\": 0.95,\n  \"primary_vulnerabilities\": [\n    \"Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade\",\n    \"Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)\",\n    \"Potential targeting of international merchant vessels, regardless of affiliation\"\n  ],\n  \"mitigations\": [\n    \"Str"
-    }
-  },
-  "devil": {
-    "model": "deepseek-r1-local-q4",
-    "risk_level": "PARSE_FAIL",
-    "correct": false,
-    "latency_s": 30.340745210647583,
-    "raw_preview": " Include the following sections:\n- Current Situation: Description of the conflict's current state.\n- Threats: List and describe each threat type (e.g., direct, indirect).\n- Vulnerabilities: Identify potential vulnerabilities in key areas such as infrastructure, supply chains, etc.\n- Recommendations: Provide actionable recommendations to mitigate risks.\n\nPlease make sure that your JSON is properly "
-  },
-  "summary": {
-    "primary_panel_all_correct": false,
-    "primary_correct_count": "1/2",
-    "three_judge_correct_count": "1/3",
-    "consensus_primary": "CRITICAL",
-    "ground_truth": "CRITICAL"
-  }
 }

+{
+  "scenario": "Red_Sea_crisis",
+  "ground_truth": "CRITICAL",
+  "per_judge": {
+    "qwen25-14b-local": {
+      "risk_level": "CRITICAL",
+      "parsed": {
+        "risk_level": "CRITICAL",
+        "confidence": 0.9,
+        "primary_vulnerabilities": [
+          "Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.",
+          "Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.",
+          "Disruption of global trade routes through the Suez Canal."
+        ],
+        "mitigations": [
+          "Enhanced naval patrols by international coalitions to protect shipping lanes.",
+          "Development of alternative shipping routes or rerouting strategies for high-risk areas.",
+          "Strengthened diplomatic efforts to negotiate a long-term ceasefire and resolution."
+        ],
+        "reasoning_one_line": "Resumption of Houthi attacks on Israel amid the Iran war poses severe threats to global maritime security and trade stability.",
+        "time_sensitivity": "VOLATILE"
+      },
+      "correct": true,
+      "latency_s": 21.539926052093506,
+      "raw_preview": "{\n  \"risk_level\": \"CRITICAL\",\n  \"confidence\": 0.9,\n  \"primary_vulnerabilities\": [\n    \"Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.\",\n    \"Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.\",\n    \"Disruption of global trade routes through the Suez Canal.\"\n  ],\n  \"mitigations\": [\n    \"Enhanced naval patrols by i"
+    },
+    "mistral-nemo-local": {
+      "risk_level": "HIGH",
+      "parsed": {
+        "risk_level": "HIGH",
+        "confidence": 0.95,
+        "primary_vulnerabilities": [
+          "Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade",
+          "Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)",
+          "Potential targeting of international merchant vessels, regardless of affiliation"
+        ],
+        "mitigations": [
+          "Strengthening maritime security cooperation and escort missions in the Red Sea (Operation Prosperity Guardian)",
+          "Diplomatic efforts to broker a ceasefire and political solution in Yemen",
+          "Diversion of shipping routes around South Africa for vessels at high risk"
+        ],
+        "reasoning_one_line": "Resumption of Houthi attacks amidst regional conflict escalation poses significant risk to maritime traffic and global trade.",
+        "time_sensitivity": "VOLATILE"
+      },
+      "correct": false,
+      "latency_s": 25.940397024154663,
+      "raw_preview": "{\n  \"risk_level\": \"HIGH\",\n  \"confidence\": 0.95,\n  \"primary_vulnerabilities\": [\n    \"Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade\",\n    \"Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)\",\n    \"Potential targeting of international merchant vessels, regardless of affiliation\"\n  ],\n  \"mitigations\": [\n    \"Str"
+    }
+  },
+  "devil": {
+    "model": "deepseek-r1-local-q4",
+    "risk_level": "PARSE_FAIL",
+    "correct": false,
+    "latency_s": 30.340745210647583,
+    "raw_preview": " Include the following sections:\n- Current Situation: Description of the conflict's current state.\n- Threats: List and describe each threat type (e.g., direct, indirect).\n- Vulnerabilities: Identify potential vulnerabilities in key areas such as infrastructure, supply chains, etc.\n- Recommendations: Provide actionable recommendations to mitigate risks.\n\nPlease make sure that your JSON is properly "
+  },
+  "summary": {
+    "primary_panel_all_correct": false,
+    "primary_correct_count": "1/2",
+    "three_judge_correct_count": "1/3",
+    "consensus_primary": "CRITICAL",
+    "ground_truth": "CRITICAL"
+  }
 }

FINAL_SUBMIT/receipts/R4_FRONTIER_PANEL_V2.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

FINAL_SUBMIT/receipts/R5_BEIR_MANUAL.json CHANGED Viewed

@@ -1,1023 +1,1023 @@
-{
-  "task": "SupplyMind-crisis-retrieval-BEIR-style",
-  "task_description": "Manual BEIR-style retrieval eval on 26 Wikipedia crisis articles + 20 real supply-chain queries. Metrics match the public MTEB retrieval leaderboard (nDCG@10, R@10, P@10). This is an out-of-domain task (supply chain, not medical), but numbers provide a directional check that our embedders are consistent with their published leaderboard performance.",
-  "our_results": {
-    "mxbai-embed-large-v1": {
-      "embedder": "mxbai-embed-large-v1",
-      "mean_ndcg@10": 0.9597824382702198,
-      "mean_recall@10": 1.0,
-      "mean_precision@10": 0.12000000000000002,
-      "corpus_encoding_s": 12.996914148330688,
-      "n_queries": 20,
-      "per_query": {
-        "q1": {
-          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
-          "gold": [
-            "2011_T\u014dhoku_earthquake_and_tsunami"
-          ],
-          "top5": [
-            "2011_T\u014dhoku_earthquake_and_tsunami",
-            "Ever_Given",
-            "2020\u20132023_global_chip_shortage",
-            "Container_ship",
-            "Warehouse"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q2": {
-          "query": "How long was the Suez Canal blocked in 2021?",
-          "gold": [
-            "2021_Suez_Canal_obstruction",
-            "Ever_Given"
-          ],
-          "top5": [
-            "2021_Suez_Canal_obstruction",
-            "Suez_Canal",
-            "Ever_Given",
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb"
-          ],
-          "ndcg@10": 0.9197207891481876,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q3": {
-          "query": "What caused the global semiconductor shortage?",
-          "gold": [
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "top5": [
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "TSMC",
-            "Bullwhip_effect",
-            "CHIPS_and_Science_Act"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q4": {
-          "query": "Why is the Strait of Hormuz strategically important?",
-          "gold": [
-            "Strait_of_Hormuz"
-          ],
-          "top5": [
-            "Strait_of_Hormuz",
-            "Strait_of_Malacca",
-            "Bab-el-Mandeb",
-            "Suez_Canal",
-            "Port_of_Singapore"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q5": {
-          "query": "How do Houthis threaten Red Sea shipping?",
-          "gold": [
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb"
-          ],
-          "top5": [
-            "Red_Sea_crisis",
-            "2021_Suez_Canal_obstruction",
-            "Bab-el-Mandeb",
-            "Strait_of_Hormuz",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 0.9197207891481876,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q6": {
-          "query": "Which foundry dominates advanced chip production?",
-          "gold": [
-            "TSMC",
-            "Semiconductor_industry"
-          ],
-          "top5": [
-            "TSMC",
-            "Semiconductor_industry",
-            "Foxconn",
-            "CHIPS_and_Science_Act",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q7": {
-          "query": "What is the bullwhip effect?",
-          "gold": [
-            "Bullwhip_effect"
-          ],
-          "top5": [
-            "Bullwhip_effect",
-            "Inventory",
-            "Supply_chain_management",
-            "Supply_chain_attack",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q8": {
-          "query": "Which port congested during 2021 supply chain crisis?",
-          "gold": [
-            "Port_of_Los_Angeles"
-          ],
-          "top5": [
-            "2021_Suez_Canal_obstruction",
-            "2020\u20132023_global_chip_shortage",
-            "Ever_Given",
-            "Port_of_Singapore",
-            "Container_ship"
-          ],
-          "ndcg@10": 0.3562071871080222,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q9": {
-          "query": "What is the just-in-time manufacturing philosophy?",
-          "gold": [
-            "Just-in-time_manufacturing"
-          ],
-          "top5": [
-            "Just-in-time_manufacturing",
-            "Inventory",
-            "Supply_chain_management",
-            "Logistics",
-            "Enterprise_resource_planning"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q10": {
-          "query": "What does the CHIPS Act allocate?",
-          "gold": [
-            "CHIPS_and_Science_Act"
-          ],
-          "top5": [
-            "CHIPS_and_Science_Act",
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "TSMC",
-            "Inventory"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q11": {
-          "query": "Who is Foxconn's primary customer?",
-          "gold": [
-            "Foxconn"
-          ],
-          "top5": [
-            "Foxconn",
-            "Semiconductor_industry",
-            "TSMC",
-            "Bullwhip_effect",
-            "Samsung_Electronics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q12": {
-          "query": "Why did the Ever Given run aground?",
-          "gold": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "top5": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "Container_ship",
-            "2011_T\u014dhoku_earthquake_and_tsunami",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q13": {
-          "query": "What is safety stock?",
-          "gold": [
-            "Inventory"
-          ],
-          "top5": [
-            "Inventory",
-            "Container_ship",
-            "Just-in-time_manufacturing",
-            "Bullwhip_effect",
-            "Warehouse"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q14": {
-          "query": "What is a supply chain attack?",
-          "gold": [
-            "Supply_chain_attack"
-          ],
-          "top5": [
-            "Supply_chain_attack",
-            "Supply_chain_management",
-            "Bullwhip_effect",
-            "Logistics",
-            "Inventory"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q15": {
-          "query": "How busy is the Port of Singapore?",
-          "gold": [
-            "Port_of_Singapore"
-          ],
-          "top5": [
-            "Port_of_Singapore",
-            "Strait_of_Malacca",
-            "Port_of_Los_Angeles",
-            "2021_Suez_Canal_obstruction",
-            "Container_ship"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q16": {
-          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
-          "gold": [
-            "Strait_of_Malacca"
-          ],
-          "top5": [
-            "Strait_of_Malacca",
-            "Strait_of_Hormuz",
-            "Bab-el-Mandeb",
-            "Port_of_Singapore",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q17": {
-          "query": "Which industry does the Baltic Dry Index track?",
-          "gold": [
-            "Baltic_Dry_Index"
-          ],
-          "top5": [
-            "Baltic_Dry_Index",
-            "Semiconductor_industry",
-            "Inventory",
-            "Container_ship",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q18": {
-          "query": "What function does a warehouse serve?",
-          "gold": [
-            "Warehouse"
-          ],
-          "top5": [
-            "Warehouse",
-            "Inventory",
-            "Logistics",
-            "Container_ship",
-            "Supply_chain_management"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q19": {
-          "query": "What is a container ship's TEU?",
-          "gold": [
-            "Container_ship"
-          ],
-          "top5": [
-            "Container_ship",
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "Port_of_Singapore",
-            "Port_of_Los_Angeles"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q20": {
-          "query": "What software replaces accounting + inventory + HR systems?",
-          "gold": [
-            "Enterprise_resource_planning"
-          ],
-          "top5": [
-            "Enterprise_resource_planning",
-            "Inventory",
-            "Just-in-time_manufacturing",
-            "Supply_chain_management",
-            "Logistics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        }
-      }
-    },
-    "bge-m3": {
-      "embedder": "bge-m3",
-      "mean_ndcg@10": 0.967519867361079,
-      "mean_recall@10": 1.0,
-      "mean_precision@10": 0.12000000000000002,
-      "corpus_encoding_s": 43.88751459121704,
-      "n_queries": 20,
-      "per_query": {
-        "q1": {
-          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
-          "gold": [
-            "2011_T\u014dhoku_earthquake_and_tsunami"
-          ],
-          "top5": [
-            "2011_T\u014dhoku_earthquake_and_tsunami",
-            "Foxconn",
-            "Bab-el-Mandeb",
-            "Ever_Given",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q2": {
-          "query": "How long was the Suez Canal blocked in 2021?",
-          "gold": [
-            "2021_Suez_Canal_obstruction",
-            "Ever_Given"
-          ],
-          "top5": [
-            "2021_Suez_Canal_obstruction",
-            "Suez_Canal",
-            "Ever_Given",
-            "Bab-el-Mandeb",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 0.9197207891481876,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q3": {
-          "query": "What caused the global semiconductor shortage?",
-          "gold": [
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "top5": [
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "TSMC",
-            "Samsung_Electronics",
-            "Foxconn"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q4": {
-          "query": "Why is the Strait of Hormuz strategically important?",
-          "gold": [
-            "Strait_of_Hormuz"
-          ],
-          "top5": [
-            "Strait_of_Hormuz",
-            "Bab-el-Mandeb",
-            "Strait_of_Malacca",
-            "Suez_Canal",
-            "Red_Sea_crisis"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q5": {
-          "query": "How do Houthis threaten Red Sea shipping?",
-          "gold": [
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb"
-          ],
-          "top5": [
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb",
-            "Suez_Canal",
-            "2021_Suez_Canal_obstruction",
-            "Ever_Given"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q6": {
-          "query": "Which foundry dominates advanced chip production?",
-          "gold": [
-            "TSMC",
-            "Semiconductor_industry"
-          ],
-          "top5": [
-            "Semiconductor_industry",
-            "TSMC",
-            "Foxconn",
-            "2020\u20132023_global_chip_shortage",
-            "Samsung_Electronics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q7": {
-          "query": "What is the bullwhip effect?",
-          "gold": [
-            "Bullwhip_effect"
-          ],
-          "top5": [
-            "Bullwhip_effect",
-            "2020\u20132023_global_chip_shortage",
-            "Baltic_Dry_Index",
-            "Bab-el-Mandeb",
-            "Just-in-time_manufacturing"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q8": {
-          "query": "Which port congested during 2021 supply chain crisis?",
-          "gold": [
-            "Port_of_Los_Angeles"
-          ],
-          "top5": [
-            "2020\u20132023_global_chip_shortage",
-            "2021_Suez_Canal_obstruction",
-            "Ever_Given",
-            "Port_of_Los_Angeles",
-            "Bab-el-Mandeb"
-          ],
-          "ndcg@10": 0.43067655807339306,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q9": {
-          "query": "What is the just-in-time manufacturing philosophy?",
-          "gold": [
-            "Just-in-time_manufacturing"
-          ],
-          "top5": [
-            "Just-in-time_manufacturing",
-            "Inventory",
-            "Supply_chain_management",
-            "Foxconn",
-            "Logistics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q10": {
-          "query": "What does the CHIPS Act allocate?",
-          "gold": [
-            "CHIPS_and_Science_Act"
-          ],
-          "top5": [
-            "CHIPS_and_Science_Act",
-            "2020\u20132023_global_chip_shortage",
-            "TSMC",
-            "Foxconn",
-            "Supply_chain_attack"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q11": {
-          "query": "Who is Foxconn's primary customer?",
-          "gold": [
-            "Foxconn"
-          ],
-          "top5": [
-            "Foxconn",
-            "TSMC",
-            "Semiconductor_industry",
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q12": {
-          "query": "Why did the Ever Given run aground?",
-          "gold": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "top5": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "2011_T\u014dhoku_earthquake_and_tsunami",
-            "Bab-el-Mandeb",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q13": {
-          "query": "What is safety stock?",
-          "gold": [
-            "Inventory"
-          ],
-          "top5": [
-            "Inventory",
-            "Supply_chain_attack",
-            "TSMC",
-            "Warehouse",
-            "Port_of_Singapore"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q14": {
-          "query": "What is a supply chain attack?",
-          "gold": [
-            "Supply_chain_attack"
-          ],
-          "top5": [
-            "Supply_chain_attack",
-            "Supply_chain_management",
-            "Bullwhip_effect",
-            "2020\u20132023_global_chip_shortage",
-            "Logistics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q15": {
-          "query": "How busy is the Port of Singapore?",
-          "gold": [
-            "Port_of_Singapore"
-          ],
-          "top5": [
-            "Port_of_Singapore",
-            "Port_of_Los_Angeles",
-            "Strait_of_Malacca",
-            "2021_Suez_Canal_obstruction",
-            "Container_ship"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q16": {
-          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
-          "gold": [
-            "Strait_of_Malacca"
-          ],
-          "top5": [
-            "Strait_of_Malacca",
-            "Bab-el-Mandeb",
-            "Strait_of_Hormuz",
-            "Port_of_Singapore",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q17": {
-          "query": "Which industry does the Baltic Dry Index track?",
-          "gold": [
-            "Baltic_Dry_Index"
-          ],
-          "top5": [
-            "Baltic_Dry_Index",
-            "Inventory",
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "Logistics"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q18": {
-          "query": "What function does a warehouse serve?",
-          "gold": [
-            "Warehouse"
-          ],
-          "top5": [
-            "Warehouse",
-            "Inventory",
-            "Logistics",
-            "Container_ship",
-            "Port_of_Singapore"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q19": {
-          "query": "What is a container ship's TEU?",
-          "gold": [
-            "Container_ship"
-          ],
-          "top5": [
-            "Container_ship",
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "Baltic_Dry_Index",
-            "Port_of_Singapore"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q20": {
-          "query": "What software replaces accounting + inventory + HR systems?",
-          "gold": [
-            "Enterprise_resource_planning"
-          ],
-          "top5": [
-            "Enterprise_resource_planning",
-            "Inventory",
-            "Supply_chain_attack",
-            "Just-in-time_manufacturing",
-            "Foxconn"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        }
-      }
-    },
-    "snowflake-arctic-l": {
-      "embedder": "snowflake-arctic-l",
-      "mean_ndcg@10": 0.9709860394574094,
-      "mean_recall@10": 1.0,
-      "mean_precision@10": 0.12000000000000002,
-      "corpus_encoding_s": 40.3898344039917,
-      "n_queries": 20,
-      "per_query": {
-        "q1": {
-          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
-          "gold": [
-            "2011_T\u014dhoku_earthquake_and_tsunami"
-          ],
-          "top5": [
-            "2011_T\u014dhoku_earthquake_and_tsunami",
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "Samsung_Electronics",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q2": {
-          "query": "How long was the Suez Canal blocked in 2021?",
-          "gold": [
-            "2021_Suez_Canal_obstruction",
-            "Ever_Given"
-          ],
-          "top5": [
-            "2021_Suez_Canal_obstruction",
-            "Suez_Canal",
-            "Ever_Given",
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb"
-          ],
-          "ndcg@10": 0.9197207891481876,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q3": {
-          "query": "What caused the global semiconductor shortage?",
-          "gold": [
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "top5": [
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "TSMC",
-            "Supply_chain_attack",
-            "Foxconn"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q4": {
-          "query": "Why is the Strait of Hormuz strategically important?",
-          "gold": [
-            "Strait_of_Hormuz"
-          ],
-          "top5": [
-            "Strait_of_Hormuz",
-            "Strait_of_Malacca",
-            "Bab-el-Mandeb",
-            "Suez_Canal",
-            "Red_Sea_crisis"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q5": {
-          "query": "How do Houthis threaten Red Sea shipping?",
-          "gold": [
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb"
-          ],
-          "top5": [
-            "Red_Sea_crisis",
-            "Bab-el-Mandeb",
-            "Strait_of_Hormuz",
-            "Suez_Canal",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q6": {
-          "query": "Which foundry dominates advanced chip production?",
-          "gold": [
-            "TSMC",
-            "Semiconductor_industry"
-          ],
-          "top5": [
-            "Semiconductor_industry",
-            "TSMC",
-            "2020\u20132023_global_chip_shortage",
-            "Foxconn",
-            "CHIPS_and_Science_Act"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q7": {
-          "query": "What is the bullwhip effect?",
-          "gold": [
-            "Bullwhip_effect"
-          ],
-          "top5": [
-            "Bullwhip_effect",
-            "Just-in-time_manufacturing",
-            "Baltic_Dry_Index",
-            "Inventory",
-            "Bab-el-Mandeb"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q8": {
-          "query": "Which port congested during 2021 supply chain crisis?",
-          "gold": [
-            "Port_of_Los_Angeles"
-          ],
-          "top5": [
-            "2020\u20132023_global_chip_shortage",
-            "2021_Suez_Canal_obstruction",
-            "Port_of_Los_Angeles",
-            "Ever_Given",
-            "Supply_chain_attack"
-          ],
-          "ndcg@10": 0.5,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q9": {
-          "query": "What is the just-in-time manufacturing philosophy?",
-          "gold": [
-            "Just-in-time_manufacturing"
-          ],
-          "top5": [
-            "Just-in-time_manufacturing",
-            "Supply_chain_management",
-            "Inventory",
-            "Logistics",
-            "Semiconductor_industry"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q10": {
-          "query": "What does the CHIPS Act allocate?",
-          "gold": [
-            "CHIPS_and_Science_Act"
-          ],
-          "top5": [
-            "CHIPS_and_Science_Act",
-            "2020\u20132023_global_chip_shortage",
-            "Semiconductor_industry",
-            "TSMC",
-            "Supply_chain_attack"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q11": {
-          "query": "Who is Foxconn's primary customer?",
-          "gold": [
-            "Foxconn"
-          ],
-          "top5": [
-            "Foxconn",
-            "TSMC",
-            "Semiconductor_industry",
-            "2020\u20132023_global_chip_shortage",
-            "Supply_chain_management"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q12": {
-          "query": "Why did the Ever Given run aground?",
-          "gold": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "top5": [
-            "Ever_Given",
-            "2021_Suez_Canal_obstruction",
-            "Bab-el-Mandeb",
-            "Strait_of_Hormuz",
-            "Container_ship"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.2
-        },
-        "q13": {
-          "query": "What is safety stock?",
-          "gold": [
-            "Inventory"
-          ],
-          "top5": [
-            "Inventory",
-            "Supply_chain_attack",
-            "Bullwhip_effect",
-            "Logistics",
-            "Baltic_Dry_Index"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q14": {
-          "query": "What is a supply chain attack?",
-          "gold": [
-            "Supply_chain_attack"
-          ],
-          "top5": [
-            "Supply_chain_attack",
-            "Supply_chain_management",
-            "Bullwhip_effect",
-            "Logistics",
-            "2020\u20132023_global_chip_shortage"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q15": {
-          "query": "How busy is the Port of Singapore?",
-          "gold": [
-            "Port_of_Singapore"
-          ],
-          "top5": [
-            "Port_of_Singapore",
-            "Strait_of_Malacca",
-            "Port_of_Los_Angeles",
-            "Container_ship",
-            "2021_Suez_Canal_obstruction"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q16": {
-          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
-          "gold": [
-            "Strait_of_Malacca"
-          ],
-          "top5": [
-            "Strait_of_Malacca",
-            "Strait_of_Hormuz",
-            "Bab-el-Mandeb",
-            "Port_of_Singapore",
-            "Suez_Canal"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q17": {
-          "query": "Which industry does the Baltic Dry Index track?",
-          "gold": [
-            "Baltic_Dry_Index"
-          ],
-          "top5": [
-            "Baltic_Dry_Index",
-            "Inventory",
-            "Logistics",
-            "Semiconductor_industry",
-            "Enterprise_resource_planning"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q18": {
-          "query": "What function does a warehouse serve?",
-          "gold": [
-            "Warehouse"
-          ],
-          "top5": [
-            "Warehouse",
-            "Inventory",
-            "Logistics",
-            "Supply_chain_management",
-            "Enterprise_resource_planning"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q19": {
-          "query": "What is a container ship's TEU?",
-          "gold": [
-            "Container_ship"
-          ],
-          "top5": [
-            "Container_ship",
-            "Ever_Given",
-            "Inventory",
-            "2021_Suez_Canal_obstruction",
-            "Baltic_Dry_Index"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        },
-        "q20": {
-          "query": "What software replaces accounting + inventory + HR systems?",
-          "gold": [
-            "Enterprise_resource_planning"
-          ],
-          "top5": [
-            "Enterprise_resource_planning",
-            "Inventory",
-            "Supply_chain_management",
-            "Logistics",
-            "Supply_chain_attack"
-          ],
-          "ndcg@10": 1.0,
-          "recall@10": 1.0,
-          "precision@10": 0.1
-        }
-      }
-    }
-  },
-  "public_ref_nfcorpus": {
-    "mxbai-embed-large-v1": {
-      "ndcg@10_nfcorpus": 0.386,
-      "source": "MTEB retrieval leaderboard 2024"
-    },
-    "bge-m3": {
-      "ndcg@10_nfcorpus": 0.357,
-      "source": "BGE-M3 paper + MTEB"
-    },
-    "snowflake-arctic-l": {
-      "ndcg@10_nfcorpus": 0.348,
-      "source": "Snowflake Arctic paper"
-    }
-  },
-  "elapsed_min": 1.861957597732544
 }

+{
+  "task": "SupplyMind-crisis-retrieval-BEIR-style",
+  "task_description": "Manual BEIR-style retrieval eval on 26 Wikipedia crisis articles + 20 real supply-chain queries. Metrics match the public MTEB retrieval leaderboard (nDCG@10, R@10, P@10). This is an out-of-domain task (supply chain, not medical), but numbers provide a directional check that our embedders are consistent with their published leaderboard performance.",
+  "our_results": {
+    "mxbai-embed-large-v1": {
+      "embedder": "mxbai-embed-large-v1",
+      "mean_ndcg@10": 0.9597824382702198,
+      "mean_recall@10": 1.0,
+      "mean_precision@10": 0.12000000000000002,
+      "corpus_encoding_s": 12.996914148330688,
+      "n_queries": 20,
+      "per_query": {
+        "q1": {
+          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
+          "gold": [
+            "2011_T\u014dhoku_earthquake_and_tsunami"
+          ],
+          "top5": [
+            "2011_T\u014dhoku_earthquake_and_tsunami",
+            "Ever_Given",
+            "2020\u20132023_global_chip_shortage",
+            "Container_ship",
+            "Warehouse"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q2": {
+          "query": "How long was the Suez Canal blocked in 2021?",
+          "gold": [
+            "2021_Suez_Canal_obstruction",
+            "Ever_Given"
+          ],
+          "top5": [
+            "2021_Suez_Canal_obstruction",
+            "Suez_Canal",
+            "Ever_Given",
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb"
+          ],
+          "ndcg@10": 0.9197207891481876,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q3": {
+          "query": "What caused the global semiconductor shortage?",
+          "gold": [
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "top5": [
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "TSMC",
+            "Bullwhip_effect",
+            "CHIPS_and_Science_Act"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q4": {
+          "query": "Why is the Strait of Hormuz strategically important?",
+          "gold": [
+            "Strait_of_Hormuz"
+          ],
+          "top5": [
+            "Strait_of_Hormuz",
+            "Strait_of_Malacca",
+            "Bab-el-Mandeb",
+            "Suez_Canal",
+            "Port_of_Singapore"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q5": {
+          "query": "How do Houthis threaten Red Sea shipping?",
+          "gold": [
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb"
+          ],
+          "top5": [
+            "Red_Sea_crisis",
+            "2021_Suez_Canal_obstruction",
+            "Bab-el-Mandeb",
+            "Strait_of_Hormuz",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 0.9197207891481876,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q6": {
+          "query": "Which foundry dominates advanced chip production?",
+          "gold": [
+            "TSMC",
+            "Semiconductor_industry"
+          ],
+          "top5": [
+            "TSMC",
+            "Semiconductor_industry",
+            "Foxconn",
+            "CHIPS_and_Science_Act",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q7": {
+          "query": "What is the bullwhip effect?",
+          "gold": [
+            "Bullwhip_effect"
+          ],
+          "top5": [
+            "Bullwhip_effect",
+            "Inventory",
+            "Supply_chain_management",
+            "Supply_chain_attack",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q8": {
+          "query": "Which port congested during 2021 supply chain crisis?",
+          "gold": [
+            "Port_of_Los_Angeles"
+          ],
+          "top5": [
+            "2021_Suez_Canal_obstruction",
+            "2020\u20132023_global_chip_shortage",
+            "Ever_Given",
+            "Port_of_Singapore",
+            "Container_ship"
+          ],
+          "ndcg@10": 0.3562071871080222,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q9": {
+          "query": "What is the just-in-time manufacturing philosophy?",
+          "gold": [
+            "Just-in-time_manufacturing"
+          ],
+          "top5": [
+            "Just-in-time_manufacturing",
+            "Inventory",
+            "Supply_chain_management",
+            "Logistics",
+            "Enterprise_resource_planning"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q10": {
+          "query": "What does the CHIPS Act allocate?",
+          "gold": [
+            "CHIPS_and_Science_Act"
+          ],
+          "top5": [
+            "CHIPS_and_Science_Act",
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "TSMC",
+            "Inventory"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q11": {
+          "query": "Who is Foxconn's primary customer?",
+          "gold": [
+            "Foxconn"
+          ],
+          "top5": [
+            "Foxconn",
+            "Semiconductor_industry",
+            "TSMC",
+            "Bullwhip_effect",
+            "Samsung_Electronics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q12": {
+          "query": "Why did the Ever Given run aground?",
+          "gold": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "top5": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "Container_ship",
+            "2011_T\u014dhoku_earthquake_and_tsunami",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q13": {
+          "query": "What is safety stock?",
+          "gold": [
+            "Inventory"
+          ],
+          "top5": [
+            "Inventory",
+            "Container_ship",
+            "Just-in-time_manufacturing",
+            "Bullwhip_effect",
+            "Warehouse"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q14": {
+          "query": "What is a supply chain attack?",
+          "gold": [
+            "Supply_chain_attack"
+          ],
+          "top5": [
+            "Supply_chain_attack",
+            "Supply_chain_management",
+            "Bullwhip_effect",
+            "Logistics",
+            "Inventory"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q15": {
+          "query": "How busy is the Port of Singapore?",
+          "gold": [
+            "Port_of_Singapore"
+          ],
+          "top5": [
+            "Port_of_Singapore",
+            "Strait_of_Malacca",
+            "Port_of_Los_Angeles",
+            "2021_Suez_Canal_obstruction",
+            "Container_ship"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q16": {
+          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
+          "gold": [
+            "Strait_of_Malacca"
+          ],
+          "top5": [
+            "Strait_of_Malacca",
+            "Strait_of_Hormuz",
+            "Bab-el-Mandeb",
+            "Port_of_Singapore",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q17": {
+          "query": "Which industry does the Baltic Dry Index track?",
+          "gold": [
+            "Baltic_Dry_Index"
+          ],
+          "top5": [
+            "Baltic_Dry_Index",
+            "Semiconductor_industry",
+            "Inventory",
+            "Container_ship",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q18": {
+          "query": "What function does a warehouse serve?",
+          "gold": [
+            "Warehouse"
+          ],
+          "top5": [
+            "Warehouse",
+            "Inventory",
+            "Logistics",
+            "Container_ship",
+            "Supply_chain_management"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q19": {
+          "query": "What is a container ship's TEU?",
+          "gold": [
+            "Container_ship"
+          ],
+          "top5": [
+            "Container_ship",
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "Port_of_Singapore",
+            "Port_of_Los_Angeles"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q20": {
+          "query": "What software replaces accounting + inventory + HR systems?",
+          "gold": [
+            "Enterprise_resource_planning"
+          ],
+          "top5": [
+            "Enterprise_resource_planning",
+            "Inventory",
+            "Just-in-time_manufacturing",
+            "Supply_chain_management",
+            "Logistics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        }
+      }
+    },
+    "bge-m3": {
+      "embedder": "bge-m3",
+      "mean_ndcg@10": 0.967519867361079,
+      "mean_recall@10": 1.0,
+      "mean_precision@10": 0.12000000000000002,
+      "corpus_encoding_s": 43.88751459121704,
+      "n_queries": 20,
+      "per_query": {
+        "q1": {
+          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
+          "gold": [
+            "2011_T\u014dhoku_earthquake_and_tsunami"
+          ],
+          "top5": [
+            "2011_T\u014dhoku_earthquake_and_tsunami",
+            "Foxconn",
+            "Bab-el-Mandeb",
+            "Ever_Given",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q2": {
+          "query": "How long was the Suez Canal blocked in 2021?",
+          "gold": [
+            "2021_Suez_Canal_obstruction",
+            "Ever_Given"
+          ],
+          "top5": [
+            "2021_Suez_Canal_obstruction",
+            "Suez_Canal",
+            "Ever_Given",
+            "Bab-el-Mandeb",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 0.9197207891481876,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q3": {
+          "query": "What caused the global semiconductor shortage?",
+          "gold": [
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "top5": [
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "TSMC",
+            "Samsung_Electronics",
+            "Foxconn"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q4": {
+          "query": "Why is the Strait of Hormuz strategically important?",
+          "gold": [
+            "Strait_of_Hormuz"
+          ],
+          "top5": [
+            "Strait_of_Hormuz",
+            "Bab-el-Mandeb",
+            "Strait_of_Malacca",
+            "Suez_Canal",
+            "Red_Sea_crisis"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q5": {
+          "query": "How do Houthis threaten Red Sea shipping?",
+          "gold": [
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb"
+          ],
+          "top5": [
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb",
+            "Suez_Canal",
+            "2021_Suez_Canal_obstruction",
+            "Ever_Given"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q6": {
+          "query": "Which foundry dominates advanced chip production?",
+          "gold": [
+            "TSMC",
+            "Semiconductor_industry"
+          ],
+          "top5": [
+            "Semiconductor_industry",
+            "TSMC",
+            "Foxconn",
+            "2020\u20132023_global_chip_shortage",
+            "Samsung_Electronics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q7": {
+          "query": "What is the bullwhip effect?",
+          "gold": [
+            "Bullwhip_effect"
+          ],
+          "top5": [
+            "Bullwhip_effect",
+            "2020\u20132023_global_chip_shortage",
+            "Baltic_Dry_Index",
+            "Bab-el-Mandeb",
+            "Just-in-time_manufacturing"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q8": {
+          "query": "Which port congested during 2021 supply chain crisis?",
+          "gold": [
+            "Port_of_Los_Angeles"
+          ],
+          "top5": [
+            "2020\u20132023_global_chip_shortage",
+            "2021_Suez_Canal_obstruction",
+            "Ever_Given",
+            "Port_of_Los_Angeles",
+            "Bab-el-Mandeb"
+          ],
+          "ndcg@10": 0.43067655807339306,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q9": {
+          "query": "What is the just-in-time manufacturing philosophy?",
+          "gold": [
+            "Just-in-time_manufacturing"
+          ],
+          "top5": [
+            "Just-in-time_manufacturing",
+            "Inventory",
+            "Supply_chain_management",
+            "Foxconn",
+            "Logistics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q10": {
+          "query": "What does the CHIPS Act allocate?",
+          "gold": [
+            "CHIPS_and_Science_Act"
+          ],
+          "top5": [
+            "CHIPS_and_Science_Act",
+            "2020\u20132023_global_chip_shortage",
+            "TSMC",
+            "Foxconn",
+            "Supply_chain_attack"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q11": {
+          "query": "Who is Foxconn's primary customer?",
+          "gold": [
+            "Foxconn"
+          ],
+          "top5": [
+            "Foxconn",
+            "TSMC",
+            "Semiconductor_industry",
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q12": {
+          "query": "Why did the Ever Given run aground?",
+          "gold": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "top5": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "2011_T\u014dhoku_earthquake_and_tsunami",
+            "Bab-el-Mandeb",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q13": {
+          "query": "What is safety stock?",
+          "gold": [
+            "Inventory"
+          ],
+          "top5": [
+            "Inventory",
+            "Supply_chain_attack",
+            "TSMC",
+            "Warehouse",
+            "Port_of_Singapore"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q14": {
+          "query": "What is a supply chain attack?",
+          "gold": [
+            "Supply_chain_attack"
+          ],
+          "top5": [
+            "Supply_chain_attack",
+            "Supply_chain_management",
+            "Bullwhip_effect",
+            "2020\u20132023_global_chip_shortage",
+            "Logistics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q15": {
+          "query": "How busy is the Port of Singapore?",
+          "gold": [
+            "Port_of_Singapore"
+          ],
+          "top5": [
+            "Port_of_Singapore",
+            "Port_of_Los_Angeles",
+            "Strait_of_Malacca",
+            "2021_Suez_Canal_obstruction",
+            "Container_ship"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q16": {
+          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
+          "gold": [
+            "Strait_of_Malacca"
+          ],
+          "top5": [
+            "Strait_of_Malacca",
+            "Bab-el-Mandeb",
+            "Strait_of_Hormuz",
+            "Port_of_Singapore",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q17": {
+          "query": "Which industry does the Baltic Dry Index track?",
+          "gold": [
+            "Baltic_Dry_Index"
+          ],
+          "top5": [
+            "Baltic_Dry_Index",
+            "Inventory",
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "Logistics"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q18": {
+          "query": "What function does a warehouse serve?",
+          "gold": [
+            "Warehouse"
+          ],
+          "top5": [
+            "Warehouse",
+            "Inventory",
+            "Logistics",
+            "Container_ship",
+            "Port_of_Singapore"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q19": {
+          "query": "What is a container ship's TEU?",
+          "gold": [
+            "Container_ship"
+          ],
+          "top5": [
+            "Container_ship",
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "Baltic_Dry_Index",
+            "Port_of_Singapore"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q20": {
+          "query": "What software replaces accounting + inventory + HR systems?",
+          "gold": [
+            "Enterprise_resource_planning"
+          ],
+          "top5": [
+            "Enterprise_resource_planning",
+            "Inventory",
+            "Supply_chain_attack",
+            "Just-in-time_manufacturing",
+            "Foxconn"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        }
+      }
+    },
+    "snowflake-arctic-l": {
+      "embedder": "snowflake-arctic-l",
+      "mean_ndcg@10": 0.9709860394574094,
+      "mean_recall@10": 1.0,
+      "mean_precision@10": 0.12000000000000002,
+      "corpus_encoding_s": 40.3898344039917,
+      "n_queries": 20,
+      "per_query": {
+        "q1": {
+          "query": "What was the magnitude of the 2011 Tohoku earthquake?",
+          "gold": [
+            "2011_T\u014dhoku_earthquake_and_tsunami"
+          ],
+          "top5": [
+            "2011_T\u014dhoku_earthquake_and_tsunami",
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "Samsung_Electronics",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q2": {
+          "query": "How long was the Suez Canal blocked in 2021?",
+          "gold": [
+            "2021_Suez_Canal_obstruction",
+            "Ever_Given"
+          ],
+          "top5": [
+            "2021_Suez_Canal_obstruction",
+            "Suez_Canal",
+            "Ever_Given",
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb"
+          ],
+          "ndcg@10": 0.9197207891481876,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q3": {
+          "query": "What caused the global semiconductor shortage?",
+          "gold": [
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "top5": [
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "TSMC",
+            "Supply_chain_attack",
+            "Foxconn"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q4": {
+          "query": "Why is the Strait of Hormuz strategically important?",
+          "gold": [
+            "Strait_of_Hormuz"
+          ],
+          "top5": [
+            "Strait_of_Hormuz",
+            "Strait_of_Malacca",
+            "Bab-el-Mandeb",
+            "Suez_Canal",
+            "Red_Sea_crisis"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q5": {
+          "query": "How do Houthis threaten Red Sea shipping?",
+          "gold": [
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb"
+          ],
+          "top5": [
+            "Red_Sea_crisis",
+            "Bab-el-Mandeb",
+            "Strait_of_Hormuz",
+            "Suez_Canal",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q6": {
+          "query": "Which foundry dominates advanced chip production?",
+          "gold": [
+            "TSMC",
+            "Semiconductor_industry"
+          ],
+          "top5": [
+            "Semiconductor_industry",
+            "TSMC",
+            "2020\u20132023_global_chip_shortage",
+            "Foxconn",
+            "CHIPS_and_Science_Act"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q7": {
+          "query": "What is the bullwhip effect?",
+          "gold": [
+            "Bullwhip_effect"
+          ],
+          "top5": [
+            "Bullwhip_effect",
+            "Just-in-time_manufacturing",
+            "Baltic_Dry_Index",
+            "Inventory",
+            "Bab-el-Mandeb"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q8": {
+          "query": "Which port congested during 2021 supply chain crisis?",
+          "gold": [
+            "Port_of_Los_Angeles"
+          ],
+          "top5": [
+            "2020\u20132023_global_chip_shortage",
+            "2021_Suez_Canal_obstruction",
+            "Port_of_Los_Angeles",
+            "Ever_Given",
+            "Supply_chain_attack"
+          ],
+          "ndcg@10": 0.5,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q9": {
+          "query": "What is the just-in-time manufacturing philosophy?",
+          "gold": [
+            "Just-in-time_manufacturing"
+          ],
+          "top5": [
+            "Just-in-time_manufacturing",
+            "Supply_chain_management",
+            "Inventory",
+            "Logistics",
+            "Semiconductor_industry"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q10": {
+          "query": "What does the CHIPS Act allocate?",
+          "gold": [
+            "CHIPS_and_Science_Act"
+          ],
+          "top5": [
+            "CHIPS_and_Science_Act",
+            "2020\u20132023_global_chip_shortage",
+            "Semiconductor_industry",
+            "TSMC",
+            "Supply_chain_attack"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q11": {
+          "query": "Who is Foxconn's primary customer?",
+          "gold": [
+            "Foxconn"
+          ],
+          "top5": [
+            "Foxconn",
+            "TSMC",
+            "Semiconductor_industry",
+            "2020\u20132023_global_chip_shortage",
+            "Supply_chain_management"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q12": {
+          "query": "Why did the Ever Given run aground?",
+          "gold": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "top5": [
+            "Ever_Given",
+            "2021_Suez_Canal_obstruction",
+            "Bab-el-Mandeb",
+            "Strait_of_Hormuz",
+            "Container_ship"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.2
+        },
+        "q13": {
+          "query": "What is safety stock?",
+          "gold": [
+            "Inventory"
+          ],
+          "top5": [
+            "Inventory",
+            "Supply_chain_attack",
+            "Bullwhip_effect",
+            "Logistics",
+            "Baltic_Dry_Index"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q14": {
+          "query": "What is a supply chain attack?",
+          "gold": [
+            "Supply_chain_attack"
+          ],
+          "top5": [
+            "Supply_chain_attack",
+            "Supply_chain_management",
+            "Bullwhip_effect",
+            "Logistics",
+            "2020\u20132023_global_chip_shortage"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q15": {
+          "query": "How busy is the Port of Singapore?",
+          "gold": [
+            "Port_of_Singapore"
+          ],
+          "top5": [
+            "Port_of_Singapore",
+            "Strait_of_Malacca",
+            "Port_of_Los_Angeles",
+            "Container_ship",
+            "2021_Suez_Canal_obstruction"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q16": {
+          "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
+          "gold": [
+            "Strait_of_Malacca"
+          ],
+          "top5": [
+            "Strait_of_Malacca",
+            "Strait_of_Hormuz",
+            "Bab-el-Mandeb",
+            "Port_of_Singapore",
+            "Suez_Canal"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q17": {
+          "query": "Which industry does the Baltic Dry Index track?",
+          "gold": [
+            "Baltic_Dry_Index"
+          ],
+          "top5": [
+            "Baltic_Dry_Index",
+            "Inventory",
+            "Logistics",
+            "Semiconductor_industry",
+            "Enterprise_resource_planning"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q18": {
+          "query": "What function does a warehouse serve?",
+          "gold": [
+            "Warehouse"
+          ],
+          "top5": [
+            "Warehouse",
+            "Inventory",
+            "Logistics",
+            "Supply_chain_management",
+            "Enterprise_resource_planning"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q19": {
+          "query": "What is a container ship's TEU?",
+          "gold": [
+            "Container_ship"
+          ],
+          "top5": [
+            "Container_ship",
+            "Ever_Given",
+            "Inventory",
+            "2021_Suez_Canal_obstruction",
+            "Baltic_Dry_Index"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        },
+        "q20": {
+          "query": "What software replaces accounting + inventory + HR systems?",
+          "gold": [
+            "Enterprise_resource_planning"
+          ],
+          "top5": [
+            "Enterprise_resource_planning",
+            "Inventory",
+            "Supply_chain_management",
+            "Logistics",
+            "Supply_chain_attack"
+          ],
+          "ndcg@10": 1.0,
+          "recall@10": 1.0,
+          "precision@10": 0.1
+        }
+      }
+    }
+  },
+  "public_ref_nfcorpus": {
+    "mxbai-embed-large-v1": {
+      "ndcg@10_nfcorpus": 0.386,
+      "source": "MTEB retrieval leaderboard 2024"
+    },
+    "bge-m3": {
+      "ndcg@10_nfcorpus": 0.357,
+      "source": "BGE-M3 paper + MTEB"
+    },
+    "snowflake-arctic-l": {
+      "ndcg@10_nfcorpus": 0.348,
+      "source": "Snowflake Arctic paper"
+    }
+  },
+  "elapsed_min": 1.861957597732544
 }

FINAL_SUBMIT/receipts/R5_GRANITE.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

FINAL_SUBMIT/receipts/R5_GRANITE_HARD.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

FINAL_SUBMIT/receipts/R6_ALGO_COMPARISON.json CHANGED Viewed

@@ -1,72 +1,72 @@
-{
-  "task": "easy_typhoon_response",
-  "training_timesteps": 100000,
-  "eval_episodes": 50,
-  "per_algorithm": {
-    "MaskablePPO": {
-      "algorithm": "MaskablePPO",
-      "n_episodes": 50,
-      "reward_mean": 1.2005000000000001,
-      "reward_std": 0.19939637032804786,
-      "reward_min": 0.643,
-      "reward_max": 1.3435000000000004,
-      "length_mean": 20.0,
-      "violations_mean": 0.0,
-      "invalid_action_picks_mean_per_ep": 0.0
-    },
-    "PPO": {
-      "algorithm": "PPO",
-      "n_episodes": 50,
-      "reward_mean": 0.9470000000000001,
-      "reward_std": 0.1244727781484771,
-      "reward_min": 0.5895,
-      "reward_max": 1.0760000000000003,
-      "length_mean": 20.0,
-      "violations_mean": 0.0,
-      "invalid_action_picks_mean_per_ep": 13.64
-    },
-    "A2C": {
-      "algorithm": "A2C",
-      "n_episodes": 50,
-      "reward_mean": 0.8738700000000001,
-      "reward_std": 0.11796597221232909,
-      "reward_min": 0.5359999999999999,
-      "reward_max": 0.9690000000000002,
-      "length_mean": 20.0,
-      "violations_mean": 0.0,
-      "invalid_action_picks_mean_per_ep": 13.88
-    },
-    "RecurrentPPO": {
-      "algorithm": "RecurrentPPO",
-      "n_episodes": 50,
-      "reward_mean": 1.0806900000000002,
-      "reward_std": 0.19626869694375626,
-      "reward_min": 0.7499999999999999,
-      "reward_max": 1.3470000000000004,
-      "length_mean": 20.0,
-      "violations_mean": 0.0,
-      "invalid_action_picks_mean_per_ep": 14.86
-    }
-  },
-  "train_times_min": {
-    "MaskablePPO": 10.99298940896988,
-    "PPO": 8.347426931063334,
-    "A2C": 9.913969707489013,
-    "RecurrentPPO": 16.337928581237794
-  },
-  "maskable_vs_others": {
-    "PPO": {
-      "reward_delta": -0.25350000000000006,
-      "maskable_lift_pct": 26.768743400211196
-    },
-    "A2C": {
-      "reward_delta": -0.32663,
-      "maskable_lift_pct": 37.377413116367414
-    },
-    "RecurrentPPO": {
-      "reward_delta": -0.11980999999999997,
-      "maskable_lift_pct": 11.08643551804865
-    }
-  },
-  "elapsed_min": 45.86821995576223
 }

+{
+  "task": "easy_typhoon_response",
+  "training_timesteps": 100000,
+  "eval_episodes": 50,
+  "per_algorithm": {
+    "MaskablePPO": {
+      "algorithm": "MaskablePPO",
+      "n_episodes": 50,
+      "reward_mean": 1.2005000000000001,
+      "reward_std": 0.19939637032804786,
+      "reward_min": 0.643,
+      "reward_max": 1.3435000000000004,
+      "length_mean": 20.0,
+      "violations_mean": 0.0,
+      "invalid_action_picks_mean_per_ep": 0.0
+    },
+    "PPO": {
+      "algorithm": "PPO",
+      "n_episodes": 50,
+      "reward_mean": 0.9470000000000001,
+      "reward_std": 0.1244727781484771,
+      "reward_min": 0.5895,
+      "reward_max": 1.0760000000000003,
+      "length_mean": 20.0,
+      "violations_mean": 0.0,
+      "invalid_action_picks_mean_per_ep": 13.64
+    },
+    "A2C": {
+      "algorithm": "A2C",
+      "n_episodes": 50,
+      "reward_mean": 0.8738700000000001,
+      "reward_std": 0.11796597221232909,
+      "reward_min": 0.5359999999999999,
+      "reward_max": 0.9690000000000002,
+      "length_mean": 20.0,
+      "violations_mean": 0.0,
+      "invalid_action_picks_mean_per_ep": 13.88
+    },
+    "RecurrentPPO": {
+      "algorithm": "RecurrentPPO",
+      "n_episodes": 50,
+      "reward_mean": 1.0806900000000002,
+      "reward_std": 0.19626869694375626,
+      "reward_min": 0.7499999999999999,
+      "reward_max": 1.3470000000000004,
+      "length_mean": 20.0,
+      "violations_mean": 0.0,
+      "invalid_action_picks_mean_per_ep": 14.86
+    }
+  },
+  "train_times_min": {
+    "MaskablePPO": 10.99298940896988,
+    "PPO": 8.347426931063334,
+    "A2C": 9.913969707489013,
+    "RecurrentPPO": 16.337928581237794
+  },
+  "maskable_vs_others": {
+    "PPO": {
+      "reward_delta": -0.25350000000000006,
+      "maskable_lift_pct": 26.768743400211196
+    },
+    "A2C": {
+      "reward_delta": -0.32663,
+      "maskable_lift_pct": 37.377413116367414
+    },
+    "RecurrentPPO": {
+      "reward_delta": -0.11980999999999997,
+      "maskable_lift_pct": 11.08643551804865
+    }
+  },
+  "elapsed_min": 45.86821995576223
 }

FINAL_SUBMIT/receipts/R6_AQUA_REGIA_V2.json CHANGED Viewed

@@ -1,860 +1,860 @@
-{
-  "targets": [
-    "DCOILWTICO",
-    "DEXJPUS",
-    "DEXUSEU",
-    "DEXCHUS",
-    "DEXKOUS"
-  ],
-  "horizon": 14,
-  "confs": [
-    0.8,
-    0.9,
-    0.95
-  ],
-  "n_cal": 30,
-  "n_test": 30,
-  "results": {
-    "DCOILWTICO": {
-      "arima": {
-        "forecaster": "arima",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.8095238095238094,
-          "bare_width_mean": 10.867942261555571,
-          "perhorizon_coverage_mean": 0.6857142857142856,
-          "perhorizon_width_mean": 7.990994504643288,
-          "pooled_coverage_mean": 0.6785714285714285,
-          "pooled_width_mean": 8.029568159989491,
-          "q_per_horizon": [
-            2.0917427692512547,
-            2.414564146929898,
-            3.49864771255762,
-            3.783403014989574,
-            3.6514825270864293,
-            3.410638918826429,
-            3.6483267386695672,
-            4.291356370865486,
-            4.148100512774434,
-            4.765242660767733,
-            4.798738782538393,
-            4.648753353034714,
-            5.111777984600735,
-            5.674186039610767
-          ],
-          "q_pooled": 4.014784079994747
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.9214285714285715,
-          "bare_width_mean": 13.948852880392929,
-          "perhorizon_coverage_mean": 0.7809523809523811,
-          "perhorizon_width_mean": 10.031165041917506,
-          "pooled_coverage_mean": 0.7738095238095238,
-          "pooled_width_mean": 10.167074585069713,
-          "q_per_horizon": [
-            2.300277140003125,
-            4.097940221459595,
-            4.076376633492892,
-            4.703831136719856,
-            4.842398951063927,
-            5.337677242975467,
-            4.359396527417836,
-            6.151868291801264,
-            5.051950062063291,
-            5.854070590337393,
-            5.368481950759772,
-            5.284114635080698,
-            6.431339982770957,
-            6.3584319274764525
-          ],
-          "q_pooled": 5.0835372925348565
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.9452380952380951,
-          "bare_width_mean": 16.621083373775793,
-          "perhorizon_coverage_mean": 0.9261904761904761,
-          "perhorizon_width_mean": 14.611219531249459,
-          "pooled_coverage_mean": 0.838095238095238,
-          "pooled_width_mean": 12.16250013730463,
-          "q_per_horizon": [
-            3.0531114213612582,
-            5.059338828648023,
-            5.697604686526287,
-            7.146009479872129,
-            5.3182905673299175,
-            7.39090190741959,
-            6.856329650125417,
-            7.199424687832007,
-            6.523429069811058,
-            6.548845442730201,
-            9.62406528058468,
-            8.603787092463286,
-            11.553679176235391,
-            11.703719427806988
-          ],
-          "q_pooled": 6.0812500686523165
-        }
-      },
-      "chronos": {
-        "forecaster": "chronos",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.7809523809523807,
-          "bare_width_mean": 11.050525585810343,
-          "perhorizon_coverage_mean": 0.6547619047619048,
-          "perhorizon_width_mean": 8.338129283360074,
-          "pooled_coverage_mean": 0.6452380952380952,
-          "pooled_width_mean": 8.036834106445315,
-          "q_per_horizon": [
-            2.1229774475097685,
-            2.4522241210937494,
-            3.261205139160154,
-            3.9071347045898435,
-            3.614091110229495,
-            3.6567034912109406,
-            3.993652496337887,
-            4.4286404418945295,
-            4.545238494873047,
-            5.274034423828127,
-            5.24025115966797,
-            4.8420919799804665,
-            5.316376342773438,
-            5.71228363037109
-          ],
-          "q_pooled": 4.018417053222656
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.7809523809523807,
-          "bare_width_mean": 11.050525585810343,
-          "perhorizon_coverage_mean": 0.7880952380952381,
-          "perhorizon_width_mean": 11.069673222133089,
-          "pooled_coverage_mean": 0.769047619047619,
-          "pooled_width_mean": 10.63275268554687,
-          "q_per_horizon": [
-            2.555929565429693,
-            3.5912300109863295,
-            4.3903402709960915,
-            5.24416809082031,
-            4.982480926513674,
-            5.137361450195314,
-            5.586841278076172,
-            6.765305328369138,
-            6.67245574951172,
-            5.990972595214842,
-            5.718290405273436,
-            5.943902282714845,
-            7.989523162841799,
-            6.918911437988278
-          ],
-          "q_pooled": 5.316376342773438
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.7809523809523807,
-          "bare_width_mean": 11.050525585810343,
-          "perhorizon_coverage_mean": 0.9261904761904761,
-          "perhorizon_width_mean": 16.372548740931915,
-          "pooled_coverage_mean": 0.8547619047619047,
-          "pooled_width_mean": 13.761851806640617,
-          "q_per_horizon": [
-            4.500623779296873,
-            5.796702575683597,
-            4.578687438964849,
-            5.983569641113277,
-            7.369260253906248,
-            8.649095764160151,
-            8.18119262695312,
-            9.151351928710938,
-            8.256888427734381,
-            8.666538696289066,
-            10.109675750732421,
-            9.065566864013675,
-            12.079234161376952,
-            12.219453277587888
-          ],
-          "q_pooled": 6.8809259033203105
-        }
-      }
-    },
-    "DEXJPUS": {
-      "arima": {
-        "forecaster": "arima",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.6357142857142856,
-          "bare_width_mean": 4.436568793595841,
-          "perhorizon_coverage_mean": 0.45238095238095233,
-          "perhorizon_width_mean": 2.8685092642157013,
-          "pooled_coverage_mean": 0.4928571428571428,
-          "pooled_width_mean": 2.791173769264077,
-          "q_per_horizon": [
-            0.495163456754355,
-            0.8623131555344372,
-            0.8897926642558076,
-            1.1482011742546945,
-            1.28795516679331,
-            1.6477655987067266,
-            1.7443474583408118,
-            1.5384895904415004,
-            1.803162688834604,
-            1.7685075068830685,
-            1.7186420091775432,
-            1.5470661555772267,
-            1.888659928991629,
-            1.7394982949641928
-          ],
-          "q_pooled": 1.3955868846320385
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.7738095238095236,
-          "bare_width_mean": 5.694274399535953,
-          "perhorizon_coverage_mean": 0.5761904761904761,
-          "perhorizon_width_mean": 3.798189452444865,
-          "pooled_coverage_mean": 0.5809523809523809,
-          "pooled_width_mean": 3.8189608293080823,
-          "q_per_horizon": [
-            0.602618663621783,
-            1.5464872564533323,
-            1.410577522130609,
-            2.006457013067674,
-            1.9326982798289691,
-            1.871741039728505,
-            1.8724724170933484,
-            2.0184353738183205,
-            2.057205707305812,
-            2.300998677577681,
-            2.4584763121956854,
-            2.2610349692604643,
-            2.141044083930069,
-            2.1070788511018037
-          ],
-          "q_pooled": 1.9094804146540412
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.8738095238095237,
-          "bare_width_mean": 6.7851464460479765,
-          "perhorizon_coverage_mean": 0.8023809523809523,
-          "perhorizon_width_mean": 6.101635459825262,
-          "pooled_coverage_mean": 0.6571428571428571,
-          "pooled_width_mean": 4.601997355155362,
-          "q_per_horizon": [
-            0.9380858484970958,
-            2.323515167056655,
-            1.946219636173069,
-            2.2116051075864647,
-            2.7206754280723686,
-            3.562227529556367,
-            3.502961358052417,
-            3.5922479170316564,
-            4.142317883234554,
-            4.062380770386838,
-            3.5722844723094056,
-            3.2623018774721544,
-            3.212317495709044,
-            3.6623077276387335
-          ],
-          "q_pooled": 2.300998677577681
-        }
-      },
-      "chronos": {
-        "forecaster": "chronos",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.7309523809523808,
-          "bare_width_mean": 5.977349718411763,
-          "perhorizon_coverage_mean": 0.47380952380952385,
-          "perhorizon_width_mean": 3.038026166643411,
-          "pooled_coverage_mean": 0.49761904761904757,
-          "pooled_width_mean": 2.8918725585937466,
-          "q_per_horizon": [
-            0.5868325805664085,
-            0.8268566894531233,
-            0.8645288085937466,
-            1.1490182495117125,
-            1.4187112426757835,
-            1.667842102050784,
-            1.8516342163085966,
-            1.6831582641601557,
-            1.5933966064453102,
-            1.7942288208007824,
-            2.1771484374999943,
-            1.8165200805664057,
-            1.8638430786132858,
-            1.9724639892578182
-          ],
-          "q_pooled": 1.4459362792968733
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.7309523809523808,
-          "bare_width_mean": 5.977349718411763,
-          "perhorizon_coverage_mean": 0.6071428571428572,
-          "perhorizon_width_mean": 4.111253226143984,
-          "pooled_coverage_mean": 0.6023809523809524,
-          "pooled_width_mean": 4.0517645263671795,
-          "q_per_horizon": [
-            0.7398001098632818,
-            1.542530517578129,
-            1.4136145019531199,
-            2.0581530761718767,
-            1.8112579345703068,
-            2.3215438842773466,
-            2.0993005371093716,
-            2.064953918457036,
-            2.4423132324218813,
-            2.698671264648439,
-            2.4562600708007807,
-            2.32724975585937,
-            2.5256872558593813,
-            2.277436523437501
-          ],
-          "q_pooled": 2.0258822631835898
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.7309523809523808,
-          "bare_width_mean": 5.977349718411763,
-          "perhorizon_coverage_mean": 0.7190476190476188,
-          "perhorizon_width_mean": 5.96463936941964,
-          "pooled_coverage_mean": 0.6809523809523809,
-          "pooled_width_mean": 5.0513745117187625,
-          "q_per_horizon": [
-            0.930439453125004,
-            2.665478515624997,
-            1.9302044677734358,
-            2.0884591674804653,
-            2.7411437988281193,
-            3.6284613037109352,
-            3.513445739746089,
-            3.5274569702148426,
-            4.001575012207027,
-            3.9003729248046852,
-            3.2779876708984403,
-            3.0333639526367193,
-            3.0030249023437534,
-            3.511061706542975
-          ],
-          "q_pooled": 2.5256872558593813
-        }
-      }
-    },
-    "DEXUSEU": {
-      "arima": {
-        "forecaster": "arima",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.8595238095238095,
-          "bare_width_mean": 0.037255051394705835,
-          "perhorizon_coverage_mean": 0.811904761904762,
-          "perhorizon_width_mean": 0.03243267317446737,
-          "pooled_coverage_mean": 0.8166666666666665,
-          "pooled_width_mean": 0.031645107249388627,
-          "q_per_horizon": [
-            0.006537154478817753,
-            0.007333177556922088,
-            0.012312774872748289,
-            0.014043924961390397,
-            0.016017799097016727,
-            0.015644421534730224,
-            0.016336252170641608,
-            0.016122979608933496,
-            0.01964457489050009,
-            0.02072169154979453,
-            0.024118006869554565,
-            0.018656617879449167,
-            0.017769218599013037,
-            0.021770118151759554
-          ],
-          "q_pooled": 0.015822553624694313
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.9142857142857144,
-          "bare_width_mean": 0.047816340798432555,
-          "perhorizon_coverage_mean": 0.8904761904761905,
-          "perhorizon_width_mean": 0.04285578362084427,
-          "pooled_coverage_mean": 0.8809523809523809,
-          "pooled_width_mean": 0.041073044538626924,
-          "q_per_horizon": [
-            0.006761841674864266,
-            0.01182171512244512,
-            0.015822553624694313,
-            0.02093465874643763,
-            0.019889187414578124,
-            0.01963882946285489,
-            0.02190089656490879,
-            0.021692702530445862,
-            0.024590684771490512,
-            0.024756601121440625,
-            0.02609594060524123,
-            0.02889462135779275,
-            0.02689529861576956,
-            0.030294953732946217
-          ],
-          "q_pooled": 0.020536522269313462
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.9380952380952381,
-          "bare_width_mean": 0.05697668430905675,
-          "perhorizon_coverage_mean": 0.9404761904761906,
-          "perhorizon_width_mean": 0.05919364307194989,
-          "pooled_coverage_mean": 0.9119047619047618,
-          "pooled_width_mean": 0.05176715217769701,
-          "q_per_horizon": [
-            0.011752772972313252,
-            0.01247253748338717,
-            0.01748801536532918,
-            0.02383577073487353,
-            0.02364315675893547,
-            0.02218707632552186,
-            0.03203504055001494,
-            0.030332454296178923,
-            0.03750274950896193,
-            0.03613221732608629,
-            0.039232376756770826,
-            0.04010448928765342,
-            0.04080440634480942,
-            0.046832437792812875
-          ],
-          "q_pooled": 0.025883576088848503
-        }
-      },
-      "chronos": {
-        "forecaster": "chronos",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.8,
-          "bare_width_mean": 0.03301220412055651,
-          "perhorizon_coverage_mean": 0.8071428571428574,
-          "perhorizon_width_mean": 0.03432217042105538,
-          "pooled_coverage_mean": 0.8000000000000002,
-          "pooled_width_mean": 0.03300358161926287,
-          "q_per_horizon": [
-            0.004584144783019939,
-            0.007060681152343706,
-            0.01243185882568354,
-            0.01602103652954101,
-            0.01641003990173351,
-            0.015545682907104563,
-            0.018368010711669935,
-            0.01898662319183342,
-            0.022148969459533596,
-            0.02255078582763681,
-            0.023978458976745554,
-            0.020319693946838413,
-            0.017313012123107985,
-            0.024536194610595752
-          ],
-          "q_pooled": 0.016501790809631434
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.8,
-          "bare_width_mean": 0.03301220412055651,
-          "perhorizon_coverage_mean": 0.9190476190476191,
-          "perhorizon_width_mean": 0.05077633157457622,
-          "pooled_coverage_mean": 0.8904761904761905,
-          "pooled_width_mean": 0.04548504829406719,
-          "q_per_horizon": [
-            0.008554865837097081,
-            0.00971177463531503,
-            0.01530143814086915,
-            0.01911055355072011,
-            0.01780367832183849,
-            0.021554478836059543,
-            0.026538812255859412,
-            0.027544754409789984,
-            0.028936708450317372,
-            0.03478273067474369,
-            0.0382537099838256,
-            0.03136329650878911,
-            0.0327265468597413,
-            0.04325097255706778
-          ],
-          "q_pooled": 0.022742524147033594
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.8,
-          "bare_width_mean": 0.03301220412055651,
-          "perhorizon_coverage_mean": 0.9404761904761905,
-          "perhorizon_width_mean": 0.0633313385554722,
-          "pooled_coverage_mean": 0.9547619047619046,
-          "pooled_width_mean": 0.06135401725769052,
-          "q_per_horizon": [
-            0.011944815063476666,
-            0.01392391796112058,
-            0.017532272148132355,
-            0.022742524147033594,
-            0.02558988399505613,
-            0.02623647480010982,
-            0.03067700862884526,
-            0.034072942352294966,
-            0.04179227085113535,
-            0.0389519283294677,
-            0.042779201126098565,
-            0.04429976444244388,
-            0.044917986869811966,
-            0.04785837917327873
-          ],
-          "q_pooled": 0.03067700862884526
-        }
-      }
-    },
-    "DEXCHUS": {
-      "arima": {
-        "forecaster": "arima",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.8309523809523809,
-          "bare_width_mean": 0.12023258914287749,
-          "perhorizon_coverage_mean": 0.8,
-          "perhorizon_width_mean": 0.10379373004234645,
-          "pooled_coverage_mean": 0.7833333333333333,
-          "pooled_width_mean": 0.0905579673492376,
-          "q_per_horizon": [
-            0.01913552539082275,
-            0.021503803498270635,
-            0.03202273363733443,
-            0.04471228016293516,
-            0.04595743067166769,
-            0.057142529866381686,
-            0.041567074905930035,
-            0.05922440211999547,
-            0.06055238630005544,
-            0.06195863987337091,
-            0.07735612435271388,
-            0.07482211423245033,
-            0.0613510301071134,
-            0.06925003517738304
-          ],
-          "q_pooled": 0.0452789836746188
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.8761904761904763,
-          "bare_width_mean": 0.1543168575080998,
-          "perhorizon_coverage_mean": 0.8857142857142858,
-          "perhorizon_width_mean": 0.1694623051285068,
-          "pooled_coverage_mean": 0.8833333333333333,
-          "pooled_width_mean": 0.14964422846490066,
-          "q_per_horizon": [
-            0.026065770883445083,
-            0.03663070092160048,
-            0.04814005922096687,
-            0.05434837199719045,
-            0.06341843160370875,
-            0.06742875148755179,
-            0.08909509445192665,
-            0.09169474000207156,
-            0.11607218346504666,
-            0.12686121412365825,
-            0.11025109977698122,
-            0.12555183014476246,
-            0.11555182580724122,
-            0.11512606201339626
-          ],
-          "q_pooled": 0.07482211423245033
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.9142857142857144,
-          "bare_width_mean": 0.18387987719237844,
-          "perhorizon_coverage_mean": 0.9523809523809524,
-          "perhorizon_width_mean": 0.2451580685008066,
-          "pooled_coverage_mean": 0.9285714285714286,
-          "pooled_width_mean": 0.22228302327474836,
-          "q_per_horizon": [
-            0.032681838125458995,
-            0.07173662444320072,
-            0.06519382424998543,
-            0.06079908928748701,
-            0.09872806564422376,
-            0.10867467864500302,
-            0.11114151163737418,
-            0.14390234892072673,
-            0.14109477023066574,
-            0.1721305319733375,
-            0.17782669739203882,
-            0.18559857212707964,
-            0.17849914242157627,
-            0.16809878440748793
-          ],
-          "q_pooled": 0.11114151163737418
-        }
-      },
-      "chronos": {
-        "forecaster": "chronos",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.8428571428571429,
-          "bare_width_mean": 0.11959348532060782,
-          "perhorizon_coverage_mean": 0.7833333333333333,
-          "perhorizon_width_mean": 0.10019261191231878,
-          "pooled_coverage_mean": 0.8,
-          "pooled_width_mean": 0.09779591979980395,
-          "q_per_horizon": [
-            0.025188607788085626,
-            0.02532754745483423,
-            0.03890764770507804,
-            0.043802440643310625,
-            0.04915690460205102,
-            0.04680775070190446,
-            0.03916668243408239,
-            0.04809946746826199,
-            0.0576093139648437,
-            0.06108116531372065,
-            0.05864996337890638,
-            0.06179137878417951,
-            0.0701272941589357,
-            0.0756321189880369
-          ],
-          "q_pooled": 0.04889795989990198
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.8428571428571429,
-          "bare_width_mean": 0.11959348532060782,
-          "perhorizon_coverage_mean": 0.869047619047619,
-          "perhorizon_width_mean": 0.16607914559500545,
-          "pooled_coverage_mean": 0.861904761904762,
-          "pooled_width_mean": 0.1402545883178714,
-          "q_per_horizon": [
-            0.030081840515136626,
-            0.04935519256591814,
-            0.046391881561278936,
-            0.050782734680176134,
-            0.06024611434936489,
-            0.06782592163085965,
-            0.08113353042602522,
-            0.09840077590942364,
-            0.11880251922607421,
-            0.12758038635253932,
-            0.10697886581420857,
-            0.12221163177490268,
-            0.10586601409912078,
-            0.09689661026000973
-          ],
-          "q_pooled": 0.0701272941589357
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.8428571428571429,
-          "bare_width_mean": 0.11959348532060782,
-          "perhorizon_coverage_mean": 0.9214285714285714,
-          "perhorizon_width_mean": 0.22292400338309162,
-          "pooled_coverage_mean": 0.9095238095238095,
-          "pooled_width_mean": 0.2085365203857421,
-          "q_per_horizon": [
-            0.03159678268432575,
-            0.07481312255859418,
-            0.07034568023681675,
-            0.05222851562499997,
-            0.070854161071777,
-            0.09303555068969693,
-            0.08751402359008775,
-            0.13737474822998053,
-            0.1317485343933109,
-            0.15814713668823277,
-            0.1641494514465336,
-            0.1720175582885739,
-            0.16296061859130884,
-            0.15368213958740196
-          ],
-          "q_pooled": 0.10426826019287105
-        }
-      }
-    },
-    "DEXKOUS": {
-      "arima": {
-        "forecaster": "arima",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.7071428571428572,
-          "bare_width_mean": 41.40702231782995,
-          "perhorizon_coverage_mean": 0.6809523809523808,
-          "perhorizon_width_mean": 40.33834903476961,
-          "pooled_coverage_mean": 0.738095238095238,
-          "pooled_width_mean": 40.174430225697506,
-          "q_per_horizon": [
-            6.019828757339383,
-            9.23651622262787,
-            11.885457212575375,
-            14.301239776206785,
-            16.538830978627857,
-            21.11794087612452,
-            21.007107424806236,
-            22.089443667480282,
-            22.26134568228099,
-            25.115703414253176,
-            26.282158971560648,
-            28.31230917980338,
-            28.622331265376488,
-            29.57822981432423
-          ],
-          "q_pooled": 20.087215112848753
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.8023809523809522,
-          "bare_width_mean": 53.145337785764546,
-          "perhorizon_coverage_mean": 0.7476190476190475,
-          "perhorizon_width_mean": 47.514067959856646,
-          "pooled_coverage_mean": 0.8166666666666665,
-          "pooled_width_mean": 51.703697664495394,
-          "q_per_horizon": [
-            7.042854649616629,
-            11.217728114270585,
-            13.051289508962782,
-            17.974908318198914,
-            22.696578397519033,
-            24.786648186653792,
-            23.205692899009136,
-            25.439228843483306,
-            28.745883742858496,
-            27.649073917800933,
-            32.25531441260455,
-            33.39915882237847,
-            32.317174372199815,
-            32.81694153344006
-          ],
-          "q_pooled": 25.851848832247697
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.8952380952380953,
-          "bare_width_mean": 63.326575872509096,
-          "perhorizon_coverage_mean": 0.8833333333333332,
-          "perhorizon_width_mean": 62.3317263081943,
-          "pooled_coverage_mean": 0.861904761904762,
-          "pooled_width_mean": 63.003314010262784,
-          "q_per_horizon": [
-            12.416104342710696,
-            13.332090802595758,
-            20.658854986845654,
-            37.144614564726226,
-            31.230195571947434,
-            31.501657005131392,
-            31.466225645210898,
-            32.67178752649829,
-            41.05990019882688,
-            37.85425421989498,
-            37.08859079038166,
-            35.26046070337611,
-            40.538744747242845,
-            34.098603051971395
-          ],
-          "q_pooled": 31.501657005131392
-        }
-      },
-      "chronos": {
-        "forecaster": "chronos",
-        "n_cal": 30,
-        "n_test": 30,
-        "conf=0.8": {
-          "nominal_coverage": 0.8,
-          "bare_coverage_mean": 0.7476190476190475,
-          "bare_width_mean": 47.698866081237796,
-          "perhorizon_coverage_mean": 0.669047619047619,
-          "perhorizon_width_mean": 42.05718540736606,
-          "pooled_coverage_mean": 0.7452380952380951,
-          "pooled_width_mean": 43.94189453125,
-          "q_per_horizon": [
-            6.6086572265624,
-            8.688681640624964,
-            11.395966796874973,
-            12.880576171874964,
-            17.0732275390626,
-            19.5968017578125,
-            19.40576171875,
-            24.150083007812555,
-            24.586870117187573,
-            26.251137695312536,
-            27.594218749999982,
-            32.349785156249936,
-            31.7150732421876,
-            32.103457031249945
-          ],
-          "q_pooled": 21.970947265625
-        },
-        "conf=0.9": {
-          "nominal_coverage": 0.9,
-          "bare_coverage_mean": 0.7476190476190475,
-          "bare_width_mean": 47.698866081237796,
-          "perhorizon_coverage_mean": 0.7714285714285712,
-          "perhorizon_width_mean": 49.80674665178569,
-          "pooled_coverage_mean": 0.8357142857142856,
-          "pooled_width_mean": 56.23533203124998,
-          "q_per_horizon": [
-            8.360268554687536,
-            12.467915039062518,
-            14.159082031249909,
-            18.2329248046874,
-            23.688662109374945,
-            25.474423828125055,
-            24.956616210937455,
-            26.577456054687445,
-            28.821977539062573,
-            30.2672265624999,
-            33.08205566406241,
-            33.05286621093751,
-            33.24584472656261,
-            36.25990722656252
-          ],
-          "q_pooled": 28.11766601562499
-        },
-        "conf=0.95": {
-          "nominal_coverage": 0.95,
-          "bare_coverage_mean": 0.7476190476190475,
-          "bare_width_mean": 47.698866081237796,
-          "perhorizon_coverage_mean": 0.8738095238095237,
-          "perhorizon_width_mean": 65.5785993303571,
-          "pooled_coverage_mean": 0.8666666666666666,
-          "pooled_width_mean": 66.16411132812482,
-          "q_per_horizon": [
-            14.446508789062591,
-            15.035361328124964,
-            21.486127929687427,
-            38.963662109375036,
-            33.86973144531248,
-            34.60525878906242,
-            33.86685546874992,
-            33.722353515624945,
-            41.170214843750045,
-            36.77112792968751,
-            37.77993652343753,
-            39.08779296874991,
-            39.80886230468741,
-            38.4364013671875
-          ],
-          "q_pooled": 33.08205566406241
-        }
-      }
-    }
-  },
-  "elapsed_min": 1.141351056098938
 }

+{
+  "targets": [
+    "DCOILWTICO",
+    "DEXJPUS",
+    "DEXUSEU",
+    "DEXCHUS",
+    "DEXKOUS"
+  ],
+  "horizon": 14,
+  "confs": [
+    0.8,
+    0.9,
+    0.95
+  ],
+  "n_cal": 30,
+  "n_test": 30,
+  "results": {
+    "DCOILWTICO": {
+      "arima": {
+        "forecaster": "arima",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.8095238095238094,
+          "bare_width_mean": 10.867942261555571,
+          "perhorizon_coverage_mean": 0.6857142857142856,
+          "perhorizon_width_mean": 7.990994504643288,
+          "pooled_coverage_mean": 0.6785714285714285,
+          "pooled_width_mean": 8.029568159989491,
+          "q_per_horizon": [
+            2.0917427692512547,
+            2.414564146929898,
+            3.49864771255762,
+            3.783403014989574,
+            3.6514825270864293,
+            3.410638918826429,
+            3.6483267386695672,
+            4.291356370865486,
+            4.148100512774434,
+            4.765242660767733,
+            4.798738782538393,
+            4.648753353034714,
+            5.111777984600735,
+            5.674186039610767
+          ],
+          "q_pooled": 4.014784079994747
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.9214285714285715,
+          "bare_width_mean": 13.948852880392929,
+          "perhorizon_coverage_mean": 0.7809523809523811,
+          "perhorizon_width_mean": 10.031165041917506,
+          "pooled_coverage_mean": 0.7738095238095238,
+          "pooled_width_mean": 10.167074585069713,
+          "q_per_horizon": [
+            2.300277140003125,
+            4.097940221459595,
+            4.076376633492892,
+            4.703831136719856,
+            4.842398951063927,
+            5.337677242975467,
+            4.359396527417836,
+            6.151868291801264,
+            5.051950062063291,
+            5.854070590337393,
+            5.368481950759772,
+            5.284114635080698,
+            6.431339982770957,
+            6.3584319274764525
+          ],
+          "q_pooled": 5.0835372925348565
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.9452380952380951,
+          "bare_width_mean": 16.621083373775793,
+          "perhorizon_coverage_mean": 0.9261904761904761,
+          "perhorizon_width_mean": 14.611219531249459,
+          "pooled_coverage_mean": 0.838095238095238,
+          "pooled_width_mean": 12.16250013730463,
+          "q_per_horizon": [
+            3.0531114213612582,
+            5.059338828648023,
+            5.697604686526287,
+            7.146009479872129,
+            5.3182905673299175,
+            7.39090190741959,
+            6.856329650125417,
+            7.199424687832007,
+            6.523429069811058,
+            6.548845442730201,
+            9.62406528058468,
+            8.603787092463286,
+            11.553679176235391,
+            11.703719427806988
+          ],
+          "q_pooled": 6.0812500686523165
+        }
+      },
+      "chronos": {
+        "forecaster": "chronos",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.7809523809523807,
+          "bare_width_mean": 11.050525585810343,
+          "perhorizon_coverage_mean": 0.6547619047619048,
+          "perhorizon_width_mean": 8.338129283360074,
+          "pooled_coverage_mean": 0.6452380952380952,
+          "pooled_width_mean": 8.036834106445315,
+          "q_per_horizon": [
+            2.1229774475097685,
+            2.4522241210937494,
+            3.261205139160154,
+            3.9071347045898435,
+            3.614091110229495,
+            3.6567034912109406,
+            3.993652496337887,
+            4.4286404418945295,
+            4.545238494873047,
+            5.274034423828127,
+            5.24025115966797,
+            4.8420919799804665,
+            5.316376342773438,
+            5.71228363037109
+          ],
+          "q_pooled": 4.018417053222656
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.7809523809523807,
+          "bare_width_mean": 11.050525585810343,
+          "perhorizon_coverage_mean": 0.7880952380952381,
+          "perhorizon_width_mean": 11.069673222133089,
+          "pooled_coverage_mean": 0.769047619047619,
+          "pooled_width_mean": 10.63275268554687,
+          "q_per_horizon": [
+            2.555929565429693,
+            3.5912300109863295,
+            4.3903402709960915,
+            5.24416809082031,
+            4.982480926513674,
+            5.137361450195314,
+            5.586841278076172,
+            6.765305328369138,
+            6.67245574951172,
+            5.990972595214842,
+            5.718290405273436,
+            5.943902282714845,
+            7.989523162841799,
+            6.918911437988278
+          ],
+          "q_pooled": 5.316376342773438
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.7809523809523807,
+          "bare_width_mean": 11.050525585810343,
+          "perhorizon_coverage_mean": 0.9261904761904761,
+          "perhorizon_width_mean": 16.372548740931915,
+          "pooled_coverage_mean": 0.8547619047619047,
+          "pooled_width_mean": 13.761851806640617,
+          "q_per_horizon": [
+            4.500623779296873,
+            5.796702575683597,
+            4.578687438964849,
+            5.983569641113277,
+            7.369260253906248,
+            8.649095764160151,
+            8.18119262695312,
+            9.151351928710938,
+            8.256888427734381,
+            8.666538696289066,
+            10.109675750732421,
+            9.065566864013675,
+            12.079234161376952,
+            12.219453277587888
+          ],
+          "q_pooled": 6.8809259033203105
+        }
+      }
+    },
+    "DEXJPUS": {
+      "arima": {
+        "forecaster": "arima",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.6357142857142856,
+          "bare_width_mean": 4.436568793595841,
+          "perhorizon_coverage_mean": 0.45238095238095233,
+          "perhorizon_width_mean": 2.8685092642157013,
+          "pooled_coverage_mean": 0.4928571428571428,
+          "pooled_width_mean": 2.791173769264077,
+          "q_per_horizon": [
+            0.495163456754355,
+            0.8623131555344372,
+            0.8897926642558076,
+            1.1482011742546945,
+            1.28795516679331,
+            1.6477655987067266,
+            1.7443474583408118,
+            1.5384895904415004,
+            1.803162688834604,
+            1.7685075068830685,
+            1.7186420091775432,
+            1.5470661555772267,
+            1.888659928991629,
+            1.7394982949641928
+          ],
+          "q_pooled": 1.3955868846320385
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.7738095238095236,
+          "bare_width_mean": 5.694274399535953,
+          "perhorizon_coverage_mean": 0.5761904761904761,
+          "perhorizon_width_mean": 3.798189452444865,
+          "pooled_coverage_mean": 0.5809523809523809,
+          "pooled_width_mean": 3.8189608293080823,
+          "q_per_horizon": [
+            0.602618663621783,
+            1.5464872564533323,
+            1.410577522130609,
+            2.006457013067674,
+            1.9326982798289691,
+            1.871741039728505,
+            1.8724724170933484,
+            2.0184353738183205,
+            2.057205707305812,
+            2.300998677577681,
+            2.4584763121956854,
+            2.2610349692604643,
+            2.141044083930069,
+            2.1070788511018037
+          ],
+          "q_pooled": 1.9094804146540412
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.8738095238095237,
+          "bare_width_mean": 6.7851464460479765,
+          "perhorizon_coverage_mean": 0.8023809523809523,
+          "perhorizon_width_mean": 6.101635459825262,
+          "pooled_coverage_mean": 0.6571428571428571,
+          "pooled_width_mean": 4.601997355155362,
+          "q_per_horizon": [
+            0.9380858484970958,
+            2.323515167056655,
+            1.946219636173069,
+            2.2116051075864647,
+            2.7206754280723686,
+            3.562227529556367,
+            3.502961358052417,
+            3.5922479170316564,
+            4.142317883234554,
+            4.062380770386838,
+            3.5722844723094056,
+            3.2623018774721544,
+            3.212317495709044,
+            3.6623077276387335
+          ],
+          "q_pooled": 2.300998677577681
+        }
+      },
+      "chronos": {
+        "forecaster": "chronos",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.7309523809523808,
+          "bare_width_mean": 5.977349718411763,
+          "perhorizon_coverage_mean": 0.47380952380952385,
+          "perhorizon_width_mean": 3.038026166643411,
+          "pooled_coverage_mean": 0.49761904761904757,
+          "pooled_width_mean": 2.8918725585937466,
+          "q_per_horizon": [
+            0.5868325805664085,
+            0.8268566894531233,
+            0.8645288085937466,
+            1.1490182495117125,
+            1.4187112426757835,
+            1.667842102050784,
+            1.8516342163085966,
+            1.6831582641601557,
+            1.5933966064453102,
+            1.7942288208007824,
+            2.1771484374999943,
+            1.8165200805664057,
+            1.8638430786132858,
+            1.9724639892578182
+          ],
+          "q_pooled": 1.4459362792968733
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.7309523809523808,
+          "bare_width_mean": 5.977349718411763,
+          "perhorizon_coverage_mean": 0.6071428571428572,
+          "perhorizon_width_mean": 4.111253226143984,
+          "pooled_coverage_mean": 0.6023809523809524,
+          "pooled_width_mean": 4.0517645263671795,
+          "q_per_horizon": [
+            0.7398001098632818,
+            1.542530517578129,
+            1.4136145019531199,
+            2.0581530761718767,
+            1.8112579345703068,
+            2.3215438842773466,
+            2.0993005371093716,
+            2.064953918457036,
+            2.4423132324218813,
+            2.698671264648439,
+            2.4562600708007807,
+            2.32724975585937,
+            2.5256872558593813,
+            2.277436523437501
+          ],
+          "q_pooled": 2.0258822631835898
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.7309523809523808,
+          "bare_width_mean": 5.977349718411763,
+          "perhorizon_coverage_mean": 0.7190476190476188,
+          "perhorizon_width_mean": 5.96463936941964,
+          "pooled_coverage_mean": 0.6809523809523809,
+          "pooled_width_mean": 5.0513745117187625,
+          "q_per_horizon": [
+            0.930439453125004,
+            2.665478515624997,
+            1.9302044677734358,
+            2.0884591674804653,
+            2.7411437988281193,
+            3.6284613037109352,
+            3.513445739746089,
+            3.5274569702148426,
+            4.001575012207027,
+            3.9003729248046852,
+            3.2779876708984403,
+            3.0333639526367193,
+            3.0030249023437534,
+            3.511061706542975
+          ],
+          "q_pooled": 2.5256872558593813
+        }
+      }
+    },
+    "DEXUSEU": {
+      "arima": {
+        "forecaster": "arima",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.8595238095238095,
+          "bare_width_mean": 0.037255051394705835,
+          "perhorizon_coverage_mean": 0.811904761904762,
+          "perhorizon_width_mean": 0.03243267317446737,
+          "pooled_coverage_mean": 0.8166666666666665,
+          "pooled_width_mean": 0.031645107249388627,
+          "q_per_horizon": [
+            0.006537154478817753,
+            0.007333177556922088,
+            0.012312774872748289,
+            0.014043924961390397,
+            0.016017799097016727,
+            0.015644421534730224,
+            0.016336252170641608,
+            0.016122979608933496,
+            0.01964457489050009,
+            0.02072169154979453,
+            0.024118006869554565,
+            0.018656617879449167,
+            0.017769218599013037,
+            0.021770118151759554
+          ],
+          "q_pooled": 0.015822553624694313
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.9142857142857144,
+          "bare_width_mean": 0.047816340798432555,
+          "perhorizon_coverage_mean": 0.8904761904761905,
+          "perhorizon_width_mean": 0.04285578362084427,
+          "pooled_coverage_mean": 0.8809523809523809,
+          "pooled_width_mean": 0.041073044538626924,
+          "q_per_horizon": [
+            0.006761841674864266,
+            0.01182171512244512,
+            0.015822553624694313,
+            0.02093465874643763,
+            0.019889187414578124,
+            0.01963882946285489,
+            0.02190089656490879,
+            0.021692702530445862,
+            0.024590684771490512,
+            0.024756601121440625,
+            0.02609594060524123,
+            0.02889462135779275,
+            0.02689529861576956,
+            0.030294953732946217
+          ],
+          "q_pooled": 0.020536522269313462
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.9380952380952381,
+          "bare_width_mean": 0.05697668430905675,
+          "perhorizon_coverage_mean": 0.9404761904761906,
+          "perhorizon_width_mean": 0.05919364307194989,
+          "pooled_coverage_mean": 0.9119047619047618,
+          "pooled_width_mean": 0.05176715217769701,
+          "q_per_horizon": [
+            0.011752772972313252,
+            0.01247253748338717,
+            0.01748801536532918,
+            0.02383577073487353,
+            0.02364315675893547,
+            0.02218707632552186,
+            0.03203504055001494,
+            0.030332454296178923,
+            0.03750274950896193,
+            0.03613221732608629,
+            0.039232376756770826,
+            0.04010448928765342,
+            0.04080440634480942,
+            0.046832437792812875
+          ],
+          "q_pooled": 0.025883576088848503
+        }
+      },
+      "chronos": {
+        "forecaster": "chronos",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.8,
+          "bare_width_mean": 0.03301220412055651,
+          "perhorizon_coverage_mean": 0.8071428571428574,
+          "perhorizon_width_mean": 0.03432217042105538,
+          "pooled_coverage_mean": 0.8000000000000002,
+          "pooled_width_mean": 0.03300358161926287,
+          "q_per_horizon": [
+            0.004584144783019939,
+            0.007060681152343706,
+            0.01243185882568354,
+            0.01602103652954101,
+            0.01641003990173351,
+            0.015545682907104563,
+            0.018368010711669935,
+            0.01898662319183342,
+            0.022148969459533596,
+            0.02255078582763681,
+            0.023978458976745554,
+            0.020319693946838413,
+            0.017313012123107985,
+            0.024536194610595752
+          ],
+          "q_pooled": 0.016501790809631434
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.8,
+          "bare_width_mean": 0.03301220412055651,
+          "perhorizon_coverage_mean": 0.9190476190476191,
+          "perhorizon_width_mean": 0.05077633157457622,
+          "pooled_coverage_mean": 0.8904761904761905,
+          "pooled_width_mean": 0.04548504829406719,
+          "q_per_horizon": [
+            0.008554865837097081,
+            0.00971177463531503,
+            0.01530143814086915,
+            0.01911055355072011,
+            0.01780367832183849,
+            0.021554478836059543,
+            0.026538812255859412,
+            0.027544754409789984,
+            0.028936708450317372,
+            0.03478273067474369,
+            0.0382537099838256,
+            0.03136329650878911,
+            0.0327265468597413,
+            0.04325097255706778
+          ],
+          "q_pooled": 0.022742524147033594
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.8,
+          "bare_width_mean": 0.03301220412055651,
+          "perhorizon_coverage_mean": 0.9404761904761905,
+          "perhorizon_width_mean": 0.0633313385554722,
+          "pooled_coverage_mean": 0.9547619047619046,
+          "pooled_width_mean": 0.06135401725769052,
+          "q_per_horizon": [
+            0.011944815063476666,
+            0.01392391796112058,
+            0.017532272148132355,
+            0.022742524147033594,
+            0.02558988399505613,
+            0.02623647480010982,
+            0.03067700862884526,
+            0.034072942352294966,
+            0.04179227085113535,
+            0.0389519283294677,
+            0.042779201126098565,
+            0.04429976444244388,
+            0.044917986869811966,
+            0.04785837917327873
+          ],
+          "q_pooled": 0.03067700862884526
+        }
+      }
+    },
+    "DEXCHUS": {
+      "arima": {
+        "forecaster": "arima",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.8309523809523809,
+          "bare_width_mean": 0.12023258914287749,
+          "perhorizon_coverage_mean": 0.8,
+          "perhorizon_width_mean": 0.10379373004234645,
+          "pooled_coverage_mean": 0.7833333333333333,
+          "pooled_width_mean": 0.0905579673492376,
+          "q_per_horizon": [
+            0.01913552539082275,
+            0.021503803498270635,
+            0.03202273363733443,
+            0.04471228016293516,
+            0.04595743067166769,
+            0.057142529866381686,
+            0.041567074905930035,
+            0.05922440211999547,
+            0.06055238630005544,
+            0.06195863987337091,
+            0.07735612435271388,
+            0.07482211423245033,
+            0.0613510301071134,
+            0.06925003517738304
+          ],
+          "q_pooled": 0.0452789836746188
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.8761904761904763,
+          "bare_width_mean": 0.1543168575080998,
+          "perhorizon_coverage_mean": 0.8857142857142858,
+          "perhorizon_width_mean": 0.1694623051285068,
+          "pooled_coverage_mean": 0.8833333333333333,
+          "pooled_width_mean": 0.14964422846490066,
+          "q_per_horizon": [
+            0.026065770883445083,
+            0.03663070092160048,
+            0.04814005922096687,
+            0.05434837199719045,
+            0.06341843160370875,
+            0.06742875148755179,
+            0.08909509445192665,
+            0.09169474000207156,
+            0.11607218346504666,
+            0.12686121412365825,
+            0.11025109977698122,
+            0.12555183014476246,
+            0.11555182580724122,
+            0.11512606201339626
+          ],
+          "q_pooled": 0.07482211423245033
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.9142857142857144,
+          "bare_width_mean": 0.18387987719237844,
+          "perhorizon_coverage_mean": 0.9523809523809524,
+          "perhorizon_width_mean": 0.2451580685008066,
+          "pooled_coverage_mean": 0.9285714285714286,
+          "pooled_width_mean": 0.22228302327474836,
+          "q_per_horizon": [
+            0.032681838125458995,
+            0.07173662444320072,
+            0.06519382424998543,
+            0.06079908928748701,
+            0.09872806564422376,
+            0.10867467864500302,
+            0.11114151163737418,
+            0.14390234892072673,
+            0.14109477023066574,
+            0.1721305319733375,
+            0.17782669739203882,
+            0.18559857212707964,
+            0.17849914242157627,
+            0.16809878440748793
+          ],
+          "q_pooled": 0.11114151163737418
+        }
+      },
+      "chronos": {
+        "forecaster": "chronos",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.8428571428571429,
+          "bare_width_mean": 0.11959348532060782,
+          "perhorizon_coverage_mean": 0.7833333333333333,
+          "perhorizon_width_mean": 0.10019261191231878,
+          "pooled_coverage_mean": 0.8,
+          "pooled_width_mean": 0.09779591979980395,
+          "q_per_horizon": [
+            0.025188607788085626,
+            0.02532754745483423,
+            0.03890764770507804,
+            0.043802440643310625,
+            0.04915690460205102,
+            0.04680775070190446,
+            0.03916668243408239,
+            0.04809946746826199,
+            0.0576093139648437,
+            0.06108116531372065,
+            0.05864996337890638,
+            0.06179137878417951,
+            0.0701272941589357,
+            0.0756321189880369
+          ],
+          "q_pooled": 0.04889795989990198
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.8428571428571429,
+          "bare_width_mean": 0.11959348532060782,
+          "perhorizon_coverage_mean": 0.869047619047619,
+          "perhorizon_width_mean": 0.16607914559500545,
+          "pooled_coverage_mean": 0.861904761904762,
+          "pooled_width_mean": 0.1402545883178714,
+          "q_per_horizon": [
+            0.030081840515136626,
+            0.04935519256591814,
+            0.046391881561278936,
+            0.050782734680176134,
+            0.06024611434936489,
+            0.06782592163085965,
+            0.08113353042602522,
+            0.09840077590942364,
+            0.11880251922607421,
+            0.12758038635253932,
+            0.10697886581420857,
+            0.12221163177490268,
+            0.10586601409912078,
+            0.09689661026000973
+          ],
+          "q_pooled": 0.0701272941589357
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.8428571428571429,
+          "bare_width_mean": 0.11959348532060782,
+          "perhorizon_coverage_mean": 0.9214285714285714,
+          "perhorizon_width_mean": 0.22292400338309162,
+          "pooled_coverage_mean": 0.9095238095238095,
+          "pooled_width_mean": 0.2085365203857421,
+          "q_per_horizon": [
+            0.03159678268432575,
+            0.07481312255859418,
+            0.07034568023681675,
+            0.05222851562499997,
+            0.070854161071777,
+            0.09303555068969693,
+            0.08751402359008775,
+            0.13737474822998053,
+            0.1317485343933109,
+            0.15814713668823277,
+            0.1641494514465336,
+            0.1720175582885739,
+            0.16296061859130884,
+            0.15368213958740196
+          ],
+          "q_pooled": 0.10426826019287105
+        }
+      }
+    },
+    "DEXKOUS": {
+      "arima": {
+        "forecaster": "arima",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.7071428571428572,
+          "bare_width_mean": 41.40702231782995,
+          "perhorizon_coverage_mean": 0.6809523809523808,
+          "perhorizon_width_mean": 40.33834903476961,
+          "pooled_coverage_mean": 0.738095238095238,
+          "pooled_width_mean": 40.174430225697506,
+          "q_per_horizon": [
+            6.019828757339383,
+            9.23651622262787,
+            11.885457212575375,
+            14.301239776206785,
+            16.538830978627857,
+            21.11794087612452,
+            21.007107424806236,
+            22.089443667480282,
+            22.26134568228099,
+            25.115703414253176,
+            26.282158971560648,
+            28.31230917980338,
+            28.622331265376488,
+            29.57822981432423
+          ],
+          "q_pooled": 20.087215112848753
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.8023809523809522,
+          "bare_width_mean": 53.145337785764546,
+          "perhorizon_coverage_mean": 0.7476190476190475,
+          "perhorizon_width_mean": 47.514067959856646,
+          "pooled_coverage_mean": 0.8166666666666665,
+          "pooled_width_mean": 51.703697664495394,
+          "q_per_horizon": [
+            7.042854649616629,
+            11.217728114270585,
+            13.051289508962782,
+            17.974908318198914,
+            22.696578397519033,
+            24.786648186653792,
+            23.205692899009136,
+            25.439228843483306,
+            28.745883742858496,
+            27.649073917800933,
+            32.25531441260455,
+            33.39915882237847,
+            32.317174372199815,
+            32.81694153344006
+          ],
+          "q_pooled": 25.851848832247697
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.8952380952380953,
+          "bare_width_mean": 63.326575872509096,
+          "perhorizon_coverage_mean": 0.8833333333333332,
+          "perhorizon_width_mean": 62.3317263081943,
+          "pooled_coverage_mean": 0.861904761904762,
+          "pooled_width_mean": 63.003314010262784,
+          "q_per_horizon": [
+            12.416104342710696,
+            13.332090802595758,
+            20.658854986845654,
+            37.144614564726226,
+            31.230195571947434,
+            31.501657005131392,
+            31.466225645210898,
+            32.67178752649829,
+            41.05990019882688,
+            37.85425421989498,
+            37.08859079038166,
+            35.26046070337611,
+            40.538744747242845,
+            34.098603051971395
+          ],
+          "q_pooled": 31.501657005131392
+        }
+      },
+      "chronos": {
+        "forecaster": "chronos",
+        "n_cal": 30,
+        "n_test": 30,
+        "conf=0.8": {
+          "nominal_coverage": 0.8,
+          "bare_coverage_mean": 0.7476190476190475,
+          "bare_width_mean": 47.698866081237796,
+          "perhorizon_coverage_mean": 0.669047619047619,
+          "perhorizon_width_mean": 42.05718540736606,
+          "pooled_coverage_mean": 0.7452380952380951,
+          "pooled_width_mean": 43.94189453125,
+          "q_per_horizon": [
+            6.6086572265624,
+            8.688681640624964,
+            11.395966796874973,
+            12.880576171874964,
+            17.0732275390626,
+            19.5968017578125,
+            19.40576171875,
+            24.150083007812555,
+            24.586870117187573,
+            26.251137695312536,
+            27.594218749999982,
+            32.349785156249936,
+            31.7150732421876,
+            32.103457031249945
+          ],
+          "q_pooled": 21.970947265625
+        },
+        "conf=0.9": {
+          "nominal_coverage": 0.9,
+          "bare_coverage_mean": 0.7476190476190475,
+          "bare_width_mean": 47.698866081237796,
+          "perhorizon_coverage_mean": 0.7714285714285712,
+          "perhorizon_width_mean": 49.80674665178569,
+          "pooled_coverage_mean": 0.8357142857142856,
+          "pooled_width_mean": 56.23533203124998,
+          "q_per_horizon": [
+            8.360268554687536,
+            12.467915039062518,
+            14.159082031249909,
+            18.2329248046874,
+            23.688662109374945,
+            25.474423828125055,
+            24.956616210937455,
+            26.577456054687445,
+            28.821977539062573,
+            30.2672265624999,
+            33.08205566406241,
+            33.05286621093751,
+            33.24584472656261,
+            36.25990722656252
+          ],
+          "q_pooled": 28.11766601562499
+        },
+        "conf=0.95": {
+          "nominal_coverage": 0.95,
+          "bare_coverage_mean": 0.7476190476190475,
+          "bare_width_mean": 47.698866081237796,
+          "perhorizon_coverage_mean": 0.8738095238095237,
+          "perhorizon_width_mean": 65.5785993303571,
+          "pooled_coverage_mean": 0.8666666666666666,
+          "pooled_width_mean": 66.16411132812482,
+          "q_per_horizon": [
+            14.446508789062591,
+            15.035361328124964,
+            21.486127929687427,
+            38.963662109375036,
+            33.86973144531248,
+            34.60525878906242,
+            33.86685546874992,
+            33.722353515624945,
+            41.170214843750045,
+            36.77112792968751,
+            37.77993652343753,
+            39.08779296874991,
+            39.80886230468741,
+            38.4364013671875
+          ],
+          "q_pooled": 33.08205566406241
+        }
+      }
+    }
+  },
+  "elapsed_min": 1.141351056098938
 }

FINAL_SUBMIT/receipts/R6_GETHSEMANE.json CHANGED Viewed

@@ -1,122 +1,122 @@
-{
-  "tasks": {
-    "easy_typhoon_response": {
-      "ppo_v3": {
-        "policy": "ppo_v3",
-        "n_episodes": 50,
-        "reward_mean": 1.2005000000000001,
-        "reward_std": 0.19939637032804786,
-        "reward_min": 0.643,
-        "reward_max": 1.3435000000000004,
-        "length_mean": 20.0,
-        "violations_mean": 0.0,
-        "violations_max": 0,
-        "train_time_s": 389.36543345451355,
-        "total_timesteps": 100000
-      },
-      "random": {
-        "policy": "random",
-        "n_episodes": 50,
-        "reward_mean": 0.7797316807490356,
-        "reward_std": 0.12419262667905032,
-        "reward_min": 0.5059697476286091,
-        "reward_max": 1.009169047501108,
-        "length_mean": 20.0,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      },
-      "greedy": {
-        "policy": "greedy",
-        "n_episodes": 50,
-        "reward_mean": 0.9803400000000001,
-        "reward_std": 0.0062695215128429176,
-        "reward_min": 0.964,
-        "reward_max": 0.9894999999999999,
-        "length_mean": 20.0,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      }
-    },
-    "medium_multi_front": {
-      "ppo_v3": {
-        "policy": "ppo_v3",
-        "n_episodes": 50,
-        "reward_mean": 2.774816094381805,
-        "reward_std": 0.2510891195507745,
-        "reward_min": 2.2131947145395343,
-        "reward_max": 3.1306422226861352,
-        "length_mean": 44.76,
-        "violations_mean": 0.0,
-        "violations_max": 0,
-        "train_time_s": 1028.4124627113342,
-        "total_timesteps": 100000
-      },
-      "random": {
-        "policy": "random",
-        "n_episodes": 50,
-        "reward_mean": -1.1101909893619986,
-        "reward_std": 0.8109045133638636,
-        "reward_min": -2.3839605638376136,
-        "reward_max": 0.6624458826285525,
-        "length_mean": 44.84,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      },
-      "greedy": {
-        "policy": "greedy",
-        "n_episodes": 50,
-        "reward_mean": -1.7960883333333333,
-        "reward_std": 0.08206659628009437,
-        "reward_min": -1.9960833333333332,
-        "reward_max": -1.6348333333333334,
-        "length_mean": 44.76,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      }
-    },
-    "hard_cascading_crisis": {
-      "ppo_v3": {
-        "policy": "ppo_v3",
-        "n_episodes": 50,
-        "reward_mean": 2.67403629887518,
-        "reward_std": 0.7949077297864112,
-        "reward_min": 0.44374348685637904,
-        "reward_max": 3.4482740553083278,
-        "length_mean": 56.06,
-        "violations_mean": 0.0,
-        "violations_max": 0,
-        "train_time_s": 1359.914410352707,
-        "total_timesteps": 100000
-      },
-      "random": {
-        "policy": "random",
-        "n_episodes": 50,
-        "reward_mean": -1.222005001736981,
-        "reward_std": 0.853497432761393,
-        "reward_min": -3.8651570083150526,
-        "reward_max": 0.6500552441714463,
-        "length_mean": 56.06,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      },
-      "greedy": {
-        "policy": "greedy",
-        "n_episodes": 50,
-        "reward_mean": -1.4125516666666666,
-        "reward_std": 0.4515386177313937,
-        "reward_min": -2.3674999999999997,
-        "reward_max": -0.4405833333333334,
-        "length_mean": 56.06,
-        "violations_mean": 0.0,
-        "violations_max": 0
-      }
-    }
-  },
-  "baselines": {},
-  "config": {
-    "timesteps_per_task": 100000,
-    "eval_episodes": 50,
-    "seed": 42
-  },
-  "elapsed_min": 48.6515386501948
 }

+{
+  "tasks": {
+    "easy_typhoon_response": {
+      "ppo_v3": {
+        "policy": "ppo_v3",
+        "n_episodes": 50,
+        "reward_mean": 1.2005000000000001,
+        "reward_std": 0.19939637032804786,
+        "reward_min": 0.643,
+        "reward_max": 1.3435000000000004,
+        "length_mean": 20.0,
+        "violations_mean": 0.0,
+        "violations_max": 0,
+        "train_time_s": 389.36543345451355,
+        "total_timesteps": 100000
+      },
+      "random": {
+        "policy": "random",
+        "n_episodes": 50,
+        "reward_mean": 0.7797316807490356,
+        "reward_std": 0.12419262667905032,
+        "reward_min": 0.5059697476286091,
+        "reward_max": 1.009169047501108,
+        "length_mean": 20.0,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      },
+      "greedy": {
+        "policy": "greedy",
+        "n_episodes": 50,
+        "reward_mean": 0.9803400000000001,
+        "reward_std": 0.0062695215128429176,
+        "reward_min": 0.964,
+        "reward_max": 0.9894999999999999,
+        "length_mean": 20.0,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      }
+    },
+    "medium_multi_front": {
+      "ppo_v3": {
+        "policy": "ppo_v3",
+        "n_episodes": 50,
+        "reward_mean": 2.774816094381805,
+        "reward_std": 0.2510891195507745,
+        "reward_min": 2.2131947145395343,
+        "reward_max": 3.1306422226861352,
+        "length_mean": 44.76,
+        "violations_mean": 0.0,
+        "violations_max": 0,
+        "train_time_s": 1028.4124627113342,
+        "total_timesteps": 100000
+      },
+      "random": {
+        "policy": "random",
+        "n_episodes": 50,
+        "reward_mean": -1.1101909893619986,
+        "reward_std": 0.8109045133638636,
+        "reward_min": -2.3839605638376136,
+        "reward_max": 0.6624458826285525,
+        "length_mean": 44.84,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      },
+      "greedy": {
+        "policy": "greedy",
+        "n_episodes": 50,
+        "reward_mean": -1.7960883333333333,
+        "reward_std": 0.08206659628009437,
+        "reward_min": -1.9960833333333332,
+        "reward_max": -1.6348333333333334,
+        "length_mean": 44.76,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      }
+    },
+    "hard_cascading_crisis": {
+      "ppo_v3": {
+        "policy": "ppo_v3",
+        "n_episodes": 50,
+        "reward_mean": 2.67403629887518,
+        "reward_std": 0.7949077297864112,
+        "reward_min": 0.44374348685637904,
+        "reward_max": 3.4482740553083278,
+        "length_mean": 56.06,
+        "violations_mean": 0.0,
+        "violations_max": 0,
+        "train_time_s": 1359.914410352707,
+        "total_timesteps": 100000
+      },
+      "random": {
+        "policy": "random",
+        "n_episodes": 50,
+        "reward_mean": -1.222005001736981,
+        "reward_std": 0.853497432761393,
+        "reward_min": -3.8651570083150526,
+        "reward_max": 0.6500552441714463,
+        "length_mean": 56.06,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      },
+      "greedy": {
+        "policy": "greedy",
+        "n_episodes": 50,
+        "reward_mean": -1.4125516666666666,
+        "reward_std": 0.4515386177313937,
+        "reward_min": -2.3674999999999997,
+        "reward_max": -0.4405833333333334,
+        "length_mean": 56.06,
+        "violations_mean": 0.0,
+        "violations_max": 0
+      }
+    }
+  },
+  "baselines": {},
+  "config": {
+    "timesteps_per_task": 100000,
+    "eval_episodes": 50,
+    "seed": 42
+  },
+  "elapsed_min": 48.6515386501948
 }

FINAL_SUBMIT/receipts/R6_GETHSEMANE_ONNX_EXPORT.json CHANGED Viewed

@@ -1,25 +1,25 @@
-{
-  "exports": [
-    {
-      "task": "easy_typhoon_response",
-      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_easy_typhoon_response.onnx",
-      "size_mb": 0.970768,
-      "verified": true,
-      "max_diff": 1.9073486328125e-06
-    },
-    {
-      "task": "medium_multi_front",
-      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_medium_multi_front.onnx",
-      "size_mb": 0.970768,
-      "verified": true,
-      "max_diff": 1.9073486328125e-06
-    },
-    {
-      "task": "hard_cascading_crisis",
-      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_hard_cascading_crisis.onnx",
-      "size_mb": 0.970768,
-      "verified": true,
-      "max_diff": 1.430511474609375e-06
-    }
-  ]
 }

+{
+  "exports": [
+    {
+      "task": "easy_typhoon_response",
+      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_easy_typhoon_response.onnx",
+      "size_mb": 0.970768,
+      "verified": true,
+      "max_diff": 1.9073486328125e-06
+    },
+    {
+      "task": "medium_multi_front",
+      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_medium_multi_front.onnx",
+      "size_mb": 0.970768,
+      "verified": true,
+      "max_diff": 1.9073486328125e-06
+    },
+    {
+      "task": "hard_cascading_crisis",
+      "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_hard_cascading_crisis.onnx",
+      "size_mb": 0.970768,
+      "verified": true,
+      "max_diff": 1.430511474609375e-06
+    }
+  ]
 }

FINAL_SUBMIT/receipts/R6_PROVIDER_V2.json CHANGED Viewed

@@ -1,330 +1,330 @@
-{
-  "task": "arrival_time_regression",
-  "task_description": "Predict expected disruption arrival time (continuous) per node, given noisy per-edge lead-times and random source nodes. Non-trivial: requires GNN to learn Dijkstra-like aggregation through the graph.",
-  "lead_time_noise_sigma_relative": 0.2,
-  "graphs": {
-    "easy": {
-      "n_nodes": 12,
-      "n_edges": 12,
-      "gnn_mae": 9.20589906692505,
-      "mlp_mae": 17.712093811035157,
-      "one_hop_mean_mae": 29.553308786787092,
-      "improvement_vs_mlp_pct": 48.0247837147887,
-      "improvement_vs_1hop_pct": 68.84985321494395,
-      "gnn_loss_curve": [
-        983.6469454498291,
-        694.3125346450805,
-        594.0063958816528,
-        548.9563833961487,
-        495.32008571624755,
-        420.9683524398804,
-        364.7742200584412,
-        329.68193370532987,
-        308.9609826283455,
-        305.6601629691124,
-        298.6861881341934,
-        287.8384048962593,
-        303.22127193498613,
-        291.6199851961136,
-        292.3526881427765,
-        286.59378911590574,
-        297.95547390937804,
-        277.4495716457367,
-        278.5004913520813,
-        273.5950565481186,
-        280.847659828186,
-        269.8950548853874,
-        268.0327960948944,
-        272.2881185493469,
-        271.73518936920163,
-        266.2893534479141,
-        268.7633232383728,
-        263.14099113464357,
-        261.69743074321747,
-        262.2134785709381
-      ],
-      "gnn_test_mae_curve": [
-        15.625262084007263,
-        17.273250563144686,
-        15.69198014497757,
-        15.216868221759796,
-        13.83246925830841,
-        12.072544195652007,
-        12.047622272968292,
-        10.346303402781487,
-        10.991831306219101,
-        9.730522887706757,
-        9.387227172255516,
-        12.727755947113037,
-        10.449746668934822,
-        10.917218554019929,
-        9.83320654630661,
-        11.56927591919899,
-        9.640368175506591,
-        9.518106588125228,
-        9.238331428766251,
-        10.004606694579124,
-        9.601016719341278,
-        10.924803348779678,
-        9.062952963709831,
-        11.125388493537903,
-        8.51151149213314,
-        8.760705815553665,
-        8.83567961215973,
-        8.716645919680595,
-        9.704761312007903,
-        9.20589906692505
-      ],
-      "mlp_test_mae_curve": [
-        16.517573373317717,
-        17.61745592355728,
-        17.478831689357758,
-        17.963374128341673,
-        17.317361807823183,
-        17.35558673620224,
-        19.272147517204285,
-        17.29823645591736,
-        18.360565376281738,
-        16.33169244527817,
-        16.291482293605803,
-        20.00996126651764,
-        17.24092205762863,
-        17.935992388725282,
-        18.476314017772676,
-        20.500635390281676,
-        17.64075089454651,
-        19.23261556148529,
-        17.159917891025543,
-        18.033056726455687,
-        17.04588686466217,
-        17.51567750453949,
-        16.925300316810606,
-        19.993932852745058,
-        17.863101620674133,
-        17.46893537759781,
-        17.768136410713197,
-        17.399936029911043,
-        17.271209075450898,
-        17.712093811035157
-      ]
-    },
-    "medium": {
-      "n_nodes": 25,
-      "n_edges": 29,
-      "gnn_mae": 14.05237404346466,
-      "mlp_mae": 27.562243633270263,
-      "one_hop_mean_mae": 23.25141793220304,
-      "improvement_vs_mlp_pct": 49.01585578286486,
-      "improvement_vs_1hop_pct": 39.56336734198809,
-      "gnn_loss_curve": [
-        1455.8575012207032,
-        1070.794164489746,
-        978.3833621215821,
-        878.4453280944824,
-        759.8914498443603,
-        676.4201901473999,
-        592.9840587463378,
-        593.9022348022461,
-        580.474338684082,
-        548.8776502380371,
-        535.7356602172852,
-        524.7076401443481,
-        517.5761855316163,
-        503.14428115844726,
-        504.31373574829104,
-        482.12416637420654,
-        491.71681065368654,
-        476.0351883163452,
-        475.84812075042726,
-        469.6501838378906,
-        473.09340254211423,
-        468.5468386917114,
-        457.8393885040283,
-        461.61461613464354,
-        450.00589713287354,
-        444.84376406097414,
-        448.23634549713137,
-        441.89026587677,
-        436.69793469238283,
-        434.4493161087036
-      ],
-      "gnn_test_mae_curve": [
-        26.63341254234314,
-        23.634564056396485,
-        23.186181049346924,
-        21.077601199150084,
-        21.637806577682497,
-        17.98971748828888,
-        16.306520526409148,
-        17.966433074474335,
-        17.40695864200592,
-        15.116412845849991,
-        15.247849924564362,
-        14.415206160545349,
-        15.09439873456955,
-        14.077203586101533,
-        16.387850997447966,
-        16.519536385536195,
-        15.912737758159638,
-        15.685167801380157,
-        15.163068435192109,
-        15.200627043247223,
-        15.001122550964356,
-        14.351007792949677,
-        15.44103235244751,
-        13.403649566173554,
-        17.10527836084366,
-        14.323340699672698,
-        14.384661407470704,
-        14.556273880004882,
-        13.85397144317627,
-        14.05237404346466
-      ],
-      "mlp_test_mae_curve": [
-        27.1725799369812,
-        26.40243914604187,
-        27.289838228225708,
-        26.334666624069214,
-        28.48377342224121,
-        26.199828100204467,
-        29.151524686813353,
-        28.400241794586183,
-        26.501172218322754,
-        27.04287679672241,
-        27.969863624572753,
-        26.34369418144226,
-        28.614215364456175,
-        26.348094720840454,
-        27.199346466064455,
-        26.72101284980774,
-        26.492710275650026,
-        28.792157373428346,
-        25.963287801742553,
-        27.035139274597167,
-        26.07756766319275,
-        27.420557165145873,
-        28.615666379928587,
-        26.438606796264647,
-        26.199908666610717,
-        26.585446147918702,
-        26.246847848892212,
-        26.238035287857056,
-        26.170038957595825,
-        27.562243633270263
-      ]
-    },
-    "hard": {
-      "n_nodes": 40,
-      "n_edges": 47,
-      "gnn_mae": 10.347342171669005,
-      "mlp_mae": 28.483039016723634,
-      "one_hop_mean_mae": 16.03428017649916,
-      "improvement_vs_mlp_pct": 63.67191659010252,
-      "improvement_vs_1hop_pct": 35.46737329166347,
-      "gnn_loss_curve": [
-        1519.987557739258,
-        1021.7450046386718,
-        815.2417454833984,
-        709.5358395690918,
-        634.4188123474121,
-        560.8865319213867,
-        506.78174713134763,
-        475.7871089630127,
-        451.54362382507327,
-        442.535458694458,
-        425.76794429016115,
-        416.6028264923096,
-        416.2537903900147,
-        416.3216004333496,
-        405.91741243743894,
-        401.3154751739502,
-        403.56236766052245,
-        399.83712251281736,
-        397.13397619628904,
-        396.69007269287107,
-        389.8687892990112,
-        386.671229675293,
-        390.19565746307376,
-        387.47164192962646,
-        384.5350112533569,
-        385.34569120025634,
-        381.3625469284058,
-        380.5953342590332,
-        376.2190606918335,
-        378.44821893310547
-      ],
-      "gnn_test_mae_curve": [
-        25.89111141204834,
-        22.817488927841186,
-        19.102868838310243,
-        21.260897178649902,
-        16.00875702381134,
-        15.999692721366882,
-        14.555557656288148,
-        13.622318716049195,
-        13.0450461602211,
-        13.296297969818115,
-        12.376682465076447,
-        13.256674709320068,
-        11.923482534885407,
-        11.381103422641754,
-        13.629612107276916,
-        13.775573563575744,
-        12.455035951137543,
-        13.674895765781402,
-        12.645530993938445,
-        12.839997906684875,
-        12.782445096969605,
-        11.498445341587066,
-        12.44089034318924,
-        10.853419225215912,
-        11.889822478294372,
-        11.540131111145019,
-        12.30764417886734,
-        10.73738386631012,
-        10.981562974452972,
-        10.347342171669005
-      ],
-      "mlp_test_mae_curve": [
-        28.691825714111328,
-        29.088216686248778,
-        27.926491804122925,
-        32.548833179473874,
-        28.55751530647278,
-        27.89367533683777,
-        28.729960765838623,
-        29.485910148620604,
-        28.418713645935057,
-        29.061994075775146,
-        27.86555823326111,
-        27.882053699493408,
-        28.62539842605591,
-        28.374376544952394,
-        27.627659730911255,
-        29.199770755767823,
-        26.9179744720459,
-        29.280858907699585,
-        28.915042276382447,
-        28.664446725845337,
-        28.888797369003296,
-        29.49649586677551,
-        29.45292121887207,
-        28.840624055862428,
-        27.16323224067688,
-        27.801621007919312,
-        28.310747117996215,
-        28.82351138114929,
-        30.00698434829712,
-        28.483039016723634
-      ]
-    }
-  },
-  "config": {
-    "n_train": 500,
-    "n_test": 200,
-    "hidden": 64,
-    "epochs": 30,
-    "lr": 0.003
-  },
-  "elapsed_min": 4.006023410956065
 }

+{
+  "task": "arrival_time_regression",
+  "task_description": "Predict expected disruption arrival time (continuous) per node, given noisy per-edge lead-times and random source nodes. Non-trivial: requires GNN to learn Dijkstra-like aggregation through the graph.",
+  "lead_time_noise_sigma_relative": 0.2,
+  "graphs": {
+    "easy": {
+      "n_nodes": 12,
+      "n_edges": 12,
+      "gnn_mae": 9.20589906692505,
+      "mlp_mae": 17.712093811035157,
+      "one_hop_mean_mae": 29.553308786787092,
+      "improvement_vs_mlp_pct": 48.0247837147887,
+      "improvement_vs_1hop_pct": 68.84985321494395,
+      "gnn_loss_curve": [
+        983.6469454498291,
+        694.3125346450805,
+        594.0063958816528,
+        548.9563833961487,
+        495.32008571624755,
+        420.9683524398804,
+        364.7742200584412,
+        329.68193370532987,
+        308.9609826283455,
+        305.6601629691124,
+        298.6861881341934,
+        287.8384048962593,
+        303.22127193498613,
+        291.6199851961136,
+        292.3526881427765,
+        286.59378911590574,
+        297.95547390937804,
+        277.4495716457367,
+        278.5004913520813,
+        273.5950565481186,
+        280.847659828186,
+        269.8950548853874,
+        268.0327960948944,
+        272.2881185493469,
+        271.73518936920163,
+        266.2893534479141,
+        268.7633232383728,
+        263.14099113464357,
+        261.69743074321747,
+        262.2134785709381
+      ],
+      "gnn_test_mae_curve": [
+        15.625262084007263,
+        17.273250563144686,
+        15.69198014497757,
+        15.216868221759796,
+        13.83246925830841,
+        12.072544195652007,
+        12.047622272968292,
+        10.346303402781487,
+        10.991831306219101,
+        9.730522887706757,
+        9.387227172255516,
+        12.727755947113037,
+        10.449746668934822,
+        10.917218554019929,
+        9.83320654630661,
+        11.56927591919899,
+        9.640368175506591,
+        9.518106588125228,
+        9.238331428766251,
+        10.004606694579124,
+        9.601016719341278,
+        10.924803348779678,
+        9.062952963709831,
+        11.125388493537903,
+        8.51151149213314,
+        8.760705815553665,
+        8.83567961215973,
+        8.716645919680595,
+        9.704761312007903,
+        9.20589906692505
+      ],
+      "mlp_test_mae_curve": [
+        16.517573373317717,
+        17.61745592355728,
+        17.478831689357758,
+        17.963374128341673,
+        17.317361807823183,
+        17.35558673620224,
+        19.272147517204285,
+        17.29823645591736,
+        18.360565376281738,
+        16.33169244527817,
+        16.291482293605803,
+        20.00996126651764,
+        17.24092205762863,
+        17.935992388725282,
+        18.476314017772676,
+        20.500635390281676,
+        17.64075089454651,
+        19.23261556148529,
+        17.159917891025543,
+        18.033056726455687,
+        17.04588686466217,
+        17.51567750453949,
+        16.925300316810606,
+        19.993932852745058,
+        17.863101620674133,
+        17.46893537759781,
+        17.768136410713197,
+        17.399936029911043,
+        17.271209075450898,
+        17.712093811035157
+      ]
+    },
+    "medium": {
+      "n_nodes": 25,
+      "n_edges": 29,
+      "gnn_mae": 14.05237404346466,
+      "mlp_mae": 27.562243633270263,
+      "one_hop_mean_mae": 23.25141793220304,
+      "improvement_vs_mlp_pct": 49.01585578286486,
+      "improvement_vs_1hop_pct": 39.56336734198809,
+      "gnn_loss_curve": [
+        1455.8575012207032,
+        1070.794164489746,
+        978.3833621215821,
+        878.4453280944824,
+        759.8914498443603,
+        676.4201901473999,
+        592.9840587463378,
+        593.9022348022461,
+        580.474338684082,
+        548.8776502380371,
+        535.7356602172852,
+        524.7076401443481,
+        517.5761855316163,
+        503.14428115844726,
+        504.31373574829104,
+        482.12416637420654,
+        491.71681065368654,
+        476.0351883163452,
+        475.84812075042726,
+        469.6501838378906,
+        473.09340254211423,
+        468.5468386917114,
+        457.8393885040283,
+        461.61461613464354,
+        450.00589713287354,
+        444.84376406097414,
+        448.23634549713137,
+        441.89026587677,
+        436.69793469238283,
+        434.4493161087036
+      ],
+      "gnn_test_mae_curve": [
+        26.63341254234314,
+        23.634564056396485,
+        23.186181049346924,
+        21.077601199150084,
+        21.637806577682497,
+        17.98971748828888,
+        16.306520526409148,
+        17.966433074474335,
+        17.40695864200592,
+        15.116412845849991,
+        15.247849924564362,
+        14.415206160545349,
+        15.09439873456955,
+        14.077203586101533,
+        16.387850997447966,
+        16.519536385536195,
+        15.912737758159638,
+        15.685167801380157,
+        15.163068435192109,
+        15.200627043247223,
+        15.001122550964356,
+        14.351007792949677,
+        15.44103235244751,
+        13.403649566173554,
+        17.10527836084366,
+        14.323340699672698,
+        14.384661407470704,
+        14.556273880004882,
+        13.85397144317627,
+        14.05237404346466
+      ],
+      "mlp_test_mae_curve": [
+        27.1725799369812,
+        26.40243914604187,
+        27.289838228225708,
+        26.334666624069214,
+        28.48377342224121,
+        26.199828100204467,
+        29.151524686813353,
+        28.400241794586183,
+        26.501172218322754,
+        27.04287679672241,
+        27.969863624572753,
+        26.34369418144226,
+        28.614215364456175,
+        26.348094720840454,
+        27.199346466064455,
+        26.72101284980774,
+        26.492710275650026,
+        28.792157373428346,
+        25.963287801742553,
+        27.035139274597167,
+        26.07756766319275,
+        27.420557165145873,
+        28.615666379928587,
+        26.438606796264647,
+        26.199908666610717,
+        26.585446147918702,
+        26.246847848892212,
+        26.238035287857056,
+        26.170038957595825,
+        27.562243633270263
+      ]
+    },
+    "hard": {
+      "n_nodes": 40,
+      "n_edges": 47,
+      "gnn_mae": 10.347342171669005,
+      "mlp_mae": 28.483039016723634,
+      "one_hop_mean_mae": 16.03428017649916,
+      "improvement_vs_mlp_pct": 63.67191659010252,
+      "improvement_vs_1hop_pct": 35.46737329166347,
+      "gnn_loss_curve": [
+        1519.987557739258,
+        1021.7450046386718,
+        815.2417454833984,
+        709.5358395690918,
+        634.4188123474121,
+        560.8865319213867,
+        506.78174713134763,
+        475.7871089630127,
+        451.54362382507327,
+        442.535458694458,
+        425.76794429016115,
+        416.6028264923096,
+        416.2537903900147,
+        416.3216004333496,
+        405.91741243743894,
+        401.3154751739502,
+        403.56236766052245,
+        399.83712251281736,
+        397.13397619628904,
+        396.69007269287107,
+        389.8687892990112,
+        386.671229675293,
+        390.19565746307376,
+        387.47164192962646,
+        384.5350112533569,
+        385.34569120025634,
+        381.3625469284058,
+        380.5953342590332,
+        376.2190606918335,
+        378.44821893310547
+      ],
+      "gnn_test_mae_curve": [
+        25.89111141204834,
+        22.817488927841186,
+        19.102868838310243,
+        21.260897178649902,
+        16.00875702381134,
+        15.999692721366882,
+        14.555557656288148,
+        13.622318716049195,
+        13.0450461602211,
+        13.296297969818115,
+        12.376682465076447,
+        13.256674709320068,
+        11.923482534885407,
+        11.381103422641754,
+        13.629612107276916,
+        13.775573563575744,
+        12.455035951137543,
+        13.674895765781402,
+        12.645530993938445,
+        12.839997906684875,
+        12.782445096969605,
+        11.498445341587066,
+        12.44089034318924,
+        10.853419225215912,
+        11.889822478294372,
+        11.540131111145019,
+        12.30764417886734,
+        10.73738386631012,
+        10.981562974452972,
+        10.347342171669005
+      ],
+      "mlp_test_mae_curve": [
+        28.691825714111328,
+        29.088216686248778,
+        27.926491804122925,
+        32.548833179473874,
+        28.55751530647278,
+        27.89367533683777,
+        28.729960765838623,
+        29.485910148620604,
+        28.418713645935057,
+        29.061994075775146,
+        27.86555823326111,
+        27.882053699493408,
+        28.62539842605591,
+        28.374376544952394,
+        27.627659730911255,
+        29.199770755767823,
+        26.9179744720459,
+        29.280858907699585,
+        28.915042276382447,
+        28.664446725845337,
+        28.888797369003296,
+        29.49649586677551,
+        29.45292121887207,
+        28.840624055862428,
+        27.16323224067688,
+        27.801621007919312,
+        28.310747117996215,
+        28.82351138114929,
+        30.00698434829712,
+        28.483039016723634
+      ]
+    }
+  },
+  "config": {
+    "n_train": 500,
+    "n_test": 200,
+    "hidden": 64,
+    "epochs": 30,
+    "lr": 0.003
+  },
+  "elapsed_min": 4.006023410956065
 }

FINAL_SUBMIT/receipts/R6_PROVIDER_v1_F1.json CHANGED Viewed

@@ -1,1756 +1,1756 @@
-{
-  "graphs": {
-    "easy": {
-      "n_nodes": 12,
-      "n_edges": 10,
-      "gnn_final": {
-        "acc": 1.0,
-        "precision": 1.0,
-        "recall": 1.0,
-        "f1": 1.0
-      },
-      "baseline_direct_neighbors": {
-        "acc": 0.8258333333333333,
-        "precision": 1.0,
-        "recall": 0.6352530541012217,
-        "f1": 0.7769477054429028
-      },
-      "improvement_f1_pp": 22.305229455709718,
-      "train_loss_curve": [
-        0.10601958807871187,
-        0.00014574478766241308,
-        2.1336230871288145e-05,
-        5.904760447787133e-06,
-        0.014828034023753519,
-        0.0001365676538936252,
-        2.800940909035432e-05,
-        7.873948834791846e-06,
-        2.40824965675521e-06,
-        7.439197035413468e-07,
-        2.349434055591839e-07,
-        8.035365056026132e-08,
-        1.866763376779131e-08,
-        6.7128299592450774e-09,
-        3.606812599319898e-09,
-        2.4320182903440704e-09,
-        1.5445408799196548e-09,
-        0.03198392186360504,
-        1.3277981027858794e-05,
-        7.040849976128097e-06,
-        2.0380432214083175e-06,
-        5.154616233541851e-07,
-        0.017213296287886225,
-        0.00023569030925164338,
-        2.4805963813645227e-05,
-        6.058055528068272e-06,
-        1.8203820033098038e-06,
-        6.043328515907098e-07,
-        2.1225388103874568e-07,
-        7.437462508802039e-08,
-        1.902343076246039e-08,
-        6.527784956639485e-09,
-        3.3294667175720776e-09,
-        1.9615958442567566e-09,
-        0.010902570914775889,
-        2.806348171776314e-05,
-        7.667120790626038e-06,
-        2.582107717285551e-06,
-        9.129105348027232e-07,
-        3.106581481139294e-07,
-        1.0230859844032431e-07,
-        2.725160428237702e-08,
-        8.880124408068363e-09,
-        4.4200613740675046e-09,
-        2.8600379247657045e-09,
-        2.2151315261330923e-09,
-        1.7114610773887693e-09,
-        1.4000422095074408e-09,
-        1.0463116296276038e-09,
-        6.4079628731738e-10,
-        0.02516633728286725,
-        0.00012813284900565014,
-        2.3232634050379803e-05,
-        7.066120872802589e-06,
-        2.311430617913936e-06,
-        7.920952698295068e-07,
-        2.5278086959691613e-07,
-        7.818242851037627e-08,
-        1.983640248580842e-08,
-        7.863145182916767e-09,
-        5.0701508055233275e-09,
-        4.364776342121379e-09,
-        3.937454630286758e-09,
-        2.518706138457294e-09,
-        1.9815549914984234e-09,
-        0.018349960519401222,
-        7.85511791638533e-05,
-        2.0063992723006376e-05,
-        6.210748974664104e-06,
-        1.9043317207399904e-06,
-        6.112533347568437e-07,
-        2.0612900407184615e-07,
-        6.247272126631417e-08,
-        1.5818333928198573e-08,
-        5.678499110562204e-09,
-        2.927658185385007e-09,
-        2.2895658619235268e-09,
-        1.9812523096841366e-09,
-        1.418338779821114e-09,
-        9.94527561841937e-10
-      ],
-      "test_metric_curve": [
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        },
-        {
-          "acc": 1.0,
-          "precision": 1.0,
-          "recall": 1.0,
-          "f1": 1.0
-        }
-      ]
-    },
-    "medium": {
-      "n_nodes": 25,
-      "n_edges": 27,
-      "gnn_final": {
-        "acc": 0.9914,
-        "precision": 0.982778750729714,
-        "recall": 0.9920447849145551,
-        "f1": 0.9873900293255131
-      },
-      "baseline_direct_neighbors": {
-        "acc": 0.8301,
-        "precision": 1.0,
-        "recall": 0.4994107248084856,
-        "f1": 0.6661426606405974
-      },
-      "improvement_f1_pp": 32.124736868491574,
-      "train_loss_curve": [
-        0.18512494587464606,
-        0.05774239192842651,
-        0.04035148839658183,
-        0.03685507851154424,
-        0.034016887983169666,
-        0.03193854558186021,
-        0.030314448321928544,
-        0.028890588828011224,
-        0.02627120438580584,
-        0.02676936000857496,
-        0.02735587336003725,
-        0.024704556535801756,
-        0.023389738032454397,
-        0.02484239745095036,
-        0.022598365899086623,
-        0.022097759216314333,
-        0.021880711925624425,
-        0.023672257099118552,
-        0.021815840122002862,
-        0.021538631150760885,
-        0.021590486920307173,
-        0.020993219244996,
-        0.021660113581202914,
-        0.02028199757042485,
-        0.021449406110984975,
-        0.02049649202735325,
-        0.02005596899437715,
-        0.02060316097080978,
-        0.02082035162168178,
-        0.020935066080168856,
-        0.0209964800781561,
-        0.019652295691733542,
-        0.020470858438760543,
-        0.020456047435481396,
-        0.020529603496513553,
-        0.019996260003822708,
-        0.021328506347361064,
-        0.019778630244522907,
-        0.01971426555108731,
-        0.019847191254493045,
-        0.01984119418810368,
-        0.02021396374486143,
-        0.01946370021810413,
-        0.019111871498224214,
-        0.019667785586758944,
-        0.021675049597691873,
-        0.01897557202284267,
-        0.01971483370839516,
-        0.01965866965101487,
-        0.01936112277971507,
-        0.01895255452432814,
-        0.02035098125927439,
-        0.01909720691408324,
-        0.019500281907226687,
-        0.019117790717674256,
-        0.018927754213147425,
-        0.020313845976115717,
-        0.019341792678655486,
-        0.01890229735773205,
-        0.019833170414518056,
-        0.01948640772390163,
-        0.019305320678627013,
-        0.019213381035159603,
-        0.020478221997059808,
-        0.01936127331570382,
-        0.019158014420631225,
-        0.019090143173694583,
-        0.020291763241906225,
-        0.01900654871721499,
-        0.019815083033949698,
-        0.019103285589502736,
-        0.018360809753397392,
-        0.019985065603578676,
-        0.01858524212906661,
-        0.02056734084818314,
-        0.01856864124721938,
-        0.01852369899036554,
-        0.018906581267301003,
-        0.01927234342475787,
-        0.018721831301170885
-      ],
-      "test_metric_curve": [
-        {
-          "acc": 0.9816,
-          "precision": 0.9819819819819819,
-          "recall": 0.9634649381261049,
-          "f1": 0.9726353361094586
-        },
-        {
-          "acc": 0.9885,
-          "precision": 0.9742551345096905,
-          "recall": 0.9923394225103123,
-          "f1": 0.9832141293241862
-        },
-        {
-          "acc": 0.988,
-          "precision": 0.9720299884659747,
-          "recall": 0.993223335297584,
-          "f1": 0.9825123870591663
-        },
-        {
-          "acc": 0.9892,
-          "precision": 0.986094674556213,
-          "recall": 0.9820271066588097,
-          "f1": 0.9840566873339238
-        },
-        {
-          "acc": 0.9916,
-          "precision": 0.9825072886297376,
-          "recall": 0.9929286977018268,
-          "f1": 0.9876905041031652
-        },
-        {
-          "acc": 0.9913,
-          "precision": 0.9824919754887657,
-          "recall": 0.9920447849145551,
-          "f1": 0.9872452719542588
-        },
-        {
-          "acc": 0.9909,
-          "precision": 0.9847373055474024,
-          "recall": 0.9885091337654685,
-          "f1": 0.9866196147625349
-        },
-        {
-          "acc": 0.9857,
-          "precision": 0.9954282231027126,
-          "recall": 0.9622863877430761,
-          "f1": 0.9785767790262172
-        },
-        {
-          "acc": 0.9882,
-          "precision": 0.9761627906976744,
-          "recall": 0.9893930465527401,
-          "f1": 0.9827333918642083
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9833333333333333,
-          "recall": 0.9908662345315262,
-          "f1": 0.9870854123862635
-        },
-        {
-          "acc": 0.9911,
-          "precision": 0.9864586399764498,
-          "recall": 0.9873305833824396,
-          "f1": 0.9868944190840818
-        },
-        {
-          "acc": 0.9842,
-          "precision": 0.997539975399754,
-          "recall": 0.9558043606364172,
-          "f1": 0.9762263015347576
-        },
-        {
-          "acc": 0.9872,
-          "precision": 0.9936517533252721,
-          "recall": 0.9684737772539777,
-          "f1": 0.9809012235153686
-        },
-        {
-          "acc": 0.9919,
-          "precision": 0.9825225750072822,
-          "recall": 0.9938126104890984,
-          "f1": 0.9881353449538597
-        },
-        {
-          "acc": 0.9905,
-          "precision": 0.9864346800353878,
-          "recall": 0.9855627578078963,
-          "f1": 0.9859985261606485
-        },
-        {
-          "acc": 0.9903,
-          "precision": 0.9867139061116031,
-          "recall": 0.9846788450206246,
-          "f1": 0.9856953251732783
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9833333333333333,
-          "recall": 0.9908662345315262,
-          "f1": 0.9870854123862635
-        },
-        {
-          "acc": 0.9917,
-          "precision": 0.9827938174394867,
-          "recall": 0.9929286977018268,
-          "f1": 0.9878352630807563
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.9822157434402332,
-          "recall": 0.9926340601060696,
-          "f1": 0.9873974208675265
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.9833430742255991,
-          "recall": 0.9914555097230406,
-          "f1": 0.9873826291079812
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.9909,
-          "precision": 0.9858781994704324,
-          "recall": 0.9873305833824396,
-          "f1": 0.9866038569115266
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9833333333333333,
-          "recall": 0.9908662345315262,
-          "f1": 0.9870854123862635
-        },
-        {
-          "acc": 0.9915,
-          "precision": 0.9827837758972863,
-          "recall": 0.9923394225103123,
-          "f1": 0.9875384840932414
-        },
-        {
-          "acc": 0.9907,
-          "precision": 0.9873043991733097,
-          "recall": 0.985268120212139,
-          "f1": 0.9862852086712873
-        },
-        {
-          "acc": 0.9919,
-          "precision": 0.9825225750072822,
-          "recall": 0.9938126104890984,
-          "f1": 0.9881353449538597
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9916,
-          "precision": 0.9777713625866051,
-          "recall": 0.9979375368296994,
-          "f1": 0.9877515310586177
-        },
-        {
-          "acc": 0.9901,
-          "precision": 0.9869937924918711,
-          "recall": 0.983794932233353,
-          "f1": 0.9853917662682603
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9904,
-          "precision": 0.9872931442080378,
-          "recall": 0.9843842074248674,
-          "f1": 0.9858365299498378
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9887,
-          "precision": 0.993680409268733,
-          "recall": 0.9728933411903359,
-          "f1": 0.9831770135477147
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9833333333333333,
-          "recall": 0.9908662345315262,
-          "f1": 0.9870854123862635
-        },
-        {
-          "acc": 0.9913,
-          "precision": 0.983338205203157,
-          "recall": 0.9911608721272834,
-          "f1": 0.9872340425531914
-        },
-        {
-          "acc": 0.9915,
-          "precision": 0.9827837758972863,
-          "recall": 0.9923394225103123,
-          "f1": 0.9875384840932414
-        },
-        {
-          "acc": 0.991,
-          "precision": 0.9858823529411764,
-          "recall": 0.9876252209781968,
-          "f1": 0.986753017368266
-        },
-        {
-          "acc": 0.9905,
-          "precision": 0.9870091526424565,
-          "recall": 0.9849734826163818,
-          "f1": 0.9859902669222829
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9830508474576272,
-          "recall": 0.9911608721272834,
-          "f1": 0.9870892018779343
-        },
-        {
-          "acc": 0.9911,
-          "precision": 0.9822001750802452,
-          "recall": 0.9917501473187978,
-          "f1": 0.9869520598152763
-        },
-        {
-          "acc": 0.9901,
-          "precision": 0.9887273805992287,
-          "recall": 0.9820271066588097,
-          "f1": 0.9853658536585367
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9907,
-          "precision": 0.9833089311859443,
-          "recall": 0.9893930465527401,
-          "f1": 0.9863416066970185
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.991,
-          "precision": 0.9833235810415447,
-          "recall": 0.9902769593400118,
-          "f1": 0.9867880211391661
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9833333333333333,
-          "recall": 0.9908662345315262,
-          "f1": 0.9870854123862635
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9824868651488616,
-          "recall": 0.9917501473187978,
-          "f1": 0.9870967741935485
-        },
-        {
-          "acc": 0.9909,
-          "precision": 0.9838851450336947,
-          "recall": 0.9893930465527401,
-          "f1": 0.9866314088438372
-        },
-        {
-          "acc": 0.9911,
-          "precision": 0.9833284586136297,
-          "recall": 0.990571596935769,
-          "f1": 0.9869367385879936
-        },
-        {
-          "acc": 0.9913,
-          "precision": 0.9836209417958467,
-          "recall": 0.9908662345315262,
-          "f1": 0.9872302950242183
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.991,
-          "precision": 0.9858823529411764,
-          "recall": 0.9876252209781968,
-          "f1": 0.986753017368266
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9830508474576272,
-          "recall": 0.9911608721272834,
-          "f1": 0.9870892018779343
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9899,
-          "precision": 0.9875629256736749,
-          "recall": 0.9826163818503241,
-          "f1": 0.9850834440998375
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.9915,
-          "precision": 0.9819399941741916,
-          "recall": 0.993223335297584,
-          "f1": 0.9875494360626923
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9906,
-          "precision": 0.987012987012987,
-          "recall": 0.985268120212139,
-          "f1": 0.9861397817752875
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.991,
-          "precision": 0.9833235810415447,
-          "recall": 0.9902769593400118,
-          "f1": 0.9867880211391661
-        },
-        {
-          "acc": 0.9907,
-          "precision": 0.9864426760978485,
-          "recall": 0.9861520329994107,
-          "f1": 0.9862973331368794
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.9824868651488616,
-          "recall": 0.9917501473187978,
-          "f1": 0.9870967741935485
-        },
-        {
-          "acc": 0.9911,
-          "precision": 0.9833284586136297,
-          "recall": 0.990571596935769,
-          "f1": 0.9869367385879936
-        },
-        {
-          "acc": 0.9908,
-          "precision": 0.986446670595168,
-          "recall": 0.986446670595168,
-          "f1": 0.986446670595168
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9916,
-          "precision": 0.9825072886297376,
-          "recall": 0.9929286977018268,
-          "f1": 0.9876905041031652
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        },
-        {
-          "acc": 0.9913,
-          "precision": 0.9824919754887657,
-          "recall": 0.9920447849145551,
-          "f1": 0.9872452719542588
-        },
-        {
-          "acc": 0.9915,
-          "precision": 0.9827837758972863,
-          "recall": 0.9923394225103123,
-          "f1": 0.9875384840932414
-        },
-        {
-          "acc": 0.9916,
-          "precision": 0.9827887981330222,
-          "recall": 0.9926340601060696,
-          "f1": 0.9876868953386104
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.982768691588785,
-          "recall": 0.9914555097230406,
-          "f1": 0.9870929891463771
-        },
-        {
-          "acc": 0.9909,
-          "precision": 0.9833187006145742,
-          "recall": 0.9899823217442546,
-          "f1": 0.986639260020555
-        },
-        {
-          "acc": 0.9904,
-          "precision": 0.987005316007088,
-          "recall": 0.9846788450206246,
-          "f1": 0.9858407079646017
-        },
-        {
-          "acc": 0.9912,
-          "precision": 0.982768691588785,
-          "recall": 0.9914555097230406,
-          "f1": 0.9870929891463771
-        },
-        {
-          "acc": 0.9914,
-          "precision": 0.982778750729714,
-          "recall": 0.9920447849145551,
-          "f1": 0.9873900293255131
-        }
-      ]
-    },
-    "hard": {
-      "n_nodes": 40,
-      "n_edges": 44,
-      "gnn_final": {
-        "acc": 0.984,
-        "precision": 0.9533980582524272,
-        "recall": 0.9750354609929078,
-        "f1": 0.9640953716690043
-      },
-      "baseline_direct_neighbors": {
-        "acc": 0.88875,
-        "precision": 1.0,
-        "recall": 0.4950354609929078,
-        "f1": 0.6622390891840607
-      },
-      "improvement_f1_pp": 30.185628248494357,
-      "train_loss_curve": [
-        0.15102637716173195,
-        0.052633647776499856,
-        0.04379157433440559,
-        0.04003102573152864,
-        0.03876525610721728,
-        0.0369047760956164,
-        0.036530632421345216,
-        0.035830124779022296,
-        0.0349417570647056,
-        0.035263367522318734,
-        0.03485661885762238,
-        0.03493121563128079,
-        0.032977926293009656,
-        0.03394761107103841,
-        0.033683306101149356,
-        0.033089775294763965,
-        0.0335856751325955,
-        0.03272933466515315,
-        0.032765767610715556,
-        0.032717534617419004,
-        0.03298612758413583,
-        0.03169301031356008,
-        0.0323142114428847,
-        0.03186470089994691,
-        0.032041587697027356,
-        0.03211515340814367,
-        0.032251973500227904,
-        0.031999882343730864,
-        0.03164813786187369,
-        0.03160676156320551,
-        0.031426732700598224,
-        0.031241096474510413,
-        0.03162557367896079,
-        0.03154335625256863,
-        0.03165931336190261,
-        0.03097459732750576,
-        0.03131493923773814,
-        0.0311658642354123,
-        0.030633534374135706,
-        0.031252258909702506,
-        0.030825211223787848,
-        0.03053342323340803,
-        0.030733022628217442,
-        0.030747544990059397,
-        0.030629911747484584,
-        0.030457735169680745,
-        0.03058615475141687,
-        0.030597560634826552,
-        0.030619746312839653,
-        0.03066707000986935,
-        0.03048766604950197,
-        0.030287153372872126,
-        0.0303783905812179,
-        0.030595246432494606,
-        0.03037994001944753,
-        0.030246819483697437,
-        0.03012882444020579,
-        0.03024448805347947,
-        0.030449683469725642,
-        0.03048290506813919,
-        0.030136575797458136,
-        0.02994714516170643,
-        0.030466000927322056,
-        0.03019473605195526,
-        0.02987939404982535,
-        0.030137449657182513,
-        0.030104370625325828,
-        0.030588962311178875,
-        0.029767145353838714,
-        0.030284092916966984,
-        0.03002391016312413,
-        0.02992785992539757,
-        0.030997538813613574,
-        0.029848512160238896,
-        0.030022954882957493,
-        0.030052907403214705,
-        0.02975074222330568,
-        0.029870129619877842,
-        0.02968558935528563,
-        0.029977637300933564
-      ],
-      "test_metric_curve": [
-        {
-          "acc": 0.978625,
-          "precision": 0.9395194697597349,
-          "recall": 0.9651063829787234,
-          "f1": 0.9521410579345089
-        },
-        {
-          "acc": 0.9813125,
-          "precision": 0.9460730088495575,
-          "recall": 0.9704964539007093,
-          "f1": 0.9581291135695281
-        },
-        {
-          "acc": 0.982,
-          "precision": 0.9607173356105893,
-          "recall": 0.9574468085106383,
-          "f1": 0.959079283887468
-        },
-        {
-          "acc": 0.9805625,
-          "precision": 0.9649884259259259,
-          "recall": 0.9460992907801419,
-          "f1": 0.9554505085231342
-        },
-        {
-          "acc": 0.98225,
-          "precision": 0.952274630198158,
-          "recall": 0.9679432624113475,
-          "f1": 0.9600450196961171
-        },
-        {
-          "acc": 0.98225,
-          "precision": 0.9639278557114228,
-          "recall": 0.955177304964539,
-          "f1": 0.9595326303790253
-        },
-        {
-          "acc": 0.982375,
-          "precision": 0.9543289436817035,
-          "recall": 0.9662411347517731,
-          "f1": 0.9602480969833662
-        },
-        {
-          "acc": 0.98375,
-          "precision": 0.9543556916225995,
-          "recall": 0.9727659574468085,
-          "f1": 0.9634728856420341
-        },
-        {
-          "acc": 0.98125,
-          "precision": 0.9680696661828737,
-          "recall": 0.9460992907801419,
-          "f1": 0.9569583931133429
-        },
-        {
-          "acc": 0.983,
-          "precision": 0.965379113018598,
-          "recall": 0.9571631205673758,
-          "f1": 0.9612535612535612
-        },
-        {
-          "acc": 0.984375,
-          "precision": 0.9593267882187938,
-          "recall": 0.9702127659574468,
-          "f1": 0.9647390691114245
-        },
-        {
-          "acc": 0.9836875,
-          "precision": 0.9633730834752982,
-          "recall": 0.9625531914893617,
-          "f1": 0.9629629629629629
-        },
-        {
-          "acc": 0.98425,
-          "precision": 0.9507022858716607,
-          "recall": 0.979290780141844,
-          "f1": 0.9647847959754053
-        },
-        {
-          "acc": 0.983,
-          "precision": 0.9651129539605376,
-          "recall": 0.9574468085106383,
-          "f1": 0.9612645969809172
-        },
-        {
-          "acc": 0.9840625,
-          "precision": 0.9587542087542088,
-          "recall": 0.9693617021276596,
-          "f1": 0.9640287769784174
-        },
-        {
-          "acc": 0.9835625,
-          "precision": 0.966,
-          "recall": 0.9591489361702128,
-          "f1": 0.9625622775800712
-        },
-        {
-          "acc": 0.9839375,
-          "precision": 0.9600225225225225,
-          "recall": 0.9673758865248226,
-          "f1": 0.963685177335029
-        },
-        {
-          "acc": 0.98425,
-          "precision": 0.9405114401076716,
-          "recall": 0.9912056737588653,
-          "f1": 0.9651933701657459
-        },
-        {
-          "acc": 0.9814375,
-          "precision": 0.9686411149825784,
-          "recall": 0.9463829787234043,
-          "f1": 0.9573826947912182
-        },
-        {
-          "acc": 0.9831875,
-          "precision": 0.955512031337437,
-          "recall": 0.9687943262411347,
-          "f1": 0.9621073390618397
-        },
-        {
-          "acc": 0.9836875,
-          "precision": 0.9515771997786386,
-          "recall": 0.9756028368794326,
-          "f1": 0.9634402577391792
-        },
-        {
-          "acc": 0.9860625,
-          "precision": 0.9565818584070797,
-          "recall": 0.9812765957446808,
-          "f1": 0.9687718806889791
-        },
-        {
-          "acc": 0.9835625,
-          "precision": 0.9505524861878453,
-          "recall": 0.9761702127659575,
-          "f1": 0.9631910426871939
-        },
-        {
-          "acc": 0.9853125,
-          "precision": 0.9472539423599783,
-          "recall": 0.9883687943262411,
-          "f1": 0.9673747049840344
-        },
-        {
-          "acc": 0.9860625,
-          "precision": 0.9479110146500271,
-          "recall": 0.9912056737588653,
-          "f1": 0.9690750242684788
-        },
-        {
-          "acc": 0.982875,
-          "precision": 0.9645613032294942,
-          "recall": 0.9574468085106383,
-          "f1": 0.960990888382688
-        },
-        {
-          "acc": 0.9843125,
-          "precision": 0.9606077658975802,
-          "recall": 0.9685106382978723,
-          "f1": 0.9645430145500776
-        },
-        {
-          "acc": 0.9840625,
-          "precision": 0.9501651982378855,
-          "recall": 0.9790070921985815,
-          "f1": 0.9643705463182898
-        },
-        {
-          "acc": 0.983375,
-          "precision": 0.9568264648163723,
-          "recall": 0.9682269503546099,
-          "f1": 0.9624929498025946
-        },
-        {
-          "acc": 0.98375,
-          "precision": 0.9505934308584046,
-          "recall": 0.9770212765957447,
-          "f1": 0.9636261891438165
-        },
-        {
-          "acc": 0.9845,
-          "precision": 0.9555184876285794,
-          "recall": 0.9750354609929078,
-          "f1": 0.9651783206964335
-        },
-        {
-          "acc": 0.9830625,
-          "precision": 0.9557422969187676,
-          "recall": 0.9679432624113475,
-          "f1": 0.9618040873854828
-        },
-        {
-          "acc": 0.983375,
-          "precision": 0.9555493430248811,
-          "recall": 0.969645390070922,
-          "f1": 0.9625457617572516
-        },
-        {
-          "acc": 0.984,
-          "precision": 0.9511454595638973,
-          "recall": 0.9775886524822694,
-          "f1": 0.9641857862339116
-        },
-        {
-          "acc": 0.9845625,
-          "precision": 0.9611705120990434,
-          "recall": 0.9690780141843972,
-          "f1": 0.9651080661110327
-        },
-        {
-          "acc": 0.984625,
-          "precision": 0.9565580618212197,
-          "recall": 0.9744680851063829,
-          "f1": 0.9654300168634065
-        },
-        {
-          "acc": 0.9846875,
-          "precision": 0.9563160823594881,
-          "recall": 0.9750354609929078,
-          "f1": 0.9655850540806294
-        },
-        {
-          "acc": 0.9856875,
-          "precision": 0.9461288576069301,
-          "recall": 0.9914893617021276,
-          "f1": 0.9682781548690954
-        },
-        {
-          "acc": 0.9841875,
-          "precision": 0.9631936579841449,
-          "recall": 0.9651063829787234,
-          "f1": 0.9641490718435596
-        },
-        {
-          "acc": 0.98475,
-          "precision": 0.9560745065332221,
-          "recall": 0.9756028368794326,
-          "f1": 0.9657399606852007
-        },
-        {
-          "acc": 0.9836875,
-          "precision": 0.9558659217877095,
-          "recall": 0.9707801418439717,
-          "f1": 0.963265306122449
-        },
-        {
-          "acc": 0.9854375,
-          "precision": 0.9497267759562842,
-          "recall": 0.9860992907801418,
-          "f1": 0.967571329157968
-        },
-        {
-          "acc": 0.9844375,
-          "precision": 0.9502473886750962,
-          "recall": 0.9807092198581561,
-          "f1": 0.9652380287588997
-        },
-        {
-          "acc": 0.9844375,
-          "precision": 0.9601123595505618,
-          "recall": 0.969645390070922,
-          "f1": 0.9648553281580804
-        },
-        {
-          "acc": 0.98475,
-          "precision": 0.957345971563981,
-          "recall": 0.9741843971631206,
-          "f1": 0.9656917885264341
-        },
-        {
-          "acc": 0.983625,
-          "precision": 0.9543302701197438,
-          "recall": 0.9721985815602837,
-          "f1": 0.9631815626756605
-        },
-        {
-          "acc": 0.9839375,
-          "precision": 0.9526315789473684,
-          "recall": 0.9756028368794326,
-          "f1": 0.9639803784162578
-        },
-        {
-          "acc": 0.9833125,
-          "precision": 0.9509966777408638,
-          "recall": 0.9744680851063829,
-          "f1": 0.962589323245061
-        },
-        {
-          "acc": 0.98425,
-          "precision": 0.9499587572174869,
-          "recall": 0.9801418439716312,
-          "f1": 0.9648142976822116
-        },
-        {
-          "acc": 0.984375,
-          "precision": 0.9590692458648724,
-          "recall": 0.9704964539007093,
-          "f1": 0.9647490129723633
-        },
-        {
-          "acc": 0.9838125,
-          "precision": 0.9528563505268997,
-          "recall": 0.9747517730496454,
-          "f1": 0.9636797083158043
-        },
-        {
-          "acc": 0.9848125,
-          "precision": 0.9553274139844617,
-          "recall": 0.9767375886524823,
-          "f1": 0.965913872913452
-        },
-        {
-          "acc": 0.9836875,
-          "precision": 0.9551031790295594,
-          "recall": 0.9716312056737588,
-          "f1": 0.963296301504711
-        },
-        {
-          "acc": 0.9845,
-          "precision": 0.9429575560962422,
-          "recall": 0.9895035460992908,
-          "f1": 0.965669988925803
-        },
-        {
-          "acc": 0.982375,
-          "precision": 0.9589583923011605,
-          "recall": 0.9611347517730496,
-          "f1": 0.9600453386228394
-        },
-        {
-          "acc": 0.984375,
-          "precision": 0.962439988703756,
-          "recall": 0.9668085106382979,
-          "f1": 0.9646193037078971
-        },
-        {
-          "acc": 0.985625,
-          "precision": 0.9517411571154374,
-          "recall": 0.9846808510638297,
-          "f1": 0.967930842163971
-        },
-        {
-          "acc": 0.98325,
-          "precision": 0.9596387242449901,
-          "recall": 0.9645390070921985,
-          "f1": 0.9620826259196378
-        },
-        {
-          "acc": 0.984,
-          "precision": 0.9647426784191072,
-          "recall": 0.9625531914893617,
-          "f1": 0.9636466912808862
-        },
-        {
-          "acc": 0.984875,
-          "precision": 0.9586476669460743,
-          "recall": 0.9733333333333334,
-          "f1": 0.9659346846846848
-        },
-        {
-          "acc": 0.9850625,
-          "precision": 0.9581706636921361,
-          "recall": 0.9747517730496454,
-          "f1": 0.9663900998453102
-        },
-        {
-          "acc": 0.9836875,
-          "precision": 0.9493392070484582,
-          "recall": 0.9781560283687943,
-          "f1": 0.9635322062316614
-        },
-        {
-          "acc": 0.983125,
-          "precision": 0.9575484959235311,
-          "recall": 0.9662411347517731,
-          "f1": 0.9618751765038125
-        },
-        {
-          "acc": 0.98425,
-          "precision": 0.9492176777381279,
-          "recall": 0.9809929078014185,
-          "f1": 0.9648437500000001
-        },
-        {
-          "acc": 0.9826875,
-          "precision": 0.9672036823935558,
-          "recall": 0.953758865248227,
-          "f1": 0.960434223682331
-        },
-        {
-          "acc": 0.9845,
-          "precision": 0.961679346294731,
-          "recall": 0.9682269503546099,
-          "f1": 0.964942041277919
-        },
-        {
-          "acc": 0.9845,
-          "precision": 0.960900140646976,
-          "recall": 0.9690780141843972,
-          "f1": 0.9649717514124294
-        },
-        {
-          "acc": 0.984125,
-          "precision": 0.9623975120158327,
-          "recall": 0.9656737588652482,
-          "f1": 0.9640328518833192
-        },
-        {
-          "acc": 0.984875,
-          "precision": 0.9571150097465887,
-          "recall": 0.9750354609929078,
-          "f1": 0.9659921304103429
-        },
-        {
-          "acc": 0.984625,
-          "precision": 0.9598877980364656,
-          "recall": 0.9707801418439717,
-          "f1": 0.9653032440056418
-        },
-        {
-          "acc": 0.98375,
-          "precision": 0.9546087440824282,
-          "recall": 0.9724822695035461,
-          "f1": 0.9634626194491286
-        },
-        {
-          "acc": 0.984125,
-          "precision": 0.9501789154968345,
-          "recall": 0.979290780141844,
-          "f1": 0.9645152277172394
-        },
-        {
-          "acc": 0.9849375,
-          "precision": 0.9607182940516273,
-          "recall": 0.9713475177304964,
-          "f1": 0.9660036676541119
-        },
-        {
-          "acc": 0.984875,
-          "precision": 0.956606397774687,
-          "recall": 0.9756028368794326,
-          "f1": 0.9660112359550562
-        },
-        {
-          "acc": 0.984625,
-          "precision": 0.9570671870643992,
-          "recall": 0.9739007092198582,
-          "f1": 0.9654105736782902
-        },
-        {
-          "acc": 0.9849375,
-          "precision": 0.9584031267448353,
-          "recall": 0.9739007092198582,
-          "f1": 0.9660897706486562
-        },
-        {
-          "acc": 0.98375,
-          "precision": 0.9523413688002217,
-          "recall": 0.9750354609929078,
-          "f1": 0.9635548079618728
-        },
-        {
-          "acc": 0.984,
-          "precision": 0.9536497363308354,
-          "recall": 0.9747517730496454,
-          "f1": 0.9640852974186307
-        },
-        {
-          "acc": 0.98375,
-          "precision": 0.9505934308584046,
-          "recall": 0.9770212765957447,
-          "f1": 0.9636261891438165
-        },
-        {
-          "acc": 0.984,
-          "precision": 0.9533980582524272,
-          "recall": 0.9750354609929078,
-          "f1": 0.9640953716690043
-        }
-      ]
-    }
-  },
-  "config": {
-    "n_train": 2000,
-    "n_test": 400,
-    "hidden_dim": 64,
-    "epochs": 80,
-    "lr": 0.002,
-    "max_hops": 3
-  },
-  "elapsed_min": 21.402417866388955
 }

+{
+  "graphs": {
+    "easy": {
+      "n_nodes": 12,
+      "n_edges": 10,
+      "gnn_final": {
+        "acc": 1.0,
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1": 1.0
+      },
+      "baseline_direct_neighbors": {
+        "acc": 0.8258333333333333,
+        "precision": 1.0,
+        "recall": 0.6352530541012217,
+        "f1": 0.7769477054429028
+      },
+      "improvement_f1_pp": 22.305229455709718,
+      "train_loss_curve": [
+        0.10601958807871187,
+        0.00014574478766241308,
+        2.1336230871288145e-05,
+        5.904760447787133e-06,
+        0.014828034023753519,
+        0.0001365676538936252,
+        2.800940909035432e-05,
+        7.873948834791846e-06,
+        2.40824965675521e-06,
+        7.439197035413468e-07,
+        2.349434055591839e-07,
+        8.035365056026132e-08,
+        1.866763376779131e-08,
+        6.7128299592450774e-09,
+        3.606812599319898e-09,
+        2.4320182903440704e-09,
+        1.5445408799196548e-09,
+        0.03198392186360504,
+        1.3277981027858794e-05,
+        7.040849976128097e-06,
+        2.0380432214083175e-06,
+        5.154616233541851e-07,
+        0.017213296287886225,
+        0.00023569030925164338,
+        2.4805963813645227e-05,
+        6.058055528068272e-06,
+        1.8203820033098038e-06,
+        6.043328515907098e-07,
+        2.1225388103874568e-07,
+        7.437462508802039e-08,
+        1.902343076246039e-08,
+        6.527784956639485e-09,
+        3.3294667175720776e-09,
+        1.9615958442567566e-09,
+        0.010902570914775889,
+        2.806348171776314e-05,
+        7.667120790626038e-06,
+        2.582107717285551e-06,
+        9.129105348027232e-07,
+        3.106581481139294e-07,
+        1.0230859844032431e-07,
+        2.725160428237702e-08,
+        8.880124408068363e-09,
+        4.4200613740675046e-09,
+        2.8600379247657045e-09,
+        2.2151315261330923e-09,
+        1.7114610773887693e-09,
+        1.4000422095074408e-09,
+        1.0463116296276038e-09,
+        6.4079628731738e-10,
+        0.02516633728286725,
+        0.00012813284900565014,
+        2.3232634050379803e-05,
+        7.066120872802589e-06,
+        2.311430617913936e-06,
+        7.920952698295068e-07,
+        2.5278086959691613e-07,
+        7.818242851037627e-08,
+        1.983640248580842e-08,
+        7.863145182916767e-09,
+        5.0701508055233275e-09,
+        4.364776342121379e-09,
+        3.937454630286758e-09,
+        2.518706138457294e-09,
+        1.9815549914984234e-09,
+        0.018349960519401222,
+        7.85511791638533e-05,
+        2.0063992723006376e-05,
+        6.210748974664104e-06,
+        1.9043317207399904e-06,
+        6.112533347568437e-07,
+        2.0612900407184615e-07,
+        6.247272126631417e-08,
+        1.5818333928198573e-08,
+        5.678499110562204e-09,
+        2.927658185385007e-09,
+        2.2895658619235268e-09,
+        1.9812523096841366e-09,
+        1.418338779821114e-09,
+        9.94527561841937e-10
+      ],
+      "test_metric_curve": [
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        },
+        {
+          "acc": 1.0,
+          "precision": 1.0,
+          "recall": 1.0,
+          "f1": 1.0
+        }
+      ]
+    },
+    "medium": {
+      "n_nodes": 25,
+      "n_edges": 27,
+      "gnn_final": {
+        "acc": 0.9914,
+        "precision": 0.982778750729714,
+        "recall": 0.9920447849145551,
+        "f1": 0.9873900293255131
+      },
+      "baseline_direct_neighbors": {
+        "acc": 0.8301,
+        "precision": 1.0,
+        "recall": 0.4994107248084856,
+        "f1": 0.6661426606405974
+      },
+      "improvement_f1_pp": 32.124736868491574,
+      "train_loss_curve": [
+        0.18512494587464606,
+        0.05774239192842651,
+        0.04035148839658183,
+        0.03685507851154424,
+        0.034016887983169666,
+        0.03193854558186021,
+        0.030314448321928544,
+        0.028890588828011224,
+        0.02627120438580584,
+        0.02676936000857496,
+        0.02735587336003725,
+        0.024704556535801756,
+        0.023389738032454397,
+        0.02484239745095036,
+        0.022598365899086623,
+        0.022097759216314333,
+        0.021880711925624425,
+        0.023672257099118552,
+        0.021815840122002862,
+        0.021538631150760885,
+        0.021590486920307173,
+        0.020993219244996,
+        0.021660113581202914,
+        0.02028199757042485,
+        0.021449406110984975,
+        0.02049649202735325,
+        0.02005596899437715,
+        0.02060316097080978,
+        0.02082035162168178,
+        0.020935066080168856,
+        0.0209964800781561,
+        0.019652295691733542,
+        0.020470858438760543,
+        0.020456047435481396,
+        0.020529603496513553,
+        0.019996260003822708,
+        0.021328506347361064,
+        0.019778630244522907,
+        0.01971426555108731,
+        0.019847191254493045,
+        0.01984119418810368,
+        0.02021396374486143,
+        0.01946370021810413,
+        0.019111871498224214,
+        0.019667785586758944,
+        0.021675049597691873,
+        0.01897557202284267,
+        0.01971483370839516,
+        0.01965866965101487,
+        0.01936112277971507,
+        0.01895255452432814,
+        0.02035098125927439,
+        0.01909720691408324,
+        0.019500281907226687,
+        0.019117790717674256,
+        0.018927754213147425,
+        0.020313845976115717,
+        0.019341792678655486,
+        0.01890229735773205,
+        0.019833170414518056,
+        0.01948640772390163,
+        0.019305320678627013,
+        0.019213381035159603,
+        0.020478221997059808,
+        0.01936127331570382,
+        0.019158014420631225,
+        0.019090143173694583,
+        0.020291763241906225,
+        0.01900654871721499,
+        0.019815083033949698,
+        0.019103285589502736,
+        0.018360809753397392,
+        0.019985065603578676,
+        0.01858524212906661,
+        0.02056734084818314,
+        0.01856864124721938,
+        0.01852369899036554,
+        0.018906581267301003,
+        0.01927234342475787,
+        0.018721831301170885
+      ],
+      "test_metric_curve": [
+        {
+          "acc": 0.9816,
+          "precision": 0.9819819819819819,
+          "recall": 0.9634649381261049,
+          "f1": 0.9726353361094586
+        },
+        {
+          "acc": 0.9885,
+          "precision": 0.9742551345096905,
+          "recall": 0.9923394225103123,
+          "f1": 0.9832141293241862
+        },
+        {
+          "acc": 0.988,
+          "precision": 0.9720299884659747,
+          "recall": 0.993223335297584,
+          "f1": 0.9825123870591663
+        },
+        {
+          "acc": 0.9892,
+          "precision": 0.986094674556213,
+          "recall": 0.9820271066588097,
+          "f1": 0.9840566873339238
+        },
+        {
+          "acc": 0.9916,
+          "precision": 0.9825072886297376,
+          "recall": 0.9929286977018268,
+          "f1": 0.9876905041031652
+        },
+        {
+          "acc": 0.9913,
+          "precision": 0.9824919754887657,
+          "recall": 0.9920447849145551,
+          "f1": 0.9872452719542588
+        },
+        {
+          "acc": 0.9909,
+          "precision": 0.9847373055474024,
+          "recall": 0.9885091337654685,
+          "f1": 0.9866196147625349
+        },
+        {
+          "acc": 0.9857,
+          "precision": 0.9954282231027126,
+          "recall": 0.9622863877430761,
+          "f1": 0.9785767790262172
+        },
+        {
+          "acc": 0.9882,
+          "precision": 0.9761627906976744,
+          "recall": 0.9893930465527401,
+          "f1": 0.9827333918642083
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9833333333333333,
+          "recall": 0.9908662345315262,
+          "f1": 0.9870854123862635
+        },
+        {
+          "acc": 0.9911,
+          "precision": 0.9864586399764498,
+          "recall": 0.9873305833824396,
+          "f1": 0.9868944190840818
+        },
+        {
+          "acc": 0.9842,
+          "precision": 0.997539975399754,
+          "recall": 0.9558043606364172,
+          "f1": 0.9762263015347576
+        },
+        {
+          "acc": 0.9872,
+          "precision": 0.9936517533252721,
+          "recall": 0.9684737772539777,
+          "f1": 0.9809012235153686
+        },
+        {
+          "acc": 0.9919,
+          "precision": 0.9825225750072822,
+          "recall": 0.9938126104890984,
+          "f1": 0.9881353449538597
+        },
+        {
+          "acc": 0.9905,
+          "precision": 0.9864346800353878,
+          "recall": 0.9855627578078963,
+          "f1": 0.9859985261606485
+        },
+        {
+          "acc": 0.9903,
+          "precision": 0.9867139061116031,
+          "recall": 0.9846788450206246,
+          "f1": 0.9856953251732783
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9833333333333333,
+          "recall": 0.9908662345315262,
+          "f1": 0.9870854123862635
+        },
+        {
+          "acc": 0.9917,
+          "precision": 0.9827938174394867,
+          "recall": 0.9929286977018268,
+          "f1": 0.9878352630807563
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.9822157434402332,
+          "recall": 0.9926340601060696,
+          "f1": 0.9873974208675265
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.9833430742255991,
+          "recall": 0.9914555097230406,
+          "f1": 0.9873826291079812
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.9909,
+          "precision": 0.9858781994704324,
+          "recall": 0.9873305833824396,
+          "f1": 0.9866038569115266
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9833333333333333,
+          "recall": 0.9908662345315262,
+          "f1": 0.9870854123862635
+        },
+        {
+          "acc": 0.9915,
+          "precision": 0.9827837758972863,
+          "recall": 0.9923394225103123,
+          "f1": 0.9875384840932414
+        },
+        {
+          "acc": 0.9907,
+          "precision": 0.9873043991733097,
+          "recall": 0.985268120212139,
+          "f1": 0.9862852086712873
+        },
+        {
+          "acc": 0.9919,
+          "precision": 0.9825225750072822,
+          "recall": 0.9938126104890984,
+          "f1": 0.9881353449538597
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9916,
+          "precision": 0.9777713625866051,
+          "recall": 0.9979375368296994,
+          "f1": 0.9877515310586177
+        },
+        {
+          "acc": 0.9901,
+          "precision": 0.9869937924918711,
+          "recall": 0.983794932233353,
+          "f1": 0.9853917662682603
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9904,
+          "precision": 0.9872931442080378,
+          "recall": 0.9843842074248674,
+          "f1": 0.9858365299498378
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9887,
+          "precision": 0.993680409268733,
+          "recall": 0.9728933411903359,
+          "f1": 0.9831770135477147
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9833333333333333,
+          "recall": 0.9908662345315262,
+          "f1": 0.9870854123862635
+        },
+        {
+          "acc": 0.9913,
+          "precision": 0.983338205203157,
+          "recall": 0.9911608721272834,
+          "f1": 0.9872340425531914
+        },
+        {
+          "acc": 0.9915,
+          "precision": 0.9827837758972863,
+          "recall": 0.9923394225103123,
+          "f1": 0.9875384840932414
+        },
+        {
+          "acc": 0.991,
+          "precision": 0.9858823529411764,
+          "recall": 0.9876252209781968,
+          "f1": 0.986753017368266
+        },
+        {
+          "acc": 0.9905,
+          "precision": 0.9870091526424565,
+          "recall": 0.9849734826163818,
+          "f1": 0.9859902669222829
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9830508474576272,
+          "recall": 0.9911608721272834,
+          "f1": 0.9870892018779343
+        },
+        {
+          "acc": 0.9911,
+          "precision": 0.9822001750802452,
+          "recall": 0.9917501473187978,
+          "f1": 0.9869520598152763
+        },
+        {
+          "acc": 0.9901,
+          "precision": 0.9887273805992287,
+          "recall": 0.9820271066588097,
+          "f1": 0.9853658536585367
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9907,
+          "precision": 0.9833089311859443,
+          "recall": 0.9893930465527401,
+          "f1": 0.9863416066970185
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.991,
+          "precision": 0.9833235810415447,
+          "recall": 0.9902769593400118,
+          "f1": 0.9867880211391661
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9833333333333333,
+          "recall": 0.9908662345315262,
+          "f1": 0.9870854123862635
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9824868651488616,
+          "recall": 0.9917501473187978,
+          "f1": 0.9870967741935485
+        },
+        {
+          "acc": 0.9909,
+          "precision": 0.9838851450336947,
+          "recall": 0.9893930465527401,
+          "f1": 0.9866314088438372
+        },
+        {
+          "acc": 0.9911,
+          "precision": 0.9833284586136297,
+          "recall": 0.990571596935769,
+          "f1": 0.9869367385879936
+        },
+        {
+          "acc": 0.9913,
+          "precision": 0.9836209417958467,
+          "recall": 0.9908662345315262,
+          "f1": 0.9872302950242183
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.991,
+          "precision": 0.9858823529411764,
+          "recall": 0.9876252209781968,
+          "f1": 0.986753017368266
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9830508474576272,
+          "recall": 0.9911608721272834,
+          "f1": 0.9870892018779343
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9899,
+          "precision": 0.9875629256736749,
+          "recall": 0.9826163818503241,
+          "f1": 0.9850834440998375
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.9915,
+          "precision": 0.9819399941741916,
+          "recall": 0.993223335297584,
+          "f1": 0.9875494360626923
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9906,
+          "precision": 0.987012987012987,
+          "recall": 0.985268120212139,
+          "f1": 0.9861397817752875
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.991,
+          "precision": 0.9833235810415447,
+          "recall": 0.9902769593400118,
+          "f1": 0.9867880211391661
+        },
+        {
+          "acc": 0.9907,
+          "precision": 0.9864426760978485,
+          "recall": 0.9861520329994107,
+          "f1": 0.9862973331368794
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.9824868651488616,
+          "recall": 0.9917501473187978,
+          "f1": 0.9870967741935485
+        },
+        {
+          "acc": 0.9911,
+          "precision": 0.9833284586136297,
+          "recall": 0.990571596935769,
+          "f1": 0.9869367385879936
+        },
+        {
+          "acc": 0.9908,
+          "precision": 0.986446670595168,
+          "recall": 0.986446670595168,
+          "f1": 0.986446670595168
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9916,
+          "precision": 0.9825072886297376,
+          "recall": 0.9929286977018268,
+          "f1": 0.9876905041031652
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        },
+        {
+          "acc": 0.9913,
+          "precision": 0.9824919754887657,
+          "recall": 0.9920447849145551,
+          "f1": 0.9872452719542588
+        },
+        {
+          "acc": 0.9915,
+          "precision": 0.9827837758972863,
+          "recall": 0.9923394225103123,
+          "f1": 0.9875384840932414
+        },
+        {
+          "acc": 0.9916,
+          "precision": 0.9827887981330222,
+          "recall": 0.9926340601060696,
+          "f1": 0.9876868953386104
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.982768691588785,
+          "recall": 0.9914555097230406,
+          "f1": 0.9870929891463771
+        },
+        {
+          "acc": 0.9909,
+          "precision": 0.9833187006145742,
+          "recall": 0.9899823217442546,
+          "f1": 0.986639260020555
+        },
+        {
+          "acc": 0.9904,
+          "precision": 0.987005316007088,
+          "recall": 0.9846788450206246,
+          "f1": 0.9858407079646017
+        },
+        {
+          "acc": 0.9912,
+          "precision": 0.982768691588785,
+          "recall": 0.9914555097230406,
+          "f1": 0.9870929891463771
+        },
+        {
+          "acc": 0.9914,
+          "precision": 0.982778750729714,
+          "recall": 0.9920447849145551,
+          "f1": 0.9873900293255131
+        }
+      ]
+    },
+    "hard": {
+      "n_nodes": 40,
+      "n_edges": 44,
+      "gnn_final": {
+        "acc": 0.984,
+        "precision": 0.9533980582524272,
+        "recall": 0.9750354609929078,
+        "f1": 0.9640953716690043
+      },
+      "baseline_direct_neighbors": {
+        "acc": 0.88875,
+        "precision": 1.0,
+        "recall": 0.4950354609929078,
+        "f1": 0.6622390891840607
+      },
+      "improvement_f1_pp": 30.185628248494357,
+      "train_loss_curve": [
+        0.15102637716173195,
+        0.052633647776499856,
+        0.04379157433440559,
+        0.04003102573152864,
+        0.03876525610721728,
+        0.0369047760956164,
+        0.036530632421345216,
+        0.035830124779022296,
+        0.0349417570647056,
+        0.035263367522318734,
+        0.03485661885762238,
+        0.03493121563128079,
+        0.032977926293009656,
+        0.03394761107103841,
+        0.033683306101149356,
+        0.033089775294763965,
+        0.0335856751325955,
+        0.03272933466515315,
+        0.032765767610715556,
+        0.032717534617419004,
+        0.03298612758413583,
+        0.03169301031356008,
+        0.0323142114428847,
+        0.03186470089994691,
+        0.032041587697027356,
+        0.03211515340814367,
+        0.032251973500227904,
+        0.031999882343730864,
+        0.03164813786187369,
+        0.03160676156320551,
+        0.031426732700598224,
+        0.031241096474510413,
+        0.03162557367896079,
+        0.03154335625256863,
+        0.03165931336190261,
+        0.03097459732750576,
+        0.03131493923773814,
+        0.0311658642354123,
+        0.030633534374135706,
+        0.031252258909702506,
+        0.030825211223787848,
+        0.03053342323340803,
+        0.030733022628217442,
+        0.030747544990059397,
+        0.030629911747484584,
+        0.030457735169680745,
+        0.03058615475141687,
+        0.030597560634826552,
+        0.030619746312839653,
+        0.03066707000986935,
+        0.03048766604950197,
+        0.030287153372872126,
+        0.0303783905812179,
+        0.030595246432494606,
+        0.03037994001944753,
+        0.030246819483697437,
+        0.03012882444020579,
+        0.03024448805347947,
+        0.030449683469725642,
+        0.03048290506813919,
+        0.030136575797458136,
+        0.02994714516170643,
+        0.030466000927322056,
+        0.03019473605195526,
+        0.02987939404982535,
+        0.030137449657182513,
+        0.030104370625325828,
+        0.030588962311178875,
+        0.029767145353838714,
+        0.030284092916966984,
+        0.03002391016312413,
+        0.02992785992539757,
+        0.030997538813613574,
+        0.029848512160238896,
+        0.030022954882957493,
+        0.030052907403214705,
+        0.02975074222330568,
+        0.029870129619877842,
+        0.02968558935528563,
+        0.029977637300933564
+      ],
+      "test_metric_curve": [
+        {
+          "acc": 0.978625,
+          "precision": 0.9395194697597349,
+          "recall": 0.9651063829787234,
+          "f1": 0.9521410579345089
+        },
+        {
+          "acc": 0.9813125,
+          "precision": 0.9460730088495575,
+          "recall": 0.9704964539007093,
+          "f1": 0.9581291135695281
+        },
+        {
+          "acc": 0.982,
+          "precision": 0.9607173356105893,
+          "recall": 0.9574468085106383,
+          "f1": 0.959079283887468
+        },
+        {
+          "acc": 0.9805625,
+          "precision": 0.9649884259259259,
+          "recall": 0.9460992907801419,
+          "f1": 0.9554505085231342
+        },
+        {
+          "acc": 0.98225,
+          "precision": 0.952274630198158,
+          "recall": 0.9679432624113475,
+          "f1": 0.9600450196961171
+        },
+        {
+          "acc": 0.98225,
+          "precision": 0.9639278557114228,
+          "recall": 0.955177304964539,
+          "f1": 0.9595326303790253
+        },
+        {
+          "acc": 0.982375,
+          "precision": 0.9543289436817035,
+          "recall": 0.9662411347517731,
+          "f1": 0.9602480969833662
+        },
+        {
+          "acc": 0.98375,
+          "precision": 0.9543556916225995,
+          "recall": 0.9727659574468085,
+          "f1": 0.9634728856420341
+        },
+        {
+          "acc": 0.98125,
+          "precision": 0.9680696661828737,
+          "recall": 0.9460992907801419,
+          "f1": 0.9569583931133429
+        },
+        {
+          "acc": 0.983,
+          "precision": 0.965379113018598,
+          "recall": 0.9571631205673758,
+          "f1": 0.9612535612535612
+        },
+        {
+          "acc": 0.984375,
+          "precision": 0.9593267882187938,
+          "recall": 0.9702127659574468,
+          "f1": 0.9647390691114245
+        },
+        {
+          "acc": 0.9836875,
+          "precision": 0.9633730834752982,
+          "recall": 0.9625531914893617,
+          "f1": 0.9629629629629629
+        },
+        {
+          "acc": 0.98425,
+          "precision": 0.9507022858716607,
+          "recall": 0.979290780141844,
+          "f1": 0.9647847959754053
+        },
+        {
+          "acc": 0.983,
+          "precision": 0.9651129539605376,
+          "recall": 0.9574468085106383,
+          "f1": 0.9612645969809172
+        },
+        {
+          "acc": 0.9840625,
+          "precision": 0.9587542087542088,
+          "recall": 0.9693617021276596,
+          "f1": 0.9640287769784174
+        },
+        {
+          "acc": 0.9835625,
+          "precision": 0.966,
+          "recall": 0.9591489361702128,
+          "f1": 0.9625622775800712
+        },
+        {
+          "acc": 0.9839375,
+          "precision": 0.9600225225225225,
+          "recall": 0.9673758865248226,
+          "f1": 0.963685177335029
+        },
+        {
+          "acc": 0.98425,
+          "precision": 0.9405114401076716,
+          "recall": 0.9912056737588653,
+          "f1": 0.9651933701657459
+        },
+        {
+          "acc": 0.9814375,
+          "precision": 0.9686411149825784,
+          "recall": 0.9463829787234043,
+          "f1": 0.9573826947912182
+        },
+        {
+          "acc": 0.9831875,
+          "precision": 0.955512031337437,
+          "recall": 0.9687943262411347,
+          "f1": 0.9621073390618397
+        },
+        {
+          "acc": 0.9836875,
+          "precision": 0.9515771997786386,
+          "recall": 0.9756028368794326,
+          "f1": 0.9634402577391792
+        },
+        {
+          "acc": 0.9860625,
+          "precision": 0.9565818584070797,
+          "recall": 0.9812765957446808,
+          "f1": 0.9687718806889791
+        },
+        {
+          "acc": 0.9835625,
+          "precision": 0.9505524861878453,
+          "recall": 0.9761702127659575,
+          "f1": 0.9631910426871939
+        },
+        {
+          "acc": 0.9853125,
+          "precision": 0.9472539423599783,
+          "recall": 0.9883687943262411,
+          "f1": 0.9673747049840344
+        },
+        {
+          "acc": 0.9860625,
+          "precision": 0.9479110146500271,
+          "recall": 0.9912056737588653,
+          "f1": 0.9690750242684788
+        },
+        {
+          "acc": 0.982875,
+          "precision": 0.9645613032294942,
+          "recall": 0.9574468085106383,
+          "f1": 0.960990888382688
+        },
+        {
+          "acc": 0.9843125,
+          "precision": 0.9606077658975802,
+          "recall": 0.9685106382978723,
+          "f1": 0.9645430145500776
+        },
+        {
+          "acc": 0.9840625,
+          "precision": 0.9501651982378855,
+          "recall": 0.9790070921985815,
+          "f1": 0.9643705463182898
+        },
+        {
+          "acc": 0.983375,
+          "precision": 0.9568264648163723,
+          "recall": 0.9682269503546099,
+          "f1": 0.9624929498025946
+        },
+        {
+          "acc": 0.98375,
+          "precision": 0.9505934308584046,
+          "recall": 0.9770212765957447,
+          "f1": 0.9636261891438165
+        },
+        {
+          "acc": 0.9845,
+          "precision": 0.9555184876285794,
+          "recall": 0.9750354609929078,
+          "f1": 0.9651783206964335
+        },
+        {
+          "acc": 0.9830625,
+          "precision": 0.9557422969187676,
+          "recall": 0.9679432624113475,
+          "f1": 0.9618040873854828
+        },
+        {
+          "acc": 0.983375,
+          "precision": 0.9555493430248811,
+          "recall": 0.969645390070922,
+          "f1": 0.9625457617572516
+        },
+        {
+          "acc": 0.984,
+          "precision": 0.9511454595638973,
+          "recall": 0.9775886524822694,
+          "f1": 0.9641857862339116
+        },
+        {
+          "acc": 0.9845625,
+          "precision": 0.9611705120990434,
+          "recall": 0.9690780141843972,
+          "f1": 0.9651080661110327
+        },
+        {
+          "acc": 0.984625,
+          "precision": 0.9565580618212197,
+          "recall": 0.9744680851063829,
+          "f1": 0.9654300168634065
+        },
+        {
+          "acc": 0.9846875,
+          "precision": 0.9563160823594881,
+          "recall": 0.9750354609929078,
+          "f1": 0.9655850540806294
+        },
+        {
+          "acc": 0.9856875,
+          "precision": 0.9461288576069301,
+          "recall": 0.9914893617021276,
+          "f1": 0.9682781548690954
+        },
+        {
+          "acc": 0.9841875,
+          "precision": 0.9631936579841449,
+          "recall": 0.9651063829787234,
+          "f1": 0.9641490718435596
+        },
+        {
+          "acc": 0.98475,
+          "precision": 0.9560745065332221,
+          "recall": 0.9756028368794326,
+          "f1": 0.9657399606852007
+        },
+        {
+          "acc": 0.9836875,
+          "precision": 0.9558659217877095,
+          "recall": 0.9707801418439717,
+          "f1": 0.963265306122449
+        },
+        {
+          "acc": 0.9854375,
+          "precision": 0.9497267759562842,
+          "recall": 0.9860992907801418,
+          "f1": 0.967571329157968
+        },
+        {
+          "acc": 0.9844375,
+          "precision": 0.9502473886750962,
+          "recall": 0.9807092198581561,
+          "f1": 0.9652380287588997
+        },
+        {
+          "acc": 0.9844375,
+          "precision": 0.9601123595505618,
+          "recall": 0.969645390070922,
+          "f1": 0.9648553281580804
+        },
+        {
+          "acc": 0.98475,
+          "precision": 0.957345971563981,
+          "recall": 0.9741843971631206,
+          "f1": 0.9656917885264341
+        },
+        {
+          "acc": 0.983625,
+          "precision": 0.9543302701197438,
+          "recall": 0.9721985815602837,
+          "f1": 0.9631815626756605
+        },
+        {
+          "acc": 0.9839375,
+          "precision": 0.9526315789473684,
+          "recall": 0.9756028368794326,
+          "f1": 0.9639803784162578
+        },
+        {
+          "acc": 0.9833125,
+          "precision": 0.9509966777408638,
+          "recall": 0.9744680851063829,
+          "f1": 0.962589323245061
+        },
+        {
+          "acc": 0.98425,
+          "precision": 0.9499587572174869,
+          "recall": 0.9801418439716312,
+          "f1": 0.9648142976822116
+        },
+        {
+          "acc": 0.984375,
+          "precision": 0.9590692458648724,
+          "recall": 0.9704964539007093,
+          "f1": 0.9647490129723633
+        },
+        {
+          "acc": 0.9838125,
+          "precision": 0.9528563505268997,
+          "recall": 0.9747517730496454,
+          "f1": 0.9636797083158043
+        },
+        {
+          "acc": 0.9848125,
+          "precision": 0.9553274139844617,
+          "recall": 0.9767375886524823,
+          "f1": 0.965913872913452
+        },
+        {
+          "acc": 0.9836875,
+          "precision": 0.9551031790295594,
+          "recall": 0.9716312056737588,
+          "f1": 0.963296301504711
+        },
+        {
+          "acc": 0.9845,
+          "precision": 0.9429575560962422,
+          "recall": 0.9895035460992908,
+          "f1": 0.965669988925803
+        },
+        {
+          "acc": 0.982375,
+          "precision": 0.9589583923011605,
+          "recall": 0.9611347517730496,
+          "f1": 0.9600453386228394
+        },
+        {
+          "acc": 0.984375,
+          "precision": 0.962439988703756,
+          "recall": 0.9668085106382979,
+          "f1": 0.9646193037078971
+        },
+        {
+          "acc": 0.985625,
+          "precision": 0.9517411571154374,
+          "recall": 0.9846808510638297,
+          "f1": 0.967930842163971
+        },
+        {
+          "acc": 0.98325,
+          "precision": 0.9596387242449901,
+          "recall": 0.9645390070921985,
+          "f1": 0.9620826259196378
+        },
+        {
+          "acc": 0.984,
+          "precision": 0.9647426784191072,
+          "recall": 0.9625531914893617,
+          "f1": 0.9636466912808862
+        },
+        {
+          "acc": 0.984875,
+          "precision": 0.9586476669460743,
+          "recall": 0.9733333333333334,
+          "f1": 0.9659346846846848
+        },
+        {
+          "acc": 0.9850625,
+          "precision": 0.9581706636921361,
+          "recall": 0.9747517730496454,
+          "f1": 0.9663900998453102
+        },
+        {
+          "acc": 0.9836875,
+          "precision": 0.9493392070484582,
+          "recall": 0.9781560283687943,
+          "f1": 0.9635322062316614
+        },
+        {
+          "acc": 0.983125,
+          "precision": 0.9575484959235311,
+          "recall": 0.9662411347517731,
+          "f1": 0.9618751765038125
+        },
+        {
+          "acc": 0.98425,
+          "precision": 0.9492176777381279,
+          "recall": 0.9809929078014185,
+          "f1": 0.9648437500000001
+        },
+        {
+          "acc": 0.9826875,
+          "precision": 0.9672036823935558,
+          "recall": 0.953758865248227,
+          "f1": 0.960434223682331
+        },
+        {
+          "acc": 0.9845,
+          "precision": 0.961679346294731,
+          "recall": 0.9682269503546099,
+          "f1": 0.964942041277919
+        },
+        {
+          "acc": 0.9845,
+          "precision": 0.960900140646976,
+          "recall": 0.9690780141843972,
+          "f1": 0.9649717514124294
+        },
+        {
+          "acc": 0.984125,
+          "precision": 0.9623975120158327,
+          "recall": 0.9656737588652482,
+          "f1": 0.9640328518833192
+        },
+        {
+          "acc": 0.984875,
+          "precision": 0.9571150097465887,
+          "recall": 0.9750354609929078,
+          "f1": 0.9659921304103429
+        },
+        {
+          "acc": 0.984625,
+          "precision": 0.9598877980364656,
+          "recall": 0.9707801418439717,
+          "f1": 0.9653032440056418
+        },
+        {
+          "acc": 0.98375,
+          "precision": 0.9546087440824282,
+          "recall": 0.9724822695035461,
+          "f1": 0.9634626194491286
+        },
+        {
+          "acc": 0.984125,
+          "precision": 0.9501789154968345,
+          "recall": 0.979290780141844,
+          "f1": 0.9645152277172394
+        },
+        {
+          "acc": 0.9849375,
+          "precision": 0.9607182940516273,
+          "recall": 0.9713475177304964,
+          "f1": 0.9660036676541119
+        },
+        {
+          "acc": 0.984875,
+          "precision": 0.956606397774687,
+          "recall": 0.9756028368794326,
+          "f1": 0.9660112359550562
+        },
+        {
+          "acc": 0.984625,
+          "precision": 0.9570671870643992,
+          "recall": 0.9739007092198582,
+          "f1": 0.9654105736782902
+        },
+        {
+          "acc": 0.9849375,
+          "precision": 0.9584031267448353,
+          "recall": 0.9739007092198582,
+          "f1": 0.9660897706486562
+        },
+        {
+          "acc": 0.98375,
+          "precision": 0.9523413688002217,
+          "recall": 0.9750354609929078,
+          "f1": 0.9635548079618728
+        },
+        {
+          "acc": 0.984,
+          "precision": 0.9536497363308354,
+          "recall": 0.9747517730496454,
+          "f1": 0.9640852974186307
+        },
+        {
+          "acc": 0.98375,
+          "precision": 0.9505934308584046,
+          "recall": 0.9770212765957447,
+          "f1": 0.9636261891438165
+        },
+        {
+          "acc": 0.984,
+          "precision": 0.9533980582524272,
+          "recall": 0.9750354609929078,
+          "f1": 0.9640953716690043
+        }
+      ]
+    }
+  },
+  "config": {
+    "n_train": 2000,
+    "n_test": 400,
+    "hidden_dim": 64,
+    "epochs": 80,
+    "lr": 0.002,
+    "max_hops": 3
+  },
+  "elapsed_min": 21.402417866388955
 }

FINAL_SUBMIT/receipts/ablation_matrix.json CHANGED Viewed

@@ -1,95 +1,95 @@
-{
-  "framework": "leave-one-out reward ablation per RL guide \u00a77-8",
-  "n_episodes_per_trial": 100,
-  "baseline": {
-    "disabled": "none",
-    "mean_return": 0.6742,
-    "solve_rate": 0.27,
-    "n_episodes": 100
-  },
-  "ablations": [
-    {
-      "disabled": "green_credit",
-      "mean_return": 0.2152,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.459,
-      "pct_change": -68.08
-    },
-    {
-      "disabled": "yellow_credit",
-      "mean_return": 0.613,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.0612,
-      "pct_change": -9.08
-    },
-    {
-      "disabled": "solve_bonus",
-      "mean_return": 0.4042,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.27,
-      "pct_change": -40.05
-    },
-    {
-      "disabled": "guess_count_bonus",
-      "mean_return": 0.6442,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.03,
-      "pct_change": -4.45
-    },
-    {
-      "disabled": "timeout_penalty",
-      "mean_return": 0.8202,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": 0.146,
-      "pct_change": 21.66
-    }
-  ],
-  "ranked_by_impact": [
-    {
-      "disabled": "green_credit",
-      "mean_return": 0.2152,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.459,
-      "pct_change": -68.08
-    },
-    {
-      "disabled": "solve_bonus",
-      "mean_return": 0.4042,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.27,
-      "pct_change": -40.05
-    },
-    {
-      "disabled": "timeout_penalty",
-      "mean_return": 0.8202,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": 0.146,
-      "pct_change": 21.66
-    },
-    {
-      "disabled": "yellow_credit",
-      "mean_return": 0.613,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.0612,
-      "pct_change": -9.08
-    },
-    {
-      "disabled": "guess_count_bonus",
-      "mean_return": 0.6442,
-      "solve_rate": 0.27,
-      "n_episodes": 100,
-      "delta_mean_return": -0.03,
-      "pct_change": -4.45
-    }
-  ],
-  "insight": "components ranked by metric drop when removed reveal which reward signals are load-bearing"
 }

+{
+  "framework": "leave-one-out reward ablation per RL guide \u00a77-8",
+  "n_episodes_per_trial": 100,
+  "baseline": {
+    "disabled": "none",
+    "mean_return": 0.6742,
+    "solve_rate": 0.27,
+    "n_episodes": 100
+  },
+  "ablations": [
+    {
+      "disabled": "green_credit",
+      "mean_return": 0.2152,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.459,
+      "pct_change": -68.08
+    },
+    {
+      "disabled": "yellow_credit",
+      "mean_return": 0.613,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.0612,
+      "pct_change": -9.08
+    },
+    {
+      "disabled": "solve_bonus",
+      "mean_return": 0.4042,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.27,
+      "pct_change": -40.05
+    },
+    {
+      "disabled": "guess_count_bonus",
+      "mean_return": 0.6442,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.03,
+      "pct_change": -4.45
+    },
+    {
+      "disabled": "timeout_penalty",
+      "mean_return": 0.8202,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": 0.146,
+      "pct_change": 21.66
+    }
+  ],
+  "ranked_by_impact": [
+    {
+      "disabled": "green_credit",
+      "mean_return": 0.2152,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.459,
+      "pct_change": -68.08
+    },
+    {
+      "disabled": "solve_bonus",
+      "mean_return": 0.4042,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.27,
+      "pct_change": -40.05
+    },
+    {
+      "disabled": "timeout_penalty",
+      "mean_return": 0.8202,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": 0.146,
+      "pct_change": 21.66
+    },
+    {
+      "disabled": "yellow_credit",
+      "mean_return": 0.613,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.0612,
+      "pct_change": -9.08
+    },
+    {
+      "disabled": "guess_count_bonus",
+      "mean_return": 0.6442,
+      "solve_rate": 0.27,
+      "n_episodes": 100,
+      "delta_mean_return": -0.03,
+      "pct_change": -4.45
+    }
+  ],
+  "insight": "components ranked by metric drop when removed reveal which reward signals are load-bearing"
 }

FINAL_SUBMIT/receipts/adversarial_20_attack_gauntlet.json CHANGED Viewed

@@ -1,217 +1,217 @@
-{
-  "started_at": 1777142712.0545185,
-  "framework": "RL guide \u00a738-44 + Skalse 2022 + Krakovna 2020",
-  "n_total": 20,
-  "results": [
-    {
-      "id": 1,
-      "name": "empty_string",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 2,
-      "name": "single_letter",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 3,
-      "name": "digits_only",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 4,
-      "name": "unicode_homoglyph",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 5,
-      "name": "6char_word",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 6,
-      "name": "4char_word",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 7,
-      "name": "uppercase_only",
-      "expected_reason": "accepted_after_normalize",
-      "actual_reason": "accepted_after_normalize",
-      "reward_extracted": 0.0,
-      "blocked": false,
-      "is_legit_test": true,
-      "verdict": "LEGIT_ACCEPTED"
-    },
-    {
-      "id": 8,
-      "name": "whitespace_padded",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 9,
-      "name": "null_action",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 10,
-      "name": "dict_lookalike_zzzzz",
-      "expected_reason": "non_dictionary",
-      "actual_reason": "non_dictionary",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 11,
-      "name": "repeat_same_guess_5x",
-      "expected_reason": "rolling_no_progress",
-      "actual_reason": "rolling_no_progress",
-      "reward_extracted": 0.07,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 12,
-      "name": "solved_word_repeat",
-      "expected_reason": "episode_done",
-      "actual_reason": "episode_done",
-      "reward_extracted": 1.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 13,
-      "name": "unicode_zero_width",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 14,
-      "name": "sql_injection",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 15,
-      "name": "path_traversal",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 16,
-      "name": "extremely_long_string",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 17,
-      "name": "json_object_payload",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 18,
-      "name": "negative_action_index",
-      "expected_reason": "out_of_bounds",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 19,
-      "name": "sleep_inside_action",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    },
-    {
-      "id": 20,
-      "name": "base64_encoded",
-      "expected_reason": "format_invalid",
-      "actual_reason": "format_invalid",
-      "reward_extracted": 0.0,
-      "blocked": true,
-      "is_legit_test": false,
-      "verdict": "BLOCKED"
-    }
-  ],
-  "summary": {
-    "n_total_tests": 20,
-    "n_attacks": 19,
-    "n_blocked": 19,
-    "block_rate_pct": 100.0,
-    "n_legit": 1,
-    "n_legit_accepted": 1,
-    "false_positive_rate_pct": 0.0,
-    "verdict": "PASS"
-  }
 }

+{
+  "started_at": 1777142712.0545185,
+  "framework": "RL guide \u00a738-44 + Skalse 2022 + Krakovna 2020",
+  "n_total": 20,
+  "results": [
+    {
+      "id": 1,
+      "name": "empty_string",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 2,
+      "name": "single_letter",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 3,
+      "name": "digits_only",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 4,
+      "name": "unicode_homoglyph",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 5,
+      "name": "6char_word",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 6,
+      "name": "4char_word",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 7,
+      "name": "uppercase_only",
+      "expected_reason": "accepted_after_normalize",
+      "actual_reason": "accepted_after_normalize",
+      "reward_extracted": 0.0,
+      "blocked": false,
+      "is_legit_test": true,
+      "verdict": "LEGIT_ACCEPTED"
+    },
+    {
+      "id": 8,
+      "name": "whitespace_padded",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 9,
+      "name": "null_action",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 10,
+      "name": "dict_lookalike_zzzzz",
+      "expected_reason": "non_dictionary",
+      "actual_reason": "non_dictionary",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 11,
+      "name": "repeat_same_guess_5x",
+      "expected_reason": "rolling_no_progress",
+      "actual_reason": "rolling_no_progress",
+      "reward_extracted": 0.07,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 12,
+      "name": "solved_word_repeat",
+      "expected_reason": "episode_done",
+      "actual_reason": "episode_done",
+      "reward_extracted": 1.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 13,
+      "name": "unicode_zero_width",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 14,
+      "name": "sql_injection",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 15,
+      "name": "path_traversal",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 16,
+      "name": "extremely_long_string",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 17,
+      "name": "json_object_payload",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 18,
+      "name": "negative_action_index",
+      "expected_reason": "out_of_bounds",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 19,
+      "name": "sleep_inside_action",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    },
+    {
+      "id": 20,
+      "name": "base64_encoded",
+      "expected_reason": "format_invalid",
+      "actual_reason": "format_invalid",
+      "reward_extracted": 0.0,
+      "blocked": true,
+      "is_legit_test": false,
+      "verdict": "BLOCKED"
+    }
+  ],
+  "summary": {
+    "n_total_tests": 20,
+    "n_attacks": 19,
+    "n_blocked": 19,
+    "block_rate_pct": 100.0,
+    "n_legit": 1,
+    "n_legit_accepted": 1,
+    "false_positive_rate_pct": 0.0,
+    "verdict": "PASS"
+  }
 }

FINAL_SUBMIT/receipts/adversarial_reward_audit.json CHANGED Viewed

@@ -1,132 +1,132 @@
-{
-  "scenario_id": "2011_Tōhoku_earthquake_and_tsunami",
-  "ground_truth": "CRITICAL",
-  "reward_formula": "0.7 * match + 0.2 * format + 0.1 * length",
-  "n_attacks": 6,
-  "n_rejected": 6,
-  "attacks": [
-    {
-      "attack": "A1_empty_string",
-      "description": "Degenerate empty output — no info at all.",
-      "reward": 0.0,
-      "breakdown": {
-        "match": 0.0,
-        "format": 0.0,
-        "length": 0.0,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 1
-      },
-      "max_expected": 0.01,
-      "rejected": true
-    },
-    {
-      "attack": "A2_risk_only_short_circuit",
-      "description": "Pure short-circuit: output the ground-truth label with no structure. Length-guard should reject.",
-      "reward": 0.7,
-      "breakdown": {
-        "match": 1.0,
-        "format": 0.0,
-        "length": 0.0,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 1
-      },
-      "max_expected": 0.71,
-      "rejected": true
-    },
-    {
-      "attack": "A3_long_spam_no_json",
-      "description": "Pad with junk to beat length-guard while omitting structured output. Format-guard should reject.",
-      "reward": 0.8,
-      "breakdown": {
-        "match": 1.0,
-        "format": 0.0,
-        "length": 1.0,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 200
-      },
-      "max_expected": 0.81,
-      "rejected": true
-    },
-    {
-      "attack": "A4_over_length_attack",
-      "description": "Generate a massive output to dilute detection. Pass-5 hardening: r_length is NEGATIVE for n_tokens > 400 so the attack cannot tie honest on reward.",
-      "reward": 0.85,
-      "breakdown": {
-        "match": 1.0,
-        "format": 1.0,
-        "length": -0.5,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 500
-      },
-      "max_expected": 0.86,
-      "rejected": true
-    },
-    {
-      "attack": "A5_adjacent_tier_guess",
-      "description": "Always guess the adjacent tier to hedge. Ordinal proximity pays only 0.5 not 1.0 on match.",
-      "reward": 0.65,
-      "breakdown": {
-        "match": 0.5,
-        "format": 1.0,
-        "length": 1.0,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 60
-      },
-      "max_expected": 0.66,
-      "rejected": true
-    },
-    {
-      "attack": "A6_wrong_tier_confident",
-      "description": "Always guess LOW — opposite end of ordinal scale. Far-from-GT reward must be 0.0, not just adjacent.",
-      "reward": 0.3,
-      "breakdown": {
-        "match": 0.0,
-        "format": 1.0,
-        "length": 1.0,
-        "weights": [
-          0.7,
-          0.2,
-          0.1
-        ],
-        "n_tokens": 60
-      },
-      "max_expected": 0.31,
-      "rejected": true
-    }
-  ],
-  "honest_baseline": {
-    "reward": 0.9,
-    "breakdown": {
-      "match": 1.0,
-      "format": 1.0,
-      "length": 0.0,
-      "weights": [
-        0.7,
-        0.2,
-        0.1
-      ],
-      "n_tokens": 20
-    }
-  },
-  "verdict": "All attack vectors score strictly below an honest full answer. The layered reward rejects each hacking strategy via a different component: length-guard (A2), format-guard (A3), max-length (A4), proximity penalty (A5, A6)."
 }

+{
+  "scenario_id": "2011_Tōhoku_earthquake_and_tsunami",
+  "ground_truth": "CRITICAL",
+  "reward_formula": "0.7 * match + 0.2 * format + 0.1 * length",
+  "n_attacks": 6,
+  "n_rejected": 6,
+  "attacks": [
+    {
+      "attack": "A1_empty_string",
+      "description": "Degenerate empty output — no info at all.",
+      "reward": 0.0,
+      "breakdown": {
+        "match": 0.0,
+        "format": 0.0,
+        "length": 0.0,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 1
+      },
+      "max_expected": 0.01,
+      "rejected": true
+    },
+    {
+      "attack": "A2_risk_only_short_circuit",
+      "description": "Pure short-circuit: output the ground-truth label with no structure. Length-guard should reject.",
+      "reward": 0.7,
+      "breakdown": {
+        "match": 1.0,
+        "format": 0.0,
+        "length": 0.0,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 1
+      },
+      "max_expected": 0.71,
+      "rejected": true
+    },
+    {
+      "attack": "A3_long_spam_no_json",
+      "description": "Pad with junk to beat length-guard while omitting structured output. Format-guard should reject.",
+      "reward": 0.8,
+      "breakdown": {
+        "match": 1.0,
+        "format": 0.0,
+        "length": 1.0,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 200
+      },
+      "max_expected": 0.81,
+      "rejected": true
+    },
+    {
+      "attack": "A4_over_length_attack",
+      "description": "Generate a massive output to dilute detection. Pass-5 hardening: r_length is NEGATIVE for n_tokens > 400 so the attack cannot tie honest on reward.",
+      "reward": 0.85,
+      "breakdown": {
+        "match": 1.0,
+        "format": 1.0,
+        "length": -0.5,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 500
+      },
+      "max_expected": 0.86,
+      "rejected": true
+    },
+    {
+      "attack": "A5_adjacent_tier_guess",
+      "description": "Always guess the adjacent tier to hedge. Ordinal proximity pays only 0.5 not 1.0 on match.",
+      "reward": 0.65,
+      "breakdown": {
+        "match": 0.5,
+        "format": 1.0,
+        "length": 1.0,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 60
+      },
+      "max_expected": 0.66,
+      "rejected": true
+    },
+    {
+      "attack": "A6_wrong_tier_confident",
+      "description": "Always guess LOW — opposite end of ordinal scale. Far-from-GT reward must be 0.0, not just adjacent.",
+      "reward": 0.3,
+      "breakdown": {
+        "match": 0.0,
+        "format": 1.0,
+        "length": 1.0,
+        "weights": [
+          0.7,
+          0.2,
+          0.1
+        ],
+        "n_tokens": 60
+      },
+      "max_expected": 0.31,
+      "rejected": true
+    }
+  ],
+  "honest_baseline": {
+    "reward": 0.9,
+    "breakdown": {
+      "match": 1.0,
+      "format": 1.0,
+      "length": 0.0,
+      "weights": [
+        0.7,
+        0.2,
+        0.1
+      ],
+      "n_tokens": 20
+    }
+  },
+  "verdict": "All attack vectors score strictly below an honest full answer. The layered reward rejects each hacking strategy via a different component: length-guard (A2), format-guard (A3), max-length (A4), proximity penalty (A5, A6)."
 }