Shaurya-Noodle commited on
Commit
9f8371c
·
verified ·
1 Parent(s): 4b96949

Deploy v6.0-genesis from GitHub main

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +17 -17
  2. .gitattributes +15 -0
  3. .gitignore +46 -14
  4. Dockerfile +32 -32
  5. Dockerfile.damocles +4 -4
  6. FINAL_SUBMIT/ALL_250_FEATURES_LIVE_PROOF.md +3 -3
  7. FINAL_SUBMIT/ARCHITECTURE.md +3 -3
  8. FINAL_SUBMIT/BENCHMARK_REPORT.md +3 -3
  9. FINAL_SUBMIT/COLD_OPEN_OPENING_LINES.md +26 -26
  10. FINAL_SUBMIT/DATASET_CARD.md +3 -3
  11. FINAL_SUBMIT/ENV_CARD.md +1 -1
  12. FINAL_SUBMIT/FEATURE_INVENTORY.md +27 -27
  13. FINAL_SUBMIT/FEATURE_INVENTORY_DI.md +7 -7
  14. FINAL_SUBMIT/FEATURE_INVENTORY_JT.md +13 -13
  15. FINAL_SUBMIT/FEATURE_INVENTORY_UBB.md +40 -40
  16. FINAL_SUBMIT/HACKATHON_README.md +2 -2
  17. FINAL_SUBMIT/JUDGE_FAQ_30.md +1 -1
  18. FINAL_SUBMIT/JUDGE_OBJECTION_HANDBOOK.md +1 -1
  19. FINAL_SUBMIT/MASTER_FEATURE_USECASE_MAP_250.md +10 -10
  20. FINAL_SUBMIT/README.md +12 -12
  21. FINAL_SUBMIT/RELIANCE_HORMUZ_DEEP_DIVE.md +1 -1
  22. FINAL_SUBMIT/REPRODUCE.md +2 -2
  23. FINAL_SUBMIT/REPRODUCE_ONE_BASH.sh +3 -3
  24. FINAL_SUBMIT/RL_GUIDE_59POINT_ALIGNMENT.md +2 -2
  25. FINAL_SUBMIT/docker/Dockerfile.api +25 -0
  26. FINAL_SUBMIT/docker/docker-compose.yml +41 -0
  27. FINAL_SUBMIT/receipts/F2_multi_agent_apple_samsung_toyota.json +116 -116
  28. FINAL_SUBMIT/receipts/ONNX_BUNDLE_MANIFEST.json +71 -71
  29. FINAL_SUBMIT/receipts/R2_SHAP_FAIRNESS_CALIBRATION.json +501 -501
  30. FINAL_SUBMIT/receipts/R3_BIGTFT_INTEGRATION.json +51 -51
  31. FINAL_SUBMIT/receipts/R3_PAST_SELF.json +0 -0
  32. FINAL_SUBMIT/receipts/R3_STACKING_V2.json +1187 -1187
  33. FINAL_SUBMIT/receipts/R3_STACKING_V3_POINTLEVEL.json +226 -226
  34. FINAL_SUBMIT/receipts/R3_TIMESFM_QUANTILE.json +129 -129
  35. FINAL_SUBMIT/receipts/R4_DANGEROUS_V2.json +0 -0
  36. FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_ABLATION.json +396 -396
  37. FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_LIVE.json +63 -63
  38. FINAL_SUBMIT/receipts/R4_FRONTIER_PANEL_V2.json +0 -0
  39. FINAL_SUBMIT/receipts/R5_BEIR_MANUAL.json +1022 -1022
  40. FINAL_SUBMIT/receipts/R5_GRANITE.json +0 -0
  41. FINAL_SUBMIT/receipts/R5_GRANITE_HARD.json +0 -0
  42. FINAL_SUBMIT/receipts/R6_ALGO_COMPARISON.json +71 -71
  43. FINAL_SUBMIT/receipts/R6_AQUA_REGIA_V2.json +859 -859
  44. FINAL_SUBMIT/receipts/R6_GETHSEMANE.json +121 -121
  45. FINAL_SUBMIT/receipts/R6_GETHSEMANE_ONNX_EXPORT.json +24 -24
  46. FINAL_SUBMIT/receipts/R6_PROVIDER_V2.json +329 -329
  47. FINAL_SUBMIT/receipts/R6_PROVIDER_v1_F1.json +1755 -1755
  48. FINAL_SUBMIT/receipts/ablation_matrix.json +94 -94
  49. FINAL_SUBMIT/receipts/adversarial_20_attack_gauntlet.json +216 -216
  50. FINAL_SUBMIT/receipts/adversarial_reward_audit.json +131 -131
.dockerignore CHANGED
@@ -1,17 +1,17 @@
1
- .git
2
- __pycache__
3
- *.pyc
4
- .pytest_cache
5
- .mypy_cache
6
- .ruff_cache
7
- tests/
8
- .env
9
- .env.*
10
- *.egg-info
11
- dist/
12
- build/
13
- .vscode/
14
- .idea/
15
- *.md
16
- !README.md
17
- LICENSE
 
1
+ .git
2
+ __pycache__
3
+ *.pyc
4
+ .pytest_cache
5
+ .mypy_cache
6
+ .ruff_cache
7
+ tests/
8
+ .env
9
+ .env.*
10
+ *.egg-info
11
+ dist/
12
+ build/
13
+ .vscode/
14
+ .idea/
15
+ *.md
16
+ !README.md
17
+ LICENSE
.gitattributes CHANGED
@@ -68,3 +68,18 @@ ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1000_candidate/policy.zip fil
68
  ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
69
  FINAL_SUBMIT/plots/real_reinforce_curve.png filter=lfs diff=lfs merge=lfs -text
70
  FINAL_SUBMIT/plots/real_reinforce_curve_v2.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  ShAuRyA_Phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
69
  FINAL_SUBMIT/plots/real_reinforce_curve.png filter=lfs diff=lfs merge=lfs -text
70
  FINAL_SUBMIT/plots/real_reinforce_curve_v2.png filter=lfs diff=lfs merge=lfs -text
71
+ versions/v3_arcadia/plots/aqua_regia/r6_aqua_regia.png filter=lfs diff=lfs merge=lfs -text
72
+ versions/v3_arcadia/plots/dangerous/r4_summary.png filter=lfs diff=lfs merge=lfs -text
73
+ versions/v3_arcadia/plots/dangerous/r4v2_heatmap.png filter=lfs diff=lfs merge=lfs -text
74
+ versions/v3_arcadia/plots/gethsemane/learning_curves.png filter=lfs diff=lfs merge=lfs -text
75
+ versions/v3_arcadia/plots/granite/r5_per_query_heatmap.png filter=lfs diff=lfs merge=lfs -text
76
+ versions/v3_arcadia/plots/hero_result_card.png filter=lfs diff=lfs merge=lfs -text
77
+ versions/v3_arcadia/plots/past_self/r3_summary.png filter=lfs diff=lfs merge=lfs -text
78
+ versions/v4_arcadia_live/features/gcn_attn/gcn_attn_easy_graph.png filter=lfs diff=lfs merge=lfs -text
79
+ versions/v4_arcadia_live/features/gcn_attn/gcn_attn_hard_graph.png filter=lfs diff=lfs merge=lfs -text
80
+ versions/v4_arcadia_live/features/gcn_attn/gcn_attn_medium_graph.png filter=lfs diff=lfs merge=lfs -text
81
+ versions/v4_arcadia_live/scenarios/crisis_library_v2.faiss filter=lfs diff=lfs merge=lfs -text
82
+ versions/v4_arcadia_live/scenarios/crisis_library_v2_emb.npz filter=lfs diff=lfs merge=lfs -text
83
+ versions/v5_phoenix/action_v2/conformal_calibrated.pt filter=lfs diff=lfs merge=lfs -text
84
+ versions/v5_phoenix/autoresearch_fixed/experiments/seed1000_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
85
+ versions/v5_phoenix/autoresearch_fixed/experiments/seed1001_candidate/policy.zip filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -12,6 +12,11 @@ env/
12
 
13
  # Tooling state
14
  .claude/
 
 
 
 
 
15
 
16
  # Stray pip version artifacts
17
  0.*/
@@ -56,21 +61,48 @@ models/
56
  sota-bundle/
57
  external_data/
58
  catboost_info/
59
- v3_arcadia/tools/
60
- v3_arcadia/gguf_out/
61
 
62
  # Auto-generated embedding caches + SB3 best/ dirs
63
- v3_arcadia/checkpoints/granite/corpus_emb_*.npy
64
- v3_arcadia/checkpoints/gethsemane/best_*/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  # v4 arcadia-live auto-generated state
67
- ShAuRyA_Supplymind/realtime/events.db
68
- ShAuRyA_Supplymind/realtime/events.db-journal
69
- ShAuRyA_Supplymind/realtime/library_embeddings.pkl
70
- ShAuRyA_Supplymind/realtime/vessel_snapshot_hormuz.json
71
- ShAuRyA_Supplymind/autoresearch/experiments/
72
- ShAuRyA_Supplymind/autoresearch/state.json
73
- ShAuRyA_Supplymind/autoresearch/stop_autoresearch.flag
74
- ShAuRyA_Supplymind/autoresearch/candidate_train.py.bak
75
- ShAuRyA_Supplymind/autoresearch/AUTORESEARCH_LAB_NOTEBOOK.md
76
- ShAuRyA_Supplymind/autoresearch/AUTORESEARCH_REJECTED.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Tooling state
14
  .claude/
15
+ .agents/
16
+ .source_cache/
17
+ .tmp_pytest/
18
+ .pytest_cache/
19
+ wandb/
20
 
21
  # Stray pip version artifacts
22
  0.*/
 
61
  sota-bundle/
62
  external_data/
63
  catboost_info/
64
+ versions/v3_arcadia/tools/
65
+ versions/v3_arcadia/gguf_out/
66
 
67
  # Auto-generated embedding caches + SB3 best/ dirs
68
+ versions/v3_arcadia/checkpoints/granite/corpus_emb_*.npy
69
+ versions/v3_arcadia/checkpoints/gethsemane/best_*/
70
+
71
+ # Third-party source checkouts (not our code) — vendored under vendor/
72
+ vendor/
73
+
74
+ # Phoenix v5 auto-generated state (keep source code, exclude heavy + auto-gen)
75
+ versions/v5_phoenix/.venv-roll/
76
+ versions/v5_phoenix/.venv/
77
+ versions/v5_phoenix/experiments/dpo_judge_v1/checkpoints/
78
+ versions/v5_phoenix/experiments/dpo_judge_v1/adapter/
79
+ versions/v5_phoenix/roll_integration/dpo_judge/adapter/
80
+ versions/v5_phoenix/**/__pycache__/
81
+ versions/v5_phoenix/**/*.pyc
82
+ versions/v5_phoenix/**/*.log
83
+ versions/v5_phoenix/receipts_v2/*.stdout
84
 
85
  # v4 arcadia-live auto-generated state
86
+ versions/v4_arcadia_live/realtime/events.db
87
+ versions/v4_arcadia_live/realtime/events.db-journal
88
+ versions/v4_arcadia_live/realtime/library_embeddings.pkl
89
+ versions/v4_arcadia_live/realtime/vessel_snapshot_hormuz.json
90
+ versions/v4_arcadia_live/autoresearch/experiments/
91
+ versions/v4_arcadia_live/autoresearch/stop_autoresearch.flag
92
+ versions/v4_arcadia_live/autoresearch/candidate_train.py.bak
93
+ # Lab notebook, rejected log, and state.json ARE committed — they document
94
+ # real autoresearch execution history (provenance for judges).
95
+
96
+ # OpenRouter usage audit log (per-call timestamps, no keys)
97
+ .openrouter_usage.jsonl
98
+ # Frontier panel run intermediate caches
99
+ .openrouter_cache/
100
+ lora_stdout.log
101
+
102
+ # Pass 8 — large harvest data (regenerable via train.py harvest_trajectories)
103
+ versions/v5_phoenix/experiments/rap_xc_v1/transitions.npz
104
+ versions/v5_phoenix/experiments/rap_xc_v1/transitions_synth.npz
105
+ versions/v5_phoenix/experiments/rap_xc_v1/smoke*.npz
106
+ versions/v5_phoenix/experiments/rap_xc_v1/rapxc_synth.pt
107
+ versions/v5_phoenix/experiments/rap_xc_v1/*.log
108
+ tests/receipts/*.log
Dockerfile CHANGED
@@ -1,32 +1,32 @@
1
- # ── Stage 1: Install dependencies ──────────────────────────────────
2
- FROM python:3.11-slim AS builder
3
-
4
- WORKDIR /build
5
- COPY requirements.txt .
6
- RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
7
-
8
- # ── Stage 2: Production image ─────────────────────────────────────
9
- FROM python:3.11-slim
10
-
11
- # Non-root user for security (UID 1000 is conventional)
12
- RUN useradd --create-home --uid 1000 appuser
13
-
14
- WORKDIR /app
15
-
16
- # Copy installed packages from builder
17
- COPY --from=builder /install /usr/local
18
-
19
- # Copy application code
20
- COPY . .
21
-
22
- # Own the app directory
23
- RUN chown -R appuser:appuser /app
24
-
25
- USER appuser
26
-
27
- EXPOSE 8000
28
-
29
- HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
30
- CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
31
-
32
- CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ # ── Stage 1: Install dependencies ──────────────────────────────────
2
+ FROM python:3.11-slim AS builder
3
+
4
+ WORKDIR /build
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
7
+
8
+ # ── Stage 2: Production image ─────────────────────────────────────
9
+ FROM python:3.11-slim
10
+
11
+ # Non-root user for security (UID 1000 is conventional)
12
+ RUN useradd --create-home --uid 1000 appuser
13
+
14
+ WORKDIR /app
15
+
16
+ # Copy installed packages from builder
17
+ COPY --from=builder /install /usr/local
18
+
19
+ # Copy application code
20
+ COPY . .
21
+
22
+ # Own the app directory
23
+ RUN chown -R appuser:appuser /app
24
+
25
+ USER appuser
26
+
27
+ EXPOSE 8000
28
+
29
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
30
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
31
+
32
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
Dockerfile.damocles CHANGED
@@ -1,5 +1,5 @@
1
  # SupplyMind v3.0-arcadia — Damocles API (FastAPI)
2
- # Deploys v3_arcadia/90_damocles/app.py with /assess, /forecast, /rag, /rl/act, /health
3
  #
4
  # Build: docker build -f Dockerfile.damocles -t supplymind-damocles:v3.0-arcadia .
5
  # Run: docker run -p 8765:8765 supplymind-damocles:v3.0-arcadia
@@ -18,10 +18,10 @@ COPY requirements-damocles.txt .
18
  RUN pip install --no-cache-dir -r requirements-damocles.txt
19
 
20
  # App code
21
- COPY v3_arcadia/90_damocles/ /app/v3_arcadia/90_damocles/
22
- COPY v3_arcadia/checkpoints/granite/corpus_chunks.pkl /app/v3_arcadia/checkpoints/granite/
23
  # Embeddings loaded lazily from cached .npy (mounted at runtime or fetched via env)
24
- COPY v3_arcadia/checkpoints/gethsemane/ppo_easy_typhoon_response.onnx /app/v3_arcadia/checkpoints/gethsemane/
25
  COPY models/mxbai-embed-large/ /app/models/mxbai-embed-large/
26
 
27
  # Healthcheck
 
1
  # SupplyMind v3.0-arcadia — Damocles API (FastAPI)
2
+ # Deploys versions/v3_arcadia/90_damocles/app.py with /assess, /forecast, /rag, /rl/act, /health
3
  #
4
  # Build: docker build -f Dockerfile.damocles -t supplymind-damocles:v3.0-arcadia .
5
  # Run: docker run -p 8765:8765 supplymind-damocles:v3.0-arcadia
 
18
  RUN pip install --no-cache-dir -r requirements-damocles.txt
19
 
20
  # App code
21
+ COPY versions/v3_arcadia/90_damocles/ /app/versions/v3_arcadia/90_damocles/
22
+ COPY versions/v3_arcadia/checkpoints/granite/corpus_chunks.pkl /app/versions/v3_arcadia/checkpoints/granite/
23
  # Embeddings loaded lazily from cached .npy (mounted at runtime or fetched via env)
24
+ COPY versions/v3_arcadia/checkpoints/gethsemane/ppo_easy_typhoon_response.onnx /app/versions/v3_arcadia/checkpoints/gethsemane/
25
  COPY models/mxbai-embed-large/ /app/models/mxbai-embed-large/
26
 
27
  # Healthcheck
FINAL_SUBMIT/ALL_250_FEATURES_LIVE_PROOF.md CHANGED
@@ -25,9 +25,9 @@ Status legend:
25
  | A7 | 30-step horizon | `server/supply_environment.py` | reset config | ✅ |
26
  | A8 | $5–15M budget tasks | `data/disruptions.json` | task manifest | ✅ |
27
  | A9 | TSMC/Samsung coords | `data/companies_real.json` | n_real_nodes=40 | ✅ |
28
- | A10 | 8-event crisis library v1 | `ShAuRyA_Supplymind/realtime/crisis_library.py` | 8 events indexed | ✅ |
29
- | A11 | Wordle RLVR mini-env | `ShAuRyA_Phoenix/wordle_env/env.py` | `wordle_real_reinforce_v2_curve.json` | ✅ |
30
- | A12 | RLVE adaptive curriculum | `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` | `rlve_curriculum_smoke.json` | ✅ |
31
 
32
  ## B · Reward engineering (14) — 14/14 ✅
33
 
 
25
  | A7 | 30-step horizon | `server/supply_environment.py` | reset config | ✅ |
26
  | A8 | $5–15M budget tasks | `data/disruptions.json` | task manifest | ✅ |
27
  | A9 | TSMC/Samsung coords | `data/companies_real.json` | n_real_nodes=40 | ✅ |
28
+ | A10 | 8-event crisis library v1 | `versions/v4_arcadia_live/realtime/crisis_library.py` | 8 events indexed | ✅ |
29
+ | A11 | Wordle RLVR mini-env | `versions/v5_phoenix/wordle_env/env.py` | `wordle_real_reinforce_v2_curve.json` | ✅ |
30
+ | A12 | RLVE adaptive curriculum | `versions/v5_phoenix/wordle_env/rlve_curriculum.py` | `rlve_curriculum_smoke.json` | ✅ |
31
 
32
  ## B · Reward engineering (14) — 14/14 ✅
33
 
FINAL_SUBMIT/ARCHITECTURE.md CHANGED
@@ -90,7 +90,7 @@
90
 
91
  Plus 4 base wrappers: `qwen25-14b-local`, `qwen25-coder-local`, `mistral-nemo-local`, `deepseek-r1-local-q4`.
92
 
93
- 5 Modelfiles committed at `rl/lora/Modelfile`, `Modelfile.v2-v4`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5`.
94
 
95
  ### 4. LoRA fine-tuning track
96
 
@@ -100,7 +100,7 @@ Qwen-2.5-1.5B → PEFT/LoRA → 4-bit NF4 (bitsandbytes) → TRL → 225 instruc
100
 
101
  Qwen-2.5-3B-Instruct base. 21 preference pairs from R4 ground truth at `dpo_judge/data/preference_pairs.jsonl`. DPO sigmoid loss, β=0.1, LoRA r=8 / α=16, hf strategy (single-GPU 12GB), per_device_train_batch_size=1, gradient_accumulation_steps=4, lr=5e-5, save_adapter_only.
102
 
103
- 5 trainers in `ShAuRyA_Phoenix/roll_integration/dpo_judge/`:
104
  - `train_dpo_trl.py` — TRL standalone (ROLL-free fallback)
105
  - `train_dpo_roll.py` — ROLL-integrated
106
  - `train_grpo_env.py` — GRPO multi-turn
@@ -178,7 +178,7 @@ dag_feats (80-d) ──→ DAGEncoder
178
 
179
  ### 14. Live data layer (20 sources)
180
 
181
- `ShAuRyA_Supplymind/realtime/orchestrator_v2.py` fans out to 20 sources via ThreadPoolExecutor with per-source timeouts and graceful failure:
182
 
183
  NewsAPI · GDELT · GDELT-Conflict · GDELT-Humanitarian · USGS earthquakes · NOAA NDBC buoys · NOAA Tides · NASA EONET · NASA FIRMS fires · EIA Brent · EIA WTI · EIA natgas · MarineTraffic AIS · Global Fishing Watch · World Bank commodities · WHO DON · SEC EDGAR · CISA KEV · OFAC sanctions · Wikipedia pageviews · HN tech ticker
184
 
 
90
 
91
  Plus 4 base wrappers: `qwen25-14b-local`, `qwen25-coder-local`, `mistral-nemo-local`, `deepseek-r1-local-q4`.
92
 
93
+ 5 Modelfiles committed at `rl/lora/Modelfile`, `Modelfile.v2-v4`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5`.
94
 
95
  ### 4. LoRA fine-tuning track
96
 
 
100
 
101
  Qwen-2.5-3B-Instruct base. 21 preference pairs from R4 ground truth at `dpo_judge/data/preference_pairs.jsonl`. DPO sigmoid loss, β=0.1, LoRA r=8 / α=16, hf strategy (single-GPU 12GB), per_device_train_batch_size=1, gradient_accumulation_steps=4, lr=5e-5, save_adapter_only.
102
 
103
+ 5 trainers in `versions/v5_phoenix/roll_integration/dpo_judge/`:
104
  - `train_dpo_trl.py` — TRL standalone (ROLL-free fallback)
105
  - `train_dpo_roll.py` — ROLL-integrated
106
  - `train_grpo_env.py` — GRPO multi-turn
 
178
 
179
  ### 14. Live data layer (20 sources)
180
 
181
+ `versions/v4_arcadia_live/realtime/orchestrator_v2.py` fans out to 20 sources via ThreadPoolExecutor with per-source timeouts and graceful failure:
182
 
183
  NewsAPI · GDELT · GDELT-Conflict · GDELT-Humanitarian · USGS earthquakes · NOAA NDBC buoys · NOAA Tides · NASA EONET · NASA FIRMS fires · EIA Brent · EIA WTI · EIA natgas · MarineTraffic AIS · Global Fishing Watch · World Bank commodities · WHO DON · SEC EDGAR · CISA KEV · OFAC sanctions · Wikipedia pageviews · HN tech ticker
184
 
FINAL_SUBMIT/BENCHMARK_REPORT.md CHANGED
@@ -69,7 +69,7 @@ Tested on 32k held-out training rows of real harvested transitions. The split-co
69
 
70
  ## 4. RAP-XC training on real harvest
71
 
72
- `ShAuRyA_Phoenix/rap_xc/train.py` → `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt`
73
 
74
  | Metric | Result |
75
  |---|---|
@@ -83,7 +83,7 @@ Tested on 32k held-out training rows of real harvested transitions. The split-co
83
 
84
  ## 5. HetTemporalGAT vs v1 GCN cascade
85
 
86
- `ShAuRyA_Phoenix/gnn_v2/train_hetgat.py` → `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json`
87
 
88
  Task: arrival-time regression on R6 cascade graphs (real semiconductor supply-chain).
89
 
@@ -111,7 +111,7 @@ Strong cross-corpus stability — same panel produces near-identical α on indep
111
 
112
  ## 7. Tohoku 2011 Platinum counterfactual replication
113
 
114
- `ShAuRyA_Phoenix/counterfactual_v2/platinum.py` synthetic-control method on real Tohoku 2011 economic data.
115
 
116
  | Metric | Value |
117
  |---|---|
 
69
 
70
  ## 4. RAP-XC training on real harvest
71
 
72
+ `versions/v5_phoenix/rap_xc/train.py` → `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt`
73
 
74
  | Metric | Result |
75
  |---|---|
 
83
 
84
  ## 5. HetTemporalGAT vs v1 GCN cascade
85
 
86
+ `versions/v5_phoenix/gnn_v2/train_hetgat.py` → `versions/v5_phoenix/experiments/hetgat_v1/report.json`
87
 
88
  Task: arrival-time regression on R6 cascade graphs (real semiconductor supply-chain).
89
 
 
111
 
112
  ## 7. Tohoku 2011 Platinum counterfactual replication
113
 
114
+ `versions/v5_phoenix/counterfactual_v2/platinum.py` synthetic-control method on real Tohoku 2011 economic data.
115
 
116
  | Metric | Value |
117
  |---|---|
FINAL_SUBMIT/COLD_OPEN_OPENING_LINES.md CHANGED
@@ -1,26 +1,26 @@
1
- # COLD OPEN -- opening lines for judge pitch (<= 8 sec each)
2
-
3
- ## Three variants depending on judge persona
4
-
5
- ### A -- Technical depth judge (academic/research)
6
- > "REINFORCE on Wordle: 100% solve rate, Wilcoxon p=1.87e-34, Cohen's d=3.89, 9.8 seconds on a single CPU thread. Same loop drives a 280-action supply-chain RL env with 1500-event EMDAT RAG corpus and conformal action filter at 0.9001 empirical coverage."
7
-
8
- ### B -- Industry pragmatist (engineer/PM)
9
- > "If Hormuz closes tomorrow, India loses INR X-trillion in 30 days. Watch what one LLM, RL-trained, does about it -- live API calls, real EIA price data, real NASA fire feed, end-to-end in 7 seconds with a sha256 receipt for every claim."
10
-
11
- ### C -- Storyteller (DevRel/PM)
12
- > "Most hackathon entries train on Wordle. We ALSO train on Wordle -- and use the same canonical loop on a real-world supply-chain crisis simulator with 9 live data feeds. One submission, all three hackathon themes, every claim sha256-replayable."
13
-
14
- ## Use-case map
15
-
16
- | Persona | Likely panel weight | Use line |
17
- |---|---|---|
18
- | Academic/research | 40% (per VICTORY_CALCULUS) | A |
19
- | Industry/PM | 35% | B |
20
- | Storyteller/DevRel | 25% | C |
21
-
22
- ## Backup ultra-short variants (<= 4 sec)
23
-
24
- - "100% solve, p=1e-34, 9.8 seconds, CPU only."
25
- - "9 live APIs. 1500 events. 7-second war room."
26
- - "Three themes. One env. Every claim hashed."
 
1
+ # COLD OPEN -- opening lines for judge pitch (<= 8 sec each)
2
+
3
+ ## Three variants depending on judge persona
4
+
5
+ ### A -- Technical depth judge (academic/research)
6
+ > "REINFORCE on Wordle: 100% solve rate, Wilcoxon p=1.87e-34, Cohen's d=3.89, 9.8 seconds on a single CPU thread. Same loop drives a 280-action supply-chain RL env with 1500-event EMDAT RAG corpus and conformal action filter at 0.9001 empirical coverage."
7
+
8
+ ### B -- Industry pragmatist (engineer/PM)
9
+ > "If Hormuz closes tomorrow, India loses INR X-trillion in 30 days. Watch what one LLM, RL-trained, does about it -- live API calls, real EIA price data, real NASA fire feed, end-to-end in 7 seconds with a sha256 receipt for every claim."
10
+
11
+ ### C -- Storyteller (DevRel/PM)
12
+ > "Most hackathon entries train on Wordle. We ALSO train on Wordle -- and use the same canonical loop on a real-world supply-chain crisis simulator with 9 live data feeds. One submission, all three hackathon themes, every claim sha256-replayable."
13
+
14
+ ## Use-case map
15
+
16
+ | Persona | Likely panel weight | Use line |
17
+ |---|---|---|
18
+ | Academic/research | 40% (per VICTORY_CALCULUS) | A |
19
+ | Industry/PM | 35% | B |
20
+ | Storyteller/DevRel | 25% | C |
21
+
22
+ ## Backup ultra-short variants (<= 4 sec)
23
+
24
+ - "100% solve, p=1e-34, 9.8 seconds, CPU only."
25
+ - "9 live APIs. 1500 events. 7-second war room."
26
+ - "Three themes. One env. Every claim hashed."
FINAL_SUBMIT/DATASET_CARD.md CHANGED
@@ -23,11 +23,11 @@
23
  ## Static datasets
24
  | Name | Size | Description | Path |
25
  |------|------|-------------|------|
26
- | EMDAT crisis library v2 | ~1500 events | historical disaster impact records | `ShAuRyA_Supplymind/scenarios/` |
27
- | Hand-curated 8 events | 8 events | Iran/Israel/Hormuz/Red-Sea/Suez/Taiwan/Thailand/Tohoku | `ShAuRyA_Supplymind/realtime/crisis_library.py` |
28
  | WTI crude time-series | 2,818 windows | DCOILWTICO from FRED | TFT training |
29
  | Real company nodes | 40 nodes | TSMC/Samsung/Toyota etc with real coords | `data/companies_real.json` |
30
- | Wordle dictionary | 102 words | 5-letter common words (tier-0 baseline) | `ShAuRyA_Phoenix/wordle_env/env.py` |
31
  | Wordle tier 1+ | +200/+150/+80 words | RLVE expansion tiers | `rlve_curriculum.py` |
32
  | RAG corpus | 6,483 chunks | wiki_crisis 564 + sec_10k 5790 + policy 129 | `R5_GRANITE.json` |
33
  | Conformal calibration NLLs | 5,696 (v2) / 16,000 (v3) | nonconformity scores | `conformal_*.json` |
 
23
  ## Static datasets
24
  | Name | Size | Description | Path |
25
  |------|------|-------------|------|
26
+ | EMDAT crisis library v2 | ~1500 events | historical disaster impact records | `versions/v4_arcadia_live/scenarios/` |
27
+ | Hand-curated 8 events | 8 events | Iran/Israel/Hormuz/Red-Sea/Suez/Taiwan/Thailand/Tohoku | `versions/v4_arcadia_live/realtime/crisis_library.py` |
28
  | WTI crude time-series | 2,818 windows | DCOILWTICO from FRED | TFT training |
29
  | Real company nodes | 40 nodes | TSMC/Samsung/Toyota etc with real coords | `data/companies_real.json` |
30
+ | Wordle dictionary | 102 words | 5-letter common words (tier-0 baseline) | `versions/v5_phoenix/wordle_env/env.py` |
31
  | Wordle tier 1+ | +200/+150/+80 words | RLVE expansion tiers | `rlve_curriculum.py` |
32
  | RAG corpus | 6,483 chunks | wiki_crisis 564 + sec_10k 5790 + policy 129 | `R5_GRANITE.json` |
33
  | Conformal calibration NLLs | 5,696 (v2) / 16,000 (v3) | nonconformity scores | `conformal_*.json` |
FINAL_SUBMIT/ENV_CARD.md CHANGED
@@ -45,7 +45,7 @@
45
  - **hard_cascading_crisis** — 40 nodes, 60 days, $15M budget, cascading
46
 
47
  ## Wordle Companion Environment
48
- - **Class**: `ShAuRyA_Phoenix.wordle_env.env`
49
  - **Type**: Canonical RLVR mini-env
50
  - **Action space**: `Discrete(102)` (102-word baseline) or restricted by curriculum tier
51
  - **State**: 188-dim (rich encoding per `final_real_reinforce_wordle_v2.py`)
 
45
  - **hard_cascading_crisis** — 40 nodes, 60 days, $15M budget, cascading
46
 
47
  ## Wordle Companion Environment
48
+ - **Class**: `versions.v5_phoenix.wordle_env.env`
49
  - **Type**: Canonical RLVR mini-env
50
  - **Action space**: `Discrete(102)` (102-word baseline) or restricted by curriculum tier
51
  - **State**: 188-dim (rich encoding per `final_real_reinforce_wordle_v2.py`)
FINAL_SUBMIT/FEATURE_INVENTORY.md CHANGED
@@ -8,9 +8,9 @@ Verification: every bullet point in the project plan mapped to file:line.
8
 
9
  | Component | Previous | Now wired in |
10
  |---|---|---|
11
- | Chronos-Bolt-base | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:53-71` |
12
- | TimesFM-2 | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:74-99` |
13
- | TabPFN-v2 regressor | PARTIAL (verify only) | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:101-145` |
14
 
15
  Closed Brent backtest gap from 6/8 to **8/8 within ±30%** (median rel err 3.3%).
16
  See `tests/receipts/ensemble_brent_validation.json`.
@@ -22,12 +22,12 @@ See `tests/receipts/ensemble_brent_validation.json`.
22
  | Bullet | Status | Path(s) | Note |
23
  |---|---|---|---|
24
  | supplymind-analyst:v1 | MISSING | — | only v2-v5 retained; v1 superseded |
25
- | supplymind-analyst:v2-v5 | PRESENT | `rl/lora/Modelfile.v2:1-20`, `Modelfile.v3:1-20`, `Modelfile.v4:1-20`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5:1-20` | 4 versions |
26
- | qwen25-14b-local Modelfile | PRESENT | `v3_arcadia/00_emergence/qwen25-14b.Modelfile:1-19` | Q4_K_M |
27
- | qwen25-coder-local Modelfile | PRESENT | `v3_arcadia/00_emergence/qwen25-coder-14b.Modelfile:1-19` | JSON-mode |
28
- | mistral-nemo-local Modelfile | PRESENT | `v3_arcadia/00_emergence/mistral-nemo.Modelfile:1-18` | num_ctx 32768 |
29
  | deepseek-r1-local-q4 Modelfile | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | Q4_K_M reference |
30
- | 5 Modelfile files (rl/lora/*) | PRESENT | `rl/lora/Modelfile, .v2, .v3, .v4` + `ShAuRyA_Supplymind/features/Modelfile.analyst_v5` | All 5 present |
31
 
32
  ## A.2 Modelfile Crafting
33
 
@@ -54,7 +54,7 @@ See `tests/receipts/ensemble_brent_validation.json`.
54
 
55
  | Bullet | Status | Path(s) | Note |
56
  |---|---|---|---|
57
- | `dpo_judge/*` directory | PRESENT | `ShAuRyA_Phoenix/roll_integration/dpo_judge/` | 6 files |
58
  | `prepare_preference_data.py` | PRESENT | `dpo_judge/prepare_preference_data.py:1-50+` | DPO pair builder |
59
  | `train_dpo_trl.py` | PRESENT | `dpo_judge/train_dpo_trl.py:1-50+` | TRL trainer |
60
  | `train_dpo_roll.py` | PRESENT | `dpo_judge/train_dpo_roll.py:1-30+` | ROLL-integrated |
@@ -81,7 +81,7 @@ See `tests/receipts/ensemble_brent_validation.json`.
81
  |---|---|---|---|
82
  | Q4_K_M references | PRESENT | `mistral-nemo.Modelfile:1`, `qwen25-14b.Modelfile:1`, `qwen25-coder-14b.Modelfile:1` | all 3 specify q4km |
83
  | `OLLAMA_MAX_LOADED_MODELS=1` | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | VRAM discipline |
84
- | `convert_bge_to_safetensors.py` | PRESENT | `v3_arcadia/00_emergence/convert_bge_to_safetensors.py:1-45` | CVE-2025-32434 workaround |
85
  | 2GB safetensors output | PRESENT | `models/bge-m3/model.safetensors` | 2.2GB verified |
86
 
87
  ## B. 13 Foundation Models
@@ -106,14 +106,14 @@ See `tests/receipts/ensemble_brent_validation.json`.
106
 
107
  | Script | Status | Path |
108
  |---|---|---|
109
- | `verify_qwen14b.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen14b.py` |
110
- | `verify_mistral_nemo.py` | PRESENT | `v3_arcadia/00_emergence/verify_mistral_nemo.py` |
111
- | `verify_qwen_coder.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen_coder.py` |
112
- | `verify_qwen_vl.py` | PRESENT | `v3_arcadia/00_emergence/verify_qwen_vl.py` |
113
- | `verify_tabpfn.py` | PRESENT | `v3_arcadia/00_emergence/verify_tabpfn.py` |
114
- | `verify_timesfm.py` | PRESENT | `v3_arcadia/00_emergence/verify_timesfm.py` |
115
- | `verify_embedders_chronos.py` | PRESENT | `v3_arcadia/00_emergence/verify_embedders_chronos.py` |
116
- | `r1_qwen_vl_downstream.py` | PRESENT | `v3_arcadia/00_emergence/r1_qwen_vl_downstream.py` |
117
 
118
  ## C.1 Game-Engine Tasks & Action Space
119
 
@@ -203,19 +203,19 @@ See `tests/receipts/ensemble_brent_validation.json`.
203
 
204
  | Component | Path | Purpose |
205
  |---|---|---|
206
- | Hormuz War Room orchestrator | `ShAuRyA_Supplymind/realtime/hormuz_war_room_router.py` | `/demo/hormuz-war-room` POST + UI route |
207
- | India 7-sector exposure | `ShAuRyA_Supplymind/scenarios/india_industry_exposure.py` | 7 cited sectors + deterministic scorer |
208
- | Gulf 7-sector exposure | `ShAuRyA_Supplymind/scenarios/gulf_industry_exposure.py` | 7 cited sectors + bypass-credit scorer |
209
- | Hormuz chokepoint graph | `ShAuRyA_Supplymind/scenarios/hormuz_chokepoint_graph.py` | 14 nodes + 18 edges + 5 IEA facts |
210
- | OpenRouter 6-judge cross-check | `ShAuRyA_Supplymind/realtime/openrouter_war_room_panel.py` | gpt-oss-120b, gemma, glm, minimax, nemotron, gemma-26b |
211
  | War-Room dashboard HTML | `server/static/hormuz_war_room.html` | dark-mode 6-panel UI |
212
  | War-Room validation harness | `scripts/validate_war_room.py` | 8-event historical backtest |
213
- | Ensemble Brent forecaster | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py` | Chronos+TimesFM+TabPFN, 8/8 ±30% |
214
  | Ensemble Brent validator | `scripts/validate_ensemble_brent.py` | 8-event closed-form backtest |
215
  | Master demo HTML | `server/static/master.html` | 9-card live integration page |
216
- | RAP-XC weights | `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt` | 3.14M params, BC 5.62→0.23 |
217
- | Conformal weights | `ShAuRyA_Phoenix/action_v2/conformal_calibrated.pt` | α=0.1, coverage 0.9001 |
218
- | HetGAT report | `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json` | +7.77/+12.15/+10.03% |
219
 
220
  ## API Keys (every key reaches a UI element)
221
 
 
8
 
9
  | Component | Previous | Now wired in |
10
  |---|---|---|
11
+ | Chronos-Bolt-base | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:53-71` |
12
+ | TimesFM-2 | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:74-99` |
13
+ | TabPFN-v2 regressor | PARTIAL (verify only) | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:101-145` |
14
 
15
  Closed Brent backtest gap from 6/8 to **8/8 within ±30%** (median rel err 3.3%).
16
  See `tests/receipts/ensemble_brent_validation.json`.
 
22
  | Bullet | Status | Path(s) | Note |
23
  |---|---|---|---|
24
  | supplymind-analyst:v1 | MISSING | — | only v2-v5 retained; v1 superseded |
25
+ | supplymind-analyst:v2-v5 | PRESENT | `rl/lora/Modelfile.v2:1-20`, `Modelfile.v3:1-20`, `Modelfile.v4:1-20`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5:1-20` | 4 versions |
26
+ | qwen25-14b-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/qwen25-14b.Modelfile:1-19` | Q4_K_M |
27
+ | qwen25-coder-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/qwen25-coder-14b.Modelfile:1-19` | JSON-mode |
28
+ | mistral-nemo-local Modelfile | PRESENT | `versions/v3_arcadia/00_emergence/mistral-nemo.Modelfile:1-18` | num_ctx 32768 |
29
  | deepseek-r1-local-q4 Modelfile | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | Q4_K_M reference |
30
+ | 5 Modelfile files (rl/lora/*) | PRESENT | `rl/lora/Modelfile, .v2, .v3, .v4` + `versions/v4_arcadia_live/features/Modelfile.analyst_v5` | All 5 present |
31
 
32
  ## A.2 Modelfile Crafting
33
 
 
54
 
55
  | Bullet | Status | Path(s) | Note |
56
  |---|---|---|---|
57
+ | `dpo_judge/*` directory | PRESENT | `versions/v5_phoenix/roll_integration/dpo_judge/` | 6 files |
58
  | `prepare_preference_data.py` | PRESENT | `dpo_judge/prepare_preference_data.py:1-50+` | DPO pair builder |
59
  | `train_dpo_trl.py` | PRESENT | `dpo_judge/train_dpo_trl.py:1-50+` | TRL trainer |
60
  | `train_dpo_roll.py` | PRESENT | `dpo_judge/train_dpo_roll.py:1-30+` | ROLL-integrated |
 
81
  |---|---|---|---|
82
  | Q4_K_M references | PRESENT | `mistral-nemo.Modelfile:1`, `qwen25-14b.Modelfile:1`, `qwen25-coder-14b.Modelfile:1` | all 3 specify q4km |
83
  | `OLLAMA_MAX_LOADED_MODELS=1` | PRESENT | `docs/OLLAMA_FINE_TUNING_FINAL_UPGRADE.md` | VRAM discipline |
84
+ | `convert_bge_to_safetensors.py` | PRESENT | `versions/v3_arcadia/00_emergence/convert_bge_to_safetensors.py:1-45` | CVE-2025-32434 workaround |
85
  | 2GB safetensors output | PRESENT | `models/bge-m3/model.safetensors` | 2.2GB verified |
86
 
87
  ## B. 13 Foundation Models
 
106
 
107
  | Script | Status | Path |
108
  |---|---|---|
109
+ | `verify_qwen14b.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen14b.py` |
110
+ | `verify_mistral_nemo.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_mistral_nemo.py` |
111
+ | `verify_qwen_coder.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen_coder.py` |
112
+ | `verify_qwen_vl.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_qwen_vl.py` |
113
+ | `verify_tabpfn.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_tabpfn.py` |
114
+ | `verify_timesfm.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_timesfm.py` |
115
+ | `verify_embedders_chronos.py` | PRESENT | `versions/v3_arcadia/00_emergence/verify_embedders_chronos.py` |
116
+ | `r1_qwen_vl_downstream.py` | PRESENT | `versions/v3_arcadia/00_emergence/r1_qwen_vl_downstream.py` |
117
 
118
  ## C.1 Game-Engine Tasks & Action Space
119
 
 
203
 
204
  | Component | Path | Purpose |
205
  |---|---|---|
206
+ | Hormuz War Room orchestrator | `versions/v4_arcadia_live/realtime/hormuz_war_room_router.py` | `/demo/hormuz-war-room` POST + UI route |
207
+ | India 7-sector exposure | `versions/v4_arcadia_live/scenarios/india_industry_exposure.py` | 7 cited sectors + deterministic scorer |
208
+ | Gulf 7-sector exposure | `versions/v4_arcadia_live/scenarios/gulf_industry_exposure.py` | 7 cited sectors + bypass-credit scorer |
209
+ | Hormuz chokepoint graph | `versions/v4_arcadia_live/scenarios/hormuz_chokepoint_graph.py` | 14 nodes + 18 edges + 5 IEA facts |
210
+ | OpenRouter 6-judge cross-check | `versions/v4_arcadia_live/realtime/openrouter_war_room_panel.py` | gpt-oss-120b, gemma, glm, minimax, nemotron, gemma-26b |
211
  | War-Room dashboard HTML | `server/static/hormuz_war_room.html` | dark-mode 6-panel UI |
212
  | War-Room validation harness | `scripts/validate_war_room.py` | 8-event historical backtest |
213
+ | Ensemble Brent forecaster | `versions/v5_phoenix/forecast_v2/ensemble_brent.py` | Chronos+TimesFM+TabPFN, 8/8 ±30% |
214
  | Ensemble Brent validator | `scripts/validate_ensemble_brent.py` | 8-event closed-form backtest |
215
  | Master demo HTML | `server/static/master.html` | 9-card live integration page |
216
+ | RAP-XC weights | `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt` | 3.14M params, BC 5.62→0.23 |
217
+ | Conformal weights | `versions/v5_phoenix/action_v2/conformal_calibrated.pt` | α=0.1, coverage 0.9001 |
218
+ | HetGAT report | `versions/v5_phoenix/experiments/hetgat_v1/report.json` | +7.77/+12.15/+10.03% |
219
 
220
  ## API Keys (every key reaches a UI element)
221
 
FINAL_SUBMIT/FEATURE_INVENTORY_DI.md CHANGED
@@ -2,7 +2,7 @@
2
 
3
  Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). Each row links to a file or a JSON receipt that proves the claim.
4
 
5
- **Note:** receipts named `R*_*.json` are mirrored from `v3_arcadia/results/` to `FINAL_SUBMIT/receipts/`.
6
 
7
  ---
8
 
@@ -86,7 +86,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
86
  |---|---|---|---|
87
  | 44 | Custom TFT 513K params on 3-target FRED | ✅ | `tft_real_metrics.json: params, test_mae_p50: {DCOILWTICO, PCOPPUSDM, PPICMM}` |
88
  | 45 | Custom TFT 90K params on real WTI MAE $7.83/bbl | ✅ | `tft_v2_metrics.json: params, test_mae_p50.DCOILWTICO` |
89
- | 46 | Chronos-Bolt 14-step quantile [0.1, 0.5, 0.9] | ✅ | `ShAuRyA_Phoenix/forecast_v2/ensemble_brent.py:53-71` (pass-10), `R3_TIMESFM_QUANTILE.json` |
90
  | 47 | TimesFM-2 + synthesized quantile via residual regression | ✅ | `forecast_v2/ensemble_brent.py:74-99`, `R3_TIMESFM_QUANTILE.json` |
91
  | 48 | Prophet weekly+yearly | ✅ | `R3_PAST_SELF.json` ensemble row (Prophet seasonality) |
92
  | 49 | ARIMA(5,1,0) classical baseline | ✅ | `R3_PAST_SELF.json` ensemble row |
@@ -98,7 +98,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
98
  | 55 | 8 FRED targets (WTI, copper, EUR/USD, JPY/USD, CNY/USD, KOR/USD, EUR-USD, PPICMM) | ✅ | `tft_v2_metrics.json: targets` (7 targets confirmed in train_tft_real.py: DCOILWTICO/PCOPPUSDM/DEXTAUS/DEXKOUS/DEXJPUS/DEXUSEU/DEXCHUS); 8th = PPICMM in tft_real_metrics |
99
  | 56 | 3 horizons (7, 14, 28 days) | ⚠️ | `train_tft_real.py:39 HORIZON=14`; 7 and 28-day variants in `R3_PAST_SELF` rolling_backtest fields |
100
  | 57 | PICP@80/90/95% calibration | ✅ | `R3_PAST_SELF.json: per_target_horizon.picp_*` |
101
- | 58 | Per-horizon split-conformal (Foygel Barber 2022) | ✅ | `ShAuRyA_Phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
102
  | 59 | TimesFM-CP residual quantile regression | ✅ | `R3_TIMESFM_QUANTILE.json` + `R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
103
  | 60 | Heteroscedastic Ridge widths | ✅ | `R3_PAST_SELF.json: ridge_widths` |
104
  | 61 | 2,883 business days (2015-2026) | ✅ | `tft_v2_metrics.json: train_size`, `rl/data/fred_cache.json` |
@@ -114,7 +114,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
114
  |---|---|---|---|
115
  | 63 | MC Dropout 50 forward passes | ✅ | `rl/uncertainty.py:1-50, n_passes=50`, `mc_dropout_v2.json` |
116
  | 64 | Epistemic σ correlates accuracy (Q1=99.76%, Q4=55.92%) | ✅ | `mc_dropout_v2.json: reliability_full[bins]` |
117
- | 65 | Conformal RL on Q-values (3 alpha 0.05/0.05/0.1) | ✅ | `ShAuRyA_Supplymind/features/conformal_rl.py:1-50` + `ShAuRyA_Phoenix/action_v2/conformal.py` |
118
  | 66 | Confidence-damped projection | ✅ | `rl/uncertainty.py: confidence_damping` + `crisis_library.py: damp_on_weak_match` |
119
  | 67 | Beta-severity + Lognormal-duration MC | ✅ | `rl/surrogate/fast_monte_carlo.py: scenarios` |
120
  | 68 | Numba JIT MC hotloop (10-50× speedup) | ✅ | `rl/surrogate/fast_monte_carlo.py: @numba.jit` |
@@ -147,7 +147,7 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
147
  | 86 | 26 BEIR Wikipedia subset | ✅ | `R5_BEIR_MANUAL.json` |
148
  | 87 | ChromaDB persistent at rl/rag/chroma_db/ | ✅ | dir present |
149
  | 88 | Ollama nomic-embed-text (768d) | ✅ | `rl/rag/indexer.py:29-30 EMBEDDING_MODEL=nomic-embed-text` |
150
- | 89 | mxbai-embed-large for crisis library | ✅ | `ShAuRyA_Supplymind/scenarios/library_v2_search.py` (pass-6) |
151
  | 90 | Corpus SHA-256 hash caching | ⚠️ | grep finds `corpus_hash` references in some scripts; not in indexer.py directly |
152
  | 91 | min_score=0.60 | ✅ | `rl/rag/indexer.py:31 MIN_SCORE=0.60` |
153
  | 92 | chunk_words=256, overlap=32, min=30 | ⚠️ | `indexer.py:32-33 chunk_words=300` (slightly different); overlap+min not in source |
@@ -207,8 +207,8 @@ Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). E
207
  | 128 | 50 cached explanations | ✅ | cache implementation present |
208
  | 129 | 3-4s per explanation on RTX 4080 | ✅ | latency profiled |
209
  | 130 | Explainer stress test 50/50 pass | ✅ | `explainer_stress_v2.json: n_test=50, passed=50, pass_rate=1.0` (exact) |
210
- | 131 | GCN edge attention PNG heatmaps | ✅ | `ShAuRyA_Supplymind/features/gcn_attention_viz.py` |
211
- | 132 | Provenance 5-tier trust classifier (regulatory/academic/reference/industry/uncertain) | ✅ | `ShAuRyA_Supplymind/features/rag_provenance.py:39-49` (5 tiers) |
212
 
213
  **Section I total: 13 ✅ + 1 ⚠️ = 14/14 = 100%**
214
 
 
2
 
3
  Bullet-by-bullet status across the 4 sections D, E, F, G, H, I (~140 bullets). Each row links to a file or a JSON receipt that proves the claim.
4
 
5
+ **Note:** receipts named `R*_*.json` are mirrored from `versions/v3_arcadia/results/` to `FINAL_SUBMIT/receipts/`.
6
 
7
  ---
8
 
 
86
  |---|---|---|---|
87
  | 44 | Custom TFT 513K params on 3-target FRED | ✅ | `tft_real_metrics.json: params, test_mae_p50: {DCOILWTICO, PCOPPUSDM, PPICMM}` |
88
  | 45 | Custom TFT 90K params on real WTI MAE $7.83/bbl | ✅ | `tft_v2_metrics.json: params, test_mae_p50.DCOILWTICO` |
89
+ | 46 | Chronos-Bolt 14-step quantile [0.1, 0.5, 0.9] | ✅ | `versions/v5_phoenix/forecast_v2/ensemble_brent.py:53-71` (pass-10), `R3_TIMESFM_QUANTILE.json` |
90
  | 47 | TimesFM-2 + synthesized quantile via residual regression | ✅ | `forecast_v2/ensemble_brent.py:74-99`, `R3_TIMESFM_QUANTILE.json` |
91
  | 48 | Prophet weekly+yearly | ✅ | `R3_PAST_SELF.json` ensemble row (Prophet seasonality) |
92
  | 49 | ARIMA(5,1,0) classical baseline | ✅ | `R3_PAST_SELF.json` ensemble row |
 
98
  | 55 | 8 FRED targets (WTI, copper, EUR/USD, JPY/USD, CNY/USD, KOR/USD, EUR-USD, PPICMM) | ✅ | `tft_v2_metrics.json: targets` (7 targets confirmed in train_tft_real.py: DCOILWTICO/PCOPPUSDM/DEXTAUS/DEXKOUS/DEXJPUS/DEXUSEU/DEXCHUS); 8th = PPICMM in tft_real_metrics |
99
  | 56 | 3 horizons (7, 14, 28 days) | ⚠️ | `train_tft_real.py:39 HORIZON=14`; 7 and 28-day variants in `R3_PAST_SELF` rolling_backtest fields |
100
  | 57 | PICP@80/90/95% calibration | ✅ | `R3_PAST_SELF.json: per_target_horizon.picp_*` |
101
+ | 58 | Per-horizon split-conformal (Foygel Barber 2022) | ✅ | `versions/v5_phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
102
  | 59 | TimesFM-CP residual quantile regression | ✅ | `R3_TIMESFM_QUANTILE.json` + `R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
103
  | 60 | Heteroscedastic Ridge widths | ✅ | `R3_PAST_SELF.json: ridge_widths` |
104
  | 61 | 2,883 business days (2015-2026) | ✅ | `tft_v2_metrics.json: train_size`, `rl/data/fred_cache.json` |
 
114
  |---|---|---|---|
115
  | 63 | MC Dropout 50 forward passes | ✅ | `rl/uncertainty.py:1-50, n_passes=50`, `mc_dropout_v2.json` |
116
  | 64 | Epistemic σ correlates accuracy (Q1=99.76%, Q4=55.92%) | ✅ | `mc_dropout_v2.json: reliability_full[bins]` |
117
+ | 65 | Conformal RL on Q-values (3 alpha 0.05/0.05/0.1) | ✅ | `versions/v4_arcadia_live/features/conformal_rl.py:1-50` + `versions/v5_phoenix/action_v2/conformal.py` |
118
  | 66 | Confidence-damped projection | ✅ | `rl/uncertainty.py: confidence_damping` + `crisis_library.py: damp_on_weak_match` |
119
  | 67 | Beta-severity + Lognormal-duration MC | ✅ | `rl/surrogate/fast_monte_carlo.py: scenarios` |
120
  | 68 | Numba JIT MC hotloop (10-50× speedup) | ✅ | `rl/surrogate/fast_monte_carlo.py: @numba.jit` |
 
147
  | 86 | 26 BEIR Wikipedia subset | ✅ | `R5_BEIR_MANUAL.json` |
148
  | 87 | ChromaDB persistent at rl/rag/chroma_db/ | ✅ | dir present |
149
  | 88 | Ollama nomic-embed-text (768d) | ✅ | `rl/rag/indexer.py:29-30 EMBEDDING_MODEL=nomic-embed-text` |
150
+ | 89 | mxbai-embed-large for crisis library | ✅ | `versions/v4_arcadia_live/scenarios/library_v2_search.py` (pass-6) |
151
  | 90 | Corpus SHA-256 hash caching | ⚠️ | grep finds `corpus_hash` references in some scripts; not in indexer.py directly |
152
  | 91 | min_score=0.60 | ✅ | `rl/rag/indexer.py:31 MIN_SCORE=0.60` |
153
  | 92 | chunk_words=256, overlap=32, min=30 | ⚠️ | `indexer.py:32-33 chunk_words=300` (slightly different); overlap+min not in source |
 
207
  | 128 | 50 cached explanations | ✅ | cache implementation present |
208
  | 129 | 3-4s per explanation on RTX 4080 | ✅ | latency profiled |
209
  | 130 | Explainer stress test 50/50 pass | ✅ | `explainer_stress_v2.json: n_test=50, passed=50, pass_rate=1.0` (exact) |
210
+ | 131 | GCN edge attention PNG heatmaps | ✅ | `versions/v4_arcadia_live/features/gcn_attention_viz.py` |
211
+ | 132 | Provenance 5-tier trust classifier (regulatory/academic/reference/industry/uncertain) | ✅ | `versions/v4_arcadia_live/features/rag_provenance.py:39-49` (5 tiers) |
212
 
213
  **Section I total: 13 ✅ + 1 ⚠️ = 14/14 = 100%**
214
 
FINAL_SUBMIT/FEATURE_INVENTORY_JT.md CHANGED
@@ -48,7 +48,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
48
  |---|---|---|---|
49
  | 21 | NSGA2 via pymoo | ✅ | `rl/pareto/*` (pymoo import) |
50
  | 22 | 3 objectives (cost, resilience_loss, carbon) | ✅ | `pareto_results.json: objective_names` exact |
51
- | 23 | Carbon factors per IMO/EPA/ICAO | ✅ | `ShAuRyA_Supplymind/features/pareto_carbon.py` constants |
52
  | 24 | Air 0.82 / Sea 0.013 / Sea express 0.026 / Rail 0.028 / Road 0.096 kg CO2/tonne-km | ✅ | constants in source |
53
  | 25 | 20 mitigation plans tested | ⚠️ | `pareto_results.json: n_policies=5` (smaller run); 20-plan run may be older or in `pareto_frontier_v2.json` |
54
  | 26 | 11 Pareto-frontier plans (55%) | ⚠️ | current receipts 2/5 and 3/5; 11/20 from older run |
@@ -73,11 +73,11 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
73
  | 36 | GPU MC: 1 state → 100K with noise linspace(0.01-0.3) | ✅ | `rl/surrogate/gpu_monte_carlo.py` |
74
  | 37 | 80ms for 100K scenarios | ✅ | profiled in module |
75
  | 38 | p5/p50/p95/p99/cvar_10 outputs | ✅ | gpu_monte_carlo.py |
76
- | 39 | Counterfactual digital twin (100 rollouts MC) | ✅ | `ShAuRyA_Phoenix/counterfactual_twin/twin.py` |
77
  | 40 | REVENUE_AT_RISK_USD: easy $200M / med $320M / hard $400M | ✅ | constants in twin.py |
78
  | 41 | Severity multiplier 0.5 + 1.0 × clamp(severity, 0, 1) | ✅ | twin.py formula |
79
  | 42 | TwinReport dataclass (median, p95, savings, CI95, savings_pct) | ✅ | twin.py |
80
- | 43 | Receipt: $178.68M saved (48%) at sev=0.85, brent=$123, n=30 | ✅ | `ShAuRyA_Phoenix/receipts_v2/V5_Twin_savings_gt_zero.receipt.yaml` |
81
 
82
  **M: 13/13 = 100%**
83
 
@@ -87,7 +87,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
87
 
88
  | # | Bullet | Status | Evidence |
89
  |---|---|---|---|
90
- | 44 | NewsAPI (5 keyword queries, 7-day, 100 req/day) | ✅ | `ShAuRyA_Supplymind/realtime/sources/newsapi.py` |
91
  | 45 | GDELT 2.0 Doc API (15-min refresh, tone severity) | ✅ | `sources/gdelt.py` |
92
  | 46 | USGS M4.5+ in last 24h, 6 region boxes | ✅ | `sources/usgs.py` |
93
  | 47 | FRED Brent DCOILBRENTEU daily spot | ✅ | `sources/fred_brent.py` |
@@ -96,7 +96,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
96
  | 50 | FRED severity max(\|DoD\|/5%, \|WoW\|/10%) capped 1.0 | ✅ | `fred_brent.py: compute_severity` |
97
  | 51 | GDELT tone-derived severity | ✅ | `gdelt.py: tone_to_severity` |
98
  | 52 | USGS magnitude-based severity | ✅ | `usgs.py: magnitude_to_severity` |
99
- | 53 | SQLite events.db with full schema | ✅ | `ShAuRyA_Supplymind/realtime/store.py: DB_PATH` |
100
  | 54 | 4 indices (source+hash, ts, region, type) | ✅ | `store.py: CREATE INDEX` × 4 |
101
  | 55 | SHA-256 dedup hash 16 chars | ✅ | `store.py: hashlib.sha256(...).hexdigest()[:16]` |
102
  | 56 | 24-hour dedup window | ✅ | `store.py: DEDUP_WINDOW_S = 86400` |
@@ -112,7 +112,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
112
 
113
  | # | Bullet | Status | Evidence |
114
  |---|---|---|---|
115
- | 60 | 8 hand-curated real events (2022-2026) | ✅ | `ShAuRyA_Supplymind/scenarios/iran_israel_hormuz_2024_2026.json: 8 events` exact |
116
  | 61 | 3-4 citations per event (Reuters/BBC/CNBC/FRED/IDF/DoD/UNCTAD/Lloyd's) | ✅ | each event.citations[] in JSON has 3-4 entries with publisher field |
117
  | 62 | Curation policy ≥3 citations | ✅ | grep `citations` in v1 library |
118
  | 63 | mxbai embedding mode | ✅ | `realtime/crisis_library.py` SentenceTransformer |
@@ -172,7 +172,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
172
  | 110 | CatBoost (1500 iters, depth 8, GPU) | ✅ | DataCo training receipt |
173
  | 111 | TabPFN-v2 classifier (zero-shot) | ✅ | `tabpfn_verify.json + tabpfn_risk_judge.py` |
174
  | 112 | TabPFN-v2 regressor | ✅ | wired in pass-10 ensemble |
175
- | 113 | TabPFN bagging | ✅ | `v3_arcadia/10_caramel/r2_tabpfn_bagging.py` + `R2_BENEFIT_FIX.json` |
176
  | 114 | Stacking with Ridge meta-learner | ✅ | `R3_STACKING_V2.json` |
177
  | 115 | 5-fold CV | ✅ | rolling-fold in stacking |
178
  | 116 | OOF predictions | ✅ | `R3_STACKING_V2.json: oof_predictions` |
@@ -194,7 +194,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
194
 
195
  | # | Bullet | Status | Evidence |
196
  |---|---|---|---|
197
- | 126 | Political risk GBR R²=0.994, MAE=0.0095 on 214 countries | ✅ | `ShAuRyA_Supplymind/features/political_risk.py + receipts/F12_*.json` |
198
  | 127 | Political risk LSTM | ✅ | alternate model in same module |
199
  | 128 | Dependency MLP acc=97.45% on 144K | ✅ | `features/dependency_mlp.py + F11_*.json` |
200
  | 129 | Financial impact Ridge R²=0.736, MAE=$26.04 | ✅ | `features/financial_ridge.py + F8_*.json` |
@@ -241,7 +241,7 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
241
  | 152 | R6_MaskingAblation_easy_lift = 26.768% | ✅ | `R6_GETHSEMANE_MASKING_ABLATION.json` |
242
  | 153 | R6_GCN_easy_MAE_vs_MLP = 48.025% | ✅ | `R6_PROVIDER_V2.json: easy.improvement_vs_mlp_pct = 48.025` exact |
243
  | 154 | R6_AquaRegia_WTI_dev95 = 0.0238 | ✅ | `R6_AQUA_REGIA_V2.json` |
244
- | 155 | R3_TimesFM_CP_WTI_dev95 = 0.050 | ✅ | `ShAuRyA_Phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
245
  | 156 | V4_SPOF_V2_F1 = 1.0 | ✅ | F23 receipt |
246
  | 157 | V4_STACKING_V2_lift_vs_WV = 0.0045 | ✅ | `R3_STACKING_V2.json` |
247
  | 158 | V4_Live_Brent_202604 = $123.28 | ✅ | live FRED fetch on 2026-04-21 |
@@ -253,15 +253,15 @@ Bullet-by-bullet status across J/K/L/M/N/O/P/Q/R/S/T (~200 bullets). Same legend
253
  | 164 | V5_Arena_baseline_leaderboard = 6 baselines | ✅ | `R6_ALGO_COMPARISON.json` per_algorithm has 4 + 2 implicit = 6 |
254
  | 165 | V5_Twin_savings_gt_zero = $178,684,200 | ✅ | twin receipt |
255
  | 166 | V5_DPO_JUDGE_preference_pairs_built = 21 | ✅ | `dpo_judge/data/preference_pairs.jsonl: 21 lines` exact |
256
- | 167 | V5_Skill_pack_shipped = 4 files | ✅ | `ShAuRyA_Phoenix/supplymind_skills/*` 4+ skills |
257
  | 168 | V5_Phoenix_tests_green = 15 passed | ✅ | phoenix smoke = 15 |
258
- | 169 | SHA-256 stdout tracking | ✅ | `ShAuRyA_Phoenix/receipts_v2/framework.py` |
259
  | 170 | Hardware capture (CUDA detection) | ✅ | framework.py |
260
  | 171 | Runtime tracking | ✅ | framework.py |
261
  | 172 | 5 comparators (==, >=, <=, in_range, regex) | ✅ | framework.py |
262
- | 173 | Tamper-evident SHA-256 + INDEX.json + INDEX.md auto-generated | ✅ | `ShAuRyA_Phoenix/receipts_v2/INDEX.{json,md}` |
263
  | 174 | Tiny YAML parser (no PyYAML dep) | ✅ | framework.py |
264
- | 175 | 271-line framework.py | ✅ | `wc -l ShAuRyA_Phoenix/receipts_v2/framework.py` |
265
 
266
  **T: 28/28 = 100%**
267
 
 
48
  |---|---|---|---|
49
  | 21 | NSGA2 via pymoo | ✅ | `rl/pareto/*` (pymoo import) |
50
  | 22 | 3 objectives (cost, resilience_loss, carbon) | ✅ | `pareto_results.json: objective_names` exact |
51
+ | 23 | Carbon factors per IMO/EPA/ICAO | ✅ | `versions/v4_arcadia_live/features/pareto_carbon.py` constants |
52
  | 24 | Air 0.82 / Sea 0.013 / Sea express 0.026 / Rail 0.028 / Road 0.096 kg CO2/tonne-km | ✅ | constants in source |
53
  | 25 | 20 mitigation plans tested | ⚠️ | `pareto_results.json: n_policies=5` (smaller run); 20-plan run may be older or in `pareto_frontier_v2.json` |
54
  | 26 | 11 Pareto-frontier plans (55%) | ⚠️ | current receipts 2/5 and 3/5; 11/20 from older run |
 
73
  | 36 | GPU MC: 1 state → 100K with noise linspace(0.01-0.3) | ✅ | `rl/surrogate/gpu_monte_carlo.py` |
74
  | 37 | 80ms for 100K scenarios | ✅ | profiled in module |
75
  | 38 | p5/p50/p95/p99/cvar_10 outputs | ✅ | gpu_monte_carlo.py |
76
+ | 39 | Counterfactual digital twin (100 rollouts MC) | ✅ | `versions/v5_phoenix/counterfactual_twin/twin.py` |
77
  | 40 | REVENUE_AT_RISK_USD: easy $200M / med $320M / hard $400M | ✅ | constants in twin.py |
78
  | 41 | Severity multiplier 0.5 + 1.0 × clamp(severity, 0, 1) | ✅ | twin.py formula |
79
  | 42 | TwinReport dataclass (median, p95, savings, CI95, savings_pct) | ✅ | twin.py |
80
+ | 43 | Receipt: $178.68M saved (48%) at sev=0.85, brent=$123, n=30 | ✅ | `versions/v5_phoenix/receipts_v2/V5_Twin_savings_gt_zero.receipt.yaml` |
81
 
82
  **M: 13/13 = 100%**
83
 
 
87
 
88
  | # | Bullet | Status | Evidence |
89
  |---|---|---|---|
90
+ | 44 | NewsAPI (5 keyword queries, 7-day, 100 req/day) | ✅ | `versions/v4_arcadia_live/realtime/sources/newsapi.py` |
91
  | 45 | GDELT 2.0 Doc API (15-min refresh, tone severity) | ✅ | `sources/gdelt.py` |
92
  | 46 | USGS M4.5+ in last 24h, 6 region boxes | ✅ | `sources/usgs.py` |
93
  | 47 | FRED Brent DCOILBRENTEU daily spot | ✅ | `sources/fred_brent.py` |
 
96
  | 50 | FRED severity max(\|DoD\|/5%, \|WoW\|/10%) capped 1.0 | ✅ | `fred_brent.py: compute_severity` |
97
  | 51 | GDELT tone-derived severity | ✅ | `gdelt.py: tone_to_severity` |
98
  | 52 | USGS magnitude-based severity | ✅ | `usgs.py: magnitude_to_severity` |
99
+ | 53 | SQLite events.db with full schema | ✅ | `versions/v4_arcadia_live/realtime/store.py: DB_PATH` |
100
  | 54 | 4 indices (source+hash, ts, region, type) | ✅ | `store.py: CREATE INDEX` × 4 |
101
  | 55 | SHA-256 dedup hash 16 chars | ✅ | `store.py: hashlib.sha256(...).hexdigest()[:16]` |
102
  | 56 | 24-hour dedup window | ✅ | `store.py: DEDUP_WINDOW_S = 86400` |
 
112
 
113
  | # | Bullet | Status | Evidence |
114
  |---|---|---|---|
115
+ | 60 | 8 hand-curated real events (2022-2026) | ✅ | `versions/v4_arcadia_live/scenarios/iran_israel_hormuz_2024_2026.json: 8 events` exact |
116
  | 61 | 3-4 citations per event (Reuters/BBC/CNBC/FRED/IDF/DoD/UNCTAD/Lloyd's) | ✅ | each event.citations[] in JSON has 3-4 entries with publisher field |
117
  | 62 | Curation policy ≥3 citations | ✅ | grep `citations` in v1 library |
118
  | 63 | mxbai embedding mode | ✅ | `realtime/crisis_library.py` SentenceTransformer |
 
172
  | 110 | CatBoost (1500 iters, depth 8, GPU) | ✅ | DataCo training receipt |
173
  | 111 | TabPFN-v2 classifier (zero-shot) | ✅ | `tabpfn_verify.json + tabpfn_risk_judge.py` |
174
  | 112 | TabPFN-v2 regressor | ✅ | wired in pass-10 ensemble |
175
+ | 113 | TabPFN bagging | ✅ | `versions/v3_arcadia/10_caramel/r2_tabpfn_bagging.py` + `R2_BENEFIT_FIX.json` |
176
  | 114 | Stacking with Ridge meta-learner | ✅ | `R3_STACKING_V2.json` |
177
  | 115 | 5-fold CV | ✅ | rolling-fold in stacking |
178
  | 116 | OOF predictions | ✅ | `R3_STACKING_V2.json: oof_predictions` |
 
194
 
195
  | # | Bullet | Status | Evidence |
196
  |---|---|---|---|
197
+ | 126 | Political risk GBR R²=0.994, MAE=0.0095 on 214 countries | ✅ | `versions/v4_arcadia_live/features/political_risk.py + receipts/F12_*.json` |
198
  | 127 | Political risk LSTM | ✅ | alternate model in same module |
199
  | 128 | Dependency MLP acc=97.45% on 144K | ✅ | `features/dependency_mlp.py + F11_*.json` |
200
  | 129 | Financial impact Ridge R²=0.736, MAE=$26.04 | ✅ | `features/financial_ridge.py + F8_*.json` |
 
241
  | 152 | R6_MaskingAblation_easy_lift = 26.768% | ✅ | `R6_GETHSEMANE_MASKING_ABLATION.json` |
242
  | 153 | R6_GCN_easy_MAE_vs_MLP = 48.025% | ✅ | `R6_PROVIDER_V2.json: easy.improvement_vs_mlp_pct = 48.025` exact |
243
  | 154 | R6_AquaRegia_WTI_dev95 = 0.0238 | ✅ | `R6_AQUA_REGIA_V2.json` |
244
+ | 155 | R3_TimesFM_CP_WTI_dev95 = 0.050 | ✅ | `versions/v5_phoenix/receipts_v2/R3_TimesFM_CP_WTI_dev95.receipt.yaml` |
245
  | 156 | V4_SPOF_V2_F1 = 1.0 | ✅ | F23 receipt |
246
  | 157 | V4_STACKING_V2_lift_vs_WV = 0.0045 | ✅ | `R3_STACKING_V2.json` |
247
  | 158 | V4_Live_Brent_202604 = $123.28 | ✅ | live FRED fetch on 2026-04-21 |
 
253
  | 164 | V5_Arena_baseline_leaderboard = 6 baselines | ✅ | `R6_ALGO_COMPARISON.json` per_algorithm has 4 + 2 implicit = 6 |
254
  | 165 | V5_Twin_savings_gt_zero = $178,684,200 | ✅ | twin receipt |
255
  | 166 | V5_DPO_JUDGE_preference_pairs_built = 21 | ✅ | `dpo_judge/data/preference_pairs.jsonl: 21 lines` exact |
256
+ | 167 | V5_Skill_pack_shipped = 4 files | ✅ | `versions/v5_phoenix/supplymind_skills/*` 4+ skills |
257
  | 168 | V5_Phoenix_tests_green = 15 passed | ✅ | phoenix smoke = 15 |
258
+ | 169 | SHA-256 stdout tracking | ✅ | `versions/v5_phoenix/receipts_v2/framework.py` |
259
  | 170 | Hardware capture (CUDA detection) | ✅ | framework.py |
260
  | 171 | Runtime tracking | ✅ | framework.py |
261
  | 172 | 5 comparators (==, >=, <=, in_range, regex) | ✅ | framework.py |
262
+ | 173 | Tamper-evident SHA-256 + INDEX.json + INDEX.md auto-generated | ✅ | `versions/v5_phoenix/receipts_v2/INDEX.{json,md}` |
263
  | 174 | Tiny YAML parser (no PyYAML dep) | ✅ | framework.py |
264
+ | 175 | 271-line framework.py | ✅ | `wc -l versions/v5_phoenix/receipts_v2/framework.py` |
265
 
266
  **T: 28/28 = 100%**
267
 
FINAL_SUBMIT/FEATURE_INVENTORY_UBB.md CHANGED
@@ -8,7 +8,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
8
 
9
  | # | Bullet | Status | Evidence |
10
  |---|---|---|---|
11
- | 1 | Karpathy-pattern overnight loop | ✅ | `ShAuRyA_Phoenix/autoresearch_fixed/orchestrator.py` |
12
  | 2 | LLM hypothesis generation (Qwen-14B local or Claude) | ✅ | `hypothesis_engine.py` |
13
  | 3 | Mutable `candidate_train.py` with safe-to-modify markers | ✅ | `autoresearch_fixed/candidate_train.py` |
14
  | 4 | Frozen `program.md` (immutable) | ✅ | `autoresearch_fixed/program.md` |
@@ -42,11 +42,11 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
42
 
43
  | # | Bullet | Status | Evidence |
44
  |---|---|---|---|
45
- | 26 | Counterfactual digital twin 100 rollouts MC | ✅ | `ShAuRyA_Phoenix/counterfactual_twin/twin.py` |
46
  | 27 | Arena leaderboard 6 baselines pre-seeded | ✅ | `arena/leaderboard.json: n_baselines=6` exact |
47
  | 28 | MaskablePPO #1 mean=2.209 CI95=[2.178,2.239] | ✅ | `arena/leaderboard.json: rows[0] = MaskablePPO-v3 (ours), overall_reward_mean=2.209, overall_ci95=[2.178,2.239]` EXACT |
48
  | 29 | runner.py with TaskResult + ArenaResult dataclasses | ✅ | `arena/runner.py` |
49
- | 30 | 3 Claude Code skills (benchmark-runner, autoresearch-experiment, live-demo-orchestrator) | ✅ | `ShAuRyA_Phoenix/supplymind_skills/` 3 dirs |
50
  | 31 | plugin.json v1.0.0 manifest | ✅ | `supplymind_skills/plugin.json` |
51
  | 32 | Replay cache 8 events frozen | ✅ | `realtime_v5/replay_cache_latest.json: n_events=8` exact |
52
  | 33 | replay_cache_latest.json + timestamped snapshot | ✅ | dir contains both |
@@ -56,7 +56,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
56
  | 37 | DPO 21 pairs Qwen-2.5-3B LoRA r=8 | ✅ | `dpo_judge/data/preference_pairs.jsonl` 21 lines |
57
  | 38 | TRL fallback for ROLL fragility | ✅ | `dpo_judge/train_dpo_trl.py` |
58
  | 39 | Two upstream PRs ready (Meta OpenEnv + Alibaba ROLL) | ✅ | `docs/PHOENIX_PUSH_REPORT.md` |
59
- | 40 | build_pr_branch.sh | ✅ | `ShAuRyA_Phoenix/build_pr_branch.sh` |
60
  | 41 | Phoenix isolation (v3+v4 untouched) + copy-before-edit + .venv-roll/ | ✅ | docs note |
61
  | 42 | phoenix_app.py mounts /arena /twin /replay + /phoenix/status | ✅ | `phoenix_app.py` + server/app.py mount |
62
 
@@ -74,7 +74,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
74
  | 46 | Non-root appuser UID 1000 | ✅ | Dockerfile RUN useradd |
75
  | 47 | HEALTHCHECK curl /health every 30s | ✅ | Dockerfile HEALTHCHECK |
76
  | 48 | uvicorn server.app:app entry | ✅ | Dockerfile CMD |
77
- | 49 | HF Space at huggingface.co/spaces/Shaurya-Noodle/Supplymind | ✅ | `DEPLOY_HF_SPACE.md` |
78
  | 50 | ONNX <5e-5 roundtrip 4 models | ✅ | `onnx_roundtrip.json` (BC 3.05e-5, CQL 5.22e-8, IQL 3.05e-5, TD3+BC 1.53e-5) all <5e-5 |
79
  | 51 | .gitignore excludes 159GB models/ | ✅ | `.gitignore` line `models/` |
80
  | 52 | <2GB container size | ✅ | DEPLOY_HF_SPACE notes |
@@ -153,7 +153,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
153
  | 107 | 15+ disruption taxonomy | ✅ | `server/data/disruptions.json` |
154
  | 108 | 15 leading indicators with correlations | ✅ | `rl/leading_indicators.py` |
155
  | 109 | FRED state[400:407] features | ✅ | `rl/state_builder.py` slice |
156
- | 110 | 40+ industry citations DATA_SOURCES.md | ✅ | `DATA_SOURCES.md` |
157
 
158
  **Y: 21 ✅ + 2 ⚠️ = 23/23 = 100%**
159
 
@@ -164,42 +164,42 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
164
  | # | Doc | Status | Path |
165
  |---|---|---|---|
166
  | 111 | README.md (40KB) | ✅ | repo root |
167
- | 112 | SUPPLYMIND_BLUEPRINT.md (81KB) | ✅ | repo root |
168
- | 113 | ALIENWARE_KICKOFF.md (53KB) | ✅ | repo root |
169
- | 114 | AUDIT_PLAN.md (22KB) | ✅ | repo root |
170
  | 115 | MODEL_CARD.md (19KB) | ✅ | repo root |
171
- | 116 | PYTORCH_STORY.md | ✅ | repo root |
172
- | 117 | BENCHMARKS_VS_PUBLIC.md | ✅ | repo root |
173
- | 118 | DATA_SOURCES.md | ✅ | repo root |
174
- | 119 | EXTERNAL_CREDIBILITY.md | ✅ | repo root |
175
- | 120 | JUDGES.md | ✅ | repo root |
176
- | 121 | FINAL_DEMO.md | ✅ | repo root |
177
- | 122 | DEMO_SCRIPT.md | ✅ | repo root |
178
- | 123 | DEPLOY_HF_SPACE.md | ✅ | repo root |
179
- | 124 | EXECUTIVE_SUMMARY.md | ✅ | repo root |
180
- | 125 | RESULTS.md | ✅ | repo root |
181
  | 126 | CLONE_AND_STUDY.md | ✅ | docs/ |
182
  | 127 | FINAL_AUDIT_REPORT.md | ✅ | docs/ |
183
  | 128 | MULTI_TURN_GRPO_ROADMAP.md | ✅ | docs/ |
184
  | 129 | LIVE_DEMO_HORMUZ.md | ✅ | demo/ or root |
185
- | 130 | PREPRINT.md | ✅ | ShAuRyA_Supplymind/docs/ |
186
- | 131 | PREPRINT_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
187
  | 132 | PITCH_DECK.md | ✅ | demo/ |
188
- | 133 | PITCH_DECK_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
189
  | 134 | DEMO_VIDEO_SCRIPT.md | ✅ | demo/ |
190
- | 135 | DEMO_VIDEO_SCRIPT_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
191
- | 136 | JUDGES_V5.md | ✅ | ShAuRyA_Phoenix/docs/ |
192
  | 137 | CHECKLIST.md | ✅ | demo/ |
193
  | 138 | LANDING_PAGE.md | ✅ | demo/ |
194
  | 139 | EXTERNAL_OUTREACH.md | ✅ | demo/ |
195
  | 140 | SECRETS_ROTATION.md | ✅ | docs/ |
196
- | 141 | PHOENIX_PLAN_V5.md | ✅ | ShAuRyA_Supplymind/docs/ |
197
- | 142 | PHOENIX_COMPLETION_AUDIT.md | ✅ | ShAuRyA_Phoenix/docs/ |
198
- | 143 | PHOENIX_PUSH_REPORT.md | ✅ | ShAuRyA_Phoenix/docs/ |
199
  | 144 | HF_DEPLOY_V4.md | ✅ | docs/ |
200
  | 145 | R4_RUBRIC_CHALLENGE.md | ✅ | challenges/ |
201
  | 146 | FAILURE_TABLE.md | ✅ | repo root |
202
- | 147 | 12 Sleep Token album-track stages (00_emergence → 95_arcadia) | ✅ | `v3_arcadia/` 12 dirs verified exact |
203
  | 148 | Notebook 01_environment_quickstart | ✅ | `notebooks/01_environment_quickstart.ipynb` |
204
  | 149 | Notebook 02_training_your_own_agent | ✅ | `notebooks/02_*.ipynb` |
205
  | 150 | Notebook 03_reproducing_benchmarks | ✅ | same |
@@ -216,16 +216,16 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
216
 
217
  | # | Bullet | Status | Evidence |
218
  |---|---|---|---|
219
- | 155 | Hero result card 10-number 2×5 grid | ✅ | `make_hero_card.py` + `v3_arcadia/plots/hero_*.png` |
220
  | 156 | make_hero_card.py | ✅ | repo |
221
- | 157 | Caramel reliability calibration curves | ✅ | `v3_arcadia/plots/r2_caramel_*` |
222
- | 158 | R4 dangerous 7 plots | ✅ | `v3_arcadia/plots/r4_dangerous_*.png` |
223
- | 159 | R5 granite 5 plots | ✅ | `v3_arcadia/plots/r5_granite_*.png` |
224
- | 160 | R6 gethsemane 3 plots | ✅ | `v3_arcadia/plots/r6_gethsemane_*.png` |
225
- | 161 | R3 past-self 2 plots | ✅ | `v3_arcadia/plots/r3_past_self_*.png` |
226
- | 162 | R6 provider network graph | ✅ | `v3_arcadia/plots/r6_provider_graph.png` |
227
- | 163 | R6 euclidian bootstrap CI bands | ✅ | `v3_arcadia/plots/r6_euclidian_*.png` |
228
- | 164 | R6 aqua-regia coverage plot | ✅ | `v3_arcadia/plots/r6_aqua_regia_coverage.png` |
229
  | 165 | GCN attention heatmaps 3 graphs | ✅ | `rl/gnn/attention.py` outputs PNG |
230
  | 166 | Streamlit dashboard 12 panels | ✅ | `dashboard/streamlit_app.py` |
231
  | 167 | Pareto 3D scatter Plotly | ✅ | `rl/pareto/visualize.py` |
@@ -246,7 +246,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
246
  | 173 | Two-pass DeepSeek extraction (free CoT → Qwen JSON parse) | ✅ | `R4_DANGEROUS_V2.json: extractor field` 100% parse rate |
247
  | 174 | Phoenix isolation guarantee 3 layers | ✅ | `PHOENIX_COMPLETION_AUDIT.md` |
248
  | 175 | Copy-before-edit discipline | ✅ | `PHOENIX_PUSH_REPORT.md` |
249
- | 176 | Tiny YAML parser (no PyYAML) | ✅ | `ShAuRyA_Phoenix/receipts_v2/framework.py` |
250
  | 177 | _corpus_hash SHA-256 embedding cache invalidation | ✅ | `crisis_library.py: corpus_hash` |
251
  | 178 | Token-bucket OpenRouter limiter | ✅ | `openrouter_client.py: per_minute=18` |
252
  | 179 | .openrouter_cache/ API caching | ✅ | dir exists |
@@ -263,7 +263,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
263
  | 190 | Honest fallback labeling | ✅ | `data_source_flags.live_pipeline = "deterministic_rubric_fallback"` |
264
  | 191 | judge_source field | ✅ | `_call_ollama_judge: judge_source = ollama:<model>` |
265
  | 192 | Scenario JSON ingestion_note | ✅ | crisis library schema |
266
- | 193 | 4-minute judge path designed | ✅ | `JUDGES.md` |
267
  | 194 | 30-second receipt verification target | ✅ | `framework.py` design |
268
  | 195 | Sleep Token thesis "Even in Arcadia, disruptions happen" | ✅ | tagline in docs |
269
 
@@ -301,7 +301,7 @@ Bullet-by-bullet status across U/V/W/X/Y/Z/AA/BB (~180 bullets). Same legend as
301
  | 6 baselines pre-seeded | `n_baselines=6` ✅ EXACT |
302
  | Replay cache 8 events | `replay_cache_latest.json: n_events=8` ✅ EXACT |
303
  | Phoenix INDEX 20 receipts | `INDEX.json: list[20]` ✅ EXACT |
304
- | 12 Sleep Token stages | `v3_arcadia/` 12 dirs ✅ EXACT |
305
  | 125 .md docs | `find *.md` 125 ✅ |
306
  | 4 ONNX <5e-5 | onnx_roundtrip ✅ |
307
  | Token-bucket 18 req/min, 950 req/day | `openrouter_client.py` ✅ EXACT |
 
8
 
9
  | # | Bullet | Status | Evidence |
10
  |---|---|---|---|
11
+ | 1 | Karpathy-pattern overnight loop | ✅ | `versions/v5_phoenix/autoresearch_fixed/orchestrator.py` |
12
  | 2 | LLM hypothesis generation (Qwen-14B local or Claude) | ✅ | `hypothesis_engine.py` |
13
  | 3 | Mutable `candidate_train.py` with safe-to-modify markers | ✅ | `autoresearch_fixed/candidate_train.py` |
14
  | 4 | Frozen `program.md` (immutable) | ✅ | `autoresearch_fixed/program.md` |
 
42
 
43
  | # | Bullet | Status | Evidence |
44
  |---|---|---|---|
45
+ | 26 | Counterfactual digital twin 100 rollouts MC | ✅ | `versions/v5_phoenix/counterfactual_twin/twin.py` |
46
  | 27 | Arena leaderboard 6 baselines pre-seeded | ✅ | `arena/leaderboard.json: n_baselines=6` exact |
47
  | 28 | MaskablePPO #1 mean=2.209 CI95=[2.178,2.239] | ✅ | `arena/leaderboard.json: rows[0] = MaskablePPO-v3 (ours), overall_reward_mean=2.209, overall_ci95=[2.178,2.239]` EXACT |
48
  | 29 | runner.py with TaskResult + ArenaResult dataclasses | ✅ | `arena/runner.py` |
49
+ | 30 | 3 Claude Code skills (benchmark-runner, autoresearch-experiment, live-demo-orchestrator) | ✅ | `versions/v5_phoenix/supplymind_skills/` 3 dirs |
50
  | 31 | plugin.json v1.0.0 manifest | ✅ | `supplymind_skills/plugin.json` |
51
  | 32 | Replay cache 8 events frozen | ✅ | `realtime_v5/replay_cache_latest.json: n_events=8` exact |
52
  | 33 | replay_cache_latest.json + timestamped snapshot | ✅ | dir contains both |
 
56
  | 37 | DPO 21 pairs Qwen-2.5-3B LoRA r=8 | ✅ | `dpo_judge/data/preference_pairs.jsonl` 21 lines |
57
  | 38 | TRL fallback for ROLL fragility | ✅ | `dpo_judge/train_dpo_trl.py` |
58
  | 39 | Two upstream PRs ready (Meta OpenEnv + Alibaba ROLL) | ✅ | `docs/PHOENIX_PUSH_REPORT.md` |
59
+ | 40 | build_pr_branch.sh | ✅ | `versions/v5_phoenix/build_pr_branch.sh` |
60
  | 41 | Phoenix isolation (v3+v4 untouched) + copy-before-edit + .venv-roll/ | ✅ | docs note |
61
  | 42 | phoenix_app.py mounts /arena /twin /replay + /phoenix/status | ✅ | `phoenix_app.py` + server/app.py mount |
62
 
 
74
  | 46 | Non-root appuser UID 1000 | ✅ | Dockerfile RUN useradd |
75
  | 47 | HEALTHCHECK curl /health every 30s | ✅ | Dockerfile HEALTHCHECK |
76
  | 48 | uvicorn server.app:app entry | ✅ | Dockerfile CMD |
77
+ | 49 | HF Space at huggingface.co/spaces/Shaurya-Noodle/Supplymind | ✅ | `docs/v3/DEPLOY_HF_SPACE.md` |
78
  | 50 | ONNX <5e-5 roundtrip 4 models | ✅ | `onnx_roundtrip.json` (BC 3.05e-5, CQL 5.22e-8, IQL 3.05e-5, TD3+BC 1.53e-5) all <5e-5 |
79
  | 51 | .gitignore excludes 159GB models/ | ✅ | `.gitignore` line `models/` |
80
  | 52 | <2GB container size | ✅ | DEPLOY_HF_SPACE notes |
 
153
  | 107 | 15+ disruption taxonomy | ✅ | `server/data/disruptions.json` |
154
  | 108 | 15 leading indicators with correlations | ✅ | `rl/leading_indicators.py` |
155
  | 109 | FRED state[400:407] features | ✅ | `rl/state_builder.py` slice |
156
+ | 110 | 40+ industry citations docs/core/DATA_SOURCES.md | ✅ | `docs/core/DATA_SOURCES.md` |
157
 
158
  **Y: 21 ✅ + 2 ⚠️ = 23/23 = 100%**
159
 
 
164
  | # | Doc | Status | Path |
165
  |---|---|---|---|
166
  | 111 | README.md (40KB) | ✅ | repo root |
167
+ | 112 | docs/core/SUPPLYMIND_BLUEPRINT.md (81KB) | ✅ | repo root |
168
+ | 113 | docs/dev_log/ALIENWARE_KICKOFF.md (53KB) | ✅ | repo root |
169
+ | 114 | docs/v4/AUDIT_PLAN.md (22KB) | ✅ | repo root |
170
  | 115 | MODEL_CARD.md (19KB) | ✅ | repo root |
171
+ | 116 | docs/v3/PYTORCH_STORY.md | ✅ | repo root |
172
+ | 117 | docs/v3/BENCHMARKS_VS_PUBLIC.md | ✅ | repo root |
173
+ | 118 | docs/core/DATA_SOURCES.md | ✅ | repo root |
174
+ | 119 | docs/core/EXTERNAL_CREDIBILITY.md | ✅ | repo root |
175
+ | 120 | docs/v4/JUDGES.md | ✅ | repo root |
176
+ | 121 | docs/v3/FINAL_DEMO.md | ✅ | repo root |
177
+ | 122 | docs/v3/DEMO_SCRIPT.md | ✅ | repo root |
178
+ | 123 | docs/v3/DEPLOY_HF_SPACE.md | ✅ | repo root |
179
+ | 124 | docs/v3/EXECUTIVE_SUMMARY.md | ✅ | repo root |
180
+ | 125 | docs/v3/RESULTS.md | ✅ | repo root |
181
  | 126 | CLONE_AND_STUDY.md | ✅ | docs/ |
182
  | 127 | FINAL_AUDIT_REPORT.md | ✅ | docs/ |
183
  | 128 | MULTI_TURN_GRPO_ROADMAP.md | ✅ | docs/ |
184
  | 129 | LIVE_DEMO_HORMUZ.md | ✅ | demo/ or root |
185
+ | 130 | PREPRINT.md | ✅ | versions/v4_arcadia_live/docs/ |
186
+ | 131 | PREPRINT_V5.md | ✅ | versions/v5_phoenix/docs/ |
187
  | 132 | PITCH_DECK.md | ✅ | demo/ |
188
+ | 133 | PITCH_DECK_V5.md | ✅ | versions/v5_phoenix/docs/ |
189
  | 134 | DEMO_VIDEO_SCRIPT.md | ✅ | demo/ |
190
+ | 135 | DEMO_VIDEO_SCRIPT_V5.md | ✅ | versions/v5_phoenix/docs/ |
191
+ | 136 | JUDGES_V5.md | ✅ | versions/v5_phoenix/docs/ |
192
  | 137 | CHECKLIST.md | ✅ | demo/ |
193
  | 138 | LANDING_PAGE.md | ✅ | demo/ |
194
  | 139 | EXTERNAL_OUTREACH.md | ✅ | demo/ |
195
  | 140 | SECRETS_ROTATION.md | ✅ | docs/ |
196
+ | 141 | PHOENIX_PLAN_V5.md | ✅ | versions/v4_arcadia_live/docs/ |
197
+ | 142 | PHOENIX_COMPLETION_AUDIT.md | ✅ | versions/v5_phoenix/docs/ |
198
+ | 143 | PHOENIX_PUSH_REPORT.md | ✅ | versions/v5_phoenix/docs/ |
199
  | 144 | HF_DEPLOY_V4.md | ✅ | docs/ |
200
  | 145 | R4_RUBRIC_CHALLENGE.md | ✅ | challenges/ |
201
  | 146 | FAILURE_TABLE.md | ✅ | repo root |
202
+ | 147 | 12 Sleep Token album-track stages (00_emergence → 95_arcadia) | ✅ | `versions/v3_arcadia/` 12 dirs verified exact |
203
  | 148 | Notebook 01_environment_quickstart | ✅ | `notebooks/01_environment_quickstart.ipynb` |
204
  | 149 | Notebook 02_training_your_own_agent | ✅ | `notebooks/02_*.ipynb` |
205
  | 150 | Notebook 03_reproducing_benchmarks | ✅ | same |
 
216
 
217
  | # | Bullet | Status | Evidence |
218
  |---|---|---|---|
219
+ | 155 | Hero result card 10-number 2×5 grid | ✅ | `make_hero_card.py` + `versions/v3_arcadia/plots/hero_*.png` |
220
  | 156 | make_hero_card.py | ✅ | repo |
221
+ | 157 | Caramel reliability calibration curves | ✅ | `versions/v3_arcadia/plots/r2_caramel_*` |
222
+ | 158 | R4 dangerous 7 plots | ✅ | `versions/v3_arcadia/plots/r4_dangerous_*.png` |
223
+ | 159 | R5 granite 5 plots | ✅ | `versions/v3_arcadia/plots/r5_granite_*.png` |
224
+ | 160 | R6 gethsemane 3 plots | ✅ | `versions/v3_arcadia/plots/r6_gethsemane_*.png` |
225
+ | 161 | R3 past-self 2 plots | ✅ | `versions/v3_arcadia/plots/r3_past_self_*.png` |
226
+ | 162 | R6 provider network graph | ✅ | `versions/v3_arcadia/plots/r6_provider_graph.png` |
227
+ | 163 | R6 euclidian bootstrap CI bands | ✅ | `versions/v3_arcadia/plots/r6_euclidian_*.png` |
228
+ | 164 | R6 aqua-regia coverage plot | ✅ | `versions/v3_arcadia/plots/r6_aqua_regia_coverage.png` |
229
  | 165 | GCN attention heatmaps 3 graphs | ✅ | `rl/gnn/attention.py` outputs PNG |
230
  | 166 | Streamlit dashboard 12 panels | ✅ | `dashboard/streamlit_app.py` |
231
  | 167 | Pareto 3D scatter Plotly | ✅ | `rl/pareto/visualize.py` |
 
246
  | 173 | Two-pass DeepSeek extraction (free CoT → Qwen JSON parse) | ✅ | `R4_DANGEROUS_V2.json: extractor field` 100% parse rate |
247
  | 174 | Phoenix isolation guarantee 3 layers | ✅ | `PHOENIX_COMPLETION_AUDIT.md` |
248
  | 175 | Copy-before-edit discipline | ✅ | `PHOENIX_PUSH_REPORT.md` |
249
+ | 176 | Tiny YAML parser (no PyYAML) | ✅ | `versions/v5_phoenix/receipts_v2/framework.py` |
250
  | 177 | _corpus_hash SHA-256 embedding cache invalidation | ✅ | `crisis_library.py: corpus_hash` |
251
  | 178 | Token-bucket OpenRouter limiter | ✅ | `openrouter_client.py: per_minute=18` |
252
  | 179 | .openrouter_cache/ API caching | ✅ | dir exists |
 
263
  | 190 | Honest fallback labeling | ✅ | `data_source_flags.live_pipeline = "deterministic_rubric_fallback"` |
264
  | 191 | judge_source field | ✅ | `_call_ollama_judge: judge_source = ollama:<model>` |
265
  | 192 | Scenario JSON ingestion_note | ✅ | crisis library schema |
266
+ | 193 | 4-minute judge path designed | ✅ | `docs/v4/JUDGES.md` |
267
  | 194 | 30-second receipt verification target | ✅ | `framework.py` design |
268
  | 195 | Sleep Token thesis "Even in Arcadia, disruptions happen" | ✅ | tagline in docs |
269
 
 
301
  | 6 baselines pre-seeded | `n_baselines=6` ✅ EXACT |
302
  | Replay cache 8 events | `replay_cache_latest.json: n_events=8` ✅ EXACT |
303
  | Phoenix INDEX 20 receipts | `INDEX.json: list[20]` ✅ EXACT |
304
+ | 12 Sleep Token stages | `versions/v3_arcadia/` 12 dirs ✅ EXACT |
305
  | 125 .md docs | `find *.md` 125 ✅ |
306
  | 4 ONNX <5e-5 | onnx_roundtrip ✅ |
307
  | Token-bucket 18 req/min, 950 req/day | `openrouter_client.py` ✅ EXACT |
FINAL_SUBMIT/HACKATHON_README.md CHANGED
@@ -260,7 +260,7 @@ python scripts/generate_hackathon_plots.py # all 7 plots
260
  ## 4.5 · RLVE adaptive curriculum + RLVR dual-verifier (per RL guide §22-23 + §31-33)
261
 
262
  ### RLVE adaptive curriculum controller
263
- File: [`ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py`](../ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py)
264
 
265
  Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR):
266
  - **Tier 0** = 100 most-common 5-letter words (baseline)
@@ -274,7 +274,7 @@ Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR)
274
  Smoke (200 episodes, synthetic policy): 4 tier shifts captured. Receipt: [`rlve_curriculum_smoke.json`](receipts/rlve_curriculum_smoke.json).
275
 
276
  ### RLVR dual-verifier framework
277
- File: [`ShAuRyA_Phoenix/wordle_env/dual_verifier.py`](../ShAuRyA_Phoenix/wordle_env/dual_verifier.py)
278
 
279
  Per RL guide §31-33 (rule-based verifiers brittle, model-based exploitable):
280
  - **Rule layer**: word ∈ dict, format valid, exact green/yellow scoring
 
260
  ## 4.5 · RLVE adaptive curriculum + RLVR dual-verifier (per RL guide §22-23 + §31-33)
261
 
262
  ### RLVE adaptive curriculum controller
263
+ File: [`versions/v5_phoenix/wordle_env/rlve_curriculum.py`](../versions/v5_phoenix/wordle_env/rlve_curriculum.py)
264
 
265
  Per RL guide §22-23 (procedural verifiable environments — beyond static RLVR):
266
  - **Tier 0** = 100 most-common 5-letter words (baseline)
 
274
  Smoke (200 episodes, synthetic policy): 4 tier shifts captured. Receipt: [`rlve_curriculum_smoke.json`](receipts/rlve_curriculum_smoke.json).
275
 
276
  ### RLVR dual-verifier framework
277
+ File: [`versions/v5_phoenix/wordle_env/dual_verifier.py`](../versions/v5_phoenix/wordle_env/dual_verifier.py)
278
 
279
  Per RL guide §31-33 (rule-based verifiers brittle, model-based exploitable):
280
  - **Rule layer**: word ∈ dict, format valid, exact green/yellow scoring
FINAL_SUBMIT/JUDGE_FAQ_30.md CHANGED
@@ -69,7 +69,7 @@ Global Fishing Watch — vessel positions feed into Hormuz/Red Sea route-disrupt
69
  Compatibility with PEFT 0.19 + Unsloth current pin. `requirements.txt` locks the stack.
70
 
71
  ### 23. "Reward function code?"
72
- `server/engine/rewards.py` (SupplyMind 7-component) + `ShAuRyA_Phoenix/wordle_env/env.py` (Wordle 6-component). Both verifiable.
73
 
74
  ### 24. "Forecasting baselines?"
75
  TFT (513,534 steps), TFT-v2, BigTFT (90,602), TimesFM zero-shot, Granite, Stacking-v3, Brent ensemble. NOAA 60.07% accuracy. Receipts each.
 
69
  Compatibility with PEFT 0.19 + Unsloth current pin. `requirements.txt` locks the stack.
70
 
71
  ### 23. "Reward function code?"
72
+ `server/engine/rewards.py` (SupplyMind 7-component) + `versions/v5_phoenix/wordle_env/env.py` (Wordle 6-component). Both verifiable.
73
 
74
  ### 24. "Forecasting baselines?"
75
  TFT (513,534 steps), TFT-v2, BigTFT (90,602), TimesFM zero-shot, Granite, Stacking-v3, Brent ensemble. NOAA 60.07% accuracy. Receipts each.
FINAL_SUBMIT/JUDGE_OBJECTION_HANDBOOK.md CHANGED
@@ -26,7 +26,7 @@ Format: **Q** = the objection · **A** = the rebuttal · **Receipt** = the on-di
26
 
27
  **Q5**. "Why supply chain over a research-paper-novel domain?"
28
  **A**. Picked deliberately: (1) supply-chain has crisp economic verifiers (Brent prices, agency-published loss bands), (2) it has rich partial observability (20 live data sources), (3) it's professionally relevant (Theme 3 explicit fit). And it's underexplored in OpenEnv community — most submissions are grid worlds or web tasks.
29
- **Receipt**: `DATA_SOURCES.md` lists 20 sources with their epistemic role.
30
 
31
  ---
32
 
 
26
 
27
  **Q5**. "Why supply chain over a research-paper-novel domain?"
28
  **A**. Picked deliberately: (1) supply-chain has crisp economic verifiers (Brent prices, agency-published loss bands), (2) it has rich partial observability (20 live data sources), (3) it's professionally relevant (Theme 3 explicit fit). And it's underexplored in OpenEnv community — most submissions are grid worlds or web tasks.
29
+ **Receipt**: `docs/core/DATA_SOURCES.md` lists 20 sources with their epistemic role.
30
 
31
  ---
32
 
FINAL_SUBMIT/MASTER_FEATURE_USECASE_MAP_250.md CHANGED
@@ -18,22 +18,22 @@ Sections A through BB + RL/RLVR/RLVE knowledge alignment.
18
  | A7 | 30-step episode horizon | `server/supply_environment.py` | bounded RL episode | reset config |
19
  | A8 | $5M-$15M budget tasks | `data/disruptions.json` | sparse-reward shaping | task manifest |
20
  | A9 | Real-world coordinates (TSMC, Samsung) | `data/companies_real.json` | Theme #3 Professional Tasks | n_real_nodes=40 |
21
- | A10 | 8 v1 events crisis library | `ShAuRyA_Supplymind/realtime/crisis_library.py` | RAG analog retrieval | 8 events indexed |
22
- | A11 | Wordle RLVR mini-env | `ShAuRyA_Phoenix/wordle_env/env.py` | canonical hackathon flow | `wordle_real_reinforce_curve.json` |
23
- | A12 | RLVE adaptive curriculum | `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` | §22-23 Procaccia-style | `rlve_curriculum_smoke.json` (4 tier shifts) |
24
 
25
  ## B. REWARD ENGINEERING — 14 features
26
  | # | Feature | File | Use case | Receipt |
27
  |---|---------|------|----------|---------|
28
  | B1 | 7-component shaped reward | `server/engine/rewards.py` | RL guide §7 multi-component | rewards module |
29
- | B2 | Format gate | `ShAuRyA_Phoenix/wordle_env/env.py` | reject malformed actions | adv-20 attacks 1-9 blocked |
30
- | B3 | Dictionary gate | `ShAuRyA_Phoenix/wordle_env/env.py` | reject non-dict words | adv-20 attack #10 blocked |
31
- | B4 | Timeout penalty | `ShAuRyA_Phoenix/wordle_env/env.py` | RL guide §15 timeout monitor | -0.2 if 6 guesses fail |
32
- | B5 | Solve bonus + step-count bonus | `ShAuRyA_Phoenix/wordle_env/env.py` | richer signal | ablation_matrix.json |
33
  | B6 | Green credit | env.py | per-letter success | ablation: -0.459 if removed |
34
  | B7 | Yellow credit | env.py | partial info credit | ablation: small drop if removed |
35
  | B8 | Process supervision (line-level) | `scripts/final_validation_bundle.py:process_supervision` | RL guide §9 Lightman 2023 | `process_supervision.json` (var amp 2735×) |
36
- | B9 | Dual-verifier composite | `ShAuRyA_Phoenix/wordle_env/dual_verifier.py` | rule × (0.5 + 0.5×model) | `dual_verifier_smoke.json` |
37
  | B10 | Disagreement alarm | `dual_verifier.py:DISAGREEMENT_THRESHOLD` | §43 anti-hacking monitoring | rolling alarm 0.30 |
38
  | B11 | Ablation receipts (5 components) | `final_validation_bundle.py` | leave-one-out analysis | `ablation_matrix.json` |
39
  | B12 | Variance reduction baseline | `final_real_reinforce_wordle.py` | Williams 1992 REINFORCE | running_baseline EMA |
@@ -98,8 +98,8 @@ Receipt: `adversarial_20_attack_gauntlet.json` (sha 082a3c57…)
98
  ## G. RAG / RETRIEVAL — 8 features
99
  | # | Feature | File | Use case | Receipt |
100
  |---|---------|------|----------|---------|
101
- | G1 | FAISS index | `ShAuRyA_Supplymind/realtime/store.py` | top-K retrieval | store.query_recent |
102
- | G2 | BGE-rerank | `ShAuRyA_Supplymind/realtime/rerank.py` | quality boost | falls back gracefully on Win |
103
  | G3 | Crisis library 8 events | `realtime/crisis_library.py` | analog retrieval | RAG against Iran/Hormuz/Suez |
104
  | G4 | NewsAPI live ingest | `realtime/news_ingest.py` | recent events | event store |
105
  | G5 | GDELT integration | `realtime/gdelt.py` | global events | event store |
 
18
  | A7 | 30-step episode horizon | `server/supply_environment.py` | bounded RL episode | reset config |
19
  | A8 | $5M-$15M budget tasks | `data/disruptions.json` | sparse-reward shaping | task manifest |
20
  | A9 | Real-world coordinates (TSMC, Samsung) | `data/companies_real.json` | Theme #3 Professional Tasks | n_real_nodes=40 |
21
+ | A10 | 8 v1 events crisis library | `versions/v4_arcadia_live/realtime/crisis_library.py` | RAG analog retrieval | 8 events indexed |
22
+ | A11 | Wordle RLVR mini-env | `versions/v5_phoenix/wordle_env/env.py` | canonical hackathon flow | `wordle_real_reinforce_curve.json` |
23
+ | A12 | RLVE adaptive curriculum | `versions/v5_phoenix/wordle_env/rlve_curriculum.py` | §22-23 Procaccia-style | `rlve_curriculum_smoke.json` (4 tier shifts) |
24
 
25
  ## B. REWARD ENGINEERING — 14 features
26
  | # | Feature | File | Use case | Receipt |
27
  |---|---------|------|----------|---------|
28
  | B1 | 7-component shaped reward | `server/engine/rewards.py` | RL guide §7 multi-component | rewards module |
29
+ | B2 | Format gate | `versions/v5_phoenix/wordle_env/env.py` | reject malformed actions | adv-20 attacks 1-9 blocked |
30
+ | B3 | Dictionary gate | `versions/v5_phoenix/wordle_env/env.py` | reject non-dict words | adv-20 attack #10 blocked |
31
+ | B4 | Timeout penalty | `versions/v5_phoenix/wordle_env/env.py` | RL guide §15 timeout monitor | -0.2 if 6 guesses fail |
32
+ | B5 | Solve bonus + step-count bonus | `versions/v5_phoenix/wordle_env/env.py` | richer signal | ablation_matrix.json |
33
  | B6 | Green credit | env.py | per-letter success | ablation: -0.459 if removed |
34
  | B7 | Yellow credit | env.py | partial info credit | ablation: small drop if removed |
35
  | B8 | Process supervision (line-level) | `scripts/final_validation_bundle.py:process_supervision` | RL guide §9 Lightman 2023 | `process_supervision.json` (var amp 2735×) |
36
+ | B9 | Dual-verifier composite | `versions/v5_phoenix/wordle_env/dual_verifier.py` | rule × (0.5 + 0.5×model) | `dual_verifier_smoke.json` |
37
  | B10 | Disagreement alarm | `dual_verifier.py:DISAGREEMENT_THRESHOLD` | §43 anti-hacking monitoring | rolling alarm 0.30 |
38
  | B11 | Ablation receipts (5 components) | `final_validation_bundle.py` | leave-one-out analysis | `ablation_matrix.json` |
39
  | B12 | Variance reduction baseline | `final_real_reinforce_wordle.py` | Williams 1992 REINFORCE | running_baseline EMA |
 
98
  ## G. RAG / RETRIEVAL — 8 features
99
  | # | Feature | File | Use case | Receipt |
100
  |---|---------|------|----------|---------|
101
+ | G1 | FAISS index | `versions/v4_arcadia_live/realtime/store.py` | top-K retrieval | store.query_recent |
102
+ | G2 | BGE-rerank | `versions/v4_arcadia_live/realtime/rerank.py` | quality boost | falls back gracefully on Win |
103
  | G3 | Crisis library 8 events | `realtime/crisis_library.py` | analog retrieval | RAG against Iran/Hormuz/Suez |
104
  | G4 | NewsAPI live ingest | `realtime/news_ingest.py` | recent events | event store |
105
  | G5 | GDELT integration | `realtime/gdelt.py` | global events | event store |
FINAL_SUBMIT/README.md CHANGED
@@ -37,13 +37,13 @@ http://127.0.0.1:8000/demo/master
37
  | Conformal action coverage | **0.9001** | `tests/receipts/conformal_calibration.json` |
38
  | Cross-corpus α (frontier 6, v2 EMDAT) | **0.5436** | `tests/receipts/cross_corpus_alpha.json` |
39
  | 12-frontier panel α (R4 corpus) | **0.5669** | `tests/receipts/panel_agreement_R4.json` |
40
- | HetGAT vs v1 GCN MAE | **+7.77 / +12.15 / +10.03 %** | `ShAuRyA_Phoenix/experiments/hetgat_v1/report.json` |
41
- | RAP-XC training loss | BC **5.62 → 0.23** | `ShAuRyA_Phoenix/experiments/rap_xc_v1/rapxc.pt` |
42
  | RAP-XC parameters | **3,137,049** | same |
43
  | Tohoku 2011 replicated | **$276 B vs $235 B published (+18%)** | `tests/receipts/platinum_counterfactual.json` |
44
- | Live data sources | **20** | `ShAuRyA_Supplymind/realtime/orchestrator_v2.py` |
45
- | Crisis library | **1,500 EMDAT events** | `ShAuRyA_Supplymind/scenarios/crisis_library_v2.json` |
46
- | Foundation models verified | **13/13** | `v3_arcadia/00_emergence/verify_*.py` |
47
  | Custom Ollama analyst models | **5 (v1→v5)** | `rl/lora/Modelfile.v[2-4]`, `Modelfile.analyst_v5` |
48
  | LoRA training pairs | **225** | `rl/data/lora_training_data.json` |
49
  | DPO preference pairs | **21** | `dpo_judge/data/preference_pairs.jsonl` |
@@ -98,13 +98,13 @@ Detailed: see [REPRODUCE.md](REPRODUCE.md).
98
  | Section | Where |
99
  |---|---|
100
  | Game engine (OpenEnv) | `server/app.py`, `server/supply_environment.py`, `server/engine/` |
101
- | 9 RL agents | `ShAuRyA_Phoenix/arena/`, `ShAuRyA_Phoenix/rap_xc/` |
102
- | 13 foundation models | `models/`, `v3_arcadia/00_emergence/verify_*.py` |
103
- | Custom Ollama analyst models | `rl/lora/Modelfile.v[2-4]`, `ShAuRyA_Supplymind/features/Modelfile.analyst_v5` |
104
- | LoRA + DPO + GRPO training | `rl/lora/`, `ShAuRyA_Phoenix/roll_integration/dpo_judge/` |
105
- | 1500-event crisis library | `ShAuRyA_Supplymind/scenarios/crisis_library_v2.{json,faiss}` |
106
- | 4-method counterfactual | `ShAuRyA_Phoenix/counterfactual_v2/platinum.py` |
107
- | Hormuz War Room | `ShAuRyA_Supplymind/realtime/hormuz_war_room_router.py`, `server/static/hormuz_war_room.html` |
108
  | Master demo page | `server/static/master.html` |
109
  | Receipts | `tests/receipts/*.json` |
110
 
 
37
  | Conformal action coverage | **0.9001** | `tests/receipts/conformal_calibration.json` |
38
  | Cross-corpus α (frontier 6, v2 EMDAT) | **0.5436** | `tests/receipts/cross_corpus_alpha.json` |
39
  | 12-frontier panel α (R4 corpus) | **0.5669** | `tests/receipts/panel_agreement_R4.json` |
40
+ | HetGAT vs v1 GCN MAE | **+7.77 / +12.15 / +10.03 %** | `versions/v5_phoenix/experiments/hetgat_v1/report.json` |
41
+ | RAP-XC training loss | BC **5.62 → 0.23** | `versions/v5_phoenix/experiments/rap_xc_v1/rapxc.pt` |
42
  | RAP-XC parameters | **3,137,049** | same |
43
  | Tohoku 2011 replicated | **$276 B vs $235 B published (+18%)** | `tests/receipts/platinum_counterfactual.json` |
44
+ | Live data sources | **20** | `versions/v4_arcadia_live/realtime/orchestrator_v2.py` |
45
+ | Crisis library | **1,500 EMDAT events** | `versions/v4_arcadia_live/scenarios/crisis_library_v2.json` |
46
+ | Foundation models verified | **13/13** | `versions/v3_arcadia/00_emergence/verify_*.py` |
47
  | Custom Ollama analyst models | **5 (v1→v5)** | `rl/lora/Modelfile.v[2-4]`, `Modelfile.analyst_v5` |
48
  | LoRA training pairs | **225** | `rl/data/lora_training_data.json` |
49
  | DPO preference pairs | **21** | `dpo_judge/data/preference_pairs.jsonl` |
 
98
  | Section | Where |
99
  |---|---|
100
  | Game engine (OpenEnv) | `server/app.py`, `server/supply_environment.py`, `server/engine/` |
101
+ | 9 RL agents | `versions/v5_phoenix/arena/`, `versions/v5_phoenix/rap_xc/` |
102
+ | 13 foundation models | `models/`, `versions/v3_arcadia/00_emergence/verify_*.py` |
103
+ | Custom Ollama analyst models | `rl/lora/Modelfile.v[2-4]`, `versions/v4_arcadia_live/features/Modelfile.analyst_v5` |
104
+ | LoRA + DPO + GRPO training | `rl/lora/`, `versions/v5_phoenix/roll_integration/dpo_judge/` |
105
+ | 1500-event crisis library | `versions/v4_arcadia_live/scenarios/crisis_library_v2.{json,faiss}` |
106
+ | 4-method counterfactual | `versions/v5_phoenix/counterfactual_v2/platinum.py` |
107
+ | Hormuz War Room | `versions/v4_arcadia_live/realtime/hormuz_war_room_router.py`, `server/static/hormuz_war_room.html` |
108
  | Master demo page | `server/static/master.html` |
109
  | Receipts | `tests/receipts/*.json` |
110
 
FINAL_SUBMIT/RELIANCE_HORMUZ_DEEP_DIVE.md CHANGED
@@ -119,4 +119,4 @@ The remaining seven subsidiaries collectively account for ~15% of impact.
119
 
120
  **Key insight**: highest *score* node (RIIL pipelines 0.916) has lowest *absolute* impact (₹35 Cr) because it is a small-revenue stub. Highest *absolute* impact (Jamnagar ₹12,194 Cr) has lower score (0.824) but the largest revenue base. **Score and absolute impact tell different stories — both matter.**
121
 
122
- Receipt: deterministic (no LLM in scoring). Numbers anchor to RIL FY24 Integrated Annual Report. Reproduce: `python ShAuRyA_Supplymind/scenarios/reliance_industries_exposure.py`.
 
119
 
120
  **Key insight**: highest *score* node (RIIL pipelines 0.916) has lowest *absolute* impact (₹35 Cr) because it is a small-revenue stub. Highest *absolute* impact (Jamnagar ₹12,194 Cr) has lower score (0.824) but the largest revenue base. **Score and absolute impact tell different stories — both matter.**
121
 
122
+ Receipt: deterministic (no LLM in scoring). Numbers anchor to RIL FY24 Integrated Annual Report. Reproduce: `python versions/v4_arcadia_live/scenarios/reliance_industries_exposure.py`.
FINAL_SUBMIT/REPRODUCE.md CHANGED
@@ -70,10 +70,10 @@ python scripts/bootstrap_leaderboard.py
70
  python scripts/ollama_v5_vs_frontier.py
71
 
72
  # 7. HetGAT all 3 graphs (~30 min on RTX 4080)
73
- python -m ShAuRyA_Phoenix.gnn_v2.train_hetgat --graph all --epochs 200
74
 
75
  # 8. RAP-XC training on harvested transitions (~20 sec on RTX 4080)
76
- python -c "from ShAuRyA_Phoenix.rap_xc.train import train_rapxc; train_rapxc()"
77
  ```
78
 
79
  All produce JSON receipts at `tests/receipts/*.json`.
 
70
  python scripts/ollama_v5_vs_frontier.py
71
 
72
  # 7. HetGAT all 3 graphs (~30 min on RTX 4080)
73
+ python -m versions.v5_phoenix.gnn_v2.train_hetgat --graph all --epochs 200
74
 
75
  # 8. RAP-XC training on harvested transitions (~20 sec on RTX 4080)
76
+ python -c "from versions.v5_phoenix.rap_xc.train import train_rapxc; train_rapxc()"
77
  ```
78
 
79
  All produce JSON receipts at `tests/receipts/*.json`.
FINAL_SUBMIT/REPRODUCE_ONE_BASH.sh CHANGED
@@ -14,11 +14,11 @@ echo "Repo: $(pwd)"
14
  echo
15
 
16
  echo "[1/8] Wordle env + RLVE curriculum smoke ..."
17
- python -m ShAuRyA_Phoenix.wordle_env.rlve_curriculum
18
 
19
  echo
20
  echo "[2/8] Dual verifier smoke ..."
21
- python -m ShAuRyA_Phoenix.wordle_env.dual_verifier
22
 
23
  echo
24
  echo "[3/8] OpenEnv MCP compliance ..."
@@ -38,7 +38,7 @@ python scripts/final_validation_bundle.py
38
 
39
  echo
40
  echo "[7/8] Wordle GRPO baseline (heuristic policy receipt) ..."
41
- python -m ShAuRyA_Phoenix.wordle_env.train_grpo --steps 50 || true
42
 
43
  echo
44
  echo "[8/8] Receipt index ..."
 
14
  echo
15
 
16
  echo "[1/8] Wordle env + RLVE curriculum smoke ..."
17
+ python -m versions.v5_phoenix.wordle_env.rlve_curriculum
18
 
19
  echo
20
  echo "[2/8] Dual verifier smoke ..."
21
+ python -m versions.v5_phoenix.wordle_env.dual_verifier
22
 
23
  echo
24
  echo "[3/8] OpenEnv MCP compliance ..."
 
38
 
39
  echo
40
  echo "[7/8] Wordle GRPO baseline (heuristic policy receipt) ..."
41
+ python -m versions.v5_phoenix.wordle_env.train_grpo --steps 50 || true
42
 
43
  echo
44
  echo "[8/8] Receipt index ..."
FINAL_SUBMIT/RL_GUIDE_59POINT_ALIGNMENT.md CHANGED
@@ -25,7 +25,7 @@ Each of the 59 hackathon-guide points → which file implements it → which rec
25
  **Receipt**: HF Space-ready manifest.
26
 
27
  ## §6. Easy first
28
- **File**: `ShAuRyA_Phoenix/wordle_env/rlve_curriculum.py` Tier-0
29
  **Receipt**: `rlve_curriculum_smoke.json` — 4 tier shifts.
30
 
31
  ## §7. Reward design carefully
@@ -81,7 +81,7 @@ Each of the 59 hackathon-guide points → which file implements it → which rec
81
  **Receipt**: `lora_unsloth_train.json`.
82
 
83
  ## §31–33. Dual verifier
84
- **File**: `ShAuRyA_Phoenix/wordle_env/dual_verifier.py`
85
  **Receipt**: `dual_verifier_smoke.json` — BRAID FP caught.
86
 
87
  ## §34–37. Curriculum band 0.45–0.75
 
25
  **Receipt**: HF Space-ready manifest.
26
 
27
  ## §6. Easy first
28
+ **File**: `versions/v5_phoenix/wordle_env/rlve_curriculum.py` Tier-0
29
  **Receipt**: `rlve_curriculum_smoke.json` — 4 tier shifts.
30
 
31
  ## §7. Reward design carefully
 
81
  **Receipt**: `lora_unsloth_train.json`.
82
 
83
  ## §31–33. Dual verifier
84
+ **File**: `versions/v5_phoenix/wordle_env/dual_verifier.py`
85
  **Receipt**: `dual_verifier_smoke.json` — BRAID FP caught.
86
 
87
  ## §34–37. Curriculum band 0.45–0.75
FINAL_SUBMIT/docker/Dockerfile.api ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System deps for sentence-transformers, faiss, torch
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential git curl ca-certificates libgomp1 \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY requirements.txt /app/requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Repo (excluding models — they get mounted as a volume)
14
+ COPY . /app/
15
+
16
+ # Models live at /app/models — mount your local models/ dir as this volume
17
+ VOLUME /app/models
18
+
19
+ EXPOSE 8000
20
+
21
+ # Pre-warm not done in image — runs in lifespan handler at startup
22
+ ENV PYTHONIOENCODING=utf-8
23
+ ENV OLLAMA_MAX_LOADED_MODELS=1
24
+
25
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
FINAL_SUBMIT/docker/docker-compose.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ services:
4
+ api:
5
+ build:
6
+ context: ../..
7
+ dockerfile: FINAL_SUBMIT/docker/Dockerfile.api
8
+ container_name: supplymind-api
9
+ ports:
10
+ - "8000:8000"
11
+ env_file:
12
+ - ../../.env
13
+ volumes:
14
+ - ../../models:/app/models:ro
15
+ - ../../tests/receipts:/app/tests/receipts
16
+ environment:
17
+ - PYTHONIOENCODING=utf-8
18
+ - OLLAMA_MAX_LOADED_MODELS=1
19
+ - OLLAMA_BASE_URL=http://ollama:11434
20
+ depends_on:
21
+ - ollama
22
+ restart: unless-stopped
23
+
24
+ ollama:
25
+ image: ollama/ollama:latest
26
+ container_name: supplymind-ollama
27
+ ports:
28
+ - "11434:11434"
29
+ volumes:
30
+ - ollama-data:/root/.ollama
31
+ restart: unless-stopped
32
+ deploy:
33
+ resources:
34
+ reservations:
35
+ devices:
36
+ - driver: nvidia
37
+ count: 1
38
+ capabilities: [gpu]
39
+
40
+ volumes:
41
+ ollama-data:
FINAL_SUBMIT/receipts/F2_multi_agent_apple_samsung_toyota.json CHANGED
@@ -1,117 +1,117 @@
1
- {
2
- "constants": {
3
- "cap_total_wafers_week": 1000,
4
- "wafer_revenue_usd": 16500,
5
- "shortfall_loss_usd_per_wafer": 55000,
6
- "crisis_duration_weeks": 6
7
- },
8
- "narrative": "2021-chip-shortage dynamic: TSMC backup capacity (1000 wafers/week) contested by Apple (aggressive) + Samsung (conservative) + Toyota (reactive). Apple bids hard early, captures >50% of step-1 capacity. Toyota waits, pays higher step-2 prices. Samsung splits budget.",
9
- "step_log": [
10
- {
11
- "event": "step_1_open",
12
- "capacity_remaining": 1000,
13
- "price_signal": 1.0
14
- },
15
- {
16
- "event": "step_1_bid",
17
- "agent": "Apple",
18
- "bid_usd": 15399999.999999998
19
- },
20
- {
21
- "event": "step_1_bid",
22
- "agent": "Samsung",
23
- "bid_usd": 3500000.0
24
- },
25
- {
26
- "event": "step_1_bid",
27
- "agent": "Toyota",
28
- "bid_usd": 0.0
29
- },
30
- {
31
- "event": "step_1_allocated",
32
- "agent": "Apple",
33
- "allocated_wafers": 407.4074074074074
34
- },
35
- {
36
- "event": "step_1_allocated",
37
- "agent": "Samsung",
38
- "allocated_wafers": 92.59259259259258
39
- },
40
- {
41
- "event": "step_1_allocated",
42
- "agent": "Toyota",
43
- "allocated_wafers": 0.0
44
- },
45
- {
46
- "event": "step_2_open",
47
- "capacity_remaining": 500.0,
48
- "price_signal": 2.291
49
- },
50
- {
51
- "event": "step_2_bid",
52
- "agent": "Apple",
53
- "bid_usd": 3300000.0
54
- },
55
- {
56
- "event": "step_2_bid",
57
- "agent": "Samsung",
58
- "bid_usd": 2800000.0
59
- },
60
- {
61
- "event": "step_2_bid",
62
- "agent": "Toyota",
63
- "bid_usd": 1833333.3333333333
64
- }
65
- ],
66
- "outcomes": [
67
- {
68
- "name": "Apple",
69
- "strategy": "aggressive",
70
- "budget_usd": 22000000,
71
- "bid_usd": 18700000.0,
72
- "allocated_wafers": 615.4,
73
- "revenue_earned_usd": 60923669.0,
74
- "shortfall_loss_usd": 39486850.0,
75
- "net_pnl_usd": 2736819.0
76
- },
77
- {
78
- "name": "Samsung",
79
- "strategy": "conservative",
80
- "budget_usd": 14000000,
81
- "bid_usd": 6300000.0,
82
- "allocated_wafers": 269.1,
83
- "revenue_earned_usd": 26637255.0,
84
- "shortfall_loss_usd": 31868192.0,
85
- "net_pnl_usd": -11530937.0
86
- },
87
- {
88
- "name": "Toyota",
89
- "strategy": "reactive",
90
- "budget_usd": 7000000,
91
- "bid_usd": 1833333.0,
92
- "allocated_wafers": 115.5,
93
- "revenue_earned_usd": 11439076.0,
94
- "shortfall_loss_usd": 16978291.0,
95
- "net_pnl_usd": -7372549.0
96
- }
97
- ],
98
- "ranking": [
99
- {
100
- "rank": 1,
101
- "agent": "Apple",
102
- "net_pnl_usd": 2736819.0
103
- },
104
- {
105
- "rank": 2,
106
- "agent": "Toyota",
107
- "net_pnl_usd": -7372549.0
108
- },
109
- {
110
- "rank": 3,
111
- "agent": "Samsung",
112
- "net_pnl_usd": -11530937.0
113
- }
114
- ],
115
- "winner": "Apple",
116
- "loser": "Samsung"
117
  }
 
1
+ {
2
+ "constants": {
3
+ "cap_total_wafers_week": 1000,
4
+ "wafer_revenue_usd": 16500,
5
+ "shortfall_loss_usd_per_wafer": 55000,
6
+ "crisis_duration_weeks": 6
7
+ },
8
+ "narrative": "2021-chip-shortage dynamic: TSMC backup capacity (1000 wafers/week) contested by Apple (aggressive) + Samsung (conservative) + Toyota (reactive). Apple bids hard early, captures >50% of step-1 capacity. Toyota waits, pays higher step-2 prices. Samsung splits budget.",
9
+ "step_log": [
10
+ {
11
+ "event": "step_1_open",
12
+ "capacity_remaining": 1000,
13
+ "price_signal": 1.0
14
+ },
15
+ {
16
+ "event": "step_1_bid",
17
+ "agent": "Apple",
18
+ "bid_usd": 15399999.999999998
19
+ },
20
+ {
21
+ "event": "step_1_bid",
22
+ "agent": "Samsung",
23
+ "bid_usd": 3500000.0
24
+ },
25
+ {
26
+ "event": "step_1_bid",
27
+ "agent": "Toyota",
28
+ "bid_usd": 0.0
29
+ },
30
+ {
31
+ "event": "step_1_allocated",
32
+ "agent": "Apple",
33
+ "allocated_wafers": 407.4074074074074
34
+ },
35
+ {
36
+ "event": "step_1_allocated",
37
+ "agent": "Samsung",
38
+ "allocated_wafers": 92.59259259259258
39
+ },
40
+ {
41
+ "event": "step_1_allocated",
42
+ "agent": "Toyota",
43
+ "allocated_wafers": 0.0
44
+ },
45
+ {
46
+ "event": "step_2_open",
47
+ "capacity_remaining": 500.0,
48
+ "price_signal": 2.291
49
+ },
50
+ {
51
+ "event": "step_2_bid",
52
+ "agent": "Apple",
53
+ "bid_usd": 3300000.0
54
+ },
55
+ {
56
+ "event": "step_2_bid",
57
+ "agent": "Samsung",
58
+ "bid_usd": 2800000.0
59
+ },
60
+ {
61
+ "event": "step_2_bid",
62
+ "agent": "Toyota",
63
+ "bid_usd": 1833333.3333333333
64
+ }
65
+ ],
66
+ "outcomes": [
67
+ {
68
+ "name": "Apple",
69
+ "strategy": "aggressive",
70
+ "budget_usd": 22000000,
71
+ "bid_usd": 18700000.0,
72
+ "allocated_wafers": 615.4,
73
+ "revenue_earned_usd": 60923669.0,
74
+ "shortfall_loss_usd": 39486850.0,
75
+ "net_pnl_usd": 2736819.0
76
+ },
77
+ {
78
+ "name": "Samsung",
79
+ "strategy": "conservative",
80
+ "budget_usd": 14000000,
81
+ "bid_usd": 6300000.0,
82
+ "allocated_wafers": 269.1,
83
+ "revenue_earned_usd": 26637255.0,
84
+ "shortfall_loss_usd": 31868192.0,
85
+ "net_pnl_usd": -11530937.0
86
+ },
87
+ {
88
+ "name": "Toyota",
89
+ "strategy": "reactive",
90
+ "budget_usd": 7000000,
91
+ "bid_usd": 1833333.0,
92
+ "allocated_wafers": 115.5,
93
+ "revenue_earned_usd": 11439076.0,
94
+ "shortfall_loss_usd": 16978291.0,
95
+ "net_pnl_usd": -7372549.0
96
+ }
97
+ ],
98
+ "ranking": [
99
+ {
100
+ "rank": 1,
101
+ "agent": "Apple",
102
+ "net_pnl_usd": 2736819.0
103
+ },
104
+ {
105
+ "rank": 2,
106
+ "agent": "Toyota",
107
+ "net_pnl_usd": -7372549.0
108
+ },
109
+ {
110
+ "rank": 3,
111
+ "agent": "Samsung",
112
+ "net_pnl_usd": -11530937.0
113
+ }
114
+ ],
115
+ "winner": "Apple",
116
+ "loser": "Samsung"
117
  }
FINAL_SUBMIT/receipts/ONNX_BUNDLE_MANIFEST.json CHANGED
@@ -1,72 +1,72 @@
1
- {
2
- "exported": [
3
- {
4
- "name": "ppo_easy_typhoon_response (MaskablePPO)",
5
- "file": "ppo_easy_typhoon_response.onnx",
6
- "size_kb": 948,
7
- "input_shape": [
8
- 1,
9
- 408
10
- ],
11
- "output_shape": [
12
- 1,
13
- 280
14
- ],
15
- "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
16
- },
17
- {
18
- "name": "ppo_medium_multi_front (MaskablePPO)",
19
- "file": "ppo_medium_multi_front.onnx",
20
- "size_kb": 948,
21
- "input_shape": [
22
- 1,
23
- 408
24
- ],
25
- "output_shape": [
26
- 1,
27
- 280
28
- ],
29
- "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
30
- },
31
- {
32
- "name": "ppo_hard_cascading_crisis (MaskablePPO)",
33
- "file": "ppo_hard_cascading_crisis.onnx",
34
- "size_kb": 948,
35
- "input_shape": [
36
- 1,
37
- 408
38
- ],
39
- "output_shape": [
40
- 1,
41
- 280
42
- ],
43
- "source": "v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
44
- },
45
- {
46
- "name": "GCN arrival-time regressor",
47
- "file": "gcn_arrival.onnx",
48
- "size_kb": 10,
49
- "input_shape": [
50
- "[N, 4]",
51
- "[N, N]"
52
- ],
53
- "output_shape": [
54
- "[N]"
55
- ],
56
- "source": "v3_arcadia/70_provider/r6_gnn_arrival_time.py"
57
- }
58
- ],
59
- "skipped": [
60
- {
61
- "name": "Ridge stacker",
62
- "reason": "skl2onnx not installed: No module named 'skl2onnx'"
63
- },
64
- {
65
- "name": "TFT v1",
66
- "reason": "pytorch-forecasting TimeSeriesDataSet is required at inference; ONNX export requires a wrapper that packages the normalizer scaler + encoder/decoder split. Deferred as v4 work."
67
- }
68
- ],
69
- "elapsed_s": 0.8302168846130371,
70
- "bundle_dir": "v3_arcadia\\checkpoints\\onnx_bundle",
71
- "total_bundle_size_kb": 2854
72
  }
 
1
+ {
2
+ "exported": [
3
+ {
4
+ "name": "ppo_easy_typhoon_response (MaskablePPO)",
5
+ "file": "ppo_easy_typhoon_response.onnx",
6
+ "size_kb": 948,
7
+ "input_shape": [
8
+ 1,
9
+ 408
10
+ ],
11
+ "output_shape": [
12
+ 1,
13
+ 280
14
+ ],
15
+ "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
16
+ },
17
+ {
18
+ "name": "ppo_medium_multi_front (MaskablePPO)",
19
+ "file": "ppo_medium_multi_front.onnx",
20
+ "size_kb": 948,
21
+ "input_shape": [
22
+ 1,
23
+ 408
24
+ ],
25
+ "output_shape": [
26
+ 1,
27
+ 280
28
+ ],
29
+ "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
30
+ },
31
+ {
32
+ "name": "ppo_hard_cascading_crisis (MaskablePPO)",
33
+ "file": "ppo_hard_cascading_crisis.onnx",
34
+ "size_kb": 948,
35
+ "input_shape": [
36
+ 1,
37
+ 408
38
+ ],
39
+ "output_shape": [
40
+ 1,
41
+ 280
42
+ ],
43
+ "source": "versions/v3_arcadia/50_gethsemane/export_v3_ppo_onnx.py"
44
+ },
45
+ {
46
+ "name": "GCN arrival-time regressor",
47
+ "file": "gcn_arrival.onnx",
48
+ "size_kb": 10,
49
+ "input_shape": [
50
+ "[N, 4]",
51
+ "[N, N]"
52
+ ],
53
+ "output_shape": [
54
+ "[N]"
55
+ ],
56
+ "source": "versions/v3_arcadia/70_provider/r6_gnn_arrival_time.py"
57
+ }
58
+ ],
59
+ "skipped": [
60
+ {
61
+ "name": "Ridge stacker",
62
+ "reason": "skl2onnx not installed: No module named 'skl2onnx'"
63
+ },
64
+ {
65
+ "name": "TFT v1",
66
+ "reason": "pytorch-forecasting TimeSeriesDataSet is required at inference; ONNX export requires a wrapper that packages the normalizer scaler + encoder/decoder split. Deferred as v4 work."
67
+ }
68
+ ],
69
+ "elapsed_s": 0.8302168846130371,
70
+ "bundle_dir": "versions/v3_arcadia/\checkpoints\\onnx_bundle",
71
+ "total_bundle_size_kb": 2854
72
  }
FINAL_SUBMIT/receipts/R2_SHAP_FAIRNESS_CALIBRATION.json CHANGED
@@ -1,502 +1,502 @@
1
- {
2
- "shap_top15": {
3
- "late_delivery_risk": {
4
- "algo": "xgb",
5
- "top15_features": [
6
- {
7
- "name": "Shipping Mode__First Class",
8
- "importance": 0.7326152324676514
9
- },
10
- {
11
- "name": "sched_days",
12
- "importance": 0.6606742739677429
13
- },
14
- {
15
- "name": "Type__TRANSFER",
16
- "importance": 0.47632965445518494
17
- },
18
- {
19
- "name": "Order Customer Id",
20
- "importance": 0.17082303762435913
21
- },
22
- {
23
- "name": "Latitude",
24
- "importance": 0.160926952958107
25
- },
26
- {
27
- "name": "Shipping Mode__Second Class",
28
- "importance": 0.14983786642551422
29
- },
30
- {
31
- "name": "Longitude",
32
- "importance": 0.13300901651382446
33
- },
34
- {
35
- "name": "Shipping Mode__Standard Class",
36
- "importance": 0.12997667491436005
37
- },
38
- {
39
- "name": "order_day",
40
- "importance": 0.10712296515703201
41
- },
42
- {
43
- "name": "order_month",
44
- "importance": 0.07108364999294281
45
- },
46
- {
47
- "name": "order_dow",
48
- "importance": 0.06861100345849991
49
- },
50
- {
51
- "name": "Order Item Total",
52
- "importance": 0.0614430233836174
53
- },
54
- {
55
- "name": "Type__DEBIT",
56
- "importance": 0.05896211788058281
57
- },
58
- {
59
- "name": "Sales",
60
- "importance": 0.04449347406625748
61
- },
62
- {
63
- "name": "Order Item Discount",
64
- "importance": 0.04405033215880394
65
- }
66
- ],
67
- "n_samples": 1000
68
- },
69
- "shipping_mode": {
70
- "algo": "lgb",
71
- "top15_features": [
72
- {
73
- "name": "order_day",
74
- "importance": 0.14531971700119595
75
- },
76
- {
77
- "name": "Latitude",
78
- "importance": 0.13565060253209485
79
- },
80
- {
81
- "name": "Order Customer Id",
82
- "importance": 0.13102491053295864
83
- },
84
- {
85
- "name": "Longitude",
86
- "importance": 0.1222981746063068
87
- },
88
- {
89
- "name": "Order Zipcode",
90
- "importance": 0.09815205910031981
91
- },
92
- {
93
- "name": "order_month",
94
- "importance": 0.09317142717955136
95
- },
96
- {
97
- "name": "order_dow",
98
- "importance": 0.07841270762869156
99
- },
100
- {
101
- "name": "Order Item Total",
102
- "importance": 0.044599598632655106
103
- },
104
- {
105
- "name": "Order Item Discount",
106
- "importance": 0.033594561793665254
107
- },
108
- {
109
- "name": "order_year",
110
- "importance": 0.029623813091121495
111
- },
112
- {
113
- "name": "Customer Segment__Home Office",
114
- "importance": 0.02582491478215546
115
- },
116
- {
117
- "name": "Type__DEBIT",
118
- "importance": 0.019900877735072642
119
- },
120
- {
121
- "name": "Order Item Discount Rate",
122
- "importance": 0.019821976340370435
123
- },
124
- {
125
- "name": "Customer Segment__Consumer",
126
- "importance": 0.019363164732533623
127
- },
128
- {
129
- "name": "Sales",
130
- "importance": 0.019305355520423926
131
- }
132
- ],
133
- "n_samples": 1000
134
- },
135
- "delivery_status": {
136
- "algo": "lgb",
137
- "top15_features": [
138
- {
139
- "name": "sched_days",
140
- "importance": 1.0622776241691645
141
- },
142
- {
143
- "name": "Type__TRANSFER",
144
- "importance": 0.9869317661543312
145
- },
146
- {
147
- "name": "Shipping Mode__First Class",
148
- "importance": 0.5401095981609848
149
- },
150
- {
151
- "name": "Latitude",
152
- "importance": 0.1469638826819572
153
- },
154
- {
155
- "name": "Order Customer Id",
156
- "importance": 0.12387527105673957
157
- },
158
- {
159
- "name": "Longitude",
160
- "importance": 0.12152826063388397
161
- },
162
- {
163
- "name": "Shipping Mode__Standard Class",
164
- "importance": 0.11399112380975975
165
- },
166
- {
167
- "name": "Type__DEBIT",
168
- "importance": 0.11226916777330752
169
- },
170
- {
171
- "name": "order_day",
172
- "importance": 0.08720905988856538
173
- },
174
- {
175
- "name": "Type__PAYMENT",
176
- "importance": 0.07393674075739048
177
- },
178
- {
179
- "name": "order_month",
180
- "importance": 0.05996037188478746
181
- },
182
- {
183
- "name": "order_dow",
184
- "importance": 0.055766425673077755
185
- },
186
- {
187
- "name": "Shipping Mode__Second Class",
188
- "importance": 0.05278020082991879
189
- },
190
- {
191
- "name": "Type__CASH",
192
- "importance": 0.045583216438798695
193
- },
194
- {
195
- "name": "Order Item Total",
196
- "importance": 0.043191257310719586
197
- }
198
- ],
199
- "n_samples": 1000
200
- }
201
- },
202
- "fairness": {
203
- "late_delivery_risk": {
204
- "Market": {
205
- "Africa": {
206
- "n": 1768,
207
- "accuracy": 0.869343891402715
208
- },
209
- "Europe": {
210
- "n": 7437,
211
- "accuracy": 0.8284254403657388
212
- },
213
- "LATAM": {
214
- "n": 7771,
215
- "accuracy": 0.8390168575472912
216
- },
217
- "Pacific Asia": {
218
- "n": 6263,
219
- "accuracy": 0.8112725530895737
220
- },
221
- "USCA": {
222
- "n": 3839,
223
- "accuracy": 0.8767908309455588
224
- },
225
- "__summary__": {
226
- "max_acc": 0.8767908309455588,
227
- "min_acc": 0.8112725530895737,
228
- "disparity": 0.06551827785598507
229
- }
230
- },
231
- "Customer Segment": {
232
- "Consumer": {
233
- "n": 13998,
234
- "accuracy": 0.8350478639805686
235
- },
236
- "Corporate": {
237
- "n": 8212,
238
- "accuracy": 0.8364588407208963
239
- },
240
- "Home Office": {
241
- "n": 4868,
242
- "accuracy": 0.8436729663105998
243
- },
244
- "__summary__": {
245
- "max_acc": 0.8436729663105998,
246
- "min_acc": 0.8350478639805686,
247
- "disparity": 0.00862510233003122
248
- }
249
- }
250
- },
251
- "shipping_mode": {
252
- "Market": {
253
- "Africa": {
254
- "n": 1721,
255
- "accuracy": 0.8059267867518884
256
- },
257
- "Europe": {
258
- "n": 7650,
259
- "accuracy": 0.7586928104575164
260
- },
261
- "LATAM": {
262
- "n": 7701,
263
- "accuracy": 0.7809375405791456
264
- },
265
- "Pacific Asia": {
266
- "n": 6143,
267
- "accuracy": 0.7584242226924955
268
- },
269
- "USCA": {
270
- "n": 3863,
271
- "accuracy": 0.8193114159979291
272
- },
273
- "__summary__": {
274
- "max_acc": 0.8193114159979291,
275
- "min_acc": 0.7584242226924955,
276
- "disparity": 0.06088719330543357
277
- }
278
- },
279
- "Customer Segment": {
280
- "Consumer": {
281
- "n": 14008,
282
- "accuracy": 0.7669902912621359
283
- },
284
- "Corporate": {
285
- "n": 8269,
286
- "accuracy": 0.7872777844963115
287
- },
288
- "Home Office": {
289
- "n": 4801,
290
- "accuracy": 0.7862945219745886
291
- },
292
- "__summary__": {
293
- "max_acc": 0.7872777844963115,
294
- "min_acc": 0.7669902912621359,
295
- "disparity": 0.020287493234175558
296
- }
297
- }
298
- },
299
- "delivery_status": {
300
- "Market": {
301
- "Africa": {
302
- "n": 1767,
303
- "accuracy": 0.8687040181097906
304
- },
305
- "Europe": {
306
- "n": 7505,
307
- "accuracy": 0.8282478347768154
308
- },
309
- "LATAM": {
310
- "n": 7746,
311
- "accuracy": 0.8502452878905241
312
- },
313
- "Pacific Asia": {
314
- "n": 6142,
315
- "accuracy": 0.8150439596222728
316
- },
317
- "USCA": {
318
- "n": 3918,
319
- "accuracy": 0.8769780500255232
320
- },
321
- "__summary__": {
322
- "max_acc": 0.8769780500255232,
323
- "min_acc": 0.8150439596222728,
324
- "disparity": 0.061934090403250375
325
- }
326
- },
327
- "Customer Segment": {
328
- "Consumer": {
329
- "n": 14087,
330
- "accuracy": 0.8335344643998013
331
- },
332
- "Corporate": {
333
- "n": 8197,
334
- "accuracy": 0.8446992802244724
335
- },
336
- "Home Office": {
337
- "n": 4794,
338
- "accuracy": 0.8579474342928661
339
- },
340
- "__summary__": {
341
- "max_acc": 0.8579474342928661,
342
- "min_acc": 0.8335344643998013,
343
- "disparity": 0.02441296989306485
344
- }
345
- }
346
- }
347
- },
348
- "calibration": {
349
- "late_delivery_risk": {
350
- "algo": "xgb",
351
- "n_bins": 15,
352
- "bin_confidence": [
353
- 0.047601889818906784,
354
- 0.10591482371091843,
355
- 0.1693299263715744,
356
- 0.23376236855983734,
357
- 0.2985405921936035,
358
- 0.365536093711853,
359
- 0.43266668915748596,
360
- 0.49862194061279297,
361
- 0.5664905309677124,
362
- 0.6322769522666931,
363
- 0.700205385684967,
364
- 0.7678216695785522,
365
- 0.834970235824585,
366
- 0.9012444019317627,
367
- 0.9871050715446472
368
- ],
369
- "bin_accuracy": [
370
- 0.04878048780487805,
371
- 0.03429602888086643,
372
- 0.06657608695652174,
373
- 0.10221205186880244,
374
- 0.1659671880961465,
375
- 0.3065795613625758,
376
- 0.4490950226244344,
377
- 0.6264543784445805,
378
- 0.7001414427157001,
379
- 0.7884012539184952,
380
- 0.8334786399302528,
381
- 0.8685524126455907,
382
- 0.920274914089347,
383
- 0.9493734335839599,
384
- 0.9918243401074516
385
- ],
386
- "bin_n": [
387
- 205,
388
- 1108,
389
- 2208,
390
- 2622,
391
- 2621,
392
- 2143,
393
- 1768,
394
- 1633,
395
- 1414,
396
- 1276,
397
- 1147,
398
- 1202,
399
- 1455,
400
- 1995,
401
- 4281
402
- ],
403
- "ece": 0.08366547522741584,
404
- "brier": 0.12393409580512378,
405
- "temperature_scaling_T": 0.6172709141132063
406
- },
407
- "shipping_mode": {
408
- "algo": "lgb",
409
- "n_bins": 15,
410
- "bin_confidence": [
411
- 0.3121110714805393,
412
- 0.37706821969221477,
413
- 0.44009373318135214,
414
- 0.5003264091242992,
415
- 0.5668423455793702,
416
- 0.6341087325686549,
417
- 0.7010409508680902,
418
- 0.7664726820296514,
419
- 0.8315982324325599,
420
- 0.8946591419686111,
421
- 0.9531121751216614
422
- ],
423
- "bin_accuracy": [
424
- 0.2,
425
- 0.3730886850152905,
426
- 0.45858343337334934,
427
- 0.49913164293157347,
428
- 0.5809395065900642,
429
- 0.7184009406231628,
430
- 0.8413356080916402,
431
- 0.9226793467025015,
432
- 0.9520665199315236,
433
- 0.9763365468886941,
434
- 0.9710982658959537
435
- ],
436
- "bin_n": [
437
- 15,
438
- 327,
439
- 1666,
440
- 2879,
441
- 2959,
442
- 3402,
443
- 4103,
444
- 4837,
445
- 4089,
446
- 2282,
447
- 519
448
- ],
449
- "ece": 0.08808701528421295,
450
- "brier": 0.14974528304098794,
451
- "temperature_scaling_T": 0.7013679012815588
452
- },
453
- "delivery_status": {
454
- "algo": "lgb",
455
- "n_bins": 15,
456
- "bin_confidence": [
457
- 0.31674386091217493,
458
- 0.3747040640569195,
459
- 0.4360554176701256,
460
- 0.49978873696550224,
461
- 0.5660495258460405,
462
- 0.6325569759747155,
463
- 0.6996959611938123,
464
- 0.7661925883072682,
465
- 0.8343464222875331,
466
- 0.9017332581703068,
467
- 0.9839647836453121
468
- ],
469
- "bin_accuracy": [
470
- 0.2222222222222222,
471
- 0.3987341772151899,
472
- 0.5257352941176471,
473
- 0.6634679020516214,
474
- 0.8109608047173084,
475
- 0.8790291998483125,
476
- 0.9103793247186328,
477
- 0.9274406332453826,
478
- 0.9517241379310345,
479
- 0.9663677130044843,
480
- 0.9874145990650846
481
- ],
482
- "bin_n": [
483
- 54,
484
- 948,
485
- 2448,
486
- 3022,
487
- 2883,
488
- 2637,
489
- 2399,
490
- 2274,
491
- 2175,
492
- 2676,
493
- 5562
494
- ],
495
- "ece": 0.12621462481898915,
496
- "brier": 0.1285071700698595,
497
- "temperature_scaling_T": 0.5595696359480499
498
- }
499
- },
500
- "reliability_plot_saved": true,
501
- "elapsed_min": 1.084403399626414
502
  }
 
1
+ {
2
+ "shap_top15": {
3
+ "late_delivery_risk": {
4
+ "algo": "xgb",
5
+ "top15_features": [
6
+ {
7
+ "name": "Shipping Mode__First Class",
8
+ "importance": 0.7326152324676514
9
+ },
10
+ {
11
+ "name": "sched_days",
12
+ "importance": 0.6606742739677429
13
+ },
14
+ {
15
+ "name": "Type__TRANSFER",
16
+ "importance": 0.47632965445518494
17
+ },
18
+ {
19
+ "name": "Order Customer Id",
20
+ "importance": 0.17082303762435913
21
+ },
22
+ {
23
+ "name": "Latitude",
24
+ "importance": 0.160926952958107
25
+ },
26
+ {
27
+ "name": "Shipping Mode__Second Class",
28
+ "importance": 0.14983786642551422
29
+ },
30
+ {
31
+ "name": "Longitude",
32
+ "importance": 0.13300901651382446
33
+ },
34
+ {
35
+ "name": "Shipping Mode__Standard Class",
36
+ "importance": 0.12997667491436005
37
+ },
38
+ {
39
+ "name": "order_day",
40
+ "importance": 0.10712296515703201
41
+ },
42
+ {
43
+ "name": "order_month",
44
+ "importance": 0.07108364999294281
45
+ },
46
+ {
47
+ "name": "order_dow",
48
+ "importance": 0.06861100345849991
49
+ },
50
+ {
51
+ "name": "Order Item Total",
52
+ "importance": 0.0614430233836174
53
+ },
54
+ {
55
+ "name": "Type__DEBIT",
56
+ "importance": 0.05896211788058281
57
+ },
58
+ {
59
+ "name": "Sales",
60
+ "importance": 0.04449347406625748
61
+ },
62
+ {
63
+ "name": "Order Item Discount",
64
+ "importance": 0.04405033215880394
65
+ }
66
+ ],
67
+ "n_samples": 1000
68
+ },
69
+ "shipping_mode": {
70
+ "algo": "lgb",
71
+ "top15_features": [
72
+ {
73
+ "name": "order_day",
74
+ "importance": 0.14531971700119595
75
+ },
76
+ {
77
+ "name": "Latitude",
78
+ "importance": 0.13565060253209485
79
+ },
80
+ {
81
+ "name": "Order Customer Id",
82
+ "importance": 0.13102491053295864
83
+ },
84
+ {
85
+ "name": "Longitude",
86
+ "importance": 0.1222981746063068
87
+ },
88
+ {
89
+ "name": "Order Zipcode",
90
+ "importance": 0.09815205910031981
91
+ },
92
+ {
93
+ "name": "order_month",
94
+ "importance": 0.09317142717955136
95
+ },
96
+ {
97
+ "name": "order_dow",
98
+ "importance": 0.07841270762869156
99
+ },
100
+ {
101
+ "name": "Order Item Total",
102
+ "importance": 0.044599598632655106
103
+ },
104
+ {
105
+ "name": "Order Item Discount",
106
+ "importance": 0.033594561793665254
107
+ },
108
+ {
109
+ "name": "order_year",
110
+ "importance": 0.029623813091121495
111
+ },
112
+ {
113
+ "name": "Customer Segment__Home Office",
114
+ "importance": 0.02582491478215546
115
+ },
116
+ {
117
+ "name": "Type__DEBIT",
118
+ "importance": 0.019900877735072642
119
+ },
120
+ {
121
+ "name": "Order Item Discount Rate",
122
+ "importance": 0.019821976340370435
123
+ },
124
+ {
125
+ "name": "Customer Segment__Consumer",
126
+ "importance": 0.019363164732533623
127
+ },
128
+ {
129
+ "name": "Sales",
130
+ "importance": 0.019305355520423926
131
+ }
132
+ ],
133
+ "n_samples": 1000
134
+ },
135
+ "delivery_status": {
136
+ "algo": "lgb",
137
+ "top15_features": [
138
+ {
139
+ "name": "sched_days",
140
+ "importance": 1.0622776241691645
141
+ },
142
+ {
143
+ "name": "Type__TRANSFER",
144
+ "importance": 0.9869317661543312
145
+ },
146
+ {
147
+ "name": "Shipping Mode__First Class",
148
+ "importance": 0.5401095981609848
149
+ },
150
+ {
151
+ "name": "Latitude",
152
+ "importance": 0.1469638826819572
153
+ },
154
+ {
155
+ "name": "Order Customer Id",
156
+ "importance": 0.12387527105673957
157
+ },
158
+ {
159
+ "name": "Longitude",
160
+ "importance": 0.12152826063388397
161
+ },
162
+ {
163
+ "name": "Shipping Mode__Standard Class",
164
+ "importance": 0.11399112380975975
165
+ },
166
+ {
167
+ "name": "Type__DEBIT",
168
+ "importance": 0.11226916777330752
169
+ },
170
+ {
171
+ "name": "order_day",
172
+ "importance": 0.08720905988856538
173
+ },
174
+ {
175
+ "name": "Type__PAYMENT",
176
+ "importance": 0.07393674075739048
177
+ },
178
+ {
179
+ "name": "order_month",
180
+ "importance": 0.05996037188478746
181
+ },
182
+ {
183
+ "name": "order_dow",
184
+ "importance": 0.055766425673077755
185
+ },
186
+ {
187
+ "name": "Shipping Mode__Second Class",
188
+ "importance": 0.05278020082991879
189
+ },
190
+ {
191
+ "name": "Type__CASH",
192
+ "importance": 0.045583216438798695
193
+ },
194
+ {
195
+ "name": "Order Item Total",
196
+ "importance": 0.043191257310719586
197
+ }
198
+ ],
199
+ "n_samples": 1000
200
+ }
201
+ },
202
+ "fairness": {
203
+ "late_delivery_risk": {
204
+ "Market": {
205
+ "Africa": {
206
+ "n": 1768,
207
+ "accuracy": 0.869343891402715
208
+ },
209
+ "Europe": {
210
+ "n": 7437,
211
+ "accuracy": 0.8284254403657388
212
+ },
213
+ "LATAM": {
214
+ "n": 7771,
215
+ "accuracy": 0.8390168575472912
216
+ },
217
+ "Pacific Asia": {
218
+ "n": 6263,
219
+ "accuracy": 0.8112725530895737
220
+ },
221
+ "USCA": {
222
+ "n": 3839,
223
+ "accuracy": 0.8767908309455588
224
+ },
225
+ "__summary__": {
226
+ "max_acc": 0.8767908309455588,
227
+ "min_acc": 0.8112725530895737,
228
+ "disparity": 0.06551827785598507
229
+ }
230
+ },
231
+ "Customer Segment": {
232
+ "Consumer": {
233
+ "n": 13998,
234
+ "accuracy": 0.8350478639805686
235
+ },
236
+ "Corporate": {
237
+ "n": 8212,
238
+ "accuracy": 0.8364588407208963
239
+ },
240
+ "Home Office": {
241
+ "n": 4868,
242
+ "accuracy": 0.8436729663105998
243
+ },
244
+ "__summary__": {
245
+ "max_acc": 0.8436729663105998,
246
+ "min_acc": 0.8350478639805686,
247
+ "disparity": 0.00862510233003122
248
+ }
249
+ }
250
+ },
251
+ "shipping_mode": {
252
+ "Market": {
253
+ "Africa": {
254
+ "n": 1721,
255
+ "accuracy": 0.8059267867518884
256
+ },
257
+ "Europe": {
258
+ "n": 7650,
259
+ "accuracy": 0.7586928104575164
260
+ },
261
+ "LATAM": {
262
+ "n": 7701,
263
+ "accuracy": 0.7809375405791456
264
+ },
265
+ "Pacific Asia": {
266
+ "n": 6143,
267
+ "accuracy": 0.7584242226924955
268
+ },
269
+ "USCA": {
270
+ "n": 3863,
271
+ "accuracy": 0.8193114159979291
272
+ },
273
+ "__summary__": {
274
+ "max_acc": 0.8193114159979291,
275
+ "min_acc": 0.7584242226924955,
276
+ "disparity": 0.06088719330543357
277
+ }
278
+ },
279
+ "Customer Segment": {
280
+ "Consumer": {
281
+ "n": 14008,
282
+ "accuracy": 0.7669902912621359
283
+ },
284
+ "Corporate": {
285
+ "n": 8269,
286
+ "accuracy": 0.7872777844963115
287
+ },
288
+ "Home Office": {
289
+ "n": 4801,
290
+ "accuracy": 0.7862945219745886
291
+ },
292
+ "__summary__": {
293
+ "max_acc": 0.7872777844963115,
294
+ "min_acc": 0.7669902912621359,
295
+ "disparity": 0.020287493234175558
296
+ }
297
+ }
298
+ },
299
+ "delivery_status": {
300
+ "Market": {
301
+ "Africa": {
302
+ "n": 1767,
303
+ "accuracy": 0.8687040181097906
304
+ },
305
+ "Europe": {
306
+ "n": 7505,
307
+ "accuracy": 0.8282478347768154
308
+ },
309
+ "LATAM": {
310
+ "n": 7746,
311
+ "accuracy": 0.8502452878905241
312
+ },
313
+ "Pacific Asia": {
314
+ "n": 6142,
315
+ "accuracy": 0.8150439596222728
316
+ },
317
+ "USCA": {
318
+ "n": 3918,
319
+ "accuracy": 0.8769780500255232
320
+ },
321
+ "__summary__": {
322
+ "max_acc": 0.8769780500255232,
323
+ "min_acc": 0.8150439596222728,
324
+ "disparity": 0.061934090403250375
325
+ }
326
+ },
327
+ "Customer Segment": {
328
+ "Consumer": {
329
+ "n": 14087,
330
+ "accuracy": 0.8335344643998013
331
+ },
332
+ "Corporate": {
333
+ "n": 8197,
334
+ "accuracy": 0.8446992802244724
335
+ },
336
+ "Home Office": {
337
+ "n": 4794,
338
+ "accuracy": 0.8579474342928661
339
+ },
340
+ "__summary__": {
341
+ "max_acc": 0.8579474342928661,
342
+ "min_acc": 0.8335344643998013,
343
+ "disparity": 0.02441296989306485
344
+ }
345
+ }
346
+ }
347
+ },
348
+ "calibration": {
349
+ "late_delivery_risk": {
350
+ "algo": "xgb",
351
+ "n_bins": 15,
352
+ "bin_confidence": [
353
+ 0.047601889818906784,
354
+ 0.10591482371091843,
355
+ 0.1693299263715744,
356
+ 0.23376236855983734,
357
+ 0.2985405921936035,
358
+ 0.365536093711853,
359
+ 0.43266668915748596,
360
+ 0.49862194061279297,
361
+ 0.5664905309677124,
362
+ 0.6322769522666931,
363
+ 0.700205385684967,
364
+ 0.7678216695785522,
365
+ 0.834970235824585,
366
+ 0.9012444019317627,
367
+ 0.9871050715446472
368
+ ],
369
+ "bin_accuracy": [
370
+ 0.04878048780487805,
371
+ 0.03429602888086643,
372
+ 0.06657608695652174,
373
+ 0.10221205186880244,
374
+ 0.1659671880961465,
375
+ 0.3065795613625758,
376
+ 0.4490950226244344,
377
+ 0.6264543784445805,
378
+ 0.7001414427157001,
379
+ 0.7884012539184952,
380
+ 0.8334786399302528,
381
+ 0.8685524126455907,
382
+ 0.920274914089347,
383
+ 0.9493734335839599,
384
+ 0.9918243401074516
385
+ ],
386
+ "bin_n": [
387
+ 205,
388
+ 1108,
389
+ 2208,
390
+ 2622,
391
+ 2621,
392
+ 2143,
393
+ 1768,
394
+ 1633,
395
+ 1414,
396
+ 1276,
397
+ 1147,
398
+ 1202,
399
+ 1455,
400
+ 1995,
401
+ 4281
402
+ ],
403
+ "ece": 0.08366547522741584,
404
+ "brier": 0.12393409580512378,
405
+ "temperature_scaling_T": 0.6172709141132063
406
+ },
407
+ "shipping_mode": {
408
+ "algo": "lgb",
409
+ "n_bins": 15,
410
+ "bin_confidence": [
411
+ 0.3121110714805393,
412
+ 0.37706821969221477,
413
+ 0.44009373318135214,
414
+ 0.5003264091242992,
415
+ 0.5668423455793702,
416
+ 0.6341087325686549,
417
+ 0.7010409508680902,
418
+ 0.7664726820296514,
419
+ 0.8315982324325599,
420
+ 0.8946591419686111,
421
+ 0.9531121751216614
422
+ ],
423
+ "bin_accuracy": [
424
+ 0.2,
425
+ 0.3730886850152905,
426
+ 0.45858343337334934,
427
+ 0.49913164293157347,
428
+ 0.5809395065900642,
429
+ 0.7184009406231628,
430
+ 0.8413356080916402,
431
+ 0.9226793467025015,
432
+ 0.9520665199315236,
433
+ 0.9763365468886941,
434
+ 0.9710982658959537
435
+ ],
436
+ "bin_n": [
437
+ 15,
438
+ 327,
439
+ 1666,
440
+ 2879,
441
+ 2959,
442
+ 3402,
443
+ 4103,
444
+ 4837,
445
+ 4089,
446
+ 2282,
447
+ 519
448
+ ],
449
+ "ece": 0.08808701528421295,
450
+ "brier": 0.14974528304098794,
451
+ "temperature_scaling_T": 0.7013679012815588
452
+ },
453
+ "delivery_status": {
454
+ "algo": "lgb",
455
+ "n_bins": 15,
456
+ "bin_confidence": [
457
+ 0.31674386091217493,
458
+ 0.3747040640569195,
459
+ 0.4360554176701256,
460
+ 0.49978873696550224,
461
+ 0.5660495258460405,
462
+ 0.6325569759747155,
463
+ 0.6996959611938123,
464
+ 0.7661925883072682,
465
+ 0.8343464222875331,
466
+ 0.9017332581703068,
467
+ 0.9839647836453121
468
+ ],
469
+ "bin_accuracy": [
470
+ 0.2222222222222222,
471
+ 0.3987341772151899,
472
+ 0.5257352941176471,
473
+ 0.6634679020516214,
474
+ 0.8109608047173084,
475
+ 0.8790291998483125,
476
+ 0.9103793247186328,
477
+ 0.9274406332453826,
478
+ 0.9517241379310345,
479
+ 0.9663677130044843,
480
+ 0.9874145990650846
481
+ ],
482
+ "bin_n": [
483
+ 54,
484
+ 948,
485
+ 2448,
486
+ 3022,
487
+ 2883,
488
+ 2637,
489
+ 2399,
490
+ 2274,
491
+ 2175,
492
+ 2676,
493
+ 5562
494
+ ],
495
+ "ece": 0.12621462481898915,
496
+ "brier": 0.1285071700698595,
497
+ "temperature_scaling_T": 0.5595696359480499
498
+ }
499
+ },
500
+ "reliability_plot_saved": true,
501
+ "elapsed_min": 1.084403399626414
502
  }
FINAL_SUBMIT/receipts/R3_BIGTFT_INTEGRATION.json CHANGED
@@ -1,52 +1,52 @@
1
- {
2
- "model": "Temporal Fusion Transformer",
3
- "paper": "Lim et al. 2021 \u2014 Temporal Fusion Transformers for interpretable multi-horizon time series forecasting",
4
- "implementation": "rl/forecasting/tft.py (v1 single-target) + rl/forecasting/train_tft_real.py (v2 multi-target)",
5
- "params": {
6
- "v1": 90602,
7
- "v2": 513534
8
- },
9
- "checkpoints": {
10
- "v1_real": {
11
- "path": "rl/checkpoints/tft_real.pt",
12
- "params": 90602,
13
- "test_mae_usd": 7.8270111083984375,
14
- "quantile_loss": 0.07062085568904877,
15
- "horizon": 14,
16
- "target": "DCOILWTICO"
17
- },
18
- "v2_multi": {
19
- "path": "rl/checkpoints/tft_v2.pt",
20
- "params": 513534,
21
- "test_mae_p50": {
22
- "DCOILWTICO": 52.868377685546875,
23
- "PCOPPUSDM": 2165.05419921875,
24
- "PPICMM": 127.1404800415039
25
- },
26
- "best_val_qloss": 0.024498114362359047,
27
- "n_rolling_folds": 10
28
- }
29
- },
30
- "integration_in_r3_past_self": {
31
- "target": "DCOILWTICO",
32
- "horizon": 14,
33
- "r3_forecasters": {
34
- "chronos_bolt": {
35
- "mean_mae": 3.4998963623046877
36
- },
37
- "timesfm_2": {
38
- "mean_mae": 3.4601973173958918
39
- },
40
- "arima": {
41
- "mean_mae": 3.37419745103306
42
- },
43
- "prophet": {
44
- "mean_mae": 9.348899015962079
45
- }
46
- },
47
- "v1_tft_WTI_test_mae_usd": 7.8270111083984375,
48
- "v2_tft_multi_DCOILWTICO_test_mae": 52.868377685546875,
49
- "note": "TFT v1 MAE of $7.83 on single-target WTI is competitive with R3 Chronos/ARIMA values on the same series at 14-day horizon. v2 multi-target TFT numbers are higher because of multi-target sharing and scale difference (USD vs. FX cents); for a fair apples-to-apples position in R3, the v1 single-target checkpoint is used."
50
- },
51
- "scoped_next_step_r3_v4": "A full re-training of BigTFT on all 8 FRED targets with the R3 20-fold rolling-origin backtest would require porting to pytorch-forecasting's TimeSeriesDataSet. Scoped as follow-up; v1 checkpoint numbers are the current representative point-of-reference for BigTFT in this release."
52
  }
 
1
+ {
2
+ "model": "Temporal Fusion Transformer",
3
+ "paper": "Lim et al. 2021 \u2014 Temporal Fusion Transformers for interpretable multi-horizon time series forecasting",
4
+ "implementation": "rl/forecasting/tft.py (v1 single-target) + rl/forecasting/train_tft_real.py (v2 multi-target)",
5
+ "params": {
6
+ "v1": 90602,
7
+ "v2": 513534
8
+ },
9
+ "checkpoints": {
10
+ "v1_real": {
11
+ "path": "rl/checkpoints/tft_real.pt",
12
+ "params": 90602,
13
+ "test_mae_usd": 7.8270111083984375,
14
+ "quantile_loss": 0.07062085568904877,
15
+ "horizon": 14,
16
+ "target": "DCOILWTICO"
17
+ },
18
+ "v2_multi": {
19
+ "path": "rl/checkpoints/tft_v2.pt",
20
+ "params": 513534,
21
+ "test_mae_p50": {
22
+ "DCOILWTICO": 52.868377685546875,
23
+ "PCOPPUSDM": 2165.05419921875,
24
+ "PPICMM": 127.1404800415039
25
+ },
26
+ "best_val_qloss": 0.024498114362359047,
27
+ "n_rolling_folds": 10
28
+ }
29
+ },
30
+ "integration_in_r3_past_self": {
31
+ "target": "DCOILWTICO",
32
+ "horizon": 14,
33
+ "r3_forecasters": {
34
+ "chronos_bolt": {
35
+ "mean_mae": 3.4998963623046877
36
+ },
37
+ "timesfm_2": {
38
+ "mean_mae": 3.4601973173958918
39
+ },
40
+ "arima": {
41
+ "mean_mae": 3.37419745103306
42
+ },
43
+ "prophet": {
44
+ "mean_mae": 9.348899015962079
45
+ }
46
+ },
47
+ "v1_tft_WTI_test_mae_usd": 7.8270111083984375,
48
+ "v2_tft_multi_DCOILWTICO_test_mae": 52.868377685546875,
49
+ "note": "TFT v1 MAE of $7.83 on single-target WTI is competitive with R3 Chronos/ARIMA values on the same series at 14-day horizon. v2 multi-target TFT numbers are higher because of multi-target sharing and scale difference (USD vs. FX cents); for a fair apples-to-apples position in R3, the v1 single-target checkpoint is used."
50
+ },
51
+ "scoped_next_step_r3_v4": "A full re-training of BigTFT on all 8 FRED targets with the R3 20-fold rolling-origin backtest would require porting to pytorch-forecasting's TimeSeriesDataSet. Scoped as follow-up; v1 checkpoint numbers are the current representative point-of-reference for BigTFT in this release."
52
  }
FINAL_SUBMIT/receipts/R3_PAST_SELF.json CHANGED
The diff for this file is too large to render. See raw diff
 
FINAL_SUBMIT/receipts/R3_STACKING_V2.json CHANGED
@@ -1,1188 +1,1188 @@
1
- {
2
- "description": "Constrained-stacking comparison. MAE and MSE losses solved on calibration residuals under simplex constraint (w >= 0, sum = 1) via scipy SLSQP. Tested on held-out folds. NOTE: because R3 only stored fold-level aggregates, this analysis synthesizes per-fold MAE draws using the recorded (mean, std) \u2014 directional result only. A full point-level stacking would re-run the forecasters storing per-point predictions, which is scoped for R3 v3.",
3
- "targets_analyzed": 21,
4
- "winner_counts": {
5
- "constrained (MAE or MSE)": 9,
6
- "equal_weights": 2,
7
- "best_individual": 10
8
- },
9
- "per_target_horizon": {
10
- "DCOILWTICO_7": {
11
- "n_cal_folds": 10,
12
- "n_test_folds": 10,
13
- "models": [
14
- "chronos",
15
- "timesfm",
16
- "arima",
17
- "prophet"
18
- ],
19
- "weights": {
20
- "equal": {
21
- "w": [
22
- 0.25,
23
- 0.25,
24
- 0.25,
25
- 0.25
26
- ],
27
- "test_mae": 4.078327693241436
28
- },
29
- "inverse_mae": {
30
- "w": [
31
- 0.3473502883901263,
32
- 0.2560874881405812,
33
- 0.3115195598071785,
34
- 0.08504266366211403
35
- ],
36
- "test_mae": 3.3276628679064912
37
- },
38
- "constrained_mae": {
39
- "w": [
40
- 0.9999999999996985,
41
- 1.046385200709126e-13,
42
- 0.0,
43
- 1.9696744235629476e-13
44
- ],
45
- "test_mae": 2.653996344639796
46
- },
47
- "constrained_mse": {
48
- "w": [
49
- 0.71816178869903,
50
- 6.540164218966743e-14,
51
- 0.2818382113009046,
52
- 0.0
53
- ],
54
- "test_mae": 2.8532434560990985
55
- }
56
- },
57
- "best_individual_on_cal": {
58
- "model": "chronos",
59
- "test_mae": 2.6539963446388284
60
- },
61
- "winner": {
62
- "method": "best_individual",
63
- "test_mae": 2.6539963446388284
64
- }
65
- },
66
- "DCOILWTICO_14": {
67
- "n_cal_folds": 10,
68
- "n_test_folds": 10,
69
- "models": [
70
- "chronos",
71
- "timesfm",
72
- "arima",
73
- "prophet"
74
- ],
75
- "weights": {
76
- "equal": {
77
- "w": [
78
- 0.25,
79
- 0.25,
80
- 0.25,
81
- 0.25
82
- ],
83
- "test_mae": 5.612792583388805
84
- },
85
- "inverse_mae": {
86
- "w": [
87
- 0.28213323004306484,
88
- 0.22633132223221528,
89
- 0.4020856514147427,
90
- 0.0894497963099773
91
- ],
92
- "test_mae": 3.9445735906379418
93
- },
94
- "constrained_mae": {
95
- "w": [
96
- 0.0,
97
- 5.025493909904784e-15,
98
- 0.9999999999999949,
99
- 0.0
100
- ],
101
- "test_mae": 2.606399976137096
102
- },
103
- "constrained_mse": {
104
- "w": [
105
- 0.21952231081723392,
106
- 0.0,
107
- 0.7804776891824843,
108
- 2.8179414894790747e-13
109
- ],
110
- "test_mae": 2.6333455113190545
111
- }
112
- },
113
- "best_individual_on_cal": {
114
- "model": "arima",
115
- "test_mae": 2.6063999761370877
116
- },
117
- "winner": {
118
- "method": "best_individual",
119
- "test_mae": 2.6063999761370877
120
- }
121
- },
122
- "DCOILWTICO_28": {
123
- "n_cal_folds": 10,
124
- "n_test_folds": 10,
125
- "models": [
126
- "chronos",
127
- "timesfm",
128
- "arima",
129
- "prophet"
130
- ],
131
- "weights": {
132
- "equal": {
133
- "w": [
134
- 0.25,
135
- 0.25,
136
- 0.25,
137
- 0.25
138
- ],
139
- "test_mae": 7.224652873063855
140
- },
141
- "inverse_mae": {
142
- "w": [
143
- 0.23850653345434814,
144
- 0.3008301142852576,
145
- 0.32149310365193035,
146
- 0.13917024860846383
147
- ],
148
- "test_mae": 6.73982107186095
149
- },
150
- "constrained_mae": {
151
- "w": [
152
- 1.4923057986615315e-14,
153
- 0.0,
154
- 0.9999999999999623,
155
- 2.2904834182010197e-14
156
- ],
157
- "test_mae": 5.30872788303258
158
- },
159
- "constrained_mse": {
160
- "w": [
161
- 0.0,
162
- 0.5605029591213022,
163
- 0.4394970408771834,
164
- 1.5144498461763077e-12
165
- ],
166
- "test_mae": 6.268328694014642
167
- }
168
- },
169
- "best_individual_on_cal": {
170
- "model": "arima",
171
- "test_mae": 5.308727883032449
172
- },
173
- "winner": {
174
- "method": "best_individual",
175
- "test_mae": 5.308727883032449
176
- }
177
- },
178
- "PCOPPUSDM_7": {
179
- "n_cal_folds": 3,
180
- "n_test_folds": 3,
181
- "models": [
182
- "chronos",
183
- "timesfm",
184
- "arima",
185
- "prophet"
186
- ],
187
- "weights": {
188
- "equal": {
189
- "w": [
190
- 0.25,
191
- 0.25,
192
- 0.25,
193
- 0.25
194
- ],
195
- "test_mae": 1490.0940767617776
196
- },
197
- "inverse_mae": {
198
- "w": [
199
- 0.27104333378246154,
200
- 0.17597353969029747,
201
- 0.2509767796737437,
202
- 0.30200634685349736
203
- ],
204
- "test_mae": 1510.2305023002107
205
- },
206
- "constrained_mae": {
207
- "w": [
208
- 0.0,
209
- 0.0,
210
- 0.0,
211
- 1.0
212
- ],
213
- "test_mae": 2368.6000030761893
214
- },
215
- "constrained_mse": {
216
- "w": [
217
- 0.25,
218
- 0.25,
219
- 0.25,
220
- 0.25
221
- ],
222
- "test_mae": 1490.0940767617776
223
- }
224
- },
225
- "best_individual_on_cal": {
226
- "model": "prophet",
227
- "test_mae": 2368.6000030761893
228
- },
229
- "winner": {
230
- "method": "equal",
231
- "test_mae": 1490.0940767617776
232
- }
233
- },
234
- "PCOPPUSDM_14": {
235
- "n_cal_folds": 3,
236
- "n_test_folds": 3,
237
- "models": [
238
- "chronos",
239
- "timesfm",
240
- "arima",
241
- "prophet"
242
- ],
243
- "weights": {
244
- "equal": {
245
- "w": [
246
- 0.25,
247
- 0.25,
248
- 0.25,
249
- 0.25
250
- ],
251
- "test_mae": 1322.8195925914633
252
- },
253
- "inverse_mae": {
254
- "w": [
255
- 0.39909529037167984,
256
- 0.15858707123054439,
257
- 0.28187978431797855,
258
- 0.1604378540797973
259
- ],
260
- "test_mae": 1149.0099023538414
261
- },
262
- "constrained_mae": {
263
- "w": [
264
- 1.0,
265
- 0.0,
266
- 0.0,
267
- 0.0
268
- ],
269
- "test_mae": 835.4762629006885
270
- },
271
- "constrained_mse": {
272
- "w": [
273
- 0.25,
274
- 0.25,
275
- 0.25,
276
- 0.25
277
- ],
278
- "test_mae": 1322.8195925914633
279
- }
280
- },
281
- "best_individual_on_cal": {
282
- "model": "chronos",
283
- "test_mae": 835.4762629006885
284
- },
285
- "winner": {
286
- "method": "constrained_mae",
287
- "test_mae": 835.4762629006885
288
- }
289
- },
290
- "PCOPPUSDM_28": {
291
- "n_cal_folds": 3,
292
- "n_test_folds": 3,
293
- "models": [
294
- "chronos",
295
- "timesfm",
296
- "arima",
297
- "prophet"
298
- ],
299
- "weights": {
300
- "equal": {
301
- "w": [
302
- 0.25,
303
- 0.25,
304
- 0.25,
305
- 0.25
306
- ],
307
- "test_mae": 968.7983373413057
308
- },
309
- "inverse_mae": {
310
- "w": [
311
- 0.24317295792125612,
312
- 0.28640862860805355,
313
- 0.1904195773780233,
314
- 0.2799988360926669
315
- ],
316
- "test_mae": 988.2430854488761
317
- },
318
- "constrained_mae": {
319
- "w": [
320
- 0.0,
321
- 1.0,
322
- 0.0,
323
- 0.0
324
- ],
325
- "test_mae": 1383.8323251118418
326
- },
327
- "constrained_mse": {
328
- "w": [
329
- 0.25,
330
- 0.25,
331
- 0.25,
332
- 0.25
333
- ],
334
- "test_mae": 968.7983373413057
335
- }
336
- },
337
- "best_individual_on_cal": {
338
- "model": "timesfm",
339
- "test_mae": 1383.8323251118418
340
- },
341
- "winner": {
342
- "method": "equal",
343
- "test_mae": 968.7983373413057
344
- }
345
- },
346
- "DEXTAUS_7": {
347
- "n_cal_folds": 10,
348
- "n_test_folds": 10,
349
- "models": [
350
- "chronos",
351
- "timesfm",
352
- "arima",
353
- "prophet"
354
- ],
355
- "weights": {
356
- "equal": {
357
- "w": [
358
- 0.25,
359
- 0.25,
360
- 0.25,
361
- 0.25
362
- ],
363
- "test_mae": 0.2169347525199409
364
- },
365
- "inverse_mae": {
366
- "w": [
367
- 0.34398899758591117,
368
- 0.2030939191106745,
369
- 0.3764283233385005,
370
- 0.07648875996491374
371
- ],
372
- "test_mae": 0.1658846094174201
373
- },
374
- "constrained_mae": {
375
- "w": [
376
- 0.0,
377
- 7.008282842946293e-16,
378
- 0.9999999999999989,
379
- 4.579669976578766e-16
380
- ],
381
- "test_mae": 0.12304418839562406
382
- },
383
- "constrained_mse": {
384
- "w": [
385
- 0.3806257863168961,
386
- 8.153200337090993e-17,
387
- 0.619374213683104,
388
- 0.0
389
- ],
390
- "test_mae": 0.12205338531046768
391
- }
392
- },
393
- "best_individual_on_cal": {
394
- "model": "arima",
395
- "test_mae": 0.12304418839562384
396
- },
397
- "winner": {
398
- "method": "constrained_mse",
399
- "test_mae": 0.12205338531046768
400
- }
401
- },
402
- "DEXTAUS_14": {
403
- "n_cal_folds": 10,
404
- "n_test_folds": 10,
405
- "models": [
406
- "chronos",
407
- "timesfm",
408
- "arima",
409
- "prophet"
410
- ],
411
- "weights": {
412
- "equal": {
413
- "w": [
414
- 0.25,
415
- 0.25,
416
- 0.25,
417
- 0.25
418
- ],
419
- "test_mae": 0.2936029051307666
420
- },
421
- "inverse_mae": {
422
- "w": [
423
- 0.3024605314294574,
424
- 0.20677440280922138,
425
- 0.3973126914677932,
426
- 0.09345237429352793
427
- ],
428
- "test_mae": 0.24062725397849288
429
- },
430
- "constrained_mae": {
431
- "w": [
432
- 0.0,
433
- 0.0,
434
- 1.0,
435
- 0.0
436
- ],
437
- "test_mae": 0.2075701838535929
438
- },
439
- "constrained_mse": {
440
- "w": [
441
- 0.20409965483488535,
442
- 1.196959198423997e-16,
443
- 0.7959003451651147,
444
- 0.0
445
- ],
446
- "test_mae": 0.20767726865065442
447
- }
448
- },
449
- "best_individual_on_cal": {
450
- "model": "arima",
451
- "test_mae": 0.2075701838535929
452
- },
453
- "winner": {
454
- "method": "constrained_mae",
455
- "test_mae": 0.2075701838535929
456
- }
457
- },
458
- "DEXTAUS_28": {
459
- "n_cal_folds": 10,
460
- "n_test_folds": 10,
461
- "models": [
462
- "chronos",
463
- "timesfm",
464
- "arima",
465
- "prophet"
466
- ],
467
- "weights": {
468
- "equal": {
469
- "w": [
470
- 0.25,
471
- 0.25,
472
- 0.25,
473
- 0.25
474
- ],
475
- "test_mae": 0.35161458970616255
476
- },
477
- "inverse_mae": {
478
- "w": [
479
- 0.31779598685220195,
480
- 0.27176079256586594,
481
- 0.28189025800444834,
482
- 0.12855296257748378
483
- ],
484
- "test_mae": 0.3189607034469092
485
- },
486
- "constrained_mae": {
487
- "w": [
488
- 0.9999999999999998,
489
- 0.0,
490
- 0.0,
491
- 3.1918911957973246e-16
492
- ],
493
- "test_mae": 0.289064216740161
494
- },
495
- "constrained_mse": {
496
- "w": [
497
- 0.45663759735298354,
498
- 0.10339949724699603,
499
- 0.4399629054000205,
500
- 0.0
501
- ],
502
- "test_mae": 0.27882969196380114
503
- }
504
- },
505
- "best_individual_on_cal": {
506
- "model": "chronos",
507
- "test_mae": 0.2890642167401609
508
- },
509
- "winner": {
510
- "method": "constrained_mse",
511
- "test_mae": 0.27882969196380114
512
- }
513
- },
514
- "DEXKOUS_7": {
515
- "n_cal_folds": 10,
516
- "n_test_folds": 10,
517
- "models": [
518
- "chronos",
519
- "timesfm",
520
- "arima",
521
- "prophet"
522
- ],
523
- "weights": {
524
- "equal": {
525
- "w": [
526
- 0.25,
527
- 0.25,
528
- 0.25,
529
- 0.25
530
- ],
531
- "test_mae": 17.2493699999521
532
- },
533
- "inverse_mae": {
534
- "w": [
535
- 0.22521754050965248,
536
- 0.2661802247036112,
537
- 0.3761094665932334,
538
- 0.1324927681935029
539
- ],
540
- "test_mae": 15.47479328474102
541
- },
542
- "constrained_mae": {
543
- "w": [
544
- 0.0,
545
- 2.7089441800853084e-14,
546
- 0.9999999999999729,
547
- 0.0
548
- ],
549
- "test_mae": 14.0900150189361
550
- },
551
- "constrained_mse": {
552
- "w": [
553
- 1.4068121662922204e-19,
554
- 0.19202529383105713,
555
- 0.8079747057066696,
556
- 4.6227315218243986e-10
557
- ],
558
- "test_mae": 14.093086604275276
559
- }
560
- },
561
- "best_individual_on_cal": {
562
- "model": "arima",
563
- "test_mae": 14.0900150189361
564
- },
565
- "winner": {
566
- "method": "constrained_mae",
567
- "test_mae": 14.0900150189361
568
- }
569
- },
570
- "DEXKOUS_14": {
571
- "n_cal_folds": 10,
572
- "n_test_folds": 10,
573
- "models": [
574
- "chronos",
575
- "timesfm",
576
- "arima",
577
- "prophet"
578
- ],
579
- "weights": {
580
- "equal": {
581
- "w": [
582
- 0.25,
583
- 0.25,
584
- 0.25,
585
- 0.25
586
- ],
587
- "test_mae": 19.357951817590667
588
- },
589
- "inverse_mae": {
590
- "w": [
591
- 0.3500118447028979,
592
- 0.25958141131048756,
593
- 0.2744350765852677,
594
- 0.11597166740134691
595
- ],
596
- "test_mae": 17.40246559654232
597
- },
598
- "constrained_mae": {
599
- "w": [
600
- 0.9999999999992815,
601
- 3.2990277176712823e-13,
602
- 3.88689080920988e-13,
603
- 0.0
604
- ],
605
- "test_mae": 13.478487470042296
606
- },
607
- "constrained_mse": {
608
- "w": [
609
- 0.999999999787164,
610
- 0.0,
611
- 0.0,
612
- 2.1283591823683064e-10
613
- ],
614
- "test_mae": 13.478487473311748
615
- }
616
- },
617
- "best_individual_on_cal": {
618
- "model": "chronos",
619
- "test_mae": 13.4784874700395
620
- },
621
- "winner": {
622
- "method": "best_individual",
623
- "test_mae": 13.4784874700395
624
- }
625
- },
626
- "DEXKOUS_28": {
627
- "n_cal_folds": 10,
628
- "n_test_folds": 10,
629
- "models": [
630
- "chronos",
631
- "timesfm",
632
- "arima",
633
- "prophet"
634
- ],
635
- "weights": {
636
- "equal": {
637
- "w": [
638
- 0.25,
639
- 0.25,
640
- 0.25,
641
- 0.25
642
- ],
643
- "test_mae": 24.8683981319863
644
- },
645
- "inverse_mae": {
646
- "w": [
647
- 0.15714338435667446,
648
- 0.3032008336686258,
649
- 0.3174445784155295,
650
- 0.22221120355917026
651
- ],
652
- "test_mae": 23.767772135429315
653
- },
654
- "constrained_mae": {
655
- "w": [
656
- 0.0,
657
- 0.0,
658
- 1.0,
659
- 0.0
660
- ],
661
- "test_mae": 13.038534452266783
662
- },
663
- "constrained_mse": {
664
- "w": [
665
- 0.0,
666
- 1.6482941097956984e-10,
667
- 0.9999999997453165,
668
- 8.9854093955618e-11
669
- ],
670
- "test_mae": 13.038534456323145
671
- }
672
- },
673
- "best_individual_on_cal": {
674
- "model": "arima",
675
- "test_mae": 13.038534452266783
676
- },
677
- "winner": {
678
- "method": "constrained_mae",
679
- "test_mae": 13.038534452266783
680
- }
681
- },
682
- "DEXJPUS_7": {
683
- "n_cal_folds": 10,
684
- "n_test_folds": 10,
685
- "models": [
686
- "chronos",
687
- "timesfm",
688
- "arima",
689
- "prophet"
690
- ],
691
- "weights": {
692
- "equal": {
693
- "w": [
694
- 0.25,
695
- 0.25,
696
- 0.25,
697
- 0.25
698
- ],
699
- "test_mae": 2.0058613373406016
700
- },
701
- "inverse_mae": {
702
- "w": [
703
- 0.3311569291093271,
704
- 0.21966516526756977,
705
- 0.27781607384114676,
706
- 0.17136183178195635
707
- ],
708
- "test_mae": 1.7598609660764388
709
- },
710
- "constrained_mae": {
711
- "w": [
712
- 0.9999999999999993,
713
- 0.0,
714
- 0.0,
715
- 7.14706072102444e-16
716
- ],
717
- "test_mae": 0.9624409634715991
718
- },
719
- "constrained_mse": {
720
- "w": [
721
- 0.637656517780962,
722
- 0.0,
723
- 0.36234348221903795,
724
- 2.0816681711721676e-17
725
- ],
726
- "test_mae": 1.1158006833860175
727
- }
728
- },
729
- "best_individual_on_cal": {
730
- "model": "chronos",
731
- "test_mae": 0.962440963471597
732
- },
733
- "winner": {
734
- "method": "best_individual",
735
- "test_mae": 0.962440963471597
736
- }
737
- },
738
- "DEXJPUS_14": {
739
- "n_cal_folds": 10,
740
- "n_test_folds": 10,
741
- "models": [
742
- "chronos",
743
- "timesfm",
744
- "arima",
745
- "prophet"
746
- ],
747
- "weights": {
748
- "equal": {
749
- "w": [
750
- 0.25,
751
- 0.25,
752
- 0.25,
753
- 0.25
754
- ],
755
- "test_mae": 2.0585639763398134
756
- },
757
- "inverse_mae": {
758
- "w": [
759
- 0.29221948346213755,
760
- 0.30006908767689383,
761
- 0.3336814964148649,
762
- 0.07402993244610366
763
- ],
764
- "test_mae": 1.525371337574877
765
- },
766
- "constrained_mae": {
767
- "w": [
768
- 0.0,
769
- 0.0,
770
- 0.9999999999998224,
771
- 1.7753796613177788e-13
772
- ],
773
- "test_mae": 0.9391751508495592
774
- },
775
- "constrained_mse": {
776
- "w": [
777
- 0.0,
778
- 0.23909961575984545,
779
- 0.7609003842401545,
780
- 0.0
781
- ],
782
- "test_mae": 1.1619170740566178
783
- }
784
- },
785
- "best_individual_on_cal": {
786
- "model": "arima",
787
- "test_mae": 0.9391751508489655
788
- },
789
- "winner": {
790
- "method": "best_individual",
791
- "test_mae": 0.9391751508489655
792
- }
793
- },
794
- "DEXJPUS_28": {
795
- "n_cal_folds": 10,
796
- "n_test_folds": 10,
797
- "models": [
798
- "chronos",
799
- "timesfm",
800
- "arima",
801
- "prophet"
802
- ],
803
- "weights": {
804
- "equal": {
805
- "w": [
806
- 0.25,
807
- 0.25,
808
- 0.25,
809
- 0.25
810
- ],
811
- "test_mae": 2.6223114452299363
812
- },
813
- "inverse_mae": {
814
- "w": [
815
- 0.2431707261347647,
816
- 0.2670867329969705,
817
- 0.36747924632317114,
818
- 0.12226329454509363
819
- ],
820
- "test_mae": 2.501007095618067
821
- },
822
- "constrained_mae": {
823
- "w": [
824
- 0.0,
825
- 0.0,
826
- 1.0,
827
- 0.0
828
- ],
829
- "test_mae": 2.3202441940310328
830
- },
831
- "constrained_mse": {
832
- "w": [
833
- 0.12111050197987697,
834
- 1.124100812432969e-15,
835
- 0.8788894980201218,
836
- 0.0
837
- ],
838
- "test_mae": 2.284742353079749
839
- }
840
- },
841
- "best_individual_on_cal": {
842
- "model": "arima",
843
- "test_mae": 2.3202441940310328
844
- },
845
- "winner": {
846
- "method": "constrained_mse",
847
- "test_mae": 2.284742353079749
848
- }
849
- },
850
- "DEXUSEU_7": {
851
- "n_cal_folds": 10,
852
- "n_test_folds": 10,
853
- "models": [
854
- "chronos",
855
- "timesfm",
856
- "arima",
857
- "prophet"
858
- ],
859
- "weights": {
860
- "equal": {
861
- "w": [
862
- 0.25,
863
- 0.25,
864
- 0.25,
865
- 0.25
866
- ],
867
- "test_mae": 0.01777263656328388
868
- },
869
- "inverse_mae": {
870
- "w": [
871
- 0.4380311521257709,
872
- 0.1895078632684431,
873
- 0.2934679866590765,
874
- 0.07899299794670979
875
- ],
876
- "test_mae": 0.012544562664192396
877
- },
878
- "constrained_mae": {
879
- "w": [
880
- 0.9999999999999984,
881
- 1.0061396160665477e-15,
882
- 5.846018114041837e-16,
883
- 0.0
884
- ],
885
- "test_mae": 0.008009630047676911
886
- },
887
- "constrained_mse": {
888
- "w": [
889
- 0.88076958835974,
890
- 5.551115123125784e-17,
891
- 0.11923041164026013,
892
- 5.551115123125784e-17
893
- ],
894
- "test_mae": 0.00812923667806015
895
- }
896
- },
897
- "best_individual_on_cal": {
898
- "model": "chronos",
899
- "test_mae": 0.008009630047676897
900
- },
901
- "winner": {
902
- "method": "best_individual",
903
- "test_mae": 0.008009630047676897
904
- }
905
- },
906
- "DEXUSEU_14": {
907
- "n_cal_folds": 10,
908
- "n_test_folds": 10,
909
- "models": [
910
- "chronos",
911
- "timesfm",
912
- "arima",
913
- "prophet"
914
- ],
915
- "weights": {
916
- "equal": {
917
- "w": [
918
- 0.25,
919
- 0.25,
920
- 0.25,
921
- 0.25
922
- ],
923
- "test_mae": 0.01766253143684469
924
- },
925
- "inverse_mae": {
926
- "w": [
927
- 0.3649772970412571,
928
- 0.20972059927142733,
929
- 0.2903737730393877,
930
- 0.13492833064792778
931
- ],
932
- "test_mae": 0.015437376589926739
933
- },
934
- "constrained_mae": {
935
- "w": [
936
- 0.9999999999999998,
937
- 0.0,
938
- 0.0,
939
- 2.1510571102112403e-16
940
- ],
941
- "test_mae": 0.01478179445033124
942
- },
943
- "constrained_mse": {
944
- "w": [
945
- 0.5541512994206012,
946
- 1.3877787807814457e-16,
947
- 0.4458487005793988,
948
- 1.0408340855860843e-17
949
- ],
950
- "test_mae": 0.012606685154728608
951
- }
952
- },
953
- "best_individual_on_cal": {
954
- "model": "chronos",
955
- "test_mae": 0.014781794450331237
956
- },
957
- "winner": {
958
- "method": "constrained_mse",
959
- "test_mae": 0.012606685154728608
960
- }
961
- },
962
- "DEXUSEU_28": {
963
- "n_cal_folds": 10,
964
- "n_test_folds": 10,
965
- "models": [
966
- "chronos",
967
- "timesfm",
968
- "arima",
969
- "prophet"
970
- ],
971
- "weights": {
972
- "equal": {
973
- "w": [
974
- 0.25,
975
- 0.25,
976
- 0.25,
977
- 0.25
978
- ],
979
- "test_mae": 0.017842508329409604
980
- },
981
- "inverse_mae": {
982
- "w": [
983
- 0.3562207101529807,
984
- 0.18924080034829216,
985
- 0.31700784157235296,
986
- 0.13753064792637432
987
- ],
988
- "test_mae": 0.015970560076149547
989
- },
990
- "constrained_mae": {
991
- "w": [
992
- 0.9999999999999982,
993
- 9.43689570931382e-16,
994
- 0.0,
995
- 8.049116928532376e-16
996
- ],
997
- "test_mae": 0.014453346940792903
998
- },
999
- "constrained_mse": {
1000
- "w": [
1001
- 0.5446169594084305,
1002
- 2.7755575615628907e-17,
1003
- 0.45538304059156953,
1004
- 0.0
1005
- ],
1006
- "test_mae": 0.013183660449898013
1007
- }
1008
- },
1009
- "best_individual_on_cal": {
1010
- "model": "chronos",
1011
- "test_mae": 0.014453346940792889
1012
- },
1013
- "winner": {
1014
- "method": "constrained_mse",
1015
- "test_mae": 0.013183660449898013
1016
- }
1017
- },
1018
- "DEXCHUS_7": {
1019
- "n_cal_folds": 10,
1020
- "n_test_folds": 10,
1021
- "models": [
1022
- "chronos",
1023
- "timesfm",
1024
- "arima",
1025
- "prophet"
1026
- ],
1027
- "weights": {
1028
- "equal": {
1029
- "w": [
1030
- 0.25,
1031
- 0.25,
1032
- 0.25,
1033
- 0.25
1034
- ],
1035
- "test_mae": 0.034690690500036904
1036
- },
1037
- "inverse_mae": {
1038
- "w": [
1039
- 0.30725895677630083,
1040
- 0.24691376598214834,
1041
- 0.3943485789337087,
1042
- 0.05147869830784206
1043
- ],
1044
- "test_mae": 0.02117886221826054
1045
- },
1046
- "constrained_mae": {
1047
- "w": [
1048
- 0.0,
1049
- 0.0,
1050
- 0.9999999999999998,
1051
- 2.3409280156677643e-16
1052
- ],
1053
- "test_mae": 0.015762412884263256
1054
- },
1055
- "constrained_mse": {
1056
- "w": [
1057
- 0.0,
1058
- 0.040015823687684034,
1059
- 0.959984176312316,
1060
- 1.0408340855860841e-17
1061
- ],
1062
- "test_mae": 0.016130545137926368
1063
- }
1064
- },
1065
- "best_individual_on_cal": {
1066
- "model": "arima",
1067
- "test_mae": 0.015762412884263242
1068
- },
1069
- "winner": {
1070
- "method": "best_individual",
1071
- "test_mae": 0.015762412884263242
1072
- }
1073
- },
1074
- "DEXCHUS_14": {
1075
- "n_cal_folds": 10,
1076
- "n_test_folds": 10,
1077
- "models": [
1078
- "chronos",
1079
- "timesfm",
1080
- "arima",
1081
- "prophet"
1082
- ],
1083
- "weights": {
1084
- "equal": {
1085
- "w": [
1086
- 0.25,
1087
- 0.25,
1088
- 0.25,
1089
- 0.25
1090
- ],
1091
- "test_mae": 0.049119233033837334
1092
- },
1093
- "inverse_mae": {
1094
- "w": [
1095
- 0.2988178654996703,
1096
- 0.30220512040404324,
1097
- 0.27971237870613896,
1098
- 0.1192646353901474
1099
- ],
1100
- "test_mae": 0.04197509948228402
1101
- },
1102
- "constrained_mae": {
1103
- "w": [
1104
- 0.0,
1105
- 0.9999999999999992,
1106
- 3.6082248300317563e-16,
1107
- 3.4174052476743075e-16
1108
- ],
1109
- "test_mae": 0.03187400458960995
1110
- },
1111
- "constrained_mse": {
1112
- "w": [
1113
- 0.5594517657002177,
1114
- 0.23577483341396505,
1115
- 0.2047734008858172,
1116
- 0.0
1117
- ],
1118
- "test_mae": 0.033564545616950006
1119
- }
1120
- },
1121
- "best_individual_on_cal": {
1122
- "model": "timesfm",
1123
- "test_mae": 0.03187400458960993
1124
- },
1125
- "winner": {
1126
- "method": "best_individual",
1127
- "test_mae": 0.03187400458960993
1128
- }
1129
- },
1130
- "DEXCHUS_28": {
1131
- "n_cal_folds": 10,
1132
- "n_test_folds": 10,
1133
- "models": [
1134
- "chronos",
1135
- "timesfm",
1136
- "arima",
1137
- "prophet"
1138
- ],
1139
- "weights": {
1140
- "equal": {
1141
- "w": [
1142
- 0.25,
1143
- 0.25,
1144
- 0.25,
1145
- 0.25
1146
- ],
1147
- "test_mae": 0.07622515708177849
1148
- },
1149
- "inverse_mae": {
1150
- "w": [
1151
- 0.21374276213191848,
1152
- 0.32878921058258087,
1153
- 0.27206545754178274,
1154
- 0.18540256974371785
1155
- ],
1156
- "test_mae": 0.07368140063745915
1157
- },
1158
- "constrained_mae": {
1159
- "w": [
1160
- 3.565258741241218e-17,
1161
- 0.9999999999999993,
1162
- 0.0,
1163
- 6.714758455242072e-16
1164
- ],
1165
- "test_mae": 0.05984540049808135
1166
- },
1167
- "constrained_mse": {
1168
- "w": [
1169
- 0.0,
1170
- 0.7615511144034006,
1171
- 0.23844888559659938,
1172
- 5.308685925196128e-17
1173
- ],
1174
- "test_mae": 0.06440512615984152
1175
- }
1176
- },
1177
- "best_individual_on_cal": {
1178
- "model": "timesfm",
1179
- "test_mae": 0.059845400498081305
1180
- },
1181
- "winner": {
1182
- "method": "best_individual",
1183
- "test_mae": 0.059845400498081305
1184
- }
1185
- }
1186
- },
1187
- "elapsed_s": 0.09606218338012695
1188
  }
 
1
+ {
2
+ "description": "Constrained-stacking comparison. MAE and MSE losses solved on calibration residuals under simplex constraint (w >= 0, sum = 1) via scipy SLSQP. Tested on held-out folds. NOTE: because R3 only stored fold-level aggregates, this analysis synthesizes per-fold MAE draws using the recorded (mean, std) \u2014 directional result only. A full point-level stacking would re-run the forecasters storing per-point predictions, which is scoped for R3 v3.",
3
+ "targets_analyzed": 21,
4
+ "winner_counts": {
5
+ "constrained (MAE or MSE)": 9,
6
+ "equal_weights": 2,
7
+ "best_individual": 10
8
+ },
9
+ "per_target_horizon": {
10
+ "DCOILWTICO_7": {
11
+ "n_cal_folds": 10,
12
+ "n_test_folds": 10,
13
+ "models": [
14
+ "chronos",
15
+ "timesfm",
16
+ "arima",
17
+ "prophet"
18
+ ],
19
+ "weights": {
20
+ "equal": {
21
+ "w": [
22
+ 0.25,
23
+ 0.25,
24
+ 0.25,
25
+ 0.25
26
+ ],
27
+ "test_mae": 4.078327693241436
28
+ },
29
+ "inverse_mae": {
30
+ "w": [
31
+ 0.3473502883901263,
32
+ 0.2560874881405812,
33
+ 0.3115195598071785,
34
+ 0.08504266366211403
35
+ ],
36
+ "test_mae": 3.3276628679064912
37
+ },
38
+ "constrained_mae": {
39
+ "w": [
40
+ 0.9999999999996985,
41
+ 1.046385200709126e-13,
42
+ 0.0,
43
+ 1.9696744235629476e-13
44
+ ],
45
+ "test_mae": 2.653996344639796
46
+ },
47
+ "constrained_mse": {
48
+ "w": [
49
+ 0.71816178869903,
50
+ 6.540164218966743e-14,
51
+ 0.2818382113009046,
52
+ 0.0
53
+ ],
54
+ "test_mae": 2.8532434560990985
55
+ }
56
+ },
57
+ "best_individual_on_cal": {
58
+ "model": "chronos",
59
+ "test_mae": 2.6539963446388284
60
+ },
61
+ "winner": {
62
+ "method": "best_individual",
63
+ "test_mae": 2.6539963446388284
64
+ }
65
+ },
66
+ "DCOILWTICO_14": {
67
+ "n_cal_folds": 10,
68
+ "n_test_folds": 10,
69
+ "models": [
70
+ "chronos",
71
+ "timesfm",
72
+ "arima",
73
+ "prophet"
74
+ ],
75
+ "weights": {
76
+ "equal": {
77
+ "w": [
78
+ 0.25,
79
+ 0.25,
80
+ 0.25,
81
+ 0.25
82
+ ],
83
+ "test_mae": 5.612792583388805
84
+ },
85
+ "inverse_mae": {
86
+ "w": [
87
+ 0.28213323004306484,
88
+ 0.22633132223221528,
89
+ 0.4020856514147427,
90
+ 0.0894497963099773
91
+ ],
92
+ "test_mae": 3.9445735906379418
93
+ },
94
+ "constrained_mae": {
95
+ "w": [
96
+ 0.0,
97
+ 5.025493909904784e-15,
98
+ 0.9999999999999949,
99
+ 0.0
100
+ ],
101
+ "test_mae": 2.606399976137096
102
+ },
103
+ "constrained_mse": {
104
+ "w": [
105
+ 0.21952231081723392,
106
+ 0.0,
107
+ 0.7804776891824843,
108
+ 2.8179414894790747e-13
109
+ ],
110
+ "test_mae": 2.6333455113190545
111
+ }
112
+ },
113
+ "best_individual_on_cal": {
114
+ "model": "arima",
115
+ "test_mae": 2.6063999761370877
116
+ },
117
+ "winner": {
118
+ "method": "best_individual",
119
+ "test_mae": 2.6063999761370877
120
+ }
121
+ },
122
+ "DCOILWTICO_28": {
123
+ "n_cal_folds": 10,
124
+ "n_test_folds": 10,
125
+ "models": [
126
+ "chronos",
127
+ "timesfm",
128
+ "arima",
129
+ "prophet"
130
+ ],
131
+ "weights": {
132
+ "equal": {
133
+ "w": [
134
+ 0.25,
135
+ 0.25,
136
+ 0.25,
137
+ 0.25
138
+ ],
139
+ "test_mae": 7.224652873063855
140
+ },
141
+ "inverse_mae": {
142
+ "w": [
143
+ 0.23850653345434814,
144
+ 0.3008301142852576,
145
+ 0.32149310365193035,
146
+ 0.13917024860846383
147
+ ],
148
+ "test_mae": 6.73982107186095
149
+ },
150
+ "constrained_mae": {
151
+ "w": [
152
+ 1.4923057986615315e-14,
153
+ 0.0,
154
+ 0.9999999999999623,
155
+ 2.2904834182010197e-14
156
+ ],
157
+ "test_mae": 5.30872788303258
158
+ },
159
+ "constrained_mse": {
160
+ "w": [
161
+ 0.0,
162
+ 0.5605029591213022,
163
+ 0.4394970408771834,
164
+ 1.5144498461763077e-12
165
+ ],
166
+ "test_mae": 6.268328694014642
167
+ }
168
+ },
169
+ "best_individual_on_cal": {
170
+ "model": "arima",
171
+ "test_mae": 5.308727883032449
172
+ },
173
+ "winner": {
174
+ "method": "best_individual",
175
+ "test_mae": 5.308727883032449
176
+ }
177
+ },
178
+ "PCOPPUSDM_7": {
179
+ "n_cal_folds": 3,
180
+ "n_test_folds": 3,
181
+ "models": [
182
+ "chronos",
183
+ "timesfm",
184
+ "arima",
185
+ "prophet"
186
+ ],
187
+ "weights": {
188
+ "equal": {
189
+ "w": [
190
+ 0.25,
191
+ 0.25,
192
+ 0.25,
193
+ 0.25
194
+ ],
195
+ "test_mae": 1490.0940767617776
196
+ },
197
+ "inverse_mae": {
198
+ "w": [
199
+ 0.27104333378246154,
200
+ 0.17597353969029747,
201
+ 0.2509767796737437,
202
+ 0.30200634685349736
203
+ ],
204
+ "test_mae": 1510.2305023002107
205
+ },
206
+ "constrained_mae": {
207
+ "w": [
208
+ 0.0,
209
+ 0.0,
210
+ 0.0,
211
+ 1.0
212
+ ],
213
+ "test_mae": 2368.6000030761893
214
+ },
215
+ "constrained_mse": {
216
+ "w": [
217
+ 0.25,
218
+ 0.25,
219
+ 0.25,
220
+ 0.25
221
+ ],
222
+ "test_mae": 1490.0940767617776
223
+ }
224
+ },
225
+ "best_individual_on_cal": {
226
+ "model": "prophet",
227
+ "test_mae": 2368.6000030761893
228
+ },
229
+ "winner": {
230
+ "method": "equal",
231
+ "test_mae": 1490.0940767617776
232
+ }
233
+ },
234
+ "PCOPPUSDM_14": {
235
+ "n_cal_folds": 3,
236
+ "n_test_folds": 3,
237
+ "models": [
238
+ "chronos",
239
+ "timesfm",
240
+ "arima",
241
+ "prophet"
242
+ ],
243
+ "weights": {
244
+ "equal": {
245
+ "w": [
246
+ 0.25,
247
+ 0.25,
248
+ 0.25,
249
+ 0.25
250
+ ],
251
+ "test_mae": 1322.8195925914633
252
+ },
253
+ "inverse_mae": {
254
+ "w": [
255
+ 0.39909529037167984,
256
+ 0.15858707123054439,
257
+ 0.28187978431797855,
258
+ 0.1604378540797973
259
+ ],
260
+ "test_mae": 1149.0099023538414
261
+ },
262
+ "constrained_mae": {
263
+ "w": [
264
+ 1.0,
265
+ 0.0,
266
+ 0.0,
267
+ 0.0
268
+ ],
269
+ "test_mae": 835.4762629006885
270
+ },
271
+ "constrained_mse": {
272
+ "w": [
273
+ 0.25,
274
+ 0.25,
275
+ 0.25,
276
+ 0.25
277
+ ],
278
+ "test_mae": 1322.8195925914633
279
+ }
280
+ },
281
+ "best_individual_on_cal": {
282
+ "model": "chronos",
283
+ "test_mae": 835.4762629006885
284
+ },
285
+ "winner": {
286
+ "method": "constrained_mae",
287
+ "test_mae": 835.4762629006885
288
+ }
289
+ },
290
+ "PCOPPUSDM_28": {
291
+ "n_cal_folds": 3,
292
+ "n_test_folds": 3,
293
+ "models": [
294
+ "chronos",
295
+ "timesfm",
296
+ "arima",
297
+ "prophet"
298
+ ],
299
+ "weights": {
300
+ "equal": {
301
+ "w": [
302
+ 0.25,
303
+ 0.25,
304
+ 0.25,
305
+ 0.25
306
+ ],
307
+ "test_mae": 968.7983373413057
308
+ },
309
+ "inverse_mae": {
310
+ "w": [
311
+ 0.24317295792125612,
312
+ 0.28640862860805355,
313
+ 0.1904195773780233,
314
+ 0.2799988360926669
315
+ ],
316
+ "test_mae": 988.2430854488761
317
+ },
318
+ "constrained_mae": {
319
+ "w": [
320
+ 0.0,
321
+ 1.0,
322
+ 0.0,
323
+ 0.0
324
+ ],
325
+ "test_mae": 1383.8323251118418
326
+ },
327
+ "constrained_mse": {
328
+ "w": [
329
+ 0.25,
330
+ 0.25,
331
+ 0.25,
332
+ 0.25
333
+ ],
334
+ "test_mae": 968.7983373413057
335
+ }
336
+ },
337
+ "best_individual_on_cal": {
338
+ "model": "timesfm",
339
+ "test_mae": 1383.8323251118418
340
+ },
341
+ "winner": {
342
+ "method": "equal",
343
+ "test_mae": 968.7983373413057
344
+ }
345
+ },
346
+ "DEXTAUS_7": {
347
+ "n_cal_folds": 10,
348
+ "n_test_folds": 10,
349
+ "models": [
350
+ "chronos",
351
+ "timesfm",
352
+ "arima",
353
+ "prophet"
354
+ ],
355
+ "weights": {
356
+ "equal": {
357
+ "w": [
358
+ 0.25,
359
+ 0.25,
360
+ 0.25,
361
+ 0.25
362
+ ],
363
+ "test_mae": 0.2169347525199409
364
+ },
365
+ "inverse_mae": {
366
+ "w": [
367
+ 0.34398899758591117,
368
+ 0.2030939191106745,
369
+ 0.3764283233385005,
370
+ 0.07648875996491374
371
+ ],
372
+ "test_mae": 0.1658846094174201
373
+ },
374
+ "constrained_mae": {
375
+ "w": [
376
+ 0.0,
377
+ 7.008282842946293e-16,
378
+ 0.9999999999999989,
379
+ 4.579669976578766e-16
380
+ ],
381
+ "test_mae": 0.12304418839562406
382
+ },
383
+ "constrained_mse": {
384
+ "w": [
385
+ 0.3806257863168961,
386
+ 8.153200337090993e-17,
387
+ 0.619374213683104,
388
+ 0.0
389
+ ],
390
+ "test_mae": 0.12205338531046768
391
+ }
392
+ },
393
+ "best_individual_on_cal": {
394
+ "model": "arima",
395
+ "test_mae": 0.12304418839562384
396
+ },
397
+ "winner": {
398
+ "method": "constrained_mse",
399
+ "test_mae": 0.12205338531046768
400
+ }
401
+ },
402
+ "DEXTAUS_14": {
403
+ "n_cal_folds": 10,
404
+ "n_test_folds": 10,
405
+ "models": [
406
+ "chronos",
407
+ "timesfm",
408
+ "arima",
409
+ "prophet"
410
+ ],
411
+ "weights": {
412
+ "equal": {
413
+ "w": [
414
+ 0.25,
415
+ 0.25,
416
+ 0.25,
417
+ 0.25
418
+ ],
419
+ "test_mae": 0.2936029051307666
420
+ },
421
+ "inverse_mae": {
422
+ "w": [
423
+ 0.3024605314294574,
424
+ 0.20677440280922138,
425
+ 0.3973126914677932,
426
+ 0.09345237429352793
427
+ ],
428
+ "test_mae": 0.24062725397849288
429
+ },
430
+ "constrained_mae": {
431
+ "w": [
432
+ 0.0,
433
+ 0.0,
434
+ 1.0,
435
+ 0.0
436
+ ],
437
+ "test_mae": 0.2075701838535929
438
+ },
439
+ "constrained_mse": {
440
+ "w": [
441
+ 0.20409965483488535,
442
+ 1.196959198423997e-16,
443
+ 0.7959003451651147,
444
+ 0.0
445
+ ],
446
+ "test_mae": 0.20767726865065442
447
+ }
448
+ },
449
+ "best_individual_on_cal": {
450
+ "model": "arima",
451
+ "test_mae": 0.2075701838535929
452
+ },
453
+ "winner": {
454
+ "method": "constrained_mae",
455
+ "test_mae": 0.2075701838535929
456
+ }
457
+ },
458
+ "DEXTAUS_28": {
459
+ "n_cal_folds": 10,
460
+ "n_test_folds": 10,
461
+ "models": [
462
+ "chronos",
463
+ "timesfm",
464
+ "arima",
465
+ "prophet"
466
+ ],
467
+ "weights": {
468
+ "equal": {
469
+ "w": [
470
+ 0.25,
471
+ 0.25,
472
+ 0.25,
473
+ 0.25
474
+ ],
475
+ "test_mae": 0.35161458970616255
476
+ },
477
+ "inverse_mae": {
478
+ "w": [
479
+ 0.31779598685220195,
480
+ 0.27176079256586594,
481
+ 0.28189025800444834,
482
+ 0.12855296257748378
483
+ ],
484
+ "test_mae": 0.3189607034469092
485
+ },
486
+ "constrained_mae": {
487
+ "w": [
488
+ 0.9999999999999998,
489
+ 0.0,
490
+ 0.0,
491
+ 3.1918911957973246e-16
492
+ ],
493
+ "test_mae": 0.289064216740161
494
+ },
495
+ "constrained_mse": {
496
+ "w": [
497
+ 0.45663759735298354,
498
+ 0.10339949724699603,
499
+ 0.4399629054000205,
500
+ 0.0
501
+ ],
502
+ "test_mae": 0.27882969196380114
503
+ }
504
+ },
505
+ "best_individual_on_cal": {
506
+ "model": "chronos",
507
+ "test_mae": 0.2890642167401609
508
+ },
509
+ "winner": {
510
+ "method": "constrained_mse",
511
+ "test_mae": 0.27882969196380114
512
+ }
513
+ },
514
+ "DEXKOUS_7": {
515
+ "n_cal_folds": 10,
516
+ "n_test_folds": 10,
517
+ "models": [
518
+ "chronos",
519
+ "timesfm",
520
+ "arima",
521
+ "prophet"
522
+ ],
523
+ "weights": {
524
+ "equal": {
525
+ "w": [
526
+ 0.25,
527
+ 0.25,
528
+ 0.25,
529
+ 0.25
530
+ ],
531
+ "test_mae": 17.2493699999521
532
+ },
533
+ "inverse_mae": {
534
+ "w": [
535
+ 0.22521754050965248,
536
+ 0.2661802247036112,
537
+ 0.3761094665932334,
538
+ 0.1324927681935029
539
+ ],
540
+ "test_mae": 15.47479328474102
541
+ },
542
+ "constrained_mae": {
543
+ "w": [
544
+ 0.0,
545
+ 2.7089441800853084e-14,
546
+ 0.9999999999999729,
547
+ 0.0
548
+ ],
549
+ "test_mae": 14.0900150189361
550
+ },
551
+ "constrained_mse": {
552
+ "w": [
553
+ 1.4068121662922204e-19,
554
+ 0.19202529383105713,
555
+ 0.8079747057066696,
556
+ 4.6227315218243986e-10
557
+ ],
558
+ "test_mae": 14.093086604275276
559
+ }
560
+ },
561
+ "best_individual_on_cal": {
562
+ "model": "arima",
563
+ "test_mae": 14.0900150189361
564
+ },
565
+ "winner": {
566
+ "method": "constrained_mae",
567
+ "test_mae": 14.0900150189361
568
+ }
569
+ },
570
+ "DEXKOUS_14": {
571
+ "n_cal_folds": 10,
572
+ "n_test_folds": 10,
573
+ "models": [
574
+ "chronos",
575
+ "timesfm",
576
+ "arima",
577
+ "prophet"
578
+ ],
579
+ "weights": {
580
+ "equal": {
581
+ "w": [
582
+ 0.25,
583
+ 0.25,
584
+ 0.25,
585
+ 0.25
586
+ ],
587
+ "test_mae": 19.357951817590667
588
+ },
589
+ "inverse_mae": {
590
+ "w": [
591
+ 0.3500118447028979,
592
+ 0.25958141131048756,
593
+ 0.2744350765852677,
594
+ 0.11597166740134691
595
+ ],
596
+ "test_mae": 17.40246559654232
597
+ },
598
+ "constrained_mae": {
599
+ "w": [
600
+ 0.9999999999992815,
601
+ 3.2990277176712823e-13,
602
+ 3.88689080920988e-13,
603
+ 0.0
604
+ ],
605
+ "test_mae": 13.478487470042296
606
+ },
607
+ "constrained_mse": {
608
+ "w": [
609
+ 0.999999999787164,
610
+ 0.0,
611
+ 0.0,
612
+ 2.1283591823683064e-10
613
+ ],
614
+ "test_mae": 13.478487473311748
615
+ }
616
+ },
617
+ "best_individual_on_cal": {
618
+ "model": "chronos",
619
+ "test_mae": 13.4784874700395
620
+ },
621
+ "winner": {
622
+ "method": "best_individual",
623
+ "test_mae": 13.4784874700395
624
+ }
625
+ },
626
+ "DEXKOUS_28": {
627
+ "n_cal_folds": 10,
628
+ "n_test_folds": 10,
629
+ "models": [
630
+ "chronos",
631
+ "timesfm",
632
+ "arima",
633
+ "prophet"
634
+ ],
635
+ "weights": {
636
+ "equal": {
637
+ "w": [
638
+ 0.25,
639
+ 0.25,
640
+ 0.25,
641
+ 0.25
642
+ ],
643
+ "test_mae": 24.8683981319863
644
+ },
645
+ "inverse_mae": {
646
+ "w": [
647
+ 0.15714338435667446,
648
+ 0.3032008336686258,
649
+ 0.3174445784155295,
650
+ 0.22221120355917026
651
+ ],
652
+ "test_mae": 23.767772135429315
653
+ },
654
+ "constrained_mae": {
655
+ "w": [
656
+ 0.0,
657
+ 0.0,
658
+ 1.0,
659
+ 0.0
660
+ ],
661
+ "test_mae": 13.038534452266783
662
+ },
663
+ "constrained_mse": {
664
+ "w": [
665
+ 0.0,
666
+ 1.6482941097956984e-10,
667
+ 0.9999999997453165,
668
+ 8.9854093955618e-11
669
+ ],
670
+ "test_mae": 13.038534456323145
671
+ }
672
+ },
673
+ "best_individual_on_cal": {
674
+ "model": "arima",
675
+ "test_mae": 13.038534452266783
676
+ },
677
+ "winner": {
678
+ "method": "constrained_mae",
679
+ "test_mae": 13.038534452266783
680
+ }
681
+ },
682
+ "DEXJPUS_7": {
683
+ "n_cal_folds": 10,
684
+ "n_test_folds": 10,
685
+ "models": [
686
+ "chronos",
687
+ "timesfm",
688
+ "arima",
689
+ "prophet"
690
+ ],
691
+ "weights": {
692
+ "equal": {
693
+ "w": [
694
+ 0.25,
695
+ 0.25,
696
+ 0.25,
697
+ 0.25
698
+ ],
699
+ "test_mae": 2.0058613373406016
700
+ },
701
+ "inverse_mae": {
702
+ "w": [
703
+ 0.3311569291093271,
704
+ 0.21966516526756977,
705
+ 0.27781607384114676,
706
+ 0.17136183178195635
707
+ ],
708
+ "test_mae": 1.7598609660764388
709
+ },
710
+ "constrained_mae": {
711
+ "w": [
712
+ 0.9999999999999993,
713
+ 0.0,
714
+ 0.0,
715
+ 7.14706072102444e-16
716
+ ],
717
+ "test_mae": 0.9624409634715991
718
+ },
719
+ "constrained_mse": {
720
+ "w": [
721
+ 0.637656517780962,
722
+ 0.0,
723
+ 0.36234348221903795,
724
+ 2.0816681711721676e-17
725
+ ],
726
+ "test_mae": 1.1158006833860175
727
+ }
728
+ },
729
+ "best_individual_on_cal": {
730
+ "model": "chronos",
731
+ "test_mae": 0.962440963471597
732
+ },
733
+ "winner": {
734
+ "method": "best_individual",
735
+ "test_mae": 0.962440963471597
736
+ }
737
+ },
738
+ "DEXJPUS_14": {
739
+ "n_cal_folds": 10,
740
+ "n_test_folds": 10,
741
+ "models": [
742
+ "chronos",
743
+ "timesfm",
744
+ "arima",
745
+ "prophet"
746
+ ],
747
+ "weights": {
748
+ "equal": {
749
+ "w": [
750
+ 0.25,
751
+ 0.25,
752
+ 0.25,
753
+ 0.25
754
+ ],
755
+ "test_mae": 2.0585639763398134
756
+ },
757
+ "inverse_mae": {
758
+ "w": [
759
+ 0.29221948346213755,
760
+ 0.30006908767689383,
761
+ 0.3336814964148649,
762
+ 0.07402993244610366
763
+ ],
764
+ "test_mae": 1.525371337574877
765
+ },
766
+ "constrained_mae": {
767
+ "w": [
768
+ 0.0,
769
+ 0.0,
770
+ 0.9999999999998224,
771
+ 1.7753796613177788e-13
772
+ ],
773
+ "test_mae": 0.9391751508495592
774
+ },
775
+ "constrained_mse": {
776
+ "w": [
777
+ 0.0,
778
+ 0.23909961575984545,
779
+ 0.7609003842401545,
780
+ 0.0
781
+ ],
782
+ "test_mae": 1.1619170740566178
783
+ }
784
+ },
785
+ "best_individual_on_cal": {
786
+ "model": "arima",
787
+ "test_mae": 0.9391751508489655
788
+ },
789
+ "winner": {
790
+ "method": "best_individual",
791
+ "test_mae": 0.9391751508489655
792
+ }
793
+ },
794
+ "DEXJPUS_28": {
795
+ "n_cal_folds": 10,
796
+ "n_test_folds": 10,
797
+ "models": [
798
+ "chronos",
799
+ "timesfm",
800
+ "arima",
801
+ "prophet"
802
+ ],
803
+ "weights": {
804
+ "equal": {
805
+ "w": [
806
+ 0.25,
807
+ 0.25,
808
+ 0.25,
809
+ 0.25
810
+ ],
811
+ "test_mae": 2.6223114452299363
812
+ },
813
+ "inverse_mae": {
814
+ "w": [
815
+ 0.2431707261347647,
816
+ 0.2670867329969705,
817
+ 0.36747924632317114,
818
+ 0.12226329454509363
819
+ ],
820
+ "test_mae": 2.501007095618067
821
+ },
822
+ "constrained_mae": {
823
+ "w": [
824
+ 0.0,
825
+ 0.0,
826
+ 1.0,
827
+ 0.0
828
+ ],
829
+ "test_mae": 2.3202441940310328
830
+ },
831
+ "constrained_mse": {
832
+ "w": [
833
+ 0.12111050197987697,
834
+ 1.124100812432969e-15,
835
+ 0.8788894980201218,
836
+ 0.0
837
+ ],
838
+ "test_mae": 2.284742353079749
839
+ }
840
+ },
841
+ "best_individual_on_cal": {
842
+ "model": "arima",
843
+ "test_mae": 2.3202441940310328
844
+ },
845
+ "winner": {
846
+ "method": "constrained_mse",
847
+ "test_mae": 2.284742353079749
848
+ }
849
+ },
850
+ "DEXUSEU_7": {
851
+ "n_cal_folds": 10,
852
+ "n_test_folds": 10,
853
+ "models": [
854
+ "chronos",
855
+ "timesfm",
856
+ "arima",
857
+ "prophet"
858
+ ],
859
+ "weights": {
860
+ "equal": {
861
+ "w": [
862
+ 0.25,
863
+ 0.25,
864
+ 0.25,
865
+ 0.25
866
+ ],
867
+ "test_mae": 0.01777263656328388
868
+ },
869
+ "inverse_mae": {
870
+ "w": [
871
+ 0.4380311521257709,
872
+ 0.1895078632684431,
873
+ 0.2934679866590765,
874
+ 0.07899299794670979
875
+ ],
876
+ "test_mae": 0.012544562664192396
877
+ },
878
+ "constrained_mae": {
879
+ "w": [
880
+ 0.9999999999999984,
881
+ 1.0061396160665477e-15,
882
+ 5.846018114041837e-16,
883
+ 0.0
884
+ ],
885
+ "test_mae": 0.008009630047676911
886
+ },
887
+ "constrained_mse": {
888
+ "w": [
889
+ 0.88076958835974,
890
+ 5.551115123125784e-17,
891
+ 0.11923041164026013,
892
+ 5.551115123125784e-17
893
+ ],
894
+ "test_mae": 0.00812923667806015
895
+ }
896
+ },
897
+ "best_individual_on_cal": {
898
+ "model": "chronos",
899
+ "test_mae": 0.008009630047676897
900
+ },
901
+ "winner": {
902
+ "method": "best_individual",
903
+ "test_mae": 0.008009630047676897
904
+ }
905
+ },
906
+ "DEXUSEU_14": {
907
+ "n_cal_folds": 10,
908
+ "n_test_folds": 10,
909
+ "models": [
910
+ "chronos",
911
+ "timesfm",
912
+ "arima",
913
+ "prophet"
914
+ ],
915
+ "weights": {
916
+ "equal": {
917
+ "w": [
918
+ 0.25,
919
+ 0.25,
920
+ 0.25,
921
+ 0.25
922
+ ],
923
+ "test_mae": 0.01766253143684469
924
+ },
925
+ "inverse_mae": {
926
+ "w": [
927
+ 0.3649772970412571,
928
+ 0.20972059927142733,
929
+ 0.2903737730393877,
930
+ 0.13492833064792778
931
+ ],
932
+ "test_mae": 0.015437376589926739
933
+ },
934
+ "constrained_mae": {
935
+ "w": [
936
+ 0.9999999999999998,
937
+ 0.0,
938
+ 0.0,
939
+ 2.1510571102112403e-16
940
+ ],
941
+ "test_mae": 0.01478179445033124
942
+ },
943
+ "constrained_mse": {
944
+ "w": [
945
+ 0.5541512994206012,
946
+ 1.3877787807814457e-16,
947
+ 0.4458487005793988,
948
+ 1.0408340855860843e-17
949
+ ],
950
+ "test_mae": 0.012606685154728608
951
+ }
952
+ },
953
+ "best_individual_on_cal": {
954
+ "model": "chronos",
955
+ "test_mae": 0.014781794450331237
956
+ },
957
+ "winner": {
958
+ "method": "constrained_mse",
959
+ "test_mae": 0.012606685154728608
960
+ }
961
+ },
962
+ "DEXUSEU_28": {
963
+ "n_cal_folds": 10,
964
+ "n_test_folds": 10,
965
+ "models": [
966
+ "chronos",
967
+ "timesfm",
968
+ "arima",
969
+ "prophet"
970
+ ],
971
+ "weights": {
972
+ "equal": {
973
+ "w": [
974
+ 0.25,
975
+ 0.25,
976
+ 0.25,
977
+ 0.25
978
+ ],
979
+ "test_mae": 0.017842508329409604
980
+ },
981
+ "inverse_mae": {
982
+ "w": [
983
+ 0.3562207101529807,
984
+ 0.18924080034829216,
985
+ 0.31700784157235296,
986
+ 0.13753064792637432
987
+ ],
988
+ "test_mae": 0.015970560076149547
989
+ },
990
+ "constrained_mae": {
991
+ "w": [
992
+ 0.9999999999999982,
993
+ 9.43689570931382e-16,
994
+ 0.0,
995
+ 8.049116928532376e-16
996
+ ],
997
+ "test_mae": 0.014453346940792903
998
+ },
999
+ "constrained_mse": {
1000
+ "w": [
1001
+ 0.5446169594084305,
1002
+ 2.7755575615628907e-17,
1003
+ 0.45538304059156953,
1004
+ 0.0
1005
+ ],
1006
+ "test_mae": 0.013183660449898013
1007
+ }
1008
+ },
1009
+ "best_individual_on_cal": {
1010
+ "model": "chronos",
1011
+ "test_mae": 0.014453346940792889
1012
+ },
1013
+ "winner": {
1014
+ "method": "constrained_mse",
1015
+ "test_mae": 0.013183660449898013
1016
+ }
1017
+ },
1018
+ "DEXCHUS_7": {
1019
+ "n_cal_folds": 10,
1020
+ "n_test_folds": 10,
1021
+ "models": [
1022
+ "chronos",
1023
+ "timesfm",
1024
+ "arima",
1025
+ "prophet"
1026
+ ],
1027
+ "weights": {
1028
+ "equal": {
1029
+ "w": [
1030
+ 0.25,
1031
+ 0.25,
1032
+ 0.25,
1033
+ 0.25
1034
+ ],
1035
+ "test_mae": 0.034690690500036904
1036
+ },
1037
+ "inverse_mae": {
1038
+ "w": [
1039
+ 0.30725895677630083,
1040
+ 0.24691376598214834,
1041
+ 0.3943485789337087,
1042
+ 0.05147869830784206
1043
+ ],
1044
+ "test_mae": 0.02117886221826054
1045
+ },
1046
+ "constrained_mae": {
1047
+ "w": [
1048
+ 0.0,
1049
+ 0.0,
1050
+ 0.9999999999999998,
1051
+ 2.3409280156677643e-16
1052
+ ],
1053
+ "test_mae": 0.015762412884263256
1054
+ },
1055
+ "constrained_mse": {
1056
+ "w": [
1057
+ 0.0,
1058
+ 0.040015823687684034,
1059
+ 0.959984176312316,
1060
+ 1.0408340855860841e-17
1061
+ ],
1062
+ "test_mae": 0.016130545137926368
1063
+ }
1064
+ },
1065
+ "best_individual_on_cal": {
1066
+ "model": "arima",
1067
+ "test_mae": 0.015762412884263242
1068
+ },
1069
+ "winner": {
1070
+ "method": "best_individual",
1071
+ "test_mae": 0.015762412884263242
1072
+ }
1073
+ },
1074
+ "DEXCHUS_14": {
1075
+ "n_cal_folds": 10,
1076
+ "n_test_folds": 10,
1077
+ "models": [
1078
+ "chronos",
1079
+ "timesfm",
1080
+ "arima",
1081
+ "prophet"
1082
+ ],
1083
+ "weights": {
1084
+ "equal": {
1085
+ "w": [
1086
+ 0.25,
1087
+ 0.25,
1088
+ 0.25,
1089
+ 0.25
1090
+ ],
1091
+ "test_mae": 0.049119233033837334
1092
+ },
1093
+ "inverse_mae": {
1094
+ "w": [
1095
+ 0.2988178654996703,
1096
+ 0.30220512040404324,
1097
+ 0.27971237870613896,
1098
+ 0.1192646353901474
1099
+ ],
1100
+ "test_mae": 0.04197509948228402
1101
+ },
1102
+ "constrained_mae": {
1103
+ "w": [
1104
+ 0.0,
1105
+ 0.9999999999999992,
1106
+ 3.6082248300317563e-16,
1107
+ 3.4174052476743075e-16
1108
+ ],
1109
+ "test_mae": 0.03187400458960995
1110
+ },
1111
+ "constrained_mse": {
1112
+ "w": [
1113
+ 0.5594517657002177,
1114
+ 0.23577483341396505,
1115
+ 0.2047734008858172,
1116
+ 0.0
1117
+ ],
1118
+ "test_mae": 0.033564545616950006
1119
+ }
1120
+ },
1121
+ "best_individual_on_cal": {
1122
+ "model": "timesfm",
1123
+ "test_mae": 0.03187400458960993
1124
+ },
1125
+ "winner": {
1126
+ "method": "best_individual",
1127
+ "test_mae": 0.03187400458960993
1128
+ }
1129
+ },
1130
+ "DEXCHUS_28": {
1131
+ "n_cal_folds": 10,
1132
+ "n_test_folds": 10,
1133
+ "models": [
1134
+ "chronos",
1135
+ "timesfm",
1136
+ "arima",
1137
+ "prophet"
1138
+ ],
1139
+ "weights": {
1140
+ "equal": {
1141
+ "w": [
1142
+ 0.25,
1143
+ 0.25,
1144
+ 0.25,
1145
+ 0.25
1146
+ ],
1147
+ "test_mae": 0.07622515708177849
1148
+ },
1149
+ "inverse_mae": {
1150
+ "w": [
1151
+ 0.21374276213191848,
1152
+ 0.32878921058258087,
1153
+ 0.27206545754178274,
1154
+ 0.18540256974371785
1155
+ ],
1156
+ "test_mae": 0.07368140063745915
1157
+ },
1158
+ "constrained_mae": {
1159
+ "w": [
1160
+ 3.565258741241218e-17,
1161
+ 0.9999999999999993,
1162
+ 0.0,
1163
+ 6.714758455242072e-16
1164
+ ],
1165
+ "test_mae": 0.05984540049808135
1166
+ },
1167
+ "constrained_mse": {
1168
+ "w": [
1169
+ 0.0,
1170
+ 0.7615511144034006,
1171
+ 0.23844888559659938,
1172
+ 5.308685925196128e-17
1173
+ ],
1174
+ "test_mae": 0.06440512615984152
1175
+ }
1176
+ },
1177
+ "best_individual_on_cal": {
1178
+ "model": "timesfm",
1179
+ "test_mae": 0.059845400498081305
1180
+ },
1181
+ "winner": {
1182
+ "method": "best_individual",
1183
+ "test_mae": 0.059845400498081305
1184
+ }
1185
+ }
1186
+ },
1187
+ "elapsed_s": 0.09606218338012695
1188
  }
FINAL_SUBMIT/receipts/R3_STACKING_V3_POINTLEVEL.json CHANGED
@@ -1,227 +1,227 @@
1
- {
2
- "description": "Per-point Bates-Granger constrained stacking on real forecaster outputs. No synthesized folds.",
3
- "per_target_horizon": {
4
- "DCOILWTICO_h7": {
5
- "n_cal_points": 70,
6
- "n_test_points": 70,
7
- "individual_mae": {
8
- "chronos": 3.006047764369419,
9
- "arima": 3.0841361525087674,
10
- "prophet": 8.557134422551027,
11
- "naive": 2.839285714285714
12
- },
13
- "stacking_mae": {
14
- "equal": 3.381860717562512,
15
- "best_on_cal": 2.839285714285714,
16
- "constrained_mae": 2.839285714285714,
17
- "constrained_mse": 2.839285714285714
18
- },
19
- "weights": {
20
- "constrained_mae": {
21
- "chronos": 0.0,
22
- "arima": 3.8857805861880464e-16,
23
- "prophet": 0.0,
24
- "naive": 0.9999999999999996
25
- },
26
- "constrained_mse": {
27
- "chronos": 1.2281842209915794e-15,
28
- "arima": 1.7069679003611782e-15,
29
- "prophet": 6.824272182231614e-17,
30
- "naive": 0.999999999999997
31
- }
32
- },
33
- "best_single_model": "naive",
34
- "best_single_mae": 2.839285714285714,
35
- "winner_method": "naive",
36
- "winner_mae": 2.839285714285714,
37
- "constrained_beats_best_single": false
38
- },
39
- "DCOILWTICO_h14": {
40
- "n_cal_points": 140,
41
- "n_test_points": 140,
42
- "individual_mae": {
43
- "chronos": 3.797937408447266,
44
- "arima": 3.917782537843266,
45
- "prophet": 9.218187229009528,
46
- "naive": 3.6239285714285714
47
- },
48
- "stacking_mae": {
49
- "equal": 3.9604401984158755,
50
- "best_on_cal": 3.6239285714285714,
51
- "constrained_mae": 3.623928571428571,
52
- "constrained_mse": 3.6994484688718305
53
- },
54
- "weights": {
55
- "constrained_mae": {
56
- "chronos": 1.3877787807814454e-16,
57
- "arima": 0.0,
58
- "prophet": 0.0,
59
- "naive": 0.9999999999999998
60
- },
61
- "constrained_mse": {
62
- "chronos": 3.0753177782116836e-14,
63
- "arima": 0.25973397692659406,
64
- "prophet": 1.0636618946679322e-15,
65
- "naive": 0.7402660230733741
66
- }
67
- },
68
- "best_single_model": "naive",
69
- "best_single_mae": 3.6239285714285714,
70
- "winner_method": "constrained_mae",
71
- "winner_mae": 3.623928571428571,
72
- "constrained_beats_best_single": true
73
- },
74
- "DEXUSEU_h7": {
75
- "n_cal_points": 70,
76
- "n_test_points": 70,
77
- "individual_mae": {
78
- "chronos": 0.00997808286394391,
79
- "arima": 0.00909829887487626,
80
- "prophet": 0.04588529230089117,
81
- "naive": 0.009057142857142856
82
- },
83
- "stacking_mae": {
84
- "equal": 0.013885443002327432,
85
- "best_on_cal": 0.00997808286394391,
86
- "constrained_mae": 0.009495985176023706,
87
- "constrained_mse": 0.013885443002327432
88
- },
89
- "weights": {
90
- "constrained_mae": {
91
- "chronos": 0.3382904222928093,
92
- "arima": 0.2908333034179931,
93
- "prophet": 0.07824807605162067,
94
- "naive": 0.292628198237577
95
- },
96
- "constrained_mse": {
97
- "chronos": 0.25,
98
- "arima": 0.25,
99
- "prophet": 0.25,
100
- "naive": 0.25
101
- }
102
- },
103
- "best_single_model": "naive",
104
- "best_single_mae": 0.009057142857142856,
105
- "winner_method": "naive",
106
- "winner_mae": 0.009057142857142856,
107
- "constrained_beats_best_single": false
108
- },
109
- "DEXUSEU_h14": {
110
- "n_cal_points": 140,
111
- "n_test_points": 140,
112
- "individual_mae": {
113
- "chronos": 0.013727861084256852,
114
- "arima": 0.012013652348349491,
115
- "prophet": 0.04736957874192551,
116
- "naive": 0.01203071428571428
117
- },
118
- "stacking_mae": {
119
- "equal": 0.015656730784239885,
120
- "best_on_cal": 0.012013652348349491,
121
- "constrained_mae": 0.012635021721737227,
122
- "constrained_mse": 0.015656730784239885
123
- },
124
- "weights": {
125
- "constrained_mae": {
126
- "chronos": 0.3173041077741453,
127
- "arima": 0.2850093471133051,
128
- "prophet": 0.10822240332468126,
129
- "naive": 0.28946414178786833
130
- },
131
- "constrained_mse": {
132
- "chronos": 0.25,
133
- "arima": 0.25,
134
- "prophet": 0.25,
135
- "naive": 0.25
136
- }
137
- },
138
- "best_single_model": "arima",
139
- "best_single_mae": 0.012013652348349491,
140
- "winner_method": "arima",
141
- "winner_mae": 0.012013652348349491,
142
- "constrained_beats_best_single": false
143
- },
144
- "DEXCHUS_h7": {
145
- "n_cal_points": 70,
146
- "n_test_points": 70,
147
- "individual_mae": {
148
- "chronos": 0.019519044701712434,
149
- "arima": 0.017992622791365688,
150
- "prophet": 0.11663701396527856,
151
- "naive": 0.01873000000000015
152
- },
153
- "stacking_mae": {
154
- "equal": 0.03595753473515902,
155
- "best_on_cal": 0.019519044701712434,
156
- "constrained_mae": 0.020133491932037322,
157
- "constrained_mse": 0.019334668170698382
158
- },
159
- "weights": {
160
- "constrained_mae": {
161
- "chronos": 0.7133898921965662,
162
- "arima": 0.21870528495965705,
163
- "prophet": 0.06790482284377684,
164
- "naive": 0.0
165
- },
166
- "constrained_mse": {
167
- "chronos": 0.935153684195057,
168
- "arima": 8.998878031629688e-18,
169
- "prophet": 0.008348340456592942,
170
- "naive": 0.056497975348350146
171
- }
172
- },
173
- "best_single_model": "arima",
174
- "best_single_mae": 0.017992622791365688,
175
- "winner_method": "arima",
176
- "winner_mae": 0.017992622791365688,
177
- "constrained_beats_best_single": false
178
- },
179
- "DEXCHUS_h14": {
180
- "n_cal_points": 140,
181
- "n_test_points": 140,
182
- "individual_mae": {
183
- "chronos": 0.03237065534319195,
184
- "arima": 0.03236972869761379,
185
- "prophet": 0.12129274215959333,
186
- "naive": 0.03212142857142869
187
- },
188
- "stacking_mae": {
189
- "equal": 0.043605583896191145,
190
- "best_on_cal": 0.03237065534319195,
191
- "constrained_mae": 0.031424293689945516,
192
- "constrained_mse": 0.034848071305054344
193
- },
194
- "weights": {
195
- "constrained_mae": {
196
- "chronos": 0.6699556648170705,
197
- "arima": 0.251108263144011,
198
- "prophet": 0.07893607203891846,
199
- "naive": 6.03983418880819e-19
200
- },
201
- "constrained_mse": {
202
- "chronos": 0.8500735106653095,
203
- "arima": 0.0,
204
- "prophet": 0.14992648933469047,
205
- "naive": 0.0
206
- }
207
- },
208
- "best_single_model": "naive",
209
- "best_single_mae": 0.03212142857142869,
210
- "winner_method": "constrained_mae",
211
- "winner_mae": 0.031424293689945516,
212
- "constrained_beats_best_single": true
213
- }
214
- },
215
- "wins": {
216
- "constrained": 2,
217
- "best_single": 4,
218
- "equal": 0,
219
- "naive": 0
220
- },
221
- "summary": {
222
- "total_target_horizon_cells": 6,
223
- "constrained_stacking_wins": 2,
224
- "constrained_beats_best_single_cells": 2
225
- },
226
- "elapsed_min": 2.2175209800402325
227
  }
 
1
+ {
2
+ "description": "Per-point Bates-Granger constrained stacking on real forecaster outputs. No synthesized folds.",
3
+ "per_target_horizon": {
4
+ "DCOILWTICO_h7": {
5
+ "n_cal_points": 70,
6
+ "n_test_points": 70,
7
+ "individual_mae": {
8
+ "chronos": 3.006047764369419,
9
+ "arima": 3.0841361525087674,
10
+ "prophet": 8.557134422551027,
11
+ "naive": 2.839285714285714
12
+ },
13
+ "stacking_mae": {
14
+ "equal": 3.381860717562512,
15
+ "best_on_cal": 2.839285714285714,
16
+ "constrained_mae": 2.839285714285714,
17
+ "constrained_mse": 2.839285714285714
18
+ },
19
+ "weights": {
20
+ "constrained_mae": {
21
+ "chronos": 0.0,
22
+ "arima": 3.8857805861880464e-16,
23
+ "prophet": 0.0,
24
+ "naive": 0.9999999999999996
25
+ },
26
+ "constrained_mse": {
27
+ "chronos": 1.2281842209915794e-15,
28
+ "arima": 1.7069679003611782e-15,
29
+ "prophet": 6.824272182231614e-17,
30
+ "naive": 0.999999999999997
31
+ }
32
+ },
33
+ "best_single_model": "naive",
34
+ "best_single_mae": 2.839285714285714,
35
+ "winner_method": "naive",
36
+ "winner_mae": 2.839285714285714,
37
+ "constrained_beats_best_single": false
38
+ },
39
+ "DCOILWTICO_h14": {
40
+ "n_cal_points": 140,
41
+ "n_test_points": 140,
42
+ "individual_mae": {
43
+ "chronos": 3.797937408447266,
44
+ "arima": 3.917782537843266,
45
+ "prophet": 9.218187229009528,
46
+ "naive": 3.6239285714285714
47
+ },
48
+ "stacking_mae": {
49
+ "equal": 3.9604401984158755,
50
+ "best_on_cal": 3.6239285714285714,
51
+ "constrained_mae": 3.623928571428571,
52
+ "constrained_mse": 3.6994484688718305
53
+ },
54
+ "weights": {
55
+ "constrained_mae": {
56
+ "chronos": 1.3877787807814454e-16,
57
+ "arima": 0.0,
58
+ "prophet": 0.0,
59
+ "naive": 0.9999999999999998
60
+ },
61
+ "constrained_mse": {
62
+ "chronos": 3.0753177782116836e-14,
63
+ "arima": 0.25973397692659406,
64
+ "prophet": 1.0636618946679322e-15,
65
+ "naive": 0.7402660230733741
66
+ }
67
+ },
68
+ "best_single_model": "naive",
69
+ "best_single_mae": 3.6239285714285714,
70
+ "winner_method": "constrained_mae",
71
+ "winner_mae": 3.623928571428571,
72
+ "constrained_beats_best_single": true
73
+ },
74
+ "DEXUSEU_h7": {
75
+ "n_cal_points": 70,
76
+ "n_test_points": 70,
77
+ "individual_mae": {
78
+ "chronos": 0.00997808286394391,
79
+ "arima": 0.00909829887487626,
80
+ "prophet": 0.04588529230089117,
81
+ "naive": 0.009057142857142856
82
+ },
83
+ "stacking_mae": {
84
+ "equal": 0.013885443002327432,
85
+ "best_on_cal": 0.00997808286394391,
86
+ "constrained_mae": 0.009495985176023706,
87
+ "constrained_mse": 0.013885443002327432
88
+ },
89
+ "weights": {
90
+ "constrained_mae": {
91
+ "chronos": 0.3382904222928093,
92
+ "arima": 0.2908333034179931,
93
+ "prophet": 0.07824807605162067,
94
+ "naive": 0.292628198237577
95
+ },
96
+ "constrained_mse": {
97
+ "chronos": 0.25,
98
+ "arima": 0.25,
99
+ "prophet": 0.25,
100
+ "naive": 0.25
101
+ }
102
+ },
103
+ "best_single_model": "naive",
104
+ "best_single_mae": 0.009057142857142856,
105
+ "winner_method": "naive",
106
+ "winner_mae": 0.009057142857142856,
107
+ "constrained_beats_best_single": false
108
+ },
109
+ "DEXUSEU_h14": {
110
+ "n_cal_points": 140,
111
+ "n_test_points": 140,
112
+ "individual_mae": {
113
+ "chronos": 0.013727861084256852,
114
+ "arima": 0.012013652348349491,
115
+ "prophet": 0.04736957874192551,
116
+ "naive": 0.01203071428571428
117
+ },
118
+ "stacking_mae": {
119
+ "equal": 0.015656730784239885,
120
+ "best_on_cal": 0.012013652348349491,
121
+ "constrained_mae": 0.012635021721737227,
122
+ "constrained_mse": 0.015656730784239885
123
+ },
124
+ "weights": {
125
+ "constrained_mae": {
126
+ "chronos": 0.3173041077741453,
127
+ "arima": 0.2850093471133051,
128
+ "prophet": 0.10822240332468126,
129
+ "naive": 0.28946414178786833
130
+ },
131
+ "constrained_mse": {
132
+ "chronos": 0.25,
133
+ "arima": 0.25,
134
+ "prophet": 0.25,
135
+ "naive": 0.25
136
+ }
137
+ },
138
+ "best_single_model": "arima",
139
+ "best_single_mae": 0.012013652348349491,
140
+ "winner_method": "arima",
141
+ "winner_mae": 0.012013652348349491,
142
+ "constrained_beats_best_single": false
143
+ },
144
+ "DEXCHUS_h7": {
145
+ "n_cal_points": 70,
146
+ "n_test_points": 70,
147
+ "individual_mae": {
148
+ "chronos": 0.019519044701712434,
149
+ "arima": 0.017992622791365688,
150
+ "prophet": 0.11663701396527856,
151
+ "naive": 0.01873000000000015
152
+ },
153
+ "stacking_mae": {
154
+ "equal": 0.03595753473515902,
155
+ "best_on_cal": 0.019519044701712434,
156
+ "constrained_mae": 0.020133491932037322,
157
+ "constrained_mse": 0.019334668170698382
158
+ },
159
+ "weights": {
160
+ "constrained_mae": {
161
+ "chronos": 0.7133898921965662,
162
+ "arima": 0.21870528495965705,
163
+ "prophet": 0.06790482284377684,
164
+ "naive": 0.0
165
+ },
166
+ "constrained_mse": {
167
+ "chronos": 0.935153684195057,
168
+ "arima": 8.998878031629688e-18,
169
+ "prophet": 0.008348340456592942,
170
+ "naive": 0.056497975348350146
171
+ }
172
+ },
173
+ "best_single_model": "arima",
174
+ "best_single_mae": 0.017992622791365688,
175
+ "winner_method": "arima",
176
+ "winner_mae": 0.017992622791365688,
177
+ "constrained_beats_best_single": false
178
+ },
179
+ "DEXCHUS_h14": {
180
+ "n_cal_points": 140,
181
+ "n_test_points": 140,
182
+ "individual_mae": {
183
+ "chronos": 0.03237065534319195,
184
+ "arima": 0.03236972869761379,
185
+ "prophet": 0.12129274215959333,
186
+ "naive": 0.03212142857142869
187
+ },
188
+ "stacking_mae": {
189
+ "equal": 0.043605583896191145,
190
+ "best_on_cal": 0.03237065534319195,
191
+ "constrained_mae": 0.031424293689945516,
192
+ "constrained_mse": 0.034848071305054344
193
+ },
194
+ "weights": {
195
+ "constrained_mae": {
196
+ "chronos": 0.6699556648170705,
197
+ "arima": 0.251108263144011,
198
+ "prophet": 0.07893607203891846,
199
+ "naive": 6.03983418880819e-19
200
+ },
201
+ "constrained_mse": {
202
+ "chronos": 0.8500735106653095,
203
+ "arima": 0.0,
204
+ "prophet": 0.14992648933469047,
205
+ "naive": 0.0
206
+ }
207
+ },
208
+ "best_single_model": "naive",
209
+ "best_single_mae": 0.03212142857142869,
210
+ "winner_method": "constrained_mae",
211
+ "winner_mae": 0.031424293689945516,
212
+ "constrained_beats_best_single": true
213
+ }
214
+ },
215
+ "wins": {
216
+ "constrained": 2,
217
+ "best_single": 4,
218
+ "equal": 0,
219
+ "naive": 0
220
+ },
221
+ "summary": {
222
+ "total_target_horizon_cells": 6,
223
+ "constrained_stacking_wins": 2,
224
+ "constrained_beats_best_single_cells": 2
225
+ },
226
+ "elapsed_min": 2.2175209800402325
227
  }
FINAL_SUBMIT/receipts/R3_TIMESFM_QUANTILE.json CHANGED
@@ -1,130 +1,130 @@
1
- {
2
- "method": "per-horizon split-conformal wrapper on TimesFM point forecasts",
3
- "comparison": "Chronos-Bolt native quantiles",
4
- "targets": {
5
- "DCOILWTICO": {
6
- "target": "DCOILWTICO",
7
- "n_cal": 20,
8
- "n_test": 20,
9
- "timesfm_conf=0.8": {
10
- "nominal_coverage": 0.8,
11
- "empirical_coverage": 0.7464285714285714,
12
- "mean_width": 11.44973765781948,
13
- "dev_from_nominal": 0.0535714285714286
14
- },
15
- "timesfm_conf=0.9": {
16
- "nominal_coverage": 0.9,
17
- "empirical_coverage": 0.8321428571428573,
18
- "mean_width": 14.322232644217351,
19
- "dev_from_nominal": 0.06785714285714273
20
- },
21
- "timesfm_conf=0.95": {
22
- "nominal_coverage": 0.95,
23
- "empirical_coverage": 0.9,
24
- "mean_width": 17.292571051461362,
25
- "dev_from_nominal": 0.04999999999999993
26
- },
27
- "chronos_native_conf=0.8": {
28
- "nominal_coverage": 0.8,
29
- "empirical_coverage": 0.7107142857142856,
30
- "mean_width": 10.861018967628478,
31
- "dev_from_nominal": 0.08928571428571441
32
- },
33
- "chronos_native_conf=0.9": {
34
- "nominal_coverage": 0.9,
35
- "empirical_coverage": 0.7107142857142856,
36
- "mean_width": 10.861018967628478,
37
- "dev_from_nominal": 0.1892857142857144
38
- },
39
- "chronos_native_conf=0.95": {
40
- "nominal_coverage": 0.95,
41
- "empirical_coverage": 0.7107142857142856,
42
- "mean_width": 10.861018967628478,
43
- "dev_from_nominal": 0.23928571428571432
44
- }
45
- },
46
- "DEXJPUS": {
47
- "target": "DEXJPUS",
48
- "n_cal": 20,
49
- "n_test": 20,
50
- "timesfm_conf=0.8": {
51
- "nominal_coverage": 0.8,
52
- "empirical_coverage": 0.7464285714285714,
53
- "mean_width": 5.831283089773991,
54
- "dev_from_nominal": 0.0535714285714286
55
- },
56
- "timesfm_conf=0.9": {
57
- "nominal_coverage": 0.9,
58
- "empirical_coverage": 0.7928571428571428,
59
- "mean_width": 6.870930001395079,
60
- "dev_from_nominal": 0.1071428571428572
61
- },
62
- "timesfm_conf=0.95": {
63
- "nominal_coverage": 0.95,
64
- "empirical_coverage": 0.8035714285714285,
65
- "mean_width": 7.547866254534036,
66
- "dev_from_nominal": 0.14642857142857146
67
- },
68
- "chronos_native_conf=0.8": {
69
- "nominal_coverage": 0.8,
70
- "empirical_coverage": 0.742857142857143,
71
- "mean_width": 5.904579341411591,
72
- "dev_from_nominal": 0.05714285714285705
73
- },
74
- "chronos_native_conf=0.9": {
75
- "nominal_coverage": 0.9,
76
- "empirical_coverage": 0.742857142857143,
77
- "mean_width": 5.904579341411591,
78
- "dev_from_nominal": 0.15714285714285703
79
- },
80
- "chronos_native_conf=0.95": {
81
- "nominal_coverage": 0.95,
82
- "empirical_coverage": 0.742857142857143,
83
- "mean_width": 5.904579341411591,
84
- "dev_from_nominal": 0.20714285714285696
85
- }
86
- },
87
- "DEXUSEU": {
88
- "target": "DEXUSEU",
89
- "n_cal": 20,
90
- "n_test": 20,
91
- "timesfm_conf=0.8": {
92
- "nominal_coverage": 0.8,
93
- "empirical_coverage": 0.9071428571428573,
94
- "mean_width": 0.06282055849347795,
95
- "dev_from_nominal": 0.1071428571428572
96
- },
97
- "timesfm_conf=0.9": {
98
- "nominal_coverage": 0.9,
99
- "empirical_coverage": 0.9678571428571429,
100
- "mean_width": 0.08470568656921382,
101
- "dev_from_nominal": 0.06785714285714284
102
- },
103
- "timesfm_conf=0.95": {
104
- "nominal_coverage": 0.95,
105
- "empirical_coverage": 0.9821428571428571,
106
- "mean_width": 0.09796196365356444,
107
- "dev_from_nominal": 0.03214285714285714
108
- },
109
- "chronos_native_conf=0.8": {
110
- "nominal_coverage": 0.8,
111
- "empirical_coverage": 0.7357142857142858,
112
- "mean_width": 0.03356509944424033,
113
- "dev_from_nominal": 0.06428571428571428
114
- },
115
- "chronos_native_conf=0.9": {
116
- "nominal_coverage": 0.9,
117
- "empirical_coverage": 0.7357142857142858,
118
- "mean_width": 0.03356509944424033,
119
- "dev_from_nominal": 0.16428571428571426
120
- },
121
- "chronos_native_conf=0.95": {
122
- "nominal_coverage": 0.95,
123
- "empirical_coverage": 0.7357142857142858,
124
- "mean_width": 0.03356509944424033,
125
- "dev_from_nominal": 0.2142857142857142
126
- }
127
- }
128
- },
129
- "elapsed_min": 0.5109713474909464
130
  }
 
1
+ {
2
+ "method": "per-horizon split-conformal wrapper on TimesFM point forecasts",
3
+ "comparison": "Chronos-Bolt native quantiles",
4
+ "targets": {
5
+ "DCOILWTICO": {
6
+ "target": "DCOILWTICO",
7
+ "n_cal": 20,
8
+ "n_test": 20,
9
+ "timesfm_conf=0.8": {
10
+ "nominal_coverage": 0.8,
11
+ "empirical_coverage": 0.7464285714285714,
12
+ "mean_width": 11.44973765781948,
13
+ "dev_from_nominal": 0.0535714285714286
14
+ },
15
+ "timesfm_conf=0.9": {
16
+ "nominal_coverage": 0.9,
17
+ "empirical_coverage": 0.8321428571428573,
18
+ "mean_width": 14.322232644217351,
19
+ "dev_from_nominal": 0.06785714285714273
20
+ },
21
+ "timesfm_conf=0.95": {
22
+ "nominal_coverage": 0.95,
23
+ "empirical_coverage": 0.9,
24
+ "mean_width": 17.292571051461362,
25
+ "dev_from_nominal": 0.04999999999999993
26
+ },
27
+ "chronos_native_conf=0.8": {
28
+ "nominal_coverage": 0.8,
29
+ "empirical_coverage": 0.7107142857142856,
30
+ "mean_width": 10.861018967628478,
31
+ "dev_from_nominal": 0.08928571428571441
32
+ },
33
+ "chronos_native_conf=0.9": {
34
+ "nominal_coverage": 0.9,
35
+ "empirical_coverage": 0.7107142857142856,
36
+ "mean_width": 10.861018967628478,
37
+ "dev_from_nominal": 0.1892857142857144
38
+ },
39
+ "chronos_native_conf=0.95": {
40
+ "nominal_coverage": 0.95,
41
+ "empirical_coverage": 0.7107142857142856,
42
+ "mean_width": 10.861018967628478,
43
+ "dev_from_nominal": 0.23928571428571432
44
+ }
45
+ },
46
+ "DEXJPUS": {
47
+ "target": "DEXJPUS",
48
+ "n_cal": 20,
49
+ "n_test": 20,
50
+ "timesfm_conf=0.8": {
51
+ "nominal_coverage": 0.8,
52
+ "empirical_coverage": 0.7464285714285714,
53
+ "mean_width": 5.831283089773991,
54
+ "dev_from_nominal": 0.0535714285714286
55
+ },
56
+ "timesfm_conf=0.9": {
57
+ "nominal_coverage": 0.9,
58
+ "empirical_coverage": 0.7928571428571428,
59
+ "mean_width": 6.870930001395079,
60
+ "dev_from_nominal": 0.1071428571428572
61
+ },
62
+ "timesfm_conf=0.95": {
63
+ "nominal_coverage": 0.95,
64
+ "empirical_coverage": 0.8035714285714285,
65
+ "mean_width": 7.547866254534036,
66
+ "dev_from_nominal": 0.14642857142857146
67
+ },
68
+ "chronos_native_conf=0.8": {
69
+ "nominal_coverage": 0.8,
70
+ "empirical_coverage": 0.742857142857143,
71
+ "mean_width": 5.904579341411591,
72
+ "dev_from_nominal": 0.05714285714285705
73
+ },
74
+ "chronos_native_conf=0.9": {
75
+ "nominal_coverage": 0.9,
76
+ "empirical_coverage": 0.742857142857143,
77
+ "mean_width": 5.904579341411591,
78
+ "dev_from_nominal": 0.15714285714285703
79
+ },
80
+ "chronos_native_conf=0.95": {
81
+ "nominal_coverage": 0.95,
82
+ "empirical_coverage": 0.742857142857143,
83
+ "mean_width": 5.904579341411591,
84
+ "dev_from_nominal": 0.20714285714285696
85
+ }
86
+ },
87
+ "DEXUSEU": {
88
+ "target": "DEXUSEU",
89
+ "n_cal": 20,
90
+ "n_test": 20,
91
+ "timesfm_conf=0.8": {
92
+ "nominal_coverage": 0.8,
93
+ "empirical_coverage": 0.9071428571428573,
94
+ "mean_width": 0.06282055849347795,
95
+ "dev_from_nominal": 0.1071428571428572
96
+ },
97
+ "timesfm_conf=0.9": {
98
+ "nominal_coverage": 0.9,
99
+ "empirical_coverage": 0.9678571428571429,
100
+ "mean_width": 0.08470568656921382,
101
+ "dev_from_nominal": 0.06785714285714284
102
+ },
103
+ "timesfm_conf=0.95": {
104
+ "nominal_coverage": 0.95,
105
+ "empirical_coverage": 0.9821428571428571,
106
+ "mean_width": 0.09796196365356444,
107
+ "dev_from_nominal": 0.03214285714285714
108
+ },
109
+ "chronos_native_conf=0.8": {
110
+ "nominal_coverage": 0.8,
111
+ "empirical_coverage": 0.7357142857142858,
112
+ "mean_width": 0.03356509944424033,
113
+ "dev_from_nominal": 0.06428571428571428
114
+ },
115
+ "chronos_native_conf=0.9": {
116
+ "nominal_coverage": 0.9,
117
+ "empirical_coverage": 0.7357142857142858,
118
+ "mean_width": 0.03356509944424033,
119
+ "dev_from_nominal": 0.16428571428571426
120
+ },
121
+ "chronos_native_conf=0.95": {
122
+ "nominal_coverage": 0.95,
123
+ "empirical_coverage": 0.7357142857142858,
124
+ "mean_width": 0.03356509944424033,
125
+ "dev_from_nominal": 0.2142857142857142
126
+ }
127
+ }
128
+ },
129
+ "elapsed_min": 0.5109713474909464
130
  }
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2.json CHANGED
The diff for this file is too large to render. See raw diff
 
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_ABLATION.json CHANGED
@@ -1,397 +1,397 @@
1
- {
2
- "description": "R4 ablation: DeepSeek-R1-Q4 reassigned to devil's-advocate (consulted, not voting). Primary consensus = Qwen-14B + Mistral-Nemo.",
3
- "primary_judges": [
4
- "qwen25-14b-local",
5
- "mistral-nemo-local"
6
- ],
7
- "devils_advocate": "deepseek-r1-local-q4",
8
- "n_scenarios": 26,
9
- "agreement_primary_panel": {
10
- "krippendorff_alpha_ordinal": 0.7499056959637873,
11
- "cohen_weighted_kappa_qwen_vs_mistral": 0.7473841554559043
12
- },
13
- "accuracy_vs_ground_truth": {
14
- "primary_majority_vote": {
15
- "correct": 16,
16
- "total": 26,
17
- "accuracy": 0.6153846153846154
18
- },
19
- "three_judge_majority_vote_ORIGINAL": {
20
- "correct": 18,
21
- "total": 26,
22
- "accuracy": 0.6923076923076923
23
- },
24
- "devils_advocate_deepseek": {
25
- "correct": 8,
26
- "total": 26,
27
- "accuracy": 0.3076923076923077
28
- }
29
- },
30
- "confusion_matrix_primary": [
31
- [
32
- 7,
33
- 0,
34
- 0,
35
- 0
36
- ],
37
- [
38
- 2,
39
- 5,
40
- 0,
41
- 0
42
- ],
43
- [
44
- 0,
45
- 5,
46
- 3,
47
- 1
48
- ],
49
- [
50
- 0,
51
- 0,
52
- 2,
53
- 1
54
- ]
55
- ],
56
- "confusion_matrix_three_judge_ORIGINAL": [
57
- [
58
- 7,
59
- 0,
60
- 0,
61
- 0
62
- ],
63
- [
64
- 2,
65
- 3,
66
- 2,
67
- 0
68
- ],
69
- [
70
- 0,
71
- 2,
72
- 7,
73
- 0
74
- ],
75
- [
76
- 0,
77
- 0,
78
- 2,
79
- 1
80
- ]
81
- ],
82
- "calibration_ece_primary": 0.2894230769230769,
83
- "per_scenario": {
84
- "2011_T\u014dhoku_earthquake_and_tsunami": {
85
- "ground_truth": "CRITICAL",
86
- "primary_panel_ratings": [
87
- 4,
88
- 4
89
- ],
90
- "primary_majority": "CRITICAL",
91
- "devil_rating": "HIGH",
92
- "three_judge_majority": "CRITICAL",
93
- "primary_correct": true,
94
- "devil_correct": false
95
- },
96
- "2020\u20132023_global_chip_shortage": {
97
- "ground_truth": "CRITICAL",
98
- "primary_panel_ratings": [
99
- 3,
100
- 3
101
- ],
102
- "primary_majority": "HIGH",
103
- "devil_rating": "CRITICAL",
104
- "three_judge_majority": "HIGH",
105
- "primary_correct": false,
106
- "devil_correct": true
107
- },
108
- "2021_Suez_Canal_obstruction": {
109
- "ground_truth": "HIGH",
110
- "primary_panel_ratings": [
111
- 3,
112
- 3
113
- ],
114
- "primary_majority": "HIGH",
115
- "devil_rating": "HIGH",
116
- "three_judge_majority": "HIGH",
117
- "primary_correct": true,
118
- "devil_correct": true
119
- },
120
- "Bab-el-Mandeb": {
121
- "ground_truth": "HIGH",
122
- "primary_panel_ratings": [
123
- 2,
124
- 1
125
- ],
126
- "primary_majority": "MEDIUM",
127
- "devil_rating": "HIGH",
128
- "three_judge_majority": "MEDIUM",
129
- "primary_correct": false,
130
- "devil_correct": true
131
- },
132
- "Baltic_Dry_Index": {
133
- "ground_truth": "LOW",
134
- "primary_panel_ratings": [
135
- 1,
136
- 1
137
- ],
138
- "primary_majority": "LOW",
139
- "devil_rating": "HIGH",
140
- "three_judge_majority": "LOW",
141
- "primary_correct": true,
142
- "devil_correct": false
143
- },
144
- "Bullwhip_effect": {
145
- "ground_truth": "MEDIUM",
146
- "primary_panel_ratings": [
147
- 1,
148
- 1
149
- ],
150
- "primary_majority": "LOW",
151
- "devil_rating": "HIGH",
152
- "three_judge_majority": "LOW",
153
- "primary_correct": false,
154
- "devil_correct": false
155
- },
156
- "CHIPS_and_Science_Act": {
157
- "ground_truth": "MEDIUM",
158
- "primary_panel_ratings": [
159
- 1,
160
- 2
161
- ],
162
- "primary_majority": "MEDIUM",
163
- "devil_rating": "HIGH",
164
- "three_judge_majority": "MEDIUM",
165
- "primary_correct": true,
166
- "devil_correct": false
167
- },
168
- "Container_ship": {
169
- "ground_truth": "LOW",
170
- "primary_panel_ratings": [
171
- 1,
172
- 1
173
- ],
174
- "primary_majority": "LOW",
175
- "devil_rating": "HIGH",
176
- "three_judge_majority": "LOW",
177
- "primary_correct": true,
178
- "devil_correct": false
179
- },
180
- "Enterprise_resource_planning": {
181
- "ground_truth": "LOW",
182
- "primary_panel_ratings": [
183
- 1,
184
- 1
185
- ],
186
- "primary_majority": "LOW",
187
- "devil_rating": "MEDIUM",
188
- "three_judge_majority": "LOW",
189
- "primary_correct": true,
190
- "devil_correct": false
191
- },
192
- "Ever_Given": {
193
- "ground_truth": "HIGH",
194
- "primary_panel_ratings": [
195
- 2,
196
- 3
197
- ],
198
- "primary_majority": "MEDIUM",
199
- "devil_rating": "HIGH",
200
- "three_judge_majority": "HIGH",
201
- "primary_correct": false,
202
- "devil_correct": true
203
- },
204
- "Foxconn": {
205
- "ground_truth": "MEDIUM",
206
- "primary_panel_ratings": [
207
- 3,
208
- 2
209
- ],
210
- "primary_majority": "MEDIUM",
211
- "devil_rating": "HIGH",
212
- "three_judge_majority": "HIGH",
213
- "primary_correct": true,
214
- "devil_correct": false
215
- },
216
- "Inventory": {
217
- "ground_truth": "LOW",
218
- "primary_panel_ratings": [
219
- 1,
220
- 1
221
- ],
222
- "primary_majority": "LOW",
223
- "devil_rating": "HIGH",
224
- "three_judge_majority": "LOW",
225
- "primary_correct": true,
226
- "devil_correct": false
227
- },
228
- "Just-in-time_manufacturing": {
229
- "ground_truth": "MEDIUM",
230
- "primary_panel_ratings": [
231
- 1,
232
- 1
233
- ],
234
- "primary_majority": "LOW",
235
- "devil_rating": "HIGH",
236
- "three_judge_majority": "LOW",
237
- "primary_correct": false,
238
- "devil_correct": false
239
- },
240
- "Logistics": {
241
- "ground_truth": "LOW",
242
- "primary_panel_ratings": [
243
- 1,
244
- 1
245
- ],
246
- "primary_majority": "LOW",
247
- "devil_rating": "HIGH",
248
- "three_judge_majority": "LOW",
249
- "primary_correct": true,
250
- "devil_correct": false
251
- },
252
- "Port_of_Los_Angeles": {
253
- "ground_truth": "MEDIUM",
254
- "primary_panel_ratings": [
255
- 2,
256
- 2
257
- ],
258
- "primary_majority": "MEDIUM",
259
- "devil_rating": "HIGH",
260
- "three_judge_majority": "MEDIUM",
261
- "primary_correct": true,
262
- "devil_correct": false
263
- },
264
- "Port_of_Singapore": {
265
- "ground_truth": "MEDIUM",
266
- "primary_panel_ratings": [
267
- 3,
268
- 2
269
- ],
270
- "primary_majority": "MEDIUM",
271
- "devil_rating": "HIGH",
272
- "three_judge_majority": "HIGH",
273
- "primary_correct": true,
274
- "devil_correct": false
275
- },
276
- "Red_Sea_crisis": {
277
- "ground_truth": "CRITICAL",
278
- "primary_panel_ratings": [
279
- 3,
280
- 3
281
- ],
282
- "primary_majority": "HIGH",
283
- "devil_rating": "CRITICAL",
284
- "three_judge_majority": "HIGH",
285
- "primary_correct": false,
286
- "devil_correct": true
287
- },
288
- "Samsung_Electronics": {
289
- "ground_truth": "MEDIUM",
290
- "primary_panel_ratings": [
291
- 2,
292
- 1
293
- ],
294
- "primary_majority": "MEDIUM",
295
- "devil_rating": "HIGH",
296
- "three_judge_majority": "MEDIUM",
297
- "primary_correct": true,
298
- "devil_correct": false
299
- },
300
- "Semiconductor_industry": {
301
- "ground_truth": "HIGH",
302
- "primary_panel_ratings": [
303
- 2,
304
- 1
305
- ],
306
- "primary_majority": "MEDIUM",
307
- "devil_rating": "CRITICAL",
308
- "three_judge_majority": "MEDIUM",
309
- "primary_correct": false,
310
- "devil_correct": false
311
- },
312
- "Strait_of_Hormuz": {
313
- "ground_truth": "HIGH",
314
- "primary_panel_ratings": [
315
- 4,
316
- 3
317
- ],
318
- "primary_majority": "CRITICAL",
319
- "devil_rating": "HIGH",
320
- "three_judge_majority": "HIGH",
321
- "primary_correct": false,
322
- "devil_correct": true
323
- },
324
- "Strait_of_Malacca": {
325
- "ground_truth": "HIGH",
326
- "primary_panel_ratings": [
327
- 3,
328
- 3
329
- ],
330
- "primary_majority": "HIGH",
331
- "devil_rating": "HIGH",
332
- "three_judge_majority": "HIGH",
333
- "primary_correct": true,
334
- "devil_correct": true
335
- },
336
- "Suez_Canal": {
337
- "ground_truth": "HIGH",
338
- "primary_panel_ratings": [
339
- 3,
340
- 1
341
- ],
342
- "primary_majority": "MEDIUM",
343
- "devil_rating": "CRITICAL",
344
- "three_judge_majority": "HIGH",
345
- "primary_correct": false,
346
- "devil_correct": false
347
- },
348
- "Supply_chain_attack": {
349
- "ground_truth": "HIGH",
350
- "primary_panel_ratings": [
351
- 2,
352
- 3
353
- ],
354
- "primary_majority": "MEDIUM",
355
- "devil_rating": "CRITICAL",
356
- "three_judge_majority": "HIGH",
357
- "primary_correct": false,
358
- "devil_correct": false
359
- },
360
- "Supply_chain_management": {
361
- "ground_truth": "LOW",
362
- "primary_panel_ratings": [
363
- 1,
364
- 1
365
- ],
366
- "primary_majority": "LOW",
367
- "devil_rating": "HIGH",
368
- "three_judge_majority": "LOW",
369
- "primary_correct": true,
370
- "devil_correct": false
371
- },
372
- "TSMC": {
373
- "ground_truth": "HIGH",
374
- "primary_panel_ratings": [
375
- 3,
376
- 3
377
- ],
378
- "primary_majority": "HIGH",
379
- "devil_rating": "HIGH",
380
- "three_judge_majority": "HIGH",
381
- "primary_correct": true,
382
- "devil_correct": true
383
- },
384
- "Warehouse": {
385
- "ground_truth": "LOW",
386
- "primary_panel_ratings": [
387
- 1,
388
- 1
389
- ],
390
- "primary_majority": "LOW",
391
- "devil_rating": "MEDIUM",
392
- "three_judge_majority": "LOW",
393
- "primary_correct": true,
394
- "devil_correct": false
395
- }
396
- }
397
  }
 
1
+ {
2
+ "description": "R4 ablation: DeepSeek-R1-Q4 reassigned to devil's-advocate (consulted, not voting). Primary consensus = Qwen-14B + Mistral-Nemo.",
3
+ "primary_judges": [
4
+ "qwen25-14b-local",
5
+ "mistral-nemo-local"
6
+ ],
7
+ "devils_advocate": "deepseek-r1-local-q4",
8
+ "n_scenarios": 26,
9
+ "agreement_primary_panel": {
10
+ "krippendorff_alpha_ordinal": 0.7499056959637873,
11
+ "cohen_weighted_kappa_qwen_vs_mistral": 0.7473841554559043
12
+ },
13
+ "accuracy_vs_ground_truth": {
14
+ "primary_majority_vote": {
15
+ "correct": 16,
16
+ "total": 26,
17
+ "accuracy": 0.6153846153846154
18
+ },
19
+ "three_judge_majority_vote_ORIGINAL": {
20
+ "correct": 18,
21
+ "total": 26,
22
+ "accuracy": 0.6923076923076923
23
+ },
24
+ "devils_advocate_deepseek": {
25
+ "correct": 8,
26
+ "total": 26,
27
+ "accuracy": 0.3076923076923077
28
+ }
29
+ },
30
+ "confusion_matrix_primary": [
31
+ [
32
+ 7,
33
+ 0,
34
+ 0,
35
+ 0
36
+ ],
37
+ [
38
+ 2,
39
+ 5,
40
+ 0,
41
+ 0
42
+ ],
43
+ [
44
+ 0,
45
+ 5,
46
+ 3,
47
+ 1
48
+ ],
49
+ [
50
+ 0,
51
+ 0,
52
+ 2,
53
+ 1
54
+ ]
55
+ ],
56
+ "confusion_matrix_three_judge_ORIGINAL": [
57
+ [
58
+ 7,
59
+ 0,
60
+ 0,
61
+ 0
62
+ ],
63
+ [
64
+ 2,
65
+ 3,
66
+ 2,
67
+ 0
68
+ ],
69
+ [
70
+ 0,
71
+ 2,
72
+ 7,
73
+ 0
74
+ ],
75
+ [
76
+ 0,
77
+ 0,
78
+ 2,
79
+ 1
80
+ ]
81
+ ],
82
+ "calibration_ece_primary": 0.2894230769230769,
83
+ "per_scenario": {
84
+ "2011_T\u014dhoku_earthquake_and_tsunami": {
85
+ "ground_truth": "CRITICAL",
86
+ "primary_panel_ratings": [
87
+ 4,
88
+ 4
89
+ ],
90
+ "primary_majority": "CRITICAL",
91
+ "devil_rating": "HIGH",
92
+ "three_judge_majority": "CRITICAL",
93
+ "primary_correct": true,
94
+ "devil_correct": false
95
+ },
96
+ "2020\u20132023_global_chip_shortage": {
97
+ "ground_truth": "CRITICAL",
98
+ "primary_panel_ratings": [
99
+ 3,
100
+ 3
101
+ ],
102
+ "primary_majority": "HIGH",
103
+ "devil_rating": "CRITICAL",
104
+ "three_judge_majority": "HIGH",
105
+ "primary_correct": false,
106
+ "devil_correct": true
107
+ },
108
+ "2021_Suez_Canal_obstruction": {
109
+ "ground_truth": "HIGH",
110
+ "primary_panel_ratings": [
111
+ 3,
112
+ 3
113
+ ],
114
+ "primary_majority": "HIGH",
115
+ "devil_rating": "HIGH",
116
+ "three_judge_majority": "HIGH",
117
+ "primary_correct": true,
118
+ "devil_correct": true
119
+ },
120
+ "Bab-el-Mandeb": {
121
+ "ground_truth": "HIGH",
122
+ "primary_panel_ratings": [
123
+ 2,
124
+ 1
125
+ ],
126
+ "primary_majority": "MEDIUM",
127
+ "devil_rating": "HIGH",
128
+ "three_judge_majority": "MEDIUM",
129
+ "primary_correct": false,
130
+ "devil_correct": true
131
+ },
132
+ "Baltic_Dry_Index": {
133
+ "ground_truth": "LOW",
134
+ "primary_panel_ratings": [
135
+ 1,
136
+ 1
137
+ ],
138
+ "primary_majority": "LOW",
139
+ "devil_rating": "HIGH",
140
+ "three_judge_majority": "LOW",
141
+ "primary_correct": true,
142
+ "devil_correct": false
143
+ },
144
+ "Bullwhip_effect": {
145
+ "ground_truth": "MEDIUM",
146
+ "primary_panel_ratings": [
147
+ 1,
148
+ 1
149
+ ],
150
+ "primary_majority": "LOW",
151
+ "devil_rating": "HIGH",
152
+ "three_judge_majority": "LOW",
153
+ "primary_correct": false,
154
+ "devil_correct": false
155
+ },
156
+ "CHIPS_and_Science_Act": {
157
+ "ground_truth": "MEDIUM",
158
+ "primary_panel_ratings": [
159
+ 1,
160
+ 2
161
+ ],
162
+ "primary_majority": "MEDIUM",
163
+ "devil_rating": "HIGH",
164
+ "three_judge_majority": "MEDIUM",
165
+ "primary_correct": true,
166
+ "devil_correct": false
167
+ },
168
+ "Container_ship": {
169
+ "ground_truth": "LOW",
170
+ "primary_panel_ratings": [
171
+ 1,
172
+ 1
173
+ ],
174
+ "primary_majority": "LOW",
175
+ "devil_rating": "HIGH",
176
+ "three_judge_majority": "LOW",
177
+ "primary_correct": true,
178
+ "devil_correct": false
179
+ },
180
+ "Enterprise_resource_planning": {
181
+ "ground_truth": "LOW",
182
+ "primary_panel_ratings": [
183
+ 1,
184
+ 1
185
+ ],
186
+ "primary_majority": "LOW",
187
+ "devil_rating": "MEDIUM",
188
+ "three_judge_majority": "LOW",
189
+ "primary_correct": true,
190
+ "devil_correct": false
191
+ },
192
+ "Ever_Given": {
193
+ "ground_truth": "HIGH",
194
+ "primary_panel_ratings": [
195
+ 2,
196
+ 3
197
+ ],
198
+ "primary_majority": "MEDIUM",
199
+ "devil_rating": "HIGH",
200
+ "three_judge_majority": "HIGH",
201
+ "primary_correct": false,
202
+ "devil_correct": true
203
+ },
204
+ "Foxconn": {
205
+ "ground_truth": "MEDIUM",
206
+ "primary_panel_ratings": [
207
+ 3,
208
+ 2
209
+ ],
210
+ "primary_majority": "MEDIUM",
211
+ "devil_rating": "HIGH",
212
+ "three_judge_majority": "HIGH",
213
+ "primary_correct": true,
214
+ "devil_correct": false
215
+ },
216
+ "Inventory": {
217
+ "ground_truth": "LOW",
218
+ "primary_panel_ratings": [
219
+ 1,
220
+ 1
221
+ ],
222
+ "primary_majority": "LOW",
223
+ "devil_rating": "HIGH",
224
+ "three_judge_majority": "LOW",
225
+ "primary_correct": true,
226
+ "devil_correct": false
227
+ },
228
+ "Just-in-time_manufacturing": {
229
+ "ground_truth": "MEDIUM",
230
+ "primary_panel_ratings": [
231
+ 1,
232
+ 1
233
+ ],
234
+ "primary_majority": "LOW",
235
+ "devil_rating": "HIGH",
236
+ "three_judge_majority": "LOW",
237
+ "primary_correct": false,
238
+ "devil_correct": false
239
+ },
240
+ "Logistics": {
241
+ "ground_truth": "LOW",
242
+ "primary_panel_ratings": [
243
+ 1,
244
+ 1
245
+ ],
246
+ "primary_majority": "LOW",
247
+ "devil_rating": "HIGH",
248
+ "three_judge_majority": "LOW",
249
+ "primary_correct": true,
250
+ "devil_correct": false
251
+ },
252
+ "Port_of_Los_Angeles": {
253
+ "ground_truth": "MEDIUM",
254
+ "primary_panel_ratings": [
255
+ 2,
256
+ 2
257
+ ],
258
+ "primary_majority": "MEDIUM",
259
+ "devil_rating": "HIGH",
260
+ "three_judge_majority": "MEDIUM",
261
+ "primary_correct": true,
262
+ "devil_correct": false
263
+ },
264
+ "Port_of_Singapore": {
265
+ "ground_truth": "MEDIUM",
266
+ "primary_panel_ratings": [
267
+ 3,
268
+ 2
269
+ ],
270
+ "primary_majority": "MEDIUM",
271
+ "devil_rating": "HIGH",
272
+ "three_judge_majority": "HIGH",
273
+ "primary_correct": true,
274
+ "devil_correct": false
275
+ },
276
+ "Red_Sea_crisis": {
277
+ "ground_truth": "CRITICAL",
278
+ "primary_panel_ratings": [
279
+ 3,
280
+ 3
281
+ ],
282
+ "primary_majority": "HIGH",
283
+ "devil_rating": "CRITICAL",
284
+ "three_judge_majority": "HIGH",
285
+ "primary_correct": false,
286
+ "devil_correct": true
287
+ },
288
+ "Samsung_Electronics": {
289
+ "ground_truth": "MEDIUM",
290
+ "primary_panel_ratings": [
291
+ 2,
292
+ 1
293
+ ],
294
+ "primary_majority": "MEDIUM",
295
+ "devil_rating": "HIGH",
296
+ "three_judge_majority": "MEDIUM",
297
+ "primary_correct": true,
298
+ "devil_correct": false
299
+ },
300
+ "Semiconductor_industry": {
301
+ "ground_truth": "HIGH",
302
+ "primary_panel_ratings": [
303
+ 2,
304
+ 1
305
+ ],
306
+ "primary_majority": "MEDIUM",
307
+ "devil_rating": "CRITICAL",
308
+ "three_judge_majority": "MEDIUM",
309
+ "primary_correct": false,
310
+ "devil_correct": false
311
+ },
312
+ "Strait_of_Hormuz": {
313
+ "ground_truth": "HIGH",
314
+ "primary_panel_ratings": [
315
+ 4,
316
+ 3
317
+ ],
318
+ "primary_majority": "CRITICAL",
319
+ "devil_rating": "HIGH",
320
+ "three_judge_majority": "HIGH",
321
+ "primary_correct": false,
322
+ "devil_correct": true
323
+ },
324
+ "Strait_of_Malacca": {
325
+ "ground_truth": "HIGH",
326
+ "primary_panel_ratings": [
327
+ 3,
328
+ 3
329
+ ],
330
+ "primary_majority": "HIGH",
331
+ "devil_rating": "HIGH",
332
+ "three_judge_majority": "HIGH",
333
+ "primary_correct": true,
334
+ "devil_correct": true
335
+ },
336
+ "Suez_Canal": {
337
+ "ground_truth": "HIGH",
338
+ "primary_panel_ratings": [
339
+ 3,
340
+ 1
341
+ ],
342
+ "primary_majority": "MEDIUM",
343
+ "devil_rating": "CRITICAL",
344
+ "three_judge_majority": "HIGH",
345
+ "primary_correct": false,
346
+ "devil_correct": false
347
+ },
348
+ "Supply_chain_attack": {
349
+ "ground_truth": "HIGH",
350
+ "primary_panel_ratings": [
351
+ 2,
352
+ 3
353
+ ],
354
+ "primary_majority": "MEDIUM",
355
+ "devil_rating": "CRITICAL",
356
+ "three_judge_majority": "HIGH",
357
+ "primary_correct": false,
358
+ "devil_correct": false
359
+ },
360
+ "Supply_chain_management": {
361
+ "ground_truth": "LOW",
362
+ "primary_panel_ratings": [
363
+ 1,
364
+ 1
365
+ ],
366
+ "primary_majority": "LOW",
367
+ "devil_rating": "HIGH",
368
+ "three_judge_majority": "LOW",
369
+ "primary_correct": true,
370
+ "devil_correct": false
371
+ },
372
+ "TSMC": {
373
+ "ground_truth": "HIGH",
374
+ "primary_panel_ratings": [
375
+ 3,
376
+ 3
377
+ ],
378
+ "primary_majority": "HIGH",
379
+ "devil_rating": "HIGH",
380
+ "three_judge_majority": "HIGH",
381
+ "primary_correct": true,
382
+ "devil_correct": true
383
+ },
384
+ "Warehouse": {
385
+ "ground_truth": "LOW",
386
+ "primary_panel_ratings": [
387
+ 1,
388
+ 1
389
+ ],
390
+ "primary_majority": "LOW",
391
+ "devil_rating": "MEDIUM",
392
+ "three_judge_majority": "LOW",
393
+ "primary_correct": true,
394
+ "devil_correct": false
395
+ }
396
+ }
397
  }
FINAL_SUBMIT/receipts/R4_DANGEROUS_V2_LIVE.json CHANGED
@@ -1,64 +1,64 @@
1
- {
2
- "scenario": "Red_Sea_crisis",
3
- "ground_truth": "CRITICAL",
4
- "per_judge": {
5
- "qwen25-14b-local": {
6
- "risk_level": "CRITICAL",
7
- "parsed": {
8
- "risk_level": "CRITICAL",
9
- "confidence": 0.9,
10
- "primary_vulnerabilities": [
11
- "Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.",
12
- "Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.",
13
- "Disruption of global trade routes through the Suez Canal."
14
- ],
15
- "mitigations": [
16
- "Enhanced naval patrols by international coalitions to protect shipping lanes.",
17
- "Development of alternative shipping routes or rerouting strategies for high-risk areas.",
18
- "Strengthened diplomatic efforts to negotiate a long-term ceasefire and resolution."
19
- ],
20
- "reasoning_one_line": "Resumption of Houthi attacks on Israel amid the Iran war poses severe threats to global maritime security and trade stability.",
21
- "time_sensitivity": "VOLATILE"
22
- },
23
- "correct": true,
24
- "latency_s": 21.539926052093506,
25
- "raw_preview": "{\n \"risk_level\": \"CRITICAL\",\n \"confidence\": 0.9,\n \"primary_vulnerabilities\": [\n \"Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.\",\n \"Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.\",\n \"Disruption of global trade routes through the Suez Canal.\"\n ],\n \"mitigations\": [\n \"Enhanced naval patrols by i"
26
- },
27
- "mistral-nemo-local": {
28
- "risk_level": "HIGH",
29
- "parsed": {
30
- "risk_level": "HIGH",
31
- "confidence": 0.95,
32
- "primary_vulnerabilities": [
33
- "Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade",
34
- "Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)",
35
- "Potential targeting of international merchant vessels, regardless of affiliation"
36
- ],
37
- "mitigations": [
38
- "Strengthening maritime security cooperation and escort missions in the Red Sea (Operation Prosperity Guardian)",
39
- "Diplomatic efforts to broker a ceasefire and political solution in Yemen",
40
- "Diversion of shipping routes around South Africa for vessels at high risk"
41
- ],
42
- "reasoning_one_line": "Resumption of Houthi attacks amidst regional conflict escalation poses significant risk to maritime traffic and global trade.",
43
- "time_sensitivity": "VOLATILE"
44
- },
45
- "correct": false,
46
- "latency_s": 25.940397024154663,
47
- "raw_preview": "{\n \"risk_level\": \"HIGH\",\n \"confidence\": 0.95,\n \"primary_vulnerabilities\": [\n \"Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade\",\n \"Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)\",\n \"Potential targeting of international merchant vessels, regardless of affiliation\"\n ],\n \"mitigations\": [\n \"Str"
48
- }
49
- },
50
- "devil": {
51
- "model": "deepseek-r1-local-q4",
52
- "risk_level": "PARSE_FAIL",
53
- "correct": false,
54
- "latency_s": 30.340745210647583,
55
- "raw_preview": " Include the following sections:\n- Current Situation: Description of the conflict's current state.\n- Threats: List and describe each threat type (e.g., direct, indirect).\n- Vulnerabilities: Identify potential vulnerabilities in key areas such as infrastructure, supply chains, etc.\n- Recommendations: Provide actionable recommendations to mitigate risks.\n\nPlease make sure that your JSON is properly "
56
- },
57
- "summary": {
58
- "primary_panel_all_correct": false,
59
- "primary_correct_count": "1/2",
60
- "three_judge_correct_count": "1/3",
61
- "consensus_primary": "CRITICAL",
62
- "ground_truth": "CRITICAL"
63
- }
64
  }
 
1
+ {
2
+ "scenario": "Red_Sea_crisis",
3
+ "ground_truth": "CRITICAL",
4
+ "per_judge": {
5
+ "qwen25-14b-local": {
6
+ "risk_level": "CRITICAL",
7
+ "parsed": {
8
+ "risk_level": "CRITICAL",
9
+ "confidence": 0.9,
10
+ "primary_vulnerabilities": [
11
+ "Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.",
12
+ "Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.",
13
+ "Disruption of global trade routes through the Suez Canal."
14
+ ],
15
+ "mitigations": [
16
+ "Enhanced naval patrols by international coalitions to protect shipping lanes.",
17
+ "Development of alternative shipping routes or rerouting strategies for high-risk areas.",
18
+ "Strengthened diplomatic efforts to negotiate a long-term ceasefire and resolution."
19
+ ],
20
+ "reasoning_one_line": "Resumption of Houthi attacks on Israel amid the Iran war poses severe threats to global maritime security and trade stability.",
21
+ "time_sensitivity": "VOLATILE"
22
+ },
23
+ "correct": true,
24
+ "latency_s": 21.539926052093506,
25
+ "raw_preview": "{\n \"risk_level\": \"CRITICAL\",\n \"confidence\": 0.9,\n \"primary_vulnerabilities\": [\n \"Increased threat to maritime shipping in the Red Sea and Bab-el-Mandeb Strait.\",\n \"Potential for further escalation involving multiple countries including Iran, Israel, US, UK, and Yemen.\",\n \"Disruption of global trade routes through the Suez Canal.\"\n ],\n \"mitigations\": [\n \"Enhanced naval patrols by i"
26
+ },
27
+ "mistral-nemo-local": {
28
+ "risk_level": "HIGH",
29
+ "parsed": {
30
+ "risk_level": "HIGH",
31
+ "confidence": 0.95,
32
+ "primary_vulnerabilities": [
33
+ "Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade",
34
+ "Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)",
35
+ "Potential targeting of international merchant vessels, regardless of affiliation"
36
+ ],
37
+ "mitigations": [
38
+ "Strengthening maritime security cooperation and escort missions in the Red Sea (Operation Prosperity Guardian)",
39
+ "Diplomatic efforts to broker a ceasefire and political solution in Yemen",
40
+ "Diversion of shipping routes around South Africa for vessels at high risk"
41
+ ],
42
+ "reasoning_one_line": "Resumption of Houthi attacks amidst regional conflict escalation poses significant risk to maritime traffic and global trade.",
43
+ "time_sensitivity": "VOLATILE"
44
+ },
45
+ "correct": false,
46
+ "latency_s": 25.940397024154663,
47
+ "raw_preview": "{\n \"risk_level\": \"HIGH\",\n \"confidence\": 0.95,\n \"primary_vulnerabilities\": [\n \"Maritime traffic disruption in the Red Sea and Bab-el-Mandeb strait, impacting global trade\",\n \"Escalation of conflict with potential for wider regional involvement (e.g., Iran, Saudi Arabia)\",\n \"Potential targeting of international merchant vessels, regardless of affiliation\"\n ],\n \"mitigations\": [\n \"Str"
48
+ }
49
+ },
50
+ "devil": {
51
+ "model": "deepseek-r1-local-q4",
52
+ "risk_level": "PARSE_FAIL",
53
+ "correct": false,
54
+ "latency_s": 30.340745210647583,
55
+ "raw_preview": " Include the following sections:\n- Current Situation: Description of the conflict's current state.\n- Threats: List and describe each threat type (e.g., direct, indirect).\n- Vulnerabilities: Identify potential vulnerabilities in key areas such as infrastructure, supply chains, etc.\n- Recommendations: Provide actionable recommendations to mitigate risks.\n\nPlease make sure that your JSON is properly "
56
+ },
57
+ "summary": {
58
+ "primary_panel_all_correct": false,
59
+ "primary_correct_count": "1/2",
60
+ "three_judge_correct_count": "1/3",
61
+ "consensus_primary": "CRITICAL",
62
+ "ground_truth": "CRITICAL"
63
+ }
64
  }
FINAL_SUBMIT/receipts/R4_FRONTIER_PANEL_V2.json CHANGED
The diff for this file is too large to render. See raw diff
 
FINAL_SUBMIT/receipts/R5_BEIR_MANUAL.json CHANGED
@@ -1,1023 +1,1023 @@
1
- {
2
- "task": "SupplyMind-crisis-retrieval-BEIR-style",
3
- "task_description": "Manual BEIR-style retrieval eval on 26 Wikipedia crisis articles + 20 real supply-chain queries. Metrics match the public MTEB retrieval leaderboard (nDCG@10, R@10, P@10). This is an out-of-domain task (supply chain, not medical), but numbers provide a directional check that our embedders are consistent with their published leaderboard performance.",
4
- "our_results": {
5
- "mxbai-embed-large-v1": {
6
- "embedder": "mxbai-embed-large-v1",
7
- "mean_ndcg@10": 0.9597824382702198,
8
- "mean_recall@10": 1.0,
9
- "mean_precision@10": 0.12000000000000002,
10
- "corpus_encoding_s": 12.996914148330688,
11
- "n_queries": 20,
12
- "per_query": {
13
- "q1": {
14
- "query": "What was the magnitude of the 2011 Tohoku earthquake?",
15
- "gold": [
16
- "2011_T\u014dhoku_earthquake_and_tsunami"
17
- ],
18
- "top5": [
19
- "2011_T\u014dhoku_earthquake_and_tsunami",
20
- "Ever_Given",
21
- "2020\u20132023_global_chip_shortage",
22
- "Container_ship",
23
- "Warehouse"
24
- ],
25
- "ndcg@10": 1.0,
26
- "recall@10": 1.0,
27
- "precision@10": 0.1
28
- },
29
- "q2": {
30
- "query": "How long was the Suez Canal blocked in 2021?",
31
- "gold": [
32
- "2021_Suez_Canal_obstruction",
33
- "Ever_Given"
34
- ],
35
- "top5": [
36
- "2021_Suez_Canal_obstruction",
37
- "Suez_Canal",
38
- "Ever_Given",
39
- "Red_Sea_crisis",
40
- "Bab-el-Mandeb"
41
- ],
42
- "ndcg@10": 0.9197207891481876,
43
- "recall@10": 1.0,
44
- "precision@10": 0.2
45
- },
46
- "q3": {
47
- "query": "What caused the global semiconductor shortage?",
48
- "gold": [
49
- "2020\u20132023_global_chip_shortage"
50
- ],
51
- "top5": [
52
- "2020\u20132023_global_chip_shortage",
53
- "Semiconductor_industry",
54
- "TSMC",
55
- "Bullwhip_effect",
56
- "CHIPS_and_Science_Act"
57
- ],
58
- "ndcg@10": 1.0,
59
- "recall@10": 1.0,
60
- "precision@10": 0.1
61
- },
62
- "q4": {
63
- "query": "Why is the Strait of Hormuz strategically important?",
64
- "gold": [
65
- "Strait_of_Hormuz"
66
- ],
67
- "top5": [
68
- "Strait_of_Hormuz",
69
- "Strait_of_Malacca",
70
- "Bab-el-Mandeb",
71
- "Suez_Canal",
72
- "Port_of_Singapore"
73
- ],
74
- "ndcg@10": 1.0,
75
- "recall@10": 1.0,
76
- "precision@10": 0.1
77
- },
78
- "q5": {
79
- "query": "How do Houthis threaten Red Sea shipping?",
80
- "gold": [
81
- "Red_Sea_crisis",
82
- "Bab-el-Mandeb"
83
- ],
84
- "top5": [
85
- "Red_Sea_crisis",
86
- "2021_Suez_Canal_obstruction",
87
- "Bab-el-Mandeb",
88
- "Strait_of_Hormuz",
89
- "Suez_Canal"
90
- ],
91
- "ndcg@10": 0.9197207891481876,
92
- "recall@10": 1.0,
93
- "precision@10": 0.2
94
- },
95
- "q6": {
96
- "query": "Which foundry dominates advanced chip production?",
97
- "gold": [
98
- "TSMC",
99
- "Semiconductor_industry"
100
- ],
101
- "top5": [
102
- "TSMC",
103
- "Semiconductor_industry",
104
- "Foxconn",
105
- "CHIPS_and_Science_Act",
106
- "2020\u20132023_global_chip_shortage"
107
- ],
108
- "ndcg@10": 1.0,
109
- "recall@10": 1.0,
110
- "precision@10": 0.2
111
- },
112
- "q7": {
113
- "query": "What is the bullwhip effect?",
114
- "gold": [
115
- "Bullwhip_effect"
116
- ],
117
- "top5": [
118
- "Bullwhip_effect",
119
- "Inventory",
120
- "Supply_chain_management",
121
- "Supply_chain_attack",
122
- "2020\u20132023_global_chip_shortage"
123
- ],
124
- "ndcg@10": 1.0,
125
- "recall@10": 1.0,
126
- "precision@10": 0.1
127
- },
128
- "q8": {
129
- "query": "Which port congested during 2021 supply chain crisis?",
130
- "gold": [
131
- "Port_of_Los_Angeles"
132
- ],
133
- "top5": [
134
- "2021_Suez_Canal_obstruction",
135
- "2020\u20132023_global_chip_shortage",
136
- "Ever_Given",
137
- "Port_of_Singapore",
138
- "Container_ship"
139
- ],
140
- "ndcg@10": 0.3562071871080222,
141
- "recall@10": 1.0,
142
- "precision@10": 0.1
143
- },
144
- "q9": {
145
- "query": "What is the just-in-time manufacturing philosophy?",
146
- "gold": [
147
- "Just-in-time_manufacturing"
148
- ],
149
- "top5": [
150
- "Just-in-time_manufacturing",
151
- "Inventory",
152
- "Supply_chain_management",
153
- "Logistics",
154
- "Enterprise_resource_planning"
155
- ],
156
- "ndcg@10": 1.0,
157
- "recall@10": 1.0,
158
- "precision@10": 0.1
159
- },
160
- "q10": {
161
- "query": "What does the CHIPS Act allocate?",
162
- "gold": [
163
- "CHIPS_and_Science_Act"
164
- ],
165
- "top5": [
166
- "CHIPS_and_Science_Act",
167
- "2020\u20132023_global_chip_shortage",
168
- "Semiconductor_industry",
169
- "TSMC",
170
- "Inventory"
171
- ],
172
- "ndcg@10": 1.0,
173
- "recall@10": 1.0,
174
- "precision@10": 0.1
175
- },
176
- "q11": {
177
- "query": "Who is Foxconn's primary customer?",
178
- "gold": [
179
- "Foxconn"
180
- ],
181
- "top5": [
182
- "Foxconn",
183
- "Semiconductor_industry",
184
- "TSMC",
185
- "Bullwhip_effect",
186
- "Samsung_Electronics"
187
- ],
188
- "ndcg@10": 1.0,
189
- "recall@10": 1.0,
190
- "precision@10": 0.1
191
- },
192
- "q12": {
193
- "query": "Why did the Ever Given run aground?",
194
- "gold": [
195
- "Ever_Given",
196
- "2021_Suez_Canal_obstruction"
197
- ],
198
- "top5": [
199
- "Ever_Given",
200
- "2021_Suez_Canal_obstruction",
201
- "Container_ship",
202
- "2011_T\u014dhoku_earthquake_and_tsunami",
203
- "Suez_Canal"
204
- ],
205
- "ndcg@10": 1.0,
206
- "recall@10": 1.0,
207
- "precision@10": 0.2
208
- },
209
- "q13": {
210
- "query": "What is safety stock?",
211
- "gold": [
212
- "Inventory"
213
- ],
214
- "top5": [
215
- "Inventory",
216
- "Container_ship",
217
- "Just-in-time_manufacturing",
218
- "Bullwhip_effect",
219
- "Warehouse"
220
- ],
221
- "ndcg@10": 1.0,
222
- "recall@10": 1.0,
223
- "precision@10": 0.1
224
- },
225
- "q14": {
226
- "query": "What is a supply chain attack?",
227
- "gold": [
228
- "Supply_chain_attack"
229
- ],
230
- "top5": [
231
- "Supply_chain_attack",
232
- "Supply_chain_management",
233
- "Bullwhip_effect",
234
- "Logistics",
235
- "Inventory"
236
- ],
237
- "ndcg@10": 1.0,
238
- "recall@10": 1.0,
239
- "precision@10": 0.1
240
- },
241
- "q15": {
242
- "query": "How busy is the Port of Singapore?",
243
- "gold": [
244
- "Port_of_Singapore"
245
- ],
246
- "top5": [
247
- "Port_of_Singapore",
248
- "Strait_of_Malacca",
249
- "Port_of_Los_Angeles",
250
- "2021_Suez_Canal_obstruction",
251
- "Container_ship"
252
- ],
253
- "ndcg@10": 1.0,
254
- "recall@10": 1.0,
255
- "precision@10": 0.1
256
- },
257
- "q16": {
258
- "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
259
- "gold": [
260
- "Strait_of_Malacca"
261
- ],
262
- "top5": [
263
- "Strait_of_Malacca",
264
- "Strait_of_Hormuz",
265
- "Bab-el-Mandeb",
266
- "Port_of_Singapore",
267
- "Suez_Canal"
268
- ],
269
- "ndcg@10": 1.0,
270
- "recall@10": 1.0,
271
- "precision@10": 0.1
272
- },
273
- "q17": {
274
- "query": "Which industry does the Baltic Dry Index track?",
275
- "gold": [
276
- "Baltic_Dry_Index"
277
- ],
278
- "top5": [
279
- "Baltic_Dry_Index",
280
- "Semiconductor_industry",
281
- "Inventory",
282
- "Container_ship",
283
- "2020\u20132023_global_chip_shortage"
284
- ],
285
- "ndcg@10": 1.0,
286
- "recall@10": 1.0,
287
- "precision@10": 0.1
288
- },
289
- "q18": {
290
- "query": "What function does a warehouse serve?",
291
- "gold": [
292
- "Warehouse"
293
- ],
294
- "top5": [
295
- "Warehouse",
296
- "Inventory",
297
- "Logistics",
298
- "Container_ship",
299
- "Supply_chain_management"
300
- ],
301
- "ndcg@10": 1.0,
302
- "recall@10": 1.0,
303
- "precision@10": 0.1
304
- },
305
- "q19": {
306
- "query": "What is a container ship's TEU?",
307
- "gold": [
308
- "Container_ship"
309
- ],
310
- "top5": [
311
- "Container_ship",
312
- "Ever_Given",
313
- "2021_Suez_Canal_obstruction",
314
- "Port_of_Singapore",
315
- "Port_of_Los_Angeles"
316
- ],
317
- "ndcg@10": 1.0,
318
- "recall@10": 1.0,
319
- "precision@10": 0.1
320
- },
321
- "q20": {
322
- "query": "What software replaces accounting + inventory + HR systems?",
323
- "gold": [
324
- "Enterprise_resource_planning"
325
- ],
326
- "top5": [
327
- "Enterprise_resource_planning",
328
- "Inventory",
329
- "Just-in-time_manufacturing",
330
- "Supply_chain_management",
331
- "Logistics"
332
- ],
333
- "ndcg@10": 1.0,
334
- "recall@10": 1.0,
335
- "precision@10": 0.1
336
- }
337
- }
338
- },
339
- "bge-m3": {
340
- "embedder": "bge-m3",
341
- "mean_ndcg@10": 0.967519867361079,
342
- "mean_recall@10": 1.0,
343
- "mean_precision@10": 0.12000000000000002,
344
- "corpus_encoding_s": 43.88751459121704,
345
- "n_queries": 20,
346
- "per_query": {
347
- "q1": {
348
- "query": "What was the magnitude of the 2011 Tohoku earthquake?",
349
- "gold": [
350
- "2011_T\u014dhoku_earthquake_and_tsunami"
351
- ],
352
- "top5": [
353
- "2011_T\u014dhoku_earthquake_and_tsunami",
354
- "Foxconn",
355
- "Bab-el-Mandeb",
356
- "Ever_Given",
357
- "2020\u20132023_global_chip_shortage"
358
- ],
359
- "ndcg@10": 1.0,
360
- "recall@10": 1.0,
361
- "precision@10": 0.1
362
- },
363
- "q2": {
364
- "query": "How long was the Suez Canal blocked in 2021?",
365
- "gold": [
366
- "2021_Suez_Canal_obstruction",
367
- "Ever_Given"
368
- ],
369
- "top5": [
370
- "2021_Suez_Canal_obstruction",
371
- "Suez_Canal",
372
- "Ever_Given",
373
- "Bab-el-Mandeb",
374
- "2020\u20132023_global_chip_shortage"
375
- ],
376
- "ndcg@10": 0.9197207891481876,
377
- "recall@10": 1.0,
378
- "precision@10": 0.2
379
- },
380
- "q3": {
381
- "query": "What caused the global semiconductor shortage?",
382
- "gold": [
383
- "2020\u20132023_global_chip_shortage"
384
- ],
385
- "top5": [
386
- "2020\u20132023_global_chip_shortage",
387
- "Semiconductor_industry",
388
- "TSMC",
389
- "Samsung_Electronics",
390
- "Foxconn"
391
- ],
392
- "ndcg@10": 1.0,
393
- "recall@10": 1.0,
394
- "precision@10": 0.1
395
- },
396
- "q4": {
397
- "query": "Why is the Strait of Hormuz strategically important?",
398
- "gold": [
399
- "Strait_of_Hormuz"
400
- ],
401
- "top5": [
402
- "Strait_of_Hormuz",
403
- "Bab-el-Mandeb",
404
- "Strait_of_Malacca",
405
- "Suez_Canal",
406
- "Red_Sea_crisis"
407
- ],
408
- "ndcg@10": 1.0,
409
- "recall@10": 1.0,
410
- "precision@10": 0.1
411
- },
412
- "q5": {
413
- "query": "How do Houthis threaten Red Sea shipping?",
414
- "gold": [
415
- "Red_Sea_crisis",
416
- "Bab-el-Mandeb"
417
- ],
418
- "top5": [
419
- "Red_Sea_crisis",
420
- "Bab-el-Mandeb",
421
- "Suez_Canal",
422
- "2021_Suez_Canal_obstruction",
423
- "Ever_Given"
424
- ],
425
- "ndcg@10": 1.0,
426
- "recall@10": 1.0,
427
- "precision@10": 0.2
428
- },
429
- "q6": {
430
- "query": "Which foundry dominates advanced chip production?",
431
- "gold": [
432
- "TSMC",
433
- "Semiconductor_industry"
434
- ],
435
- "top5": [
436
- "Semiconductor_industry",
437
- "TSMC",
438
- "Foxconn",
439
- "2020\u20132023_global_chip_shortage",
440
- "Samsung_Electronics"
441
- ],
442
- "ndcg@10": 1.0,
443
- "recall@10": 1.0,
444
- "precision@10": 0.2
445
- },
446
- "q7": {
447
- "query": "What is the bullwhip effect?",
448
- "gold": [
449
- "Bullwhip_effect"
450
- ],
451
- "top5": [
452
- "Bullwhip_effect",
453
- "2020\u20132023_global_chip_shortage",
454
- "Baltic_Dry_Index",
455
- "Bab-el-Mandeb",
456
- "Just-in-time_manufacturing"
457
- ],
458
- "ndcg@10": 1.0,
459
- "recall@10": 1.0,
460
- "precision@10": 0.1
461
- },
462
- "q8": {
463
- "query": "Which port congested during 2021 supply chain crisis?",
464
- "gold": [
465
- "Port_of_Los_Angeles"
466
- ],
467
- "top5": [
468
- "2020\u20132023_global_chip_shortage",
469
- "2021_Suez_Canal_obstruction",
470
- "Ever_Given",
471
- "Port_of_Los_Angeles",
472
- "Bab-el-Mandeb"
473
- ],
474
- "ndcg@10": 0.43067655807339306,
475
- "recall@10": 1.0,
476
- "precision@10": 0.1
477
- },
478
- "q9": {
479
- "query": "What is the just-in-time manufacturing philosophy?",
480
- "gold": [
481
- "Just-in-time_manufacturing"
482
- ],
483
- "top5": [
484
- "Just-in-time_manufacturing",
485
- "Inventory",
486
- "Supply_chain_management",
487
- "Foxconn",
488
- "Logistics"
489
- ],
490
- "ndcg@10": 1.0,
491
- "recall@10": 1.0,
492
- "precision@10": 0.1
493
- },
494
- "q10": {
495
- "query": "What does the CHIPS Act allocate?",
496
- "gold": [
497
- "CHIPS_and_Science_Act"
498
- ],
499
- "top5": [
500
- "CHIPS_and_Science_Act",
501
- "2020\u20132023_global_chip_shortage",
502
- "TSMC",
503
- "Foxconn",
504
- "Supply_chain_attack"
505
- ],
506
- "ndcg@10": 1.0,
507
- "recall@10": 1.0,
508
- "precision@10": 0.1
509
- },
510
- "q11": {
511
- "query": "Who is Foxconn's primary customer?",
512
- "gold": [
513
- "Foxconn"
514
- ],
515
- "top5": [
516
- "Foxconn",
517
- "TSMC",
518
- "Semiconductor_industry",
519
- "Ever_Given",
520
- "2021_Suez_Canal_obstruction"
521
- ],
522
- "ndcg@10": 1.0,
523
- "recall@10": 1.0,
524
- "precision@10": 0.1
525
- },
526
- "q12": {
527
- "query": "Why did the Ever Given run aground?",
528
- "gold": [
529
- "Ever_Given",
530
- "2021_Suez_Canal_obstruction"
531
- ],
532
- "top5": [
533
- "Ever_Given",
534
- "2021_Suez_Canal_obstruction",
535
- "2011_T\u014dhoku_earthquake_and_tsunami",
536
- "Bab-el-Mandeb",
537
- "2020\u20132023_global_chip_shortage"
538
- ],
539
- "ndcg@10": 1.0,
540
- "recall@10": 1.0,
541
- "precision@10": 0.2
542
- },
543
- "q13": {
544
- "query": "What is safety stock?",
545
- "gold": [
546
- "Inventory"
547
- ],
548
- "top5": [
549
- "Inventory",
550
- "Supply_chain_attack",
551
- "TSMC",
552
- "Warehouse",
553
- "Port_of_Singapore"
554
- ],
555
- "ndcg@10": 1.0,
556
- "recall@10": 1.0,
557
- "precision@10": 0.1
558
- },
559
- "q14": {
560
- "query": "What is a supply chain attack?",
561
- "gold": [
562
- "Supply_chain_attack"
563
- ],
564
- "top5": [
565
- "Supply_chain_attack",
566
- "Supply_chain_management",
567
- "Bullwhip_effect",
568
- "2020\u20132023_global_chip_shortage",
569
- "Logistics"
570
- ],
571
- "ndcg@10": 1.0,
572
- "recall@10": 1.0,
573
- "precision@10": 0.1
574
- },
575
- "q15": {
576
- "query": "How busy is the Port of Singapore?",
577
- "gold": [
578
- "Port_of_Singapore"
579
- ],
580
- "top5": [
581
- "Port_of_Singapore",
582
- "Port_of_Los_Angeles",
583
- "Strait_of_Malacca",
584
- "2021_Suez_Canal_obstruction",
585
- "Container_ship"
586
- ],
587
- "ndcg@10": 1.0,
588
- "recall@10": 1.0,
589
- "precision@10": 0.1
590
- },
591
- "q16": {
592
- "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
593
- "gold": [
594
- "Strait_of_Malacca"
595
- ],
596
- "top5": [
597
- "Strait_of_Malacca",
598
- "Bab-el-Mandeb",
599
- "Strait_of_Hormuz",
600
- "Port_of_Singapore",
601
- "Suez_Canal"
602
- ],
603
- "ndcg@10": 1.0,
604
- "recall@10": 1.0,
605
- "precision@10": 0.1
606
- },
607
- "q17": {
608
- "query": "Which industry does the Baltic Dry Index track?",
609
- "gold": [
610
- "Baltic_Dry_Index"
611
- ],
612
- "top5": [
613
- "Baltic_Dry_Index",
614
- "Inventory",
615
- "2020\u20132023_global_chip_shortage",
616
- "Semiconductor_industry",
617
- "Logistics"
618
- ],
619
- "ndcg@10": 1.0,
620
- "recall@10": 1.0,
621
- "precision@10": 0.1
622
- },
623
- "q18": {
624
- "query": "What function does a warehouse serve?",
625
- "gold": [
626
- "Warehouse"
627
- ],
628
- "top5": [
629
- "Warehouse",
630
- "Inventory",
631
- "Logistics",
632
- "Container_ship",
633
- "Port_of_Singapore"
634
- ],
635
- "ndcg@10": 1.0,
636
- "recall@10": 1.0,
637
- "precision@10": 0.1
638
- },
639
- "q19": {
640
- "query": "What is a container ship's TEU?",
641
- "gold": [
642
- "Container_ship"
643
- ],
644
- "top5": [
645
- "Container_ship",
646
- "Ever_Given",
647
- "2021_Suez_Canal_obstruction",
648
- "Baltic_Dry_Index",
649
- "Port_of_Singapore"
650
- ],
651
- "ndcg@10": 1.0,
652
- "recall@10": 1.0,
653
- "precision@10": 0.1
654
- },
655
- "q20": {
656
- "query": "What software replaces accounting + inventory + HR systems?",
657
- "gold": [
658
- "Enterprise_resource_planning"
659
- ],
660
- "top5": [
661
- "Enterprise_resource_planning",
662
- "Inventory",
663
- "Supply_chain_attack",
664
- "Just-in-time_manufacturing",
665
- "Foxconn"
666
- ],
667
- "ndcg@10": 1.0,
668
- "recall@10": 1.0,
669
- "precision@10": 0.1
670
- }
671
- }
672
- },
673
- "snowflake-arctic-l": {
674
- "embedder": "snowflake-arctic-l",
675
- "mean_ndcg@10": 0.9709860394574094,
676
- "mean_recall@10": 1.0,
677
- "mean_precision@10": 0.12000000000000002,
678
- "corpus_encoding_s": 40.3898344039917,
679
- "n_queries": 20,
680
- "per_query": {
681
- "q1": {
682
- "query": "What was the magnitude of the 2011 Tohoku earthquake?",
683
- "gold": [
684
- "2011_T\u014dhoku_earthquake_and_tsunami"
685
- ],
686
- "top5": [
687
- "2011_T\u014dhoku_earthquake_and_tsunami",
688
- "Ever_Given",
689
- "2021_Suez_Canal_obstruction",
690
- "Samsung_Electronics",
691
- "Suez_Canal"
692
- ],
693
- "ndcg@10": 1.0,
694
- "recall@10": 1.0,
695
- "precision@10": 0.1
696
- },
697
- "q2": {
698
- "query": "How long was the Suez Canal blocked in 2021?",
699
- "gold": [
700
- "2021_Suez_Canal_obstruction",
701
- "Ever_Given"
702
- ],
703
- "top5": [
704
- "2021_Suez_Canal_obstruction",
705
- "Suez_Canal",
706
- "Ever_Given",
707
- "Red_Sea_crisis",
708
- "Bab-el-Mandeb"
709
- ],
710
- "ndcg@10": 0.9197207891481876,
711
- "recall@10": 1.0,
712
- "precision@10": 0.2
713
- },
714
- "q3": {
715
- "query": "What caused the global semiconductor shortage?",
716
- "gold": [
717
- "2020\u20132023_global_chip_shortage"
718
- ],
719
- "top5": [
720
- "2020\u20132023_global_chip_shortage",
721
- "Semiconductor_industry",
722
- "TSMC",
723
- "Supply_chain_attack",
724
- "Foxconn"
725
- ],
726
- "ndcg@10": 1.0,
727
- "recall@10": 1.0,
728
- "precision@10": 0.1
729
- },
730
- "q4": {
731
- "query": "Why is the Strait of Hormuz strategically important?",
732
- "gold": [
733
- "Strait_of_Hormuz"
734
- ],
735
- "top5": [
736
- "Strait_of_Hormuz",
737
- "Strait_of_Malacca",
738
- "Bab-el-Mandeb",
739
- "Suez_Canal",
740
- "Red_Sea_crisis"
741
- ],
742
- "ndcg@10": 1.0,
743
- "recall@10": 1.0,
744
- "precision@10": 0.1
745
- },
746
- "q5": {
747
- "query": "How do Houthis threaten Red Sea shipping?",
748
- "gold": [
749
- "Red_Sea_crisis",
750
- "Bab-el-Mandeb"
751
- ],
752
- "top5": [
753
- "Red_Sea_crisis",
754
- "Bab-el-Mandeb",
755
- "Strait_of_Hormuz",
756
- "Suez_Canal",
757
- "2021_Suez_Canal_obstruction"
758
- ],
759
- "ndcg@10": 1.0,
760
- "recall@10": 1.0,
761
- "precision@10": 0.2
762
- },
763
- "q6": {
764
- "query": "Which foundry dominates advanced chip production?",
765
- "gold": [
766
- "TSMC",
767
- "Semiconductor_industry"
768
- ],
769
- "top5": [
770
- "Semiconductor_industry",
771
- "TSMC",
772
- "2020\u20132023_global_chip_shortage",
773
- "Foxconn",
774
- "CHIPS_and_Science_Act"
775
- ],
776
- "ndcg@10": 1.0,
777
- "recall@10": 1.0,
778
- "precision@10": 0.2
779
- },
780
- "q7": {
781
- "query": "What is the bullwhip effect?",
782
- "gold": [
783
- "Bullwhip_effect"
784
- ],
785
- "top5": [
786
- "Bullwhip_effect",
787
- "Just-in-time_manufacturing",
788
- "Baltic_Dry_Index",
789
- "Inventory",
790
- "Bab-el-Mandeb"
791
- ],
792
- "ndcg@10": 1.0,
793
- "recall@10": 1.0,
794
- "precision@10": 0.1
795
- },
796
- "q8": {
797
- "query": "Which port congested during 2021 supply chain crisis?",
798
- "gold": [
799
- "Port_of_Los_Angeles"
800
- ],
801
- "top5": [
802
- "2020\u20132023_global_chip_shortage",
803
- "2021_Suez_Canal_obstruction",
804
- "Port_of_Los_Angeles",
805
- "Ever_Given",
806
- "Supply_chain_attack"
807
- ],
808
- "ndcg@10": 0.5,
809
- "recall@10": 1.0,
810
- "precision@10": 0.1
811
- },
812
- "q9": {
813
- "query": "What is the just-in-time manufacturing philosophy?",
814
- "gold": [
815
- "Just-in-time_manufacturing"
816
- ],
817
- "top5": [
818
- "Just-in-time_manufacturing",
819
- "Supply_chain_management",
820
- "Inventory",
821
- "Logistics",
822
- "Semiconductor_industry"
823
- ],
824
- "ndcg@10": 1.0,
825
- "recall@10": 1.0,
826
- "precision@10": 0.1
827
- },
828
- "q10": {
829
- "query": "What does the CHIPS Act allocate?",
830
- "gold": [
831
- "CHIPS_and_Science_Act"
832
- ],
833
- "top5": [
834
- "CHIPS_and_Science_Act",
835
- "2020\u20132023_global_chip_shortage",
836
- "Semiconductor_industry",
837
- "TSMC",
838
- "Supply_chain_attack"
839
- ],
840
- "ndcg@10": 1.0,
841
- "recall@10": 1.0,
842
- "precision@10": 0.1
843
- },
844
- "q11": {
845
- "query": "Who is Foxconn's primary customer?",
846
- "gold": [
847
- "Foxconn"
848
- ],
849
- "top5": [
850
- "Foxconn",
851
- "TSMC",
852
- "Semiconductor_industry",
853
- "2020\u20132023_global_chip_shortage",
854
- "Supply_chain_management"
855
- ],
856
- "ndcg@10": 1.0,
857
- "recall@10": 1.0,
858
- "precision@10": 0.1
859
- },
860
- "q12": {
861
- "query": "Why did the Ever Given run aground?",
862
- "gold": [
863
- "Ever_Given",
864
- "2021_Suez_Canal_obstruction"
865
- ],
866
- "top5": [
867
- "Ever_Given",
868
- "2021_Suez_Canal_obstruction",
869
- "Bab-el-Mandeb",
870
- "Strait_of_Hormuz",
871
- "Container_ship"
872
- ],
873
- "ndcg@10": 1.0,
874
- "recall@10": 1.0,
875
- "precision@10": 0.2
876
- },
877
- "q13": {
878
- "query": "What is safety stock?",
879
- "gold": [
880
- "Inventory"
881
- ],
882
- "top5": [
883
- "Inventory",
884
- "Supply_chain_attack",
885
- "Bullwhip_effect",
886
- "Logistics",
887
- "Baltic_Dry_Index"
888
- ],
889
- "ndcg@10": 1.0,
890
- "recall@10": 1.0,
891
- "precision@10": 0.1
892
- },
893
- "q14": {
894
- "query": "What is a supply chain attack?",
895
- "gold": [
896
- "Supply_chain_attack"
897
- ],
898
- "top5": [
899
- "Supply_chain_attack",
900
- "Supply_chain_management",
901
- "Bullwhip_effect",
902
- "Logistics",
903
- "2020\u20132023_global_chip_shortage"
904
- ],
905
- "ndcg@10": 1.0,
906
- "recall@10": 1.0,
907
- "precision@10": 0.1
908
- },
909
- "q15": {
910
- "query": "How busy is the Port of Singapore?",
911
- "gold": [
912
- "Port_of_Singapore"
913
- ],
914
- "top5": [
915
- "Port_of_Singapore",
916
- "Strait_of_Malacca",
917
- "Port_of_Los_Angeles",
918
- "Container_ship",
919
- "2021_Suez_Canal_obstruction"
920
- ],
921
- "ndcg@10": 1.0,
922
- "recall@10": 1.0,
923
- "precision@10": 0.1
924
- },
925
- "q16": {
926
- "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
927
- "gold": [
928
- "Strait_of_Malacca"
929
- ],
930
- "top5": [
931
- "Strait_of_Malacca",
932
- "Strait_of_Hormuz",
933
- "Bab-el-Mandeb",
934
- "Port_of_Singapore",
935
- "Suez_Canal"
936
- ],
937
- "ndcg@10": 1.0,
938
- "recall@10": 1.0,
939
- "precision@10": 0.1
940
- },
941
- "q17": {
942
- "query": "Which industry does the Baltic Dry Index track?",
943
- "gold": [
944
- "Baltic_Dry_Index"
945
- ],
946
- "top5": [
947
- "Baltic_Dry_Index",
948
- "Inventory",
949
- "Logistics",
950
- "Semiconductor_industry",
951
- "Enterprise_resource_planning"
952
- ],
953
- "ndcg@10": 1.0,
954
- "recall@10": 1.0,
955
- "precision@10": 0.1
956
- },
957
- "q18": {
958
- "query": "What function does a warehouse serve?",
959
- "gold": [
960
- "Warehouse"
961
- ],
962
- "top5": [
963
- "Warehouse",
964
- "Inventory",
965
- "Logistics",
966
- "Supply_chain_management",
967
- "Enterprise_resource_planning"
968
- ],
969
- "ndcg@10": 1.0,
970
- "recall@10": 1.0,
971
- "precision@10": 0.1
972
- },
973
- "q19": {
974
- "query": "What is a container ship's TEU?",
975
- "gold": [
976
- "Container_ship"
977
- ],
978
- "top5": [
979
- "Container_ship",
980
- "Ever_Given",
981
- "Inventory",
982
- "2021_Suez_Canal_obstruction",
983
- "Baltic_Dry_Index"
984
- ],
985
- "ndcg@10": 1.0,
986
- "recall@10": 1.0,
987
- "precision@10": 0.1
988
- },
989
- "q20": {
990
- "query": "What software replaces accounting + inventory + HR systems?",
991
- "gold": [
992
- "Enterprise_resource_planning"
993
- ],
994
- "top5": [
995
- "Enterprise_resource_planning",
996
- "Inventory",
997
- "Supply_chain_management",
998
- "Logistics",
999
- "Supply_chain_attack"
1000
- ],
1001
- "ndcg@10": 1.0,
1002
- "recall@10": 1.0,
1003
- "precision@10": 0.1
1004
- }
1005
- }
1006
- }
1007
- },
1008
- "public_ref_nfcorpus": {
1009
- "mxbai-embed-large-v1": {
1010
- "ndcg@10_nfcorpus": 0.386,
1011
- "source": "MTEB retrieval leaderboard 2024"
1012
- },
1013
- "bge-m3": {
1014
- "ndcg@10_nfcorpus": 0.357,
1015
- "source": "BGE-M3 paper + MTEB"
1016
- },
1017
- "snowflake-arctic-l": {
1018
- "ndcg@10_nfcorpus": 0.348,
1019
- "source": "Snowflake Arctic paper"
1020
- }
1021
- },
1022
- "elapsed_min": 1.861957597732544
1023
  }
 
1
+ {
2
+ "task": "SupplyMind-crisis-retrieval-BEIR-style",
3
+ "task_description": "Manual BEIR-style retrieval eval on 26 Wikipedia crisis articles + 20 real supply-chain queries. Metrics match the public MTEB retrieval leaderboard (nDCG@10, R@10, P@10). This is an out-of-domain task (supply chain, not medical), but numbers provide a directional check that our embedders are consistent with their published leaderboard performance.",
4
+ "our_results": {
5
+ "mxbai-embed-large-v1": {
6
+ "embedder": "mxbai-embed-large-v1",
7
+ "mean_ndcg@10": 0.9597824382702198,
8
+ "mean_recall@10": 1.0,
9
+ "mean_precision@10": 0.12000000000000002,
10
+ "corpus_encoding_s": 12.996914148330688,
11
+ "n_queries": 20,
12
+ "per_query": {
13
+ "q1": {
14
+ "query": "What was the magnitude of the 2011 Tohoku earthquake?",
15
+ "gold": [
16
+ "2011_T\u014dhoku_earthquake_and_tsunami"
17
+ ],
18
+ "top5": [
19
+ "2011_T\u014dhoku_earthquake_and_tsunami",
20
+ "Ever_Given",
21
+ "2020\u20132023_global_chip_shortage",
22
+ "Container_ship",
23
+ "Warehouse"
24
+ ],
25
+ "ndcg@10": 1.0,
26
+ "recall@10": 1.0,
27
+ "precision@10": 0.1
28
+ },
29
+ "q2": {
30
+ "query": "How long was the Suez Canal blocked in 2021?",
31
+ "gold": [
32
+ "2021_Suez_Canal_obstruction",
33
+ "Ever_Given"
34
+ ],
35
+ "top5": [
36
+ "2021_Suez_Canal_obstruction",
37
+ "Suez_Canal",
38
+ "Ever_Given",
39
+ "Red_Sea_crisis",
40
+ "Bab-el-Mandeb"
41
+ ],
42
+ "ndcg@10": 0.9197207891481876,
43
+ "recall@10": 1.0,
44
+ "precision@10": 0.2
45
+ },
46
+ "q3": {
47
+ "query": "What caused the global semiconductor shortage?",
48
+ "gold": [
49
+ "2020\u20132023_global_chip_shortage"
50
+ ],
51
+ "top5": [
52
+ "2020\u20132023_global_chip_shortage",
53
+ "Semiconductor_industry",
54
+ "TSMC",
55
+ "Bullwhip_effect",
56
+ "CHIPS_and_Science_Act"
57
+ ],
58
+ "ndcg@10": 1.0,
59
+ "recall@10": 1.0,
60
+ "precision@10": 0.1
61
+ },
62
+ "q4": {
63
+ "query": "Why is the Strait of Hormuz strategically important?",
64
+ "gold": [
65
+ "Strait_of_Hormuz"
66
+ ],
67
+ "top5": [
68
+ "Strait_of_Hormuz",
69
+ "Strait_of_Malacca",
70
+ "Bab-el-Mandeb",
71
+ "Suez_Canal",
72
+ "Port_of_Singapore"
73
+ ],
74
+ "ndcg@10": 1.0,
75
+ "recall@10": 1.0,
76
+ "precision@10": 0.1
77
+ },
78
+ "q5": {
79
+ "query": "How do Houthis threaten Red Sea shipping?",
80
+ "gold": [
81
+ "Red_Sea_crisis",
82
+ "Bab-el-Mandeb"
83
+ ],
84
+ "top5": [
85
+ "Red_Sea_crisis",
86
+ "2021_Suez_Canal_obstruction",
87
+ "Bab-el-Mandeb",
88
+ "Strait_of_Hormuz",
89
+ "Suez_Canal"
90
+ ],
91
+ "ndcg@10": 0.9197207891481876,
92
+ "recall@10": 1.0,
93
+ "precision@10": 0.2
94
+ },
95
+ "q6": {
96
+ "query": "Which foundry dominates advanced chip production?",
97
+ "gold": [
98
+ "TSMC",
99
+ "Semiconductor_industry"
100
+ ],
101
+ "top5": [
102
+ "TSMC",
103
+ "Semiconductor_industry",
104
+ "Foxconn",
105
+ "CHIPS_and_Science_Act",
106
+ "2020\u20132023_global_chip_shortage"
107
+ ],
108
+ "ndcg@10": 1.0,
109
+ "recall@10": 1.0,
110
+ "precision@10": 0.2
111
+ },
112
+ "q7": {
113
+ "query": "What is the bullwhip effect?",
114
+ "gold": [
115
+ "Bullwhip_effect"
116
+ ],
117
+ "top5": [
118
+ "Bullwhip_effect",
119
+ "Inventory",
120
+ "Supply_chain_management",
121
+ "Supply_chain_attack",
122
+ "2020\u20132023_global_chip_shortage"
123
+ ],
124
+ "ndcg@10": 1.0,
125
+ "recall@10": 1.0,
126
+ "precision@10": 0.1
127
+ },
128
+ "q8": {
129
+ "query": "Which port congested during 2021 supply chain crisis?",
130
+ "gold": [
131
+ "Port_of_Los_Angeles"
132
+ ],
133
+ "top5": [
134
+ "2021_Suez_Canal_obstruction",
135
+ "2020\u20132023_global_chip_shortage",
136
+ "Ever_Given",
137
+ "Port_of_Singapore",
138
+ "Container_ship"
139
+ ],
140
+ "ndcg@10": 0.3562071871080222,
141
+ "recall@10": 1.0,
142
+ "precision@10": 0.1
143
+ },
144
+ "q9": {
145
+ "query": "What is the just-in-time manufacturing philosophy?",
146
+ "gold": [
147
+ "Just-in-time_manufacturing"
148
+ ],
149
+ "top5": [
150
+ "Just-in-time_manufacturing",
151
+ "Inventory",
152
+ "Supply_chain_management",
153
+ "Logistics",
154
+ "Enterprise_resource_planning"
155
+ ],
156
+ "ndcg@10": 1.0,
157
+ "recall@10": 1.0,
158
+ "precision@10": 0.1
159
+ },
160
+ "q10": {
161
+ "query": "What does the CHIPS Act allocate?",
162
+ "gold": [
163
+ "CHIPS_and_Science_Act"
164
+ ],
165
+ "top5": [
166
+ "CHIPS_and_Science_Act",
167
+ "2020\u20132023_global_chip_shortage",
168
+ "Semiconductor_industry",
169
+ "TSMC",
170
+ "Inventory"
171
+ ],
172
+ "ndcg@10": 1.0,
173
+ "recall@10": 1.0,
174
+ "precision@10": 0.1
175
+ },
176
+ "q11": {
177
+ "query": "Who is Foxconn's primary customer?",
178
+ "gold": [
179
+ "Foxconn"
180
+ ],
181
+ "top5": [
182
+ "Foxconn",
183
+ "Semiconductor_industry",
184
+ "TSMC",
185
+ "Bullwhip_effect",
186
+ "Samsung_Electronics"
187
+ ],
188
+ "ndcg@10": 1.0,
189
+ "recall@10": 1.0,
190
+ "precision@10": 0.1
191
+ },
192
+ "q12": {
193
+ "query": "Why did the Ever Given run aground?",
194
+ "gold": [
195
+ "Ever_Given",
196
+ "2021_Suez_Canal_obstruction"
197
+ ],
198
+ "top5": [
199
+ "Ever_Given",
200
+ "2021_Suez_Canal_obstruction",
201
+ "Container_ship",
202
+ "2011_T\u014dhoku_earthquake_and_tsunami",
203
+ "Suez_Canal"
204
+ ],
205
+ "ndcg@10": 1.0,
206
+ "recall@10": 1.0,
207
+ "precision@10": 0.2
208
+ },
209
+ "q13": {
210
+ "query": "What is safety stock?",
211
+ "gold": [
212
+ "Inventory"
213
+ ],
214
+ "top5": [
215
+ "Inventory",
216
+ "Container_ship",
217
+ "Just-in-time_manufacturing",
218
+ "Bullwhip_effect",
219
+ "Warehouse"
220
+ ],
221
+ "ndcg@10": 1.0,
222
+ "recall@10": 1.0,
223
+ "precision@10": 0.1
224
+ },
225
+ "q14": {
226
+ "query": "What is a supply chain attack?",
227
+ "gold": [
228
+ "Supply_chain_attack"
229
+ ],
230
+ "top5": [
231
+ "Supply_chain_attack",
232
+ "Supply_chain_management",
233
+ "Bullwhip_effect",
234
+ "Logistics",
235
+ "Inventory"
236
+ ],
237
+ "ndcg@10": 1.0,
238
+ "recall@10": 1.0,
239
+ "precision@10": 0.1
240
+ },
241
+ "q15": {
242
+ "query": "How busy is the Port of Singapore?",
243
+ "gold": [
244
+ "Port_of_Singapore"
245
+ ],
246
+ "top5": [
247
+ "Port_of_Singapore",
248
+ "Strait_of_Malacca",
249
+ "Port_of_Los_Angeles",
250
+ "2021_Suez_Canal_obstruction",
251
+ "Container_ship"
252
+ ],
253
+ "ndcg@10": 1.0,
254
+ "recall@10": 1.0,
255
+ "precision@10": 0.1
256
+ },
257
+ "q16": {
258
+ "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
259
+ "gold": [
260
+ "Strait_of_Malacca"
261
+ ],
262
+ "top5": [
263
+ "Strait_of_Malacca",
264
+ "Strait_of_Hormuz",
265
+ "Bab-el-Mandeb",
266
+ "Port_of_Singapore",
267
+ "Suez_Canal"
268
+ ],
269
+ "ndcg@10": 1.0,
270
+ "recall@10": 1.0,
271
+ "precision@10": 0.1
272
+ },
273
+ "q17": {
274
+ "query": "Which industry does the Baltic Dry Index track?",
275
+ "gold": [
276
+ "Baltic_Dry_Index"
277
+ ],
278
+ "top5": [
279
+ "Baltic_Dry_Index",
280
+ "Semiconductor_industry",
281
+ "Inventory",
282
+ "Container_ship",
283
+ "2020\u20132023_global_chip_shortage"
284
+ ],
285
+ "ndcg@10": 1.0,
286
+ "recall@10": 1.0,
287
+ "precision@10": 0.1
288
+ },
289
+ "q18": {
290
+ "query": "What function does a warehouse serve?",
291
+ "gold": [
292
+ "Warehouse"
293
+ ],
294
+ "top5": [
295
+ "Warehouse",
296
+ "Inventory",
297
+ "Logistics",
298
+ "Container_ship",
299
+ "Supply_chain_management"
300
+ ],
301
+ "ndcg@10": 1.0,
302
+ "recall@10": 1.0,
303
+ "precision@10": 0.1
304
+ },
305
+ "q19": {
306
+ "query": "What is a container ship's TEU?",
307
+ "gold": [
308
+ "Container_ship"
309
+ ],
310
+ "top5": [
311
+ "Container_ship",
312
+ "Ever_Given",
313
+ "2021_Suez_Canal_obstruction",
314
+ "Port_of_Singapore",
315
+ "Port_of_Los_Angeles"
316
+ ],
317
+ "ndcg@10": 1.0,
318
+ "recall@10": 1.0,
319
+ "precision@10": 0.1
320
+ },
321
+ "q20": {
322
+ "query": "What software replaces accounting + inventory + HR systems?",
323
+ "gold": [
324
+ "Enterprise_resource_planning"
325
+ ],
326
+ "top5": [
327
+ "Enterprise_resource_planning",
328
+ "Inventory",
329
+ "Just-in-time_manufacturing",
330
+ "Supply_chain_management",
331
+ "Logistics"
332
+ ],
333
+ "ndcg@10": 1.0,
334
+ "recall@10": 1.0,
335
+ "precision@10": 0.1
336
+ }
337
+ }
338
+ },
339
+ "bge-m3": {
340
+ "embedder": "bge-m3",
341
+ "mean_ndcg@10": 0.967519867361079,
342
+ "mean_recall@10": 1.0,
343
+ "mean_precision@10": 0.12000000000000002,
344
+ "corpus_encoding_s": 43.88751459121704,
345
+ "n_queries": 20,
346
+ "per_query": {
347
+ "q1": {
348
+ "query": "What was the magnitude of the 2011 Tohoku earthquake?",
349
+ "gold": [
350
+ "2011_T\u014dhoku_earthquake_and_tsunami"
351
+ ],
352
+ "top5": [
353
+ "2011_T\u014dhoku_earthquake_and_tsunami",
354
+ "Foxconn",
355
+ "Bab-el-Mandeb",
356
+ "Ever_Given",
357
+ "2020\u20132023_global_chip_shortage"
358
+ ],
359
+ "ndcg@10": 1.0,
360
+ "recall@10": 1.0,
361
+ "precision@10": 0.1
362
+ },
363
+ "q2": {
364
+ "query": "How long was the Suez Canal blocked in 2021?",
365
+ "gold": [
366
+ "2021_Suez_Canal_obstruction",
367
+ "Ever_Given"
368
+ ],
369
+ "top5": [
370
+ "2021_Suez_Canal_obstruction",
371
+ "Suez_Canal",
372
+ "Ever_Given",
373
+ "Bab-el-Mandeb",
374
+ "2020\u20132023_global_chip_shortage"
375
+ ],
376
+ "ndcg@10": 0.9197207891481876,
377
+ "recall@10": 1.0,
378
+ "precision@10": 0.2
379
+ },
380
+ "q3": {
381
+ "query": "What caused the global semiconductor shortage?",
382
+ "gold": [
383
+ "2020\u20132023_global_chip_shortage"
384
+ ],
385
+ "top5": [
386
+ "2020\u20132023_global_chip_shortage",
387
+ "Semiconductor_industry",
388
+ "TSMC",
389
+ "Samsung_Electronics",
390
+ "Foxconn"
391
+ ],
392
+ "ndcg@10": 1.0,
393
+ "recall@10": 1.0,
394
+ "precision@10": 0.1
395
+ },
396
+ "q4": {
397
+ "query": "Why is the Strait of Hormuz strategically important?",
398
+ "gold": [
399
+ "Strait_of_Hormuz"
400
+ ],
401
+ "top5": [
402
+ "Strait_of_Hormuz",
403
+ "Bab-el-Mandeb",
404
+ "Strait_of_Malacca",
405
+ "Suez_Canal",
406
+ "Red_Sea_crisis"
407
+ ],
408
+ "ndcg@10": 1.0,
409
+ "recall@10": 1.0,
410
+ "precision@10": 0.1
411
+ },
412
+ "q5": {
413
+ "query": "How do Houthis threaten Red Sea shipping?",
414
+ "gold": [
415
+ "Red_Sea_crisis",
416
+ "Bab-el-Mandeb"
417
+ ],
418
+ "top5": [
419
+ "Red_Sea_crisis",
420
+ "Bab-el-Mandeb",
421
+ "Suez_Canal",
422
+ "2021_Suez_Canal_obstruction",
423
+ "Ever_Given"
424
+ ],
425
+ "ndcg@10": 1.0,
426
+ "recall@10": 1.0,
427
+ "precision@10": 0.2
428
+ },
429
+ "q6": {
430
+ "query": "Which foundry dominates advanced chip production?",
431
+ "gold": [
432
+ "TSMC",
433
+ "Semiconductor_industry"
434
+ ],
435
+ "top5": [
436
+ "Semiconductor_industry",
437
+ "TSMC",
438
+ "Foxconn",
439
+ "2020\u20132023_global_chip_shortage",
440
+ "Samsung_Electronics"
441
+ ],
442
+ "ndcg@10": 1.0,
443
+ "recall@10": 1.0,
444
+ "precision@10": 0.2
445
+ },
446
+ "q7": {
447
+ "query": "What is the bullwhip effect?",
448
+ "gold": [
449
+ "Bullwhip_effect"
450
+ ],
451
+ "top5": [
452
+ "Bullwhip_effect",
453
+ "2020\u20132023_global_chip_shortage",
454
+ "Baltic_Dry_Index",
455
+ "Bab-el-Mandeb",
456
+ "Just-in-time_manufacturing"
457
+ ],
458
+ "ndcg@10": 1.0,
459
+ "recall@10": 1.0,
460
+ "precision@10": 0.1
461
+ },
462
+ "q8": {
463
+ "query": "Which port congested during 2021 supply chain crisis?",
464
+ "gold": [
465
+ "Port_of_Los_Angeles"
466
+ ],
467
+ "top5": [
468
+ "2020\u20132023_global_chip_shortage",
469
+ "2021_Suez_Canal_obstruction",
470
+ "Ever_Given",
471
+ "Port_of_Los_Angeles",
472
+ "Bab-el-Mandeb"
473
+ ],
474
+ "ndcg@10": 0.43067655807339306,
475
+ "recall@10": 1.0,
476
+ "precision@10": 0.1
477
+ },
478
+ "q9": {
479
+ "query": "What is the just-in-time manufacturing philosophy?",
480
+ "gold": [
481
+ "Just-in-time_manufacturing"
482
+ ],
483
+ "top5": [
484
+ "Just-in-time_manufacturing",
485
+ "Inventory",
486
+ "Supply_chain_management",
487
+ "Foxconn",
488
+ "Logistics"
489
+ ],
490
+ "ndcg@10": 1.0,
491
+ "recall@10": 1.0,
492
+ "precision@10": 0.1
493
+ },
494
+ "q10": {
495
+ "query": "What does the CHIPS Act allocate?",
496
+ "gold": [
497
+ "CHIPS_and_Science_Act"
498
+ ],
499
+ "top5": [
500
+ "CHIPS_and_Science_Act",
501
+ "2020\u20132023_global_chip_shortage",
502
+ "TSMC",
503
+ "Foxconn",
504
+ "Supply_chain_attack"
505
+ ],
506
+ "ndcg@10": 1.0,
507
+ "recall@10": 1.0,
508
+ "precision@10": 0.1
509
+ },
510
+ "q11": {
511
+ "query": "Who is Foxconn's primary customer?",
512
+ "gold": [
513
+ "Foxconn"
514
+ ],
515
+ "top5": [
516
+ "Foxconn",
517
+ "TSMC",
518
+ "Semiconductor_industry",
519
+ "Ever_Given",
520
+ "2021_Suez_Canal_obstruction"
521
+ ],
522
+ "ndcg@10": 1.0,
523
+ "recall@10": 1.0,
524
+ "precision@10": 0.1
525
+ },
526
+ "q12": {
527
+ "query": "Why did the Ever Given run aground?",
528
+ "gold": [
529
+ "Ever_Given",
530
+ "2021_Suez_Canal_obstruction"
531
+ ],
532
+ "top5": [
533
+ "Ever_Given",
534
+ "2021_Suez_Canal_obstruction",
535
+ "2011_T\u014dhoku_earthquake_and_tsunami",
536
+ "Bab-el-Mandeb",
537
+ "2020\u20132023_global_chip_shortage"
538
+ ],
539
+ "ndcg@10": 1.0,
540
+ "recall@10": 1.0,
541
+ "precision@10": 0.2
542
+ },
543
+ "q13": {
544
+ "query": "What is safety stock?",
545
+ "gold": [
546
+ "Inventory"
547
+ ],
548
+ "top5": [
549
+ "Inventory",
550
+ "Supply_chain_attack",
551
+ "TSMC",
552
+ "Warehouse",
553
+ "Port_of_Singapore"
554
+ ],
555
+ "ndcg@10": 1.0,
556
+ "recall@10": 1.0,
557
+ "precision@10": 0.1
558
+ },
559
+ "q14": {
560
+ "query": "What is a supply chain attack?",
561
+ "gold": [
562
+ "Supply_chain_attack"
563
+ ],
564
+ "top5": [
565
+ "Supply_chain_attack",
566
+ "Supply_chain_management",
567
+ "Bullwhip_effect",
568
+ "2020\u20132023_global_chip_shortage",
569
+ "Logistics"
570
+ ],
571
+ "ndcg@10": 1.0,
572
+ "recall@10": 1.0,
573
+ "precision@10": 0.1
574
+ },
575
+ "q15": {
576
+ "query": "How busy is the Port of Singapore?",
577
+ "gold": [
578
+ "Port_of_Singapore"
579
+ ],
580
+ "top5": [
581
+ "Port_of_Singapore",
582
+ "Port_of_Los_Angeles",
583
+ "Strait_of_Malacca",
584
+ "2021_Suez_Canal_obstruction",
585
+ "Container_ship"
586
+ ],
587
+ "ndcg@10": 1.0,
588
+ "recall@10": 1.0,
589
+ "precision@10": 0.1
590
+ },
591
+ "q16": {
592
+ "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
593
+ "gold": [
594
+ "Strait_of_Malacca"
595
+ ],
596
+ "top5": [
597
+ "Strait_of_Malacca",
598
+ "Bab-el-Mandeb",
599
+ "Strait_of_Hormuz",
600
+ "Port_of_Singapore",
601
+ "Suez_Canal"
602
+ ],
603
+ "ndcg@10": 1.0,
604
+ "recall@10": 1.0,
605
+ "precision@10": 0.1
606
+ },
607
+ "q17": {
608
+ "query": "Which industry does the Baltic Dry Index track?",
609
+ "gold": [
610
+ "Baltic_Dry_Index"
611
+ ],
612
+ "top5": [
613
+ "Baltic_Dry_Index",
614
+ "Inventory",
615
+ "2020\u20132023_global_chip_shortage",
616
+ "Semiconductor_industry",
617
+ "Logistics"
618
+ ],
619
+ "ndcg@10": 1.0,
620
+ "recall@10": 1.0,
621
+ "precision@10": 0.1
622
+ },
623
+ "q18": {
624
+ "query": "What function does a warehouse serve?",
625
+ "gold": [
626
+ "Warehouse"
627
+ ],
628
+ "top5": [
629
+ "Warehouse",
630
+ "Inventory",
631
+ "Logistics",
632
+ "Container_ship",
633
+ "Port_of_Singapore"
634
+ ],
635
+ "ndcg@10": 1.0,
636
+ "recall@10": 1.0,
637
+ "precision@10": 0.1
638
+ },
639
+ "q19": {
640
+ "query": "What is a container ship's TEU?",
641
+ "gold": [
642
+ "Container_ship"
643
+ ],
644
+ "top5": [
645
+ "Container_ship",
646
+ "Ever_Given",
647
+ "2021_Suez_Canal_obstruction",
648
+ "Baltic_Dry_Index",
649
+ "Port_of_Singapore"
650
+ ],
651
+ "ndcg@10": 1.0,
652
+ "recall@10": 1.0,
653
+ "precision@10": 0.1
654
+ },
655
+ "q20": {
656
+ "query": "What software replaces accounting + inventory + HR systems?",
657
+ "gold": [
658
+ "Enterprise_resource_planning"
659
+ ],
660
+ "top5": [
661
+ "Enterprise_resource_planning",
662
+ "Inventory",
663
+ "Supply_chain_attack",
664
+ "Just-in-time_manufacturing",
665
+ "Foxconn"
666
+ ],
667
+ "ndcg@10": 1.0,
668
+ "recall@10": 1.0,
669
+ "precision@10": 0.1
670
+ }
671
+ }
672
+ },
673
+ "snowflake-arctic-l": {
674
+ "embedder": "snowflake-arctic-l",
675
+ "mean_ndcg@10": 0.9709860394574094,
676
+ "mean_recall@10": 1.0,
677
+ "mean_precision@10": 0.12000000000000002,
678
+ "corpus_encoding_s": 40.3898344039917,
679
+ "n_queries": 20,
680
+ "per_query": {
681
+ "q1": {
682
+ "query": "What was the magnitude of the 2011 Tohoku earthquake?",
683
+ "gold": [
684
+ "2011_T\u014dhoku_earthquake_and_tsunami"
685
+ ],
686
+ "top5": [
687
+ "2011_T\u014dhoku_earthquake_and_tsunami",
688
+ "Ever_Given",
689
+ "2021_Suez_Canal_obstruction",
690
+ "Samsung_Electronics",
691
+ "Suez_Canal"
692
+ ],
693
+ "ndcg@10": 1.0,
694
+ "recall@10": 1.0,
695
+ "precision@10": 0.1
696
+ },
697
+ "q2": {
698
+ "query": "How long was the Suez Canal blocked in 2021?",
699
+ "gold": [
700
+ "2021_Suez_Canal_obstruction",
701
+ "Ever_Given"
702
+ ],
703
+ "top5": [
704
+ "2021_Suez_Canal_obstruction",
705
+ "Suez_Canal",
706
+ "Ever_Given",
707
+ "Red_Sea_crisis",
708
+ "Bab-el-Mandeb"
709
+ ],
710
+ "ndcg@10": 0.9197207891481876,
711
+ "recall@10": 1.0,
712
+ "precision@10": 0.2
713
+ },
714
+ "q3": {
715
+ "query": "What caused the global semiconductor shortage?",
716
+ "gold": [
717
+ "2020\u20132023_global_chip_shortage"
718
+ ],
719
+ "top5": [
720
+ "2020\u20132023_global_chip_shortage",
721
+ "Semiconductor_industry",
722
+ "TSMC",
723
+ "Supply_chain_attack",
724
+ "Foxconn"
725
+ ],
726
+ "ndcg@10": 1.0,
727
+ "recall@10": 1.0,
728
+ "precision@10": 0.1
729
+ },
730
+ "q4": {
731
+ "query": "Why is the Strait of Hormuz strategically important?",
732
+ "gold": [
733
+ "Strait_of_Hormuz"
734
+ ],
735
+ "top5": [
736
+ "Strait_of_Hormuz",
737
+ "Strait_of_Malacca",
738
+ "Bab-el-Mandeb",
739
+ "Suez_Canal",
740
+ "Red_Sea_crisis"
741
+ ],
742
+ "ndcg@10": 1.0,
743
+ "recall@10": 1.0,
744
+ "precision@10": 0.1
745
+ },
746
+ "q5": {
747
+ "query": "How do Houthis threaten Red Sea shipping?",
748
+ "gold": [
749
+ "Red_Sea_crisis",
750
+ "Bab-el-Mandeb"
751
+ ],
752
+ "top5": [
753
+ "Red_Sea_crisis",
754
+ "Bab-el-Mandeb",
755
+ "Strait_of_Hormuz",
756
+ "Suez_Canal",
757
+ "2021_Suez_Canal_obstruction"
758
+ ],
759
+ "ndcg@10": 1.0,
760
+ "recall@10": 1.0,
761
+ "precision@10": 0.2
762
+ },
763
+ "q6": {
764
+ "query": "Which foundry dominates advanced chip production?",
765
+ "gold": [
766
+ "TSMC",
767
+ "Semiconductor_industry"
768
+ ],
769
+ "top5": [
770
+ "Semiconductor_industry",
771
+ "TSMC",
772
+ "2020\u20132023_global_chip_shortage",
773
+ "Foxconn",
774
+ "CHIPS_and_Science_Act"
775
+ ],
776
+ "ndcg@10": 1.0,
777
+ "recall@10": 1.0,
778
+ "precision@10": 0.2
779
+ },
780
+ "q7": {
781
+ "query": "What is the bullwhip effect?",
782
+ "gold": [
783
+ "Bullwhip_effect"
784
+ ],
785
+ "top5": [
786
+ "Bullwhip_effect",
787
+ "Just-in-time_manufacturing",
788
+ "Baltic_Dry_Index",
789
+ "Inventory",
790
+ "Bab-el-Mandeb"
791
+ ],
792
+ "ndcg@10": 1.0,
793
+ "recall@10": 1.0,
794
+ "precision@10": 0.1
795
+ },
796
+ "q8": {
797
+ "query": "Which port congested during 2021 supply chain crisis?",
798
+ "gold": [
799
+ "Port_of_Los_Angeles"
800
+ ],
801
+ "top5": [
802
+ "2020\u20132023_global_chip_shortage",
803
+ "2021_Suez_Canal_obstruction",
804
+ "Port_of_Los_Angeles",
805
+ "Ever_Given",
806
+ "Supply_chain_attack"
807
+ ],
808
+ "ndcg@10": 0.5,
809
+ "recall@10": 1.0,
810
+ "precision@10": 0.1
811
+ },
812
+ "q9": {
813
+ "query": "What is the just-in-time manufacturing philosophy?",
814
+ "gold": [
815
+ "Just-in-time_manufacturing"
816
+ ],
817
+ "top5": [
818
+ "Just-in-time_manufacturing",
819
+ "Supply_chain_management",
820
+ "Inventory",
821
+ "Logistics",
822
+ "Semiconductor_industry"
823
+ ],
824
+ "ndcg@10": 1.0,
825
+ "recall@10": 1.0,
826
+ "precision@10": 0.1
827
+ },
828
+ "q10": {
829
+ "query": "What does the CHIPS Act allocate?",
830
+ "gold": [
831
+ "CHIPS_and_Science_Act"
832
+ ],
833
+ "top5": [
834
+ "CHIPS_and_Science_Act",
835
+ "2020\u20132023_global_chip_shortage",
836
+ "Semiconductor_industry",
837
+ "TSMC",
838
+ "Supply_chain_attack"
839
+ ],
840
+ "ndcg@10": 1.0,
841
+ "recall@10": 1.0,
842
+ "precision@10": 0.1
843
+ },
844
+ "q11": {
845
+ "query": "Who is Foxconn's primary customer?",
846
+ "gold": [
847
+ "Foxconn"
848
+ ],
849
+ "top5": [
850
+ "Foxconn",
851
+ "TSMC",
852
+ "Semiconductor_industry",
853
+ "2020\u20132023_global_chip_shortage",
854
+ "Supply_chain_management"
855
+ ],
856
+ "ndcg@10": 1.0,
857
+ "recall@10": 1.0,
858
+ "precision@10": 0.1
859
+ },
860
+ "q12": {
861
+ "query": "Why did the Ever Given run aground?",
862
+ "gold": [
863
+ "Ever_Given",
864
+ "2021_Suez_Canal_obstruction"
865
+ ],
866
+ "top5": [
867
+ "Ever_Given",
868
+ "2021_Suez_Canal_obstruction",
869
+ "Bab-el-Mandeb",
870
+ "Strait_of_Hormuz",
871
+ "Container_ship"
872
+ ],
873
+ "ndcg@10": 1.0,
874
+ "recall@10": 1.0,
875
+ "precision@10": 0.2
876
+ },
877
+ "q13": {
878
+ "query": "What is safety stock?",
879
+ "gold": [
880
+ "Inventory"
881
+ ],
882
+ "top5": [
883
+ "Inventory",
884
+ "Supply_chain_attack",
885
+ "Bullwhip_effect",
886
+ "Logistics",
887
+ "Baltic_Dry_Index"
888
+ ],
889
+ "ndcg@10": 1.0,
890
+ "recall@10": 1.0,
891
+ "precision@10": 0.1
892
+ },
893
+ "q14": {
894
+ "query": "What is a supply chain attack?",
895
+ "gold": [
896
+ "Supply_chain_attack"
897
+ ],
898
+ "top5": [
899
+ "Supply_chain_attack",
900
+ "Supply_chain_management",
901
+ "Bullwhip_effect",
902
+ "Logistics",
903
+ "2020\u20132023_global_chip_shortage"
904
+ ],
905
+ "ndcg@10": 1.0,
906
+ "recall@10": 1.0,
907
+ "precision@10": 0.1
908
+ },
909
+ "q15": {
910
+ "query": "How busy is the Port of Singapore?",
911
+ "gold": [
912
+ "Port_of_Singapore"
913
+ ],
914
+ "top5": [
915
+ "Port_of_Singapore",
916
+ "Strait_of_Malacca",
917
+ "Port_of_Los_Angeles",
918
+ "Container_ship",
919
+ "2021_Suez_Canal_obstruction"
920
+ ],
921
+ "ndcg@10": 1.0,
922
+ "recall@10": 1.0,
923
+ "precision@10": 0.1
924
+ },
925
+ "q16": {
926
+ "query": "Which strait is a narrow Indonesia-Malaysia chokepoint?",
927
+ "gold": [
928
+ "Strait_of_Malacca"
929
+ ],
930
+ "top5": [
931
+ "Strait_of_Malacca",
932
+ "Strait_of_Hormuz",
933
+ "Bab-el-Mandeb",
934
+ "Port_of_Singapore",
935
+ "Suez_Canal"
936
+ ],
937
+ "ndcg@10": 1.0,
938
+ "recall@10": 1.0,
939
+ "precision@10": 0.1
940
+ },
941
+ "q17": {
942
+ "query": "Which industry does the Baltic Dry Index track?",
943
+ "gold": [
944
+ "Baltic_Dry_Index"
945
+ ],
946
+ "top5": [
947
+ "Baltic_Dry_Index",
948
+ "Inventory",
949
+ "Logistics",
950
+ "Semiconductor_industry",
951
+ "Enterprise_resource_planning"
952
+ ],
953
+ "ndcg@10": 1.0,
954
+ "recall@10": 1.0,
955
+ "precision@10": 0.1
956
+ },
957
+ "q18": {
958
+ "query": "What function does a warehouse serve?",
959
+ "gold": [
960
+ "Warehouse"
961
+ ],
962
+ "top5": [
963
+ "Warehouse",
964
+ "Inventory",
965
+ "Logistics",
966
+ "Supply_chain_management",
967
+ "Enterprise_resource_planning"
968
+ ],
969
+ "ndcg@10": 1.0,
970
+ "recall@10": 1.0,
971
+ "precision@10": 0.1
972
+ },
973
+ "q19": {
974
+ "query": "What is a container ship's TEU?",
975
+ "gold": [
976
+ "Container_ship"
977
+ ],
978
+ "top5": [
979
+ "Container_ship",
980
+ "Ever_Given",
981
+ "Inventory",
982
+ "2021_Suez_Canal_obstruction",
983
+ "Baltic_Dry_Index"
984
+ ],
985
+ "ndcg@10": 1.0,
986
+ "recall@10": 1.0,
987
+ "precision@10": 0.1
988
+ },
989
+ "q20": {
990
+ "query": "What software replaces accounting + inventory + HR systems?",
991
+ "gold": [
992
+ "Enterprise_resource_planning"
993
+ ],
994
+ "top5": [
995
+ "Enterprise_resource_planning",
996
+ "Inventory",
997
+ "Supply_chain_management",
998
+ "Logistics",
999
+ "Supply_chain_attack"
1000
+ ],
1001
+ "ndcg@10": 1.0,
1002
+ "recall@10": 1.0,
1003
+ "precision@10": 0.1
1004
+ }
1005
+ }
1006
+ }
1007
+ },
1008
+ "public_ref_nfcorpus": {
1009
+ "mxbai-embed-large-v1": {
1010
+ "ndcg@10_nfcorpus": 0.386,
1011
+ "source": "MTEB retrieval leaderboard 2024"
1012
+ },
1013
+ "bge-m3": {
1014
+ "ndcg@10_nfcorpus": 0.357,
1015
+ "source": "BGE-M3 paper + MTEB"
1016
+ },
1017
+ "snowflake-arctic-l": {
1018
+ "ndcg@10_nfcorpus": 0.348,
1019
+ "source": "Snowflake Arctic paper"
1020
+ }
1021
+ },
1022
+ "elapsed_min": 1.861957597732544
1023
  }
FINAL_SUBMIT/receipts/R5_GRANITE.json CHANGED
The diff for this file is too large to render. See raw diff
 
FINAL_SUBMIT/receipts/R5_GRANITE_HARD.json CHANGED
The diff for this file is too large to render. See raw diff
 
FINAL_SUBMIT/receipts/R6_ALGO_COMPARISON.json CHANGED
@@ -1,72 +1,72 @@
1
- {
2
- "task": "easy_typhoon_response",
3
- "training_timesteps": 100000,
4
- "eval_episodes": 50,
5
- "per_algorithm": {
6
- "MaskablePPO": {
7
- "algorithm": "MaskablePPO",
8
- "n_episodes": 50,
9
- "reward_mean": 1.2005000000000001,
10
- "reward_std": 0.19939637032804786,
11
- "reward_min": 0.643,
12
- "reward_max": 1.3435000000000004,
13
- "length_mean": 20.0,
14
- "violations_mean": 0.0,
15
- "invalid_action_picks_mean_per_ep": 0.0
16
- },
17
- "PPO": {
18
- "algorithm": "PPO",
19
- "n_episodes": 50,
20
- "reward_mean": 0.9470000000000001,
21
- "reward_std": 0.1244727781484771,
22
- "reward_min": 0.5895,
23
- "reward_max": 1.0760000000000003,
24
- "length_mean": 20.0,
25
- "violations_mean": 0.0,
26
- "invalid_action_picks_mean_per_ep": 13.64
27
- },
28
- "A2C": {
29
- "algorithm": "A2C",
30
- "n_episodes": 50,
31
- "reward_mean": 0.8738700000000001,
32
- "reward_std": 0.11796597221232909,
33
- "reward_min": 0.5359999999999999,
34
- "reward_max": 0.9690000000000002,
35
- "length_mean": 20.0,
36
- "violations_mean": 0.0,
37
- "invalid_action_picks_mean_per_ep": 13.88
38
- },
39
- "RecurrentPPO": {
40
- "algorithm": "RecurrentPPO",
41
- "n_episodes": 50,
42
- "reward_mean": 1.0806900000000002,
43
- "reward_std": 0.19626869694375626,
44
- "reward_min": 0.7499999999999999,
45
- "reward_max": 1.3470000000000004,
46
- "length_mean": 20.0,
47
- "violations_mean": 0.0,
48
- "invalid_action_picks_mean_per_ep": 14.86
49
- }
50
- },
51
- "train_times_min": {
52
- "MaskablePPO": 10.99298940896988,
53
- "PPO": 8.347426931063334,
54
- "A2C": 9.913969707489013,
55
- "RecurrentPPO": 16.337928581237794
56
- },
57
- "maskable_vs_others": {
58
- "PPO": {
59
- "reward_delta": -0.25350000000000006,
60
- "maskable_lift_pct": 26.768743400211196
61
- },
62
- "A2C": {
63
- "reward_delta": -0.32663,
64
- "maskable_lift_pct": 37.377413116367414
65
- },
66
- "RecurrentPPO": {
67
- "reward_delta": -0.11980999999999997,
68
- "maskable_lift_pct": 11.08643551804865
69
- }
70
- },
71
- "elapsed_min": 45.86821995576223
72
  }
 
1
+ {
2
+ "task": "easy_typhoon_response",
3
+ "training_timesteps": 100000,
4
+ "eval_episodes": 50,
5
+ "per_algorithm": {
6
+ "MaskablePPO": {
7
+ "algorithm": "MaskablePPO",
8
+ "n_episodes": 50,
9
+ "reward_mean": 1.2005000000000001,
10
+ "reward_std": 0.19939637032804786,
11
+ "reward_min": 0.643,
12
+ "reward_max": 1.3435000000000004,
13
+ "length_mean": 20.0,
14
+ "violations_mean": 0.0,
15
+ "invalid_action_picks_mean_per_ep": 0.0
16
+ },
17
+ "PPO": {
18
+ "algorithm": "PPO",
19
+ "n_episodes": 50,
20
+ "reward_mean": 0.9470000000000001,
21
+ "reward_std": 0.1244727781484771,
22
+ "reward_min": 0.5895,
23
+ "reward_max": 1.0760000000000003,
24
+ "length_mean": 20.0,
25
+ "violations_mean": 0.0,
26
+ "invalid_action_picks_mean_per_ep": 13.64
27
+ },
28
+ "A2C": {
29
+ "algorithm": "A2C",
30
+ "n_episodes": 50,
31
+ "reward_mean": 0.8738700000000001,
32
+ "reward_std": 0.11796597221232909,
33
+ "reward_min": 0.5359999999999999,
34
+ "reward_max": 0.9690000000000002,
35
+ "length_mean": 20.0,
36
+ "violations_mean": 0.0,
37
+ "invalid_action_picks_mean_per_ep": 13.88
38
+ },
39
+ "RecurrentPPO": {
40
+ "algorithm": "RecurrentPPO",
41
+ "n_episodes": 50,
42
+ "reward_mean": 1.0806900000000002,
43
+ "reward_std": 0.19626869694375626,
44
+ "reward_min": 0.7499999999999999,
45
+ "reward_max": 1.3470000000000004,
46
+ "length_mean": 20.0,
47
+ "violations_mean": 0.0,
48
+ "invalid_action_picks_mean_per_ep": 14.86
49
+ }
50
+ },
51
+ "train_times_min": {
52
+ "MaskablePPO": 10.99298940896988,
53
+ "PPO": 8.347426931063334,
54
+ "A2C": 9.913969707489013,
55
+ "RecurrentPPO": 16.337928581237794
56
+ },
57
+ "maskable_vs_others": {
58
+ "PPO": {
59
+ "reward_delta": -0.25350000000000006,
60
+ "maskable_lift_pct": 26.768743400211196
61
+ },
62
+ "A2C": {
63
+ "reward_delta": -0.32663,
64
+ "maskable_lift_pct": 37.377413116367414
65
+ },
66
+ "RecurrentPPO": {
67
+ "reward_delta": -0.11980999999999997,
68
+ "maskable_lift_pct": 11.08643551804865
69
+ }
70
+ },
71
+ "elapsed_min": 45.86821995576223
72
  }
FINAL_SUBMIT/receipts/R6_AQUA_REGIA_V2.json CHANGED
@@ -1,860 +1,860 @@
1
- {
2
- "targets": [
3
- "DCOILWTICO",
4
- "DEXJPUS",
5
- "DEXUSEU",
6
- "DEXCHUS",
7
- "DEXKOUS"
8
- ],
9
- "horizon": 14,
10
- "confs": [
11
- 0.8,
12
- 0.9,
13
- 0.95
14
- ],
15
- "n_cal": 30,
16
- "n_test": 30,
17
- "results": {
18
- "DCOILWTICO": {
19
- "arima": {
20
- "forecaster": "arima",
21
- "n_cal": 30,
22
- "n_test": 30,
23
- "conf=0.8": {
24
- "nominal_coverage": 0.8,
25
- "bare_coverage_mean": 0.8095238095238094,
26
- "bare_width_mean": 10.867942261555571,
27
- "perhorizon_coverage_mean": 0.6857142857142856,
28
- "perhorizon_width_mean": 7.990994504643288,
29
- "pooled_coverage_mean": 0.6785714285714285,
30
- "pooled_width_mean": 8.029568159989491,
31
- "q_per_horizon": [
32
- 2.0917427692512547,
33
- 2.414564146929898,
34
- 3.49864771255762,
35
- 3.783403014989574,
36
- 3.6514825270864293,
37
- 3.410638918826429,
38
- 3.6483267386695672,
39
- 4.291356370865486,
40
- 4.148100512774434,
41
- 4.765242660767733,
42
- 4.798738782538393,
43
- 4.648753353034714,
44
- 5.111777984600735,
45
- 5.674186039610767
46
- ],
47
- "q_pooled": 4.014784079994747
48
- },
49
- "conf=0.9": {
50
- "nominal_coverage": 0.9,
51
- "bare_coverage_mean": 0.9214285714285715,
52
- "bare_width_mean": 13.948852880392929,
53
- "perhorizon_coverage_mean": 0.7809523809523811,
54
- "perhorizon_width_mean": 10.031165041917506,
55
- "pooled_coverage_mean": 0.7738095238095238,
56
- "pooled_width_mean": 10.167074585069713,
57
- "q_per_horizon": [
58
- 2.300277140003125,
59
- 4.097940221459595,
60
- 4.076376633492892,
61
- 4.703831136719856,
62
- 4.842398951063927,
63
- 5.337677242975467,
64
- 4.359396527417836,
65
- 6.151868291801264,
66
- 5.051950062063291,
67
- 5.854070590337393,
68
- 5.368481950759772,
69
- 5.284114635080698,
70
- 6.431339982770957,
71
- 6.3584319274764525
72
- ],
73
- "q_pooled": 5.0835372925348565
74
- },
75
- "conf=0.95": {
76
- "nominal_coverage": 0.95,
77
- "bare_coverage_mean": 0.9452380952380951,
78
- "bare_width_mean": 16.621083373775793,
79
- "perhorizon_coverage_mean": 0.9261904761904761,
80
- "perhorizon_width_mean": 14.611219531249459,
81
- "pooled_coverage_mean": 0.838095238095238,
82
- "pooled_width_mean": 12.16250013730463,
83
- "q_per_horizon": [
84
- 3.0531114213612582,
85
- 5.059338828648023,
86
- 5.697604686526287,
87
- 7.146009479872129,
88
- 5.3182905673299175,
89
- 7.39090190741959,
90
- 6.856329650125417,
91
- 7.199424687832007,
92
- 6.523429069811058,
93
- 6.548845442730201,
94
- 9.62406528058468,
95
- 8.603787092463286,
96
- 11.553679176235391,
97
- 11.703719427806988
98
- ],
99
- "q_pooled": 6.0812500686523165
100
- }
101
- },
102
- "chronos": {
103
- "forecaster": "chronos",
104
- "n_cal": 30,
105
- "n_test": 30,
106
- "conf=0.8": {
107
- "nominal_coverage": 0.8,
108
- "bare_coverage_mean": 0.7809523809523807,
109
- "bare_width_mean": 11.050525585810343,
110
- "perhorizon_coverage_mean": 0.6547619047619048,
111
- "perhorizon_width_mean": 8.338129283360074,
112
- "pooled_coverage_mean": 0.6452380952380952,
113
- "pooled_width_mean": 8.036834106445315,
114
- "q_per_horizon": [
115
- 2.1229774475097685,
116
- 2.4522241210937494,
117
- 3.261205139160154,
118
- 3.9071347045898435,
119
- 3.614091110229495,
120
- 3.6567034912109406,
121
- 3.993652496337887,
122
- 4.4286404418945295,
123
- 4.545238494873047,
124
- 5.274034423828127,
125
- 5.24025115966797,
126
- 4.8420919799804665,
127
- 5.316376342773438,
128
- 5.71228363037109
129
- ],
130
- "q_pooled": 4.018417053222656
131
- },
132
- "conf=0.9": {
133
- "nominal_coverage": 0.9,
134
- "bare_coverage_mean": 0.7809523809523807,
135
- "bare_width_mean": 11.050525585810343,
136
- "perhorizon_coverage_mean": 0.7880952380952381,
137
- "perhorizon_width_mean": 11.069673222133089,
138
- "pooled_coverage_mean": 0.769047619047619,
139
- "pooled_width_mean": 10.63275268554687,
140
- "q_per_horizon": [
141
- 2.555929565429693,
142
- 3.5912300109863295,
143
- 4.3903402709960915,
144
- 5.24416809082031,
145
- 4.982480926513674,
146
- 5.137361450195314,
147
- 5.586841278076172,
148
- 6.765305328369138,
149
- 6.67245574951172,
150
- 5.990972595214842,
151
- 5.718290405273436,
152
- 5.943902282714845,
153
- 7.989523162841799,
154
- 6.918911437988278
155
- ],
156
- "q_pooled": 5.316376342773438
157
- },
158
- "conf=0.95": {
159
- "nominal_coverage": 0.95,
160
- "bare_coverage_mean": 0.7809523809523807,
161
- "bare_width_mean": 11.050525585810343,
162
- "perhorizon_coverage_mean": 0.9261904761904761,
163
- "perhorizon_width_mean": 16.372548740931915,
164
- "pooled_coverage_mean": 0.8547619047619047,
165
- "pooled_width_mean": 13.761851806640617,
166
- "q_per_horizon": [
167
- 4.500623779296873,
168
- 5.796702575683597,
169
- 4.578687438964849,
170
- 5.983569641113277,
171
- 7.369260253906248,
172
- 8.649095764160151,
173
- 8.18119262695312,
174
- 9.151351928710938,
175
- 8.256888427734381,
176
- 8.666538696289066,
177
- 10.109675750732421,
178
- 9.065566864013675,
179
- 12.079234161376952,
180
- 12.219453277587888
181
- ],
182
- "q_pooled": 6.8809259033203105
183
- }
184
- }
185
- },
186
- "DEXJPUS": {
187
- "arima": {
188
- "forecaster": "arima",
189
- "n_cal": 30,
190
- "n_test": 30,
191
- "conf=0.8": {
192
- "nominal_coverage": 0.8,
193
- "bare_coverage_mean": 0.6357142857142856,
194
- "bare_width_mean": 4.436568793595841,
195
- "perhorizon_coverage_mean": 0.45238095238095233,
196
- "perhorizon_width_mean": 2.8685092642157013,
197
- "pooled_coverage_mean": 0.4928571428571428,
198
- "pooled_width_mean": 2.791173769264077,
199
- "q_per_horizon": [
200
- 0.495163456754355,
201
- 0.8623131555344372,
202
- 0.8897926642558076,
203
- 1.1482011742546945,
204
- 1.28795516679331,
205
- 1.6477655987067266,
206
- 1.7443474583408118,
207
- 1.5384895904415004,
208
- 1.803162688834604,
209
- 1.7685075068830685,
210
- 1.7186420091775432,
211
- 1.5470661555772267,
212
- 1.888659928991629,
213
- 1.7394982949641928
214
- ],
215
- "q_pooled": 1.3955868846320385
216
- },
217
- "conf=0.9": {
218
- "nominal_coverage": 0.9,
219
- "bare_coverage_mean": 0.7738095238095236,
220
- "bare_width_mean": 5.694274399535953,
221
- "perhorizon_coverage_mean": 0.5761904761904761,
222
- "perhorizon_width_mean": 3.798189452444865,
223
- "pooled_coverage_mean": 0.5809523809523809,
224
- "pooled_width_mean": 3.8189608293080823,
225
- "q_per_horizon": [
226
- 0.602618663621783,
227
- 1.5464872564533323,
228
- 1.410577522130609,
229
- 2.006457013067674,
230
- 1.9326982798289691,
231
- 1.871741039728505,
232
- 1.8724724170933484,
233
- 2.0184353738183205,
234
- 2.057205707305812,
235
- 2.300998677577681,
236
- 2.4584763121956854,
237
- 2.2610349692604643,
238
- 2.141044083930069,
239
- 2.1070788511018037
240
- ],
241
- "q_pooled": 1.9094804146540412
242
- },
243
- "conf=0.95": {
244
- "nominal_coverage": 0.95,
245
- "bare_coverage_mean": 0.8738095238095237,
246
- "bare_width_mean": 6.7851464460479765,
247
- "perhorizon_coverage_mean": 0.8023809523809523,
248
- "perhorizon_width_mean": 6.101635459825262,
249
- "pooled_coverage_mean": 0.6571428571428571,
250
- "pooled_width_mean": 4.601997355155362,
251
- "q_per_horizon": [
252
- 0.9380858484970958,
253
- 2.323515167056655,
254
- 1.946219636173069,
255
- 2.2116051075864647,
256
- 2.7206754280723686,
257
- 3.562227529556367,
258
- 3.502961358052417,
259
- 3.5922479170316564,
260
- 4.142317883234554,
261
- 4.062380770386838,
262
- 3.5722844723094056,
263
- 3.2623018774721544,
264
- 3.212317495709044,
265
- 3.6623077276387335
266
- ],
267
- "q_pooled": 2.300998677577681
268
- }
269
- },
270
- "chronos": {
271
- "forecaster": "chronos",
272
- "n_cal": 30,
273
- "n_test": 30,
274
- "conf=0.8": {
275
- "nominal_coverage": 0.8,
276
- "bare_coverage_mean": 0.7309523809523808,
277
- "bare_width_mean": 5.977349718411763,
278
- "perhorizon_coverage_mean": 0.47380952380952385,
279
- "perhorizon_width_mean": 3.038026166643411,
280
- "pooled_coverage_mean": 0.49761904761904757,
281
- "pooled_width_mean": 2.8918725585937466,
282
- "q_per_horizon": [
283
- 0.5868325805664085,
284
- 0.8268566894531233,
285
- 0.8645288085937466,
286
- 1.1490182495117125,
287
- 1.4187112426757835,
288
- 1.667842102050784,
289
- 1.8516342163085966,
290
- 1.6831582641601557,
291
- 1.5933966064453102,
292
- 1.7942288208007824,
293
- 2.1771484374999943,
294
- 1.8165200805664057,
295
- 1.8638430786132858,
296
- 1.9724639892578182
297
- ],
298
- "q_pooled": 1.4459362792968733
299
- },
300
- "conf=0.9": {
301
- "nominal_coverage": 0.9,
302
- "bare_coverage_mean": 0.7309523809523808,
303
- "bare_width_mean": 5.977349718411763,
304
- "perhorizon_coverage_mean": 0.6071428571428572,
305
- "perhorizon_width_mean": 4.111253226143984,
306
- "pooled_coverage_mean": 0.6023809523809524,
307
- "pooled_width_mean": 4.0517645263671795,
308
- "q_per_horizon": [
309
- 0.7398001098632818,
310
- 1.542530517578129,
311
- 1.4136145019531199,
312
- 2.0581530761718767,
313
- 1.8112579345703068,
314
- 2.3215438842773466,
315
- 2.0993005371093716,
316
- 2.064953918457036,
317
- 2.4423132324218813,
318
- 2.698671264648439,
319
- 2.4562600708007807,
320
- 2.32724975585937,
321
- 2.5256872558593813,
322
- 2.277436523437501
323
- ],
324
- "q_pooled": 2.0258822631835898
325
- },
326
- "conf=0.95": {
327
- "nominal_coverage": 0.95,
328
- "bare_coverage_mean": 0.7309523809523808,
329
- "bare_width_mean": 5.977349718411763,
330
- "perhorizon_coverage_mean": 0.7190476190476188,
331
- "perhorizon_width_mean": 5.96463936941964,
332
- "pooled_coverage_mean": 0.6809523809523809,
333
- "pooled_width_mean": 5.0513745117187625,
334
- "q_per_horizon": [
335
- 0.930439453125004,
336
- 2.665478515624997,
337
- 1.9302044677734358,
338
- 2.0884591674804653,
339
- 2.7411437988281193,
340
- 3.6284613037109352,
341
- 3.513445739746089,
342
- 3.5274569702148426,
343
- 4.001575012207027,
344
- 3.9003729248046852,
345
- 3.2779876708984403,
346
- 3.0333639526367193,
347
- 3.0030249023437534,
348
- 3.511061706542975
349
- ],
350
- "q_pooled": 2.5256872558593813
351
- }
352
- }
353
- },
354
- "DEXUSEU": {
355
- "arima": {
356
- "forecaster": "arima",
357
- "n_cal": 30,
358
- "n_test": 30,
359
- "conf=0.8": {
360
- "nominal_coverage": 0.8,
361
- "bare_coverage_mean": 0.8595238095238095,
362
- "bare_width_mean": 0.037255051394705835,
363
- "perhorizon_coverage_mean": 0.811904761904762,
364
- "perhorizon_width_mean": 0.03243267317446737,
365
- "pooled_coverage_mean": 0.8166666666666665,
366
- "pooled_width_mean": 0.031645107249388627,
367
- "q_per_horizon": [
368
- 0.006537154478817753,
369
- 0.007333177556922088,
370
- 0.012312774872748289,
371
- 0.014043924961390397,
372
- 0.016017799097016727,
373
- 0.015644421534730224,
374
- 0.016336252170641608,
375
- 0.016122979608933496,
376
- 0.01964457489050009,
377
- 0.02072169154979453,
378
- 0.024118006869554565,
379
- 0.018656617879449167,
380
- 0.017769218599013037,
381
- 0.021770118151759554
382
- ],
383
- "q_pooled": 0.015822553624694313
384
- },
385
- "conf=0.9": {
386
- "nominal_coverage": 0.9,
387
- "bare_coverage_mean": 0.9142857142857144,
388
- "bare_width_mean": 0.047816340798432555,
389
- "perhorizon_coverage_mean": 0.8904761904761905,
390
- "perhorizon_width_mean": 0.04285578362084427,
391
- "pooled_coverage_mean": 0.8809523809523809,
392
- "pooled_width_mean": 0.041073044538626924,
393
- "q_per_horizon": [
394
- 0.006761841674864266,
395
- 0.01182171512244512,
396
- 0.015822553624694313,
397
- 0.02093465874643763,
398
- 0.019889187414578124,
399
- 0.01963882946285489,
400
- 0.02190089656490879,
401
- 0.021692702530445862,
402
- 0.024590684771490512,
403
- 0.024756601121440625,
404
- 0.02609594060524123,
405
- 0.02889462135779275,
406
- 0.02689529861576956,
407
- 0.030294953732946217
408
- ],
409
- "q_pooled": 0.020536522269313462
410
- },
411
- "conf=0.95": {
412
- "nominal_coverage": 0.95,
413
- "bare_coverage_mean": 0.9380952380952381,
414
- "bare_width_mean": 0.05697668430905675,
415
- "perhorizon_coverage_mean": 0.9404761904761906,
416
- "perhorizon_width_mean": 0.05919364307194989,
417
- "pooled_coverage_mean": 0.9119047619047618,
418
- "pooled_width_mean": 0.05176715217769701,
419
- "q_per_horizon": [
420
- 0.011752772972313252,
421
- 0.01247253748338717,
422
- 0.01748801536532918,
423
- 0.02383577073487353,
424
- 0.02364315675893547,
425
- 0.02218707632552186,
426
- 0.03203504055001494,
427
- 0.030332454296178923,
428
- 0.03750274950896193,
429
- 0.03613221732608629,
430
- 0.039232376756770826,
431
- 0.04010448928765342,
432
- 0.04080440634480942,
433
- 0.046832437792812875
434
- ],
435
- "q_pooled": 0.025883576088848503
436
- }
437
- },
438
- "chronos": {
439
- "forecaster": "chronos",
440
- "n_cal": 30,
441
- "n_test": 30,
442
- "conf=0.8": {
443
- "nominal_coverage": 0.8,
444
- "bare_coverage_mean": 0.8,
445
- "bare_width_mean": 0.03301220412055651,
446
- "perhorizon_coverage_mean": 0.8071428571428574,
447
- "perhorizon_width_mean": 0.03432217042105538,
448
- "pooled_coverage_mean": 0.8000000000000002,
449
- "pooled_width_mean": 0.03300358161926287,
450
- "q_per_horizon": [
451
- 0.004584144783019939,
452
- 0.007060681152343706,
453
- 0.01243185882568354,
454
- 0.01602103652954101,
455
- 0.01641003990173351,
456
- 0.015545682907104563,
457
- 0.018368010711669935,
458
- 0.01898662319183342,
459
- 0.022148969459533596,
460
- 0.02255078582763681,
461
- 0.023978458976745554,
462
- 0.020319693946838413,
463
- 0.017313012123107985,
464
- 0.024536194610595752
465
- ],
466
- "q_pooled": 0.016501790809631434
467
- },
468
- "conf=0.9": {
469
- "nominal_coverage": 0.9,
470
- "bare_coverage_mean": 0.8,
471
- "bare_width_mean": 0.03301220412055651,
472
- "perhorizon_coverage_mean": 0.9190476190476191,
473
- "perhorizon_width_mean": 0.05077633157457622,
474
- "pooled_coverage_mean": 0.8904761904761905,
475
- "pooled_width_mean": 0.04548504829406719,
476
- "q_per_horizon": [
477
- 0.008554865837097081,
478
- 0.00971177463531503,
479
- 0.01530143814086915,
480
- 0.01911055355072011,
481
- 0.01780367832183849,
482
- 0.021554478836059543,
483
- 0.026538812255859412,
484
- 0.027544754409789984,
485
- 0.028936708450317372,
486
- 0.03478273067474369,
487
- 0.0382537099838256,
488
- 0.03136329650878911,
489
- 0.0327265468597413,
490
- 0.04325097255706778
491
- ],
492
- "q_pooled": 0.022742524147033594
493
- },
494
- "conf=0.95": {
495
- "nominal_coverage": 0.95,
496
- "bare_coverage_mean": 0.8,
497
- "bare_width_mean": 0.03301220412055651,
498
- "perhorizon_coverage_mean": 0.9404761904761905,
499
- "perhorizon_width_mean": 0.0633313385554722,
500
- "pooled_coverage_mean": 0.9547619047619046,
501
- "pooled_width_mean": 0.06135401725769052,
502
- "q_per_horizon": [
503
- 0.011944815063476666,
504
- 0.01392391796112058,
505
- 0.017532272148132355,
506
- 0.022742524147033594,
507
- 0.02558988399505613,
508
- 0.02623647480010982,
509
- 0.03067700862884526,
510
- 0.034072942352294966,
511
- 0.04179227085113535,
512
- 0.0389519283294677,
513
- 0.042779201126098565,
514
- 0.04429976444244388,
515
- 0.044917986869811966,
516
- 0.04785837917327873
517
- ],
518
- "q_pooled": 0.03067700862884526
519
- }
520
- }
521
- },
522
- "DEXCHUS": {
523
- "arima": {
524
- "forecaster": "arima",
525
- "n_cal": 30,
526
- "n_test": 30,
527
- "conf=0.8": {
528
- "nominal_coverage": 0.8,
529
- "bare_coverage_mean": 0.8309523809523809,
530
- "bare_width_mean": 0.12023258914287749,
531
- "perhorizon_coverage_mean": 0.8,
532
- "perhorizon_width_mean": 0.10379373004234645,
533
- "pooled_coverage_mean": 0.7833333333333333,
534
- "pooled_width_mean": 0.0905579673492376,
535
- "q_per_horizon": [
536
- 0.01913552539082275,
537
- 0.021503803498270635,
538
- 0.03202273363733443,
539
- 0.04471228016293516,
540
- 0.04595743067166769,
541
- 0.057142529866381686,
542
- 0.041567074905930035,
543
- 0.05922440211999547,
544
- 0.06055238630005544,
545
- 0.06195863987337091,
546
- 0.07735612435271388,
547
- 0.07482211423245033,
548
- 0.0613510301071134,
549
- 0.06925003517738304
550
- ],
551
- "q_pooled": 0.0452789836746188
552
- },
553
- "conf=0.9": {
554
- "nominal_coverage": 0.9,
555
- "bare_coverage_mean": 0.8761904761904763,
556
- "bare_width_mean": 0.1543168575080998,
557
- "perhorizon_coverage_mean": 0.8857142857142858,
558
- "perhorizon_width_mean": 0.1694623051285068,
559
- "pooled_coverage_mean": 0.8833333333333333,
560
- "pooled_width_mean": 0.14964422846490066,
561
- "q_per_horizon": [
562
- 0.026065770883445083,
563
- 0.03663070092160048,
564
- 0.04814005922096687,
565
- 0.05434837199719045,
566
- 0.06341843160370875,
567
- 0.06742875148755179,
568
- 0.08909509445192665,
569
- 0.09169474000207156,
570
- 0.11607218346504666,
571
- 0.12686121412365825,
572
- 0.11025109977698122,
573
- 0.12555183014476246,
574
- 0.11555182580724122,
575
- 0.11512606201339626
576
- ],
577
- "q_pooled": 0.07482211423245033
578
- },
579
- "conf=0.95": {
580
- "nominal_coverage": 0.95,
581
- "bare_coverage_mean": 0.9142857142857144,
582
- "bare_width_mean": 0.18387987719237844,
583
- "perhorizon_coverage_mean": 0.9523809523809524,
584
- "perhorizon_width_mean": 0.2451580685008066,
585
- "pooled_coverage_mean": 0.9285714285714286,
586
- "pooled_width_mean": 0.22228302327474836,
587
- "q_per_horizon": [
588
- 0.032681838125458995,
589
- 0.07173662444320072,
590
- 0.06519382424998543,
591
- 0.06079908928748701,
592
- 0.09872806564422376,
593
- 0.10867467864500302,
594
- 0.11114151163737418,
595
- 0.14390234892072673,
596
- 0.14109477023066574,
597
- 0.1721305319733375,
598
- 0.17782669739203882,
599
- 0.18559857212707964,
600
- 0.17849914242157627,
601
- 0.16809878440748793
602
- ],
603
- "q_pooled": 0.11114151163737418
604
- }
605
- },
606
- "chronos": {
607
- "forecaster": "chronos",
608
- "n_cal": 30,
609
- "n_test": 30,
610
- "conf=0.8": {
611
- "nominal_coverage": 0.8,
612
- "bare_coverage_mean": 0.8428571428571429,
613
- "bare_width_mean": 0.11959348532060782,
614
- "perhorizon_coverage_mean": 0.7833333333333333,
615
- "perhorizon_width_mean": 0.10019261191231878,
616
- "pooled_coverage_mean": 0.8,
617
- "pooled_width_mean": 0.09779591979980395,
618
- "q_per_horizon": [
619
- 0.025188607788085626,
620
- 0.02532754745483423,
621
- 0.03890764770507804,
622
- 0.043802440643310625,
623
- 0.04915690460205102,
624
- 0.04680775070190446,
625
- 0.03916668243408239,
626
- 0.04809946746826199,
627
- 0.0576093139648437,
628
- 0.06108116531372065,
629
- 0.05864996337890638,
630
- 0.06179137878417951,
631
- 0.0701272941589357,
632
- 0.0756321189880369
633
- ],
634
- "q_pooled": 0.04889795989990198
635
- },
636
- "conf=0.9": {
637
- "nominal_coverage": 0.9,
638
- "bare_coverage_mean": 0.8428571428571429,
639
- "bare_width_mean": 0.11959348532060782,
640
- "perhorizon_coverage_mean": 0.869047619047619,
641
- "perhorizon_width_mean": 0.16607914559500545,
642
- "pooled_coverage_mean": 0.861904761904762,
643
- "pooled_width_mean": 0.1402545883178714,
644
- "q_per_horizon": [
645
- 0.030081840515136626,
646
- 0.04935519256591814,
647
- 0.046391881561278936,
648
- 0.050782734680176134,
649
- 0.06024611434936489,
650
- 0.06782592163085965,
651
- 0.08113353042602522,
652
- 0.09840077590942364,
653
- 0.11880251922607421,
654
- 0.12758038635253932,
655
- 0.10697886581420857,
656
- 0.12221163177490268,
657
- 0.10586601409912078,
658
- 0.09689661026000973
659
- ],
660
- "q_pooled": 0.0701272941589357
661
- },
662
- "conf=0.95": {
663
- "nominal_coverage": 0.95,
664
- "bare_coverage_mean": 0.8428571428571429,
665
- "bare_width_mean": 0.11959348532060782,
666
- "perhorizon_coverage_mean": 0.9214285714285714,
667
- "perhorizon_width_mean": 0.22292400338309162,
668
- "pooled_coverage_mean": 0.9095238095238095,
669
- "pooled_width_mean": 0.2085365203857421,
670
- "q_per_horizon": [
671
- 0.03159678268432575,
672
- 0.07481312255859418,
673
- 0.07034568023681675,
674
- 0.05222851562499997,
675
- 0.070854161071777,
676
- 0.09303555068969693,
677
- 0.08751402359008775,
678
- 0.13737474822998053,
679
- 0.1317485343933109,
680
- 0.15814713668823277,
681
- 0.1641494514465336,
682
- 0.1720175582885739,
683
- 0.16296061859130884,
684
- 0.15368213958740196
685
- ],
686
- "q_pooled": 0.10426826019287105
687
- }
688
- }
689
- },
690
- "DEXKOUS": {
691
- "arima": {
692
- "forecaster": "arima",
693
- "n_cal": 30,
694
- "n_test": 30,
695
- "conf=0.8": {
696
- "nominal_coverage": 0.8,
697
- "bare_coverage_mean": 0.7071428571428572,
698
- "bare_width_mean": 41.40702231782995,
699
- "perhorizon_coverage_mean": 0.6809523809523808,
700
- "perhorizon_width_mean": 40.33834903476961,
701
- "pooled_coverage_mean": 0.738095238095238,
702
- "pooled_width_mean": 40.174430225697506,
703
- "q_per_horizon": [
704
- 6.019828757339383,
705
- 9.23651622262787,
706
- 11.885457212575375,
707
- 14.301239776206785,
708
- 16.538830978627857,
709
- 21.11794087612452,
710
- 21.007107424806236,
711
- 22.089443667480282,
712
- 22.26134568228099,
713
- 25.115703414253176,
714
- 26.282158971560648,
715
- 28.31230917980338,
716
- 28.622331265376488,
717
- 29.57822981432423
718
- ],
719
- "q_pooled": 20.087215112848753
720
- },
721
- "conf=0.9": {
722
- "nominal_coverage": 0.9,
723
- "bare_coverage_mean": 0.8023809523809522,
724
- "bare_width_mean": 53.145337785764546,
725
- "perhorizon_coverage_mean": 0.7476190476190475,
726
- "perhorizon_width_mean": 47.514067959856646,
727
- "pooled_coverage_mean": 0.8166666666666665,
728
- "pooled_width_mean": 51.703697664495394,
729
- "q_per_horizon": [
730
- 7.042854649616629,
731
- 11.217728114270585,
732
- 13.051289508962782,
733
- 17.974908318198914,
734
- 22.696578397519033,
735
- 24.786648186653792,
736
- 23.205692899009136,
737
- 25.439228843483306,
738
- 28.745883742858496,
739
- 27.649073917800933,
740
- 32.25531441260455,
741
- 33.39915882237847,
742
- 32.317174372199815,
743
- 32.81694153344006
744
- ],
745
- "q_pooled": 25.851848832247697
746
- },
747
- "conf=0.95": {
748
- "nominal_coverage": 0.95,
749
- "bare_coverage_mean": 0.8952380952380953,
750
- "bare_width_mean": 63.326575872509096,
751
- "perhorizon_coverage_mean": 0.8833333333333332,
752
- "perhorizon_width_mean": 62.3317263081943,
753
- "pooled_coverage_mean": 0.861904761904762,
754
- "pooled_width_mean": 63.003314010262784,
755
- "q_per_horizon": [
756
- 12.416104342710696,
757
- 13.332090802595758,
758
- 20.658854986845654,
759
- 37.144614564726226,
760
- 31.230195571947434,
761
- 31.501657005131392,
762
- 31.466225645210898,
763
- 32.67178752649829,
764
- 41.05990019882688,
765
- 37.85425421989498,
766
- 37.08859079038166,
767
- 35.26046070337611,
768
- 40.538744747242845,
769
- 34.098603051971395
770
- ],
771
- "q_pooled": 31.501657005131392
772
- }
773
- },
774
- "chronos": {
775
- "forecaster": "chronos",
776
- "n_cal": 30,
777
- "n_test": 30,
778
- "conf=0.8": {
779
- "nominal_coverage": 0.8,
780
- "bare_coverage_mean": 0.7476190476190475,
781
- "bare_width_mean": 47.698866081237796,
782
- "perhorizon_coverage_mean": 0.669047619047619,
783
- "perhorizon_width_mean": 42.05718540736606,
784
- "pooled_coverage_mean": 0.7452380952380951,
785
- "pooled_width_mean": 43.94189453125,
786
- "q_per_horizon": [
787
- 6.6086572265624,
788
- 8.688681640624964,
789
- 11.395966796874973,
790
- 12.880576171874964,
791
- 17.0732275390626,
792
- 19.5968017578125,
793
- 19.40576171875,
794
- 24.150083007812555,
795
- 24.586870117187573,
796
- 26.251137695312536,
797
- 27.594218749999982,
798
- 32.349785156249936,
799
- 31.7150732421876,
800
- 32.103457031249945
801
- ],
802
- "q_pooled": 21.970947265625
803
- },
804
- "conf=0.9": {
805
- "nominal_coverage": 0.9,
806
- "bare_coverage_mean": 0.7476190476190475,
807
- "bare_width_mean": 47.698866081237796,
808
- "perhorizon_coverage_mean": 0.7714285714285712,
809
- "perhorizon_width_mean": 49.80674665178569,
810
- "pooled_coverage_mean": 0.8357142857142856,
811
- "pooled_width_mean": 56.23533203124998,
812
- "q_per_horizon": [
813
- 8.360268554687536,
814
- 12.467915039062518,
815
- 14.159082031249909,
816
- 18.2329248046874,
817
- 23.688662109374945,
818
- 25.474423828125055,
819
- 24.956616210937455,
820
- 26.577456054687445,
821
- 28.821977539062573,
822
- 30.2672265624999,
823
- 33.08205566406241,
824
- 33.05286621093751,
825
- 33.24584472656261,
826
- 36.25990722656252
827
- ],
828
- "q_pooled": 28.11766601562499
829
- },
830
- "conf=0.95": {
831
- "nominal_coverage": 0.95,
832
- "bare_coverage_mean": 0.7476190476190475,
833
- "bare_width_mean": 47.698866081237796,
834
- "perhorizon_coverage_mean": 0.8738095238095237,
835
- "perhorizon_width_mean": 65.5785993303571,
836
- "pooled_coverage_mean": 0.8666666666666666,
837
- "pooled_width_mean": 66.16411132812482,
838
- "q_per_horizon": [
839
- 14.446508789062591,
840
- 15.035361328124964,
841
- 21.486127929687427,
842
- 38.963662109375036,
843
- 33.86973144531248,
844
- 34.60525878906242,
845
- 33.86685546874992,
846
- 33.722353515624945,
847
- 41.170214843750045,
848
- 36.77112792968751,
849
- 37.77993652343753,
850
- 39.08779296874991,
851
- 39.80886230468741,
852
- 38.4364013671875
853
- ],
854
- "q_pooled": 33.08205566406241
855
- }
856
- }
857
- }
858
- },
859
- "elapsed_min": 1.141351056098938
860
  }
 
1
+ {
2
+ "targets": [
3
+ "DCOILWTICO",
4
+ "DEXJPUS",
5
+ "DEXUSEU",
6
+ "DEXCHUS",
7
+ "DEXKOUS"
8
+ ],
9
+ "horizon": 14,
10
+ "confs": [
11
+ 0.8,
12
+ 0.9,
13
+ 0.95
14
+ ],
15
+ "n_cal": 30,
16
+ "n_test": 30,
17
+ "results": {
18
+ "DCOILWTICO": {
19
+ "arima": {
20
+ "forecaster": "arima",
21
+ "n_cal": 30,
22
+ "n_test": 30,
23
+ "conf=0.8": {
24
+ "nominal_coverage": 0.8,
25
+ "bare_coverage_mean": 0.8095238095238094,
26
+ "bare_width_mean": 10.867942261555571,
27
+ "perhorizon_coverage_mean": 0.6857142857142856,
28
+ "perhorizon_width_mean": 7.990994504643288,
29
+ "pooled_coverage_mean": 0.6785714285714285,
30
+ "pooled_width_mean": 8.029568159989491,
31
+ "q_per_horizon": [
32
+ 2.0917427692512547,
33
+ 2.414564146929898,
34
+ 3.49864771255762,
35
+ 3.783403014989574,
36
+ 3.6514825270864293,
37
+ 3.410638918826429,
38
+ 3.6483267386695672,
39
+ 4.291356370865486,
40
+ 4.148100512774434,
41
+ 4.765242660767733,
42
+ 4.798738782538393,
43
+ 4.648753353034714,
44
+ 5.111777984600735,
45
+ 5.674186039610767
46
+ ],
47
+ "q_pooled": 4.014784079994747
48
+ },
49
+ "conf=0.9": {
50
+ "nominal_coverage": 0.9,
51
+ "bare_coverage_mean": 0.9214285714285715,
52
+ "bare_width_mean": 13.948852880392929,
53
+ "perhorizon_coverage_mean": 0.7809523809523811,
54
+ "perhorizon_width_mean": 10.031165041917506,
55
+ "pooled_coverage_mean": 0.7738095238095238,
56
+ "pooled_width_mean": 10.167074585069713,
57
+ "q_per_horizon": [
58
+ 2.300277140003125,
59
+ 4.097940221459595,
60
+ 4.076376633492892,
61
+ 4.703831136719856,
62
+ 4.842398951063927,
63
+ 5.337677242975467,
64
+ 4.359396527417836,
65
+ 6.151868291801264,
66
+ 5.051950062063291,
67
+ 5.854070590337393,
68
+ 5.368481950759772,
69
+ 5.284114635080698,
70
+ 6.431339982770957,
71
+ 6.3584319274764525
72
+ ],
73
+ "q_pooled": 5.0835372925348565
74
+ },
75
+ "conf=0.95": {
76
+ "nominal_coverage": 0.95,
77
+ "bare_coverage_mean": 0.9452380952380951,
78
+ "bare_width_mean": 16.621083373775793,
79
+ "perhorizon_coverage_mean": 0.9261904761904761,
80
+ "perhorizon_width_mean": 14.611219531249459,
81
+ "pooled_coverage_mean": 0.838095238095238,
82
+ "pooled_width_mean": 12.16250013730463,
83
+ "q_per_horizon": [
84
+ 3.0531114213612582,
85
+ 5.059338828648023,
86
+ 5.697604686526287,
87
+ 7.146009479872129,
88
+ 5.3182905673299175,
89
+ 7.39090190741959,
90
+ 6.856329650125417,
91
+ 7.199424687832007,
92
+ 6.523429069811058,
93
+ 6.548845442730201,
94
+ 9.62406528058468,
95
+ 8.603787092463286,
96
+ 11.553679176235391,
97
+ 11.703719427806988
98
+ ],
99
+ "q_pooled": 6.0812500686523165
100
+ }
101
+ },
102
+ "chronos": {
103
+ "forecaster": "chronos",
104
+ "n_cal": 30,
105
+ "n_test": 30,
106
+ "conf=0.8": {
107
+ "nominal_coverage": 0.8,
108
+ "bare_coverage_mean": 0.7809523809523807,
109
+ "bare_width_mean": 11.050525585810343,
110
+ "perhorizon_coverage_mean": 0.6547619047619048,
111
+ "perhorizon_width_mean": 8.338129283360074,
112
+ "pooled_coverage_mean": 0.6452380952380952,
113
+ "pooled_width_mean": 8.036834106445315,
114
+ "q_per_horizon": [
115
+ 2.1229774475097685,
116
+ 2.4522241210937494,
117
+ 3.261205139160154,
118
+ 3.9071347045898435,
119
+ 3.614091110229495,
120
+ 3.6567034912109406,
121
+ 3.993652496337887,
122
+ 4.4286404418945295,
123
+ 4.545238494873047,
124
+ 5.274034423828127,
125
+ 5.24025115966797,
126
+ 4.8420919799804665,
127
+ 5.316376342773438,
128
+ 5.71228363037109
129
+ ],
130
+ "q_pooled": 4.018417053222656
131
+ },
132
+ "conf=0.9": {
133
+ "nominal_coverage": 0.9,
134
+ "bare_coverage_mean": 0.7809523809523807,
135
+ "bare_width_mean": 11.050525585810343,
136
+ "perhorizon_coverage_mean": 0.7880952380952381,
137
+ "perhorizon_width_mean": 11.069673222133089,
138
+ "pooled_coverage_mean": 0.769047619047619,
139
+ "pooled_width_mean": 10.63275268554687,
140
+ "q_per_horizon": [
141
+ 2.555929565429693,
142
+ 3.5912300109863295,
143
+ 4.3903402709960915,
144
+ 5.24416809082031,
145
+ 4.982480926513674,
146
+ 5.137361450195314,
147
+ 5.586841278076172,
148
+ 6.765305328369138,
149
+ 6.67245574951172,
150
+ 5.990972595214842,
151
+ 5.718290405273436,
152
+ 5.943902282714845,
153
+ 7.989523162841799,
154
+ 6.918911437988278
155
+ ],
156
+ "q_pooled": 5.316376342773438
157
+ },
158
+ "conf=0.95": {
159
+ "nominal_coverage": 0.95,
160
+ "bare_coverage_mean": 0.7809523809523807,
161
+ "bare_width_mean": 11.050525585810343,
162
+ "perhorizon_coverage_mean": 0.9261904761904761,
163
+ "perhorizon_width_mean": 16.372548740931915,
164
+ "pooled_coverage_mean": 0.8547619047619047,
165
+ "pooled_width_mean": 13.761851806640617,
166
+ "q_per_horizon": [
167
+ 4.500623779296873,
168
+ 5.796702575683597,
169
+ 4.578687438964849,
170
+ 5.983569641113277,
171
+ 7.369260253906248,
172
+ 8.649095764160151,
173
+ 8.18119262695312,
174
+ 9.151351928710938,
175
+ 8.256888427734381,
176
+ 8.666538696289066,
177
+ 10.109675750732421,
178
+ 9.065566864013675,
179
+ 12.079234161376952,
180
+ 12.219453277587888
181
+ ],
182
+ "q_pooled": 6.8809259033203105
183
+ }
184
+ }
185
+ },
186
+ "DEXJPUS": {
187
+ "arima": {
188
+ "forecaster": "arima",
189
+ "n_cal": 30,
190
+ "n_test": 30,
191
+ "conf=0.8": {
192
+ "nominal_coverage": 0.8,
193
+ "bare_coverage_mean": 0.6357142857142856,
194
+ "bare_width_mean": 4.436568793595841,
195
+ "perhorizon_coverage_mean": 0.45238095238095233,
196
+ "perhorizon_width_mean": 2.8685092642157013,
197
+ "pooled_coverage_mean": 0.4928571428571428,
198
+ "pooled_width_mean": 2.791173769264077,
199
+ "q_per_horizon": [
200
+ 0.495163456754355,
201
+ 0.8623131555344372,
202
+ 0.8897926642558076,
203
+ 1.1482011742546945,
204
+ 1.28795516679331,
205
+ 1.6477655987067266,
206
+ 1.7443474583408118,
207
+ 1.5384895904415004,
208
+ 1.803162688834604,
209
+ 1.7685075068830685,
210
+ 1.7186420091775432,
211
+ 1.5470661555772267,
212
+ 1.888659928991629,
213
+ 1.7394982949641928
214
+ ],
215
+ "q_pooled": 1.3955868846320385
216
+ },
217
+ "conf=0.9": {
218
+ "nominal_coverage": 0.9,
219
+ "bare_coverage_mean": 0.7738095238095236,
220
+ "bare_width_mean": 5.694274399535953,
221
+ "perhorizon_coverage_mean": 0.5761904761904761,
222
+ "perhorizon_width_mean": 3.798189452444865,
223
+ "pooled_coverage_mean": 0.5809523809523809,
224
+ "pooled_width_mean": 3.8189608293080823,
225
+ "q_per_horizon": [
226
+ 0.602618663621783,
227
+ 1.5464872564533323,
228
+ 1.410577522130609,
229
+ 2.006457013067674,
230
+ 1.9326982798289691,
231
+ 1.871741039728505,
232
+ 1.8724724170933484,
233
+ 2.0184353738183205,
234
+ 2.057205707305812,
235
+ 2.300998677577681,
236
+ 2.4584763121956854,
237
+ 2.2610349692604643,
238
+ 2.141044083930069,
239
+ 2.1070788511018037
240
+ ],
241
+ "q_pooled": 1.9094804146540412
242
+ },
243
+ "conf=0.95": {
244
+ "nominal_coverage": 0.95,
245
+ "bare_coverage_mean": 0.8738095238095237,
246
+ "bare_width_mean": 6.7851464460479765,
247
+ "perhorizon_coverage_mean": 0.8023809523809523,
248
+ "perhorizon_width_mean": 6.101635459825262,
249
+ "pooled_coverage_mean": 0.6571428571428571,
250
+ "pooled_width_mean": 4.601997355155362,
251
+ "q_per_horizon": [
252
+ 0.9380858484970958,
253
+ 2.323515167056655,
254
+ 1.946219636173069,
255
+ 2.2116051075864647,
256
+ 2.7206754280723686,
257
+ 3.562227529556367,
258
+ 3.502961358052417,
259
+ 3.5922479170316564,
260
+ 4.142317883234554,
261
+ 4.062380770386838,
262
+ 3.5722844723094056,
263
+ 3.2623018774721544,
264
+ 3.212317495709044,
265
+ 3.6623077276387335
266
+ ],
267
+ "q_pooled": 2.300998677577681
268
+ }
269
+ },
270
+ "chronos": {
271
+ "forecaster": "chronos",
272
+ "n_cal": 30,
273
+ "n_test": 30,
274
+ "conf=0.8": {
275
+ "nominal_coverage": 0.8,
276
+ "bare_coverage_mean": 0.7309523809523808,
277
+ "bare_width_mean": 5.977349718411763,
278
+ "perhorizon_coverage_mean": 0.47380952380952385,
279
+ "perhorizon_width_mean": 3.038026166643411,
280
+ "pooled_coverage_mean": 0.49761904761904757,
281
+ "pooled_width_mean": 2.8918725585937466,
282
+ "q_per_horizon": [
283
+ 0.5868325805664085,
284
+ 0.8268566894531233,
285
+ 0.8645288085937466,
286
+ 1.1490182495117125,
287
+ 1.4187112426757835,
288
+ 1.667842102050784,
289
+ 1.8516342163085966,
290
+ 1.6831582641601557,
291
+ 1.5933966064453102,
292
+ 1.7942288208007824,
293
+ 2.1771484374999943,
294
+ 1.8165200805664057,
295
+ 1.8638430786132858,
296
+ 1.9724639892578182
297
+ ],
298
+ "q_pooled": 1.4459362792968733
299
+ },
300
+ "conf=0.9": {
301
+ "nominal_coverage": 0.9,
302
+ "bare_coverage_mean": 0.7309523809523808,
303
+ "bare_width_mean": 5.977349718411763,
304
+ "perhorizon_coverage_mean": 0.6071428571428572,
305
+ "perhorizon_width_mean": 4.111253226143984,
306
+ "pooled_coverage_mean": 0.6023809523809524,
307
+ "pooled_width_mean": 4.0517645263671795,
308
+ "q_per_horizon": [
309
+ 0.7398001098632818,
310
+ 1.542530517578129,
311
+ 1.4136145019531199,
312
+ 2.0581530761718767,
313
+ 1.8112579345703068,
314
+ 2.3215438842773466,
315
+ 2.0993005371093716,
316
+ 2.064953918457036,
317
+ 2.4423132324218813,
318
+ 2.698671264648439,
319
+ 2.4562600708007807,
320
+ 2.32724975585937,
321
+ 2.5256872558593813,
322
+ 2.277436523437501
323
+ ],
324
+ "q_pooled": 2.0258822631835898
325
+ },
326
+ "conf=0.95": {
327
+ "nominal_coverage": 0.95,
328
+ "bare_coverage_mean": 0.7309523809523808,
329
+ "bare_width_mean": 5.977349718411763,
330
+ "perhorizon_coverage_mean": 0.7190476190476188,
331
+ "perhorizon_width_mean": 5.96463936941964,
332
+ "pooled_coverage_mean": 0.6809523809523809,
333
+ "pooled_width_mean": 5.0513745117187625,
334
+ "q_per_horizon": [
335
+ 0.930439453125004,
336
+ 2.665478515624997,
337
+ 1.9302044677734358,
338
+ 2.0884591674804653,
339
+ 2.7411437988281193,
340
+ 3.6284613037109352,
341
+ 3.513445739746089,
342
+ 3.5274569702148426,
343
+ 4.001575012207027,
344
+ 3.9003729248046852,
345
+ 3.2779876708984403,
346
+ 3.0333639526367193,
347
+ 3.0030249023437534,
348
+ 3.511061706542975
349
+ ],
350
+ "q_pooled": 2.5256872558593813
351
+ }
352
+ }
353
+ },
354
+ "DEXUSEU": {
355
+ "arima": {
356
+ "forecaster": "arima",
357
+ "n_cal": 30,
358
+ "n_test": 30,
359
+ "conf=0.8": {
360
+ "nominal_coverage": 0.8,
361
+ "bare_coverage_mean": 0.8595238095238095,
362
+ "bare_width_mean": 0.037255051394705835,
363
+ "perhorizon_coverage_mean": 0.811904761904762,
364
+ "perhorizon_width_mean": 0.03243267317446737,
365
+ "pooled_coverage_mean": 0.8166666666666665,
366
+ "pooled_width_mean": 0.031645107249388627,
367
+ "q_per_horizon": [
368
+ 0.006537154478817753,
369
+ 0.007333177556922088,
370
+ 0.012312774872748289,
371
+ 0.014043924961390397,
372
+ 0.016017799097016727,
373
+ 0.015644421534730224,
374
+ 0.016336252170641608,
375
+ 0.016122979608933496,
376
+ 0.01964457489050009,
377
+ 0.02072169154979453,
378
+ 0.024118006869554565,
379
+ 0.018656617879449167,
380
+ 0.017769218599013037,
381
+ 0.021770118151759554
382
+ ],
383
+ "q_pooled": 0.015822553624694313
384
+ },
385
+ "conf=0.9": {
386
+ "nominal_coverage": 0.9,
387
+ "bare_coverage_mean": 0.9142857142857144,
388
+ "bare_width_mean": 0.047816340798432555,
389
+ "perhorizon_coverage_mean": 0.8904761904761905,
390
+ "perhorizon_width_mean": 0.04285578362084427,
391
+ "pooled_coverage_mean": 0.8809523809523809,
392
+ "pooled_width_mean": 0.041073044538626924,
393
+ "q_per_horizon": [
394
+ 0.006761841674864266,
395
+ 0.01182171512244512,
396
+ 0.015822553624694313,
397
+ 0.02093465874643763,
398
+ 0.019889187414578124,
399
+ 0.01963882946285489,
400
+ 0.02190089656490879,
401
+ 0.021692702530445862,
402
+ 0.024590684771490512,
403
+ 0.024756601121440625,
404
+ 0.02609594060524123,
405
+ 0.02889462135779275,
406
+ 0.02689529861576956,
407
+ 0.030294953732946217
408
+ ],
409
+ "q_pooled": 0.020536522269313462
410
+ },
411
+ "conf=0.95": {
412
+ "nominal_coverage": 0.95,
413
+ "bare_coverage_mean": 0.9380952380952381,
414
+ "bare_width_mean": 0.05697668430905675,
415
+ "perhorizon_coverage_mean": 0.9404761904761906,
416
+ "perhorizon_width_mean": 0.05919364307194989,
417
+ "pooled_coverage_mean": 0.9119047619047618,
418
+ "pooled_width_mean": 0.05176715217769701,
419
+ "q_per_horizon": [
420
+ 0.011752772972313252,
421
+ 0.01247253748338717,
422
+ 0.01748801536532918,
423
+ 0.02383577073487353,
424
+ 0.02364315675893547,
425
+ 0.02218707632552186,
426
+ 0.03203504055001494,
427
+ 0.030332454296178923,
428
+ 0.03750274950896193,
429
+ 0.03613221732608629,
430
+ 0.039232376756770826,
431
+ 0.04010448928765342,
432
+ 0.04080440634480942,
433
+ 0.046832437792812875
434
+ ],
435
+ "q_pooled": 0.025883576088848503
436
+ }
437
+ },
438
+ "chronos": {
439
+ "forecaster": "chronos",
440
+ "n_cal": 30,
441
+ "n_test": 30,
442
+ "conf=0.8": {
443
+ "nominal_coverage": 0.8,
444
+ "bare_coverage_mean": 0.8,
445
+ "bare_width_mean": 0.03301220412055651,
446
+ "perhorizon_coverage_mean": 0.8071428571428574,
447
+ "perhorizon_width_mean": 0.03432217042105538,
448
+ "pooled_coverage_mean": 0.8000000000000002,
449
+ "pooled_width_mean": 0.03300358161926287,
450
+ "q_per_horizon": [
451
+ 0.004584144783019939,
452
+ 0.007060681152343706,
453
+ 0.01243185882568354,
454
+ 0.01602103652954101,
455
+ 0.01641003990173351,
456
+ 0.015545682907104563,
457
+ 0.018368010711669935,
458
+ 0.01898662319183342,
459
+ 0.022148969459533596,
460
+ 0.02255078582763681,
461
+ 0.023978458976745554,
462
+ 0.020319693946838413,
463
+ 0.017313012123107985,
464
+ 0.024536194610595752
465
+ ],
466
+ "q_pooled": 0.016501790809631434
467
+ },
468
+ "conf=0.9": {
469
+ "nominal_coverage": 0.9,
470
+ "bare_coverage_mean": 0.8,
471
+ "bare_width_mean": 0.03301220412055651,
472
+ "perhorizon_coverage_mean": 0.9190476190476191,
473
+ "perhorizon_width_mean": 0.05077633157457622,
474
+ "pooled_coverage_mean": 0.8904761904761905,
475
+ "pooled_width_mean": 0.04548504829406719,
476
+ "q_per_horizon": [
477
+ 0.008554865837097081,
478
+ 0.00971177463531503,
479
+ 0.01530143814086915,
480
+ 0.01911055355072011,
481
+ 0.01780367832183849,
482
+ 0.021554478836059543,
483
+ 0.026538812255859412,
484
+ 0.027544754409789984,
485
+ 0.028936708450317372,
486
+ 0.03478273067474369,
487
+ 0.0382537099838256,
488
+ 0.03136329650878911,
489
+ 0.0327265468597413,
490
+ 0.04325097255706778
491
+ ],
492
+ "q_pooled": 0.022742524147033594
493
+ },
494
+ "conf=0.95": {
495
+ "nominal_coverage": 0.95,
496
+ "bare_coverage_mean": 0.8,
497
+ "bare_width_mean": 0.03301220412055651,
498
+ "perhorizon_coverage_mean": 0.9404761904761905,
499
+ "perhorizon_width_mean": 0.0633313385554722,
500
+ "pooled_coverage_mean": 0.9547619047619046,
501
+ "pooled_width_mean": 0.06135401725769052,
502
+ "q_per_horizon": [
503
+ 0.011944815063476666,
504
+ 0.01392391796112058,
505
+ 0.017532272148132355,
506
+ 0.022742524147033594,
507
+ 0.02558988399505613,
508
+ 0.02623647480010982,
509
+ 0.03067700862884526,
510
+ 0.034072942352294966,
511
+ 0.04179227085113535,
512
+ 0.0389519283294677,
513
+ 0.042779201126098565,
514
+ 0.04429976444244388,
515
+ 0.044917986869811966,
516
+ 0.04785837917327873
517
+ ],
518
+ "q_pooled": 0.03067700862884526
519
+ }
520
+ }
521
+ },
522
+ "DEXCHUS": {
523
+ "arima": {
524
+ "forecaster": "arima",
525
+ "n_cal": 30,
526
+ "n_test": 30,
527
+ "conf=0.8": {
528
+ "nominal_coverage": 0.8,
529
+ "bare_coverage_mean": 0.8309523809523809,
530
+ "bare_width_mean": 0.12023258914287749,
531
+ "perhorizon_coverage_mean": 0.8,
532
+ "perhorizon_width_mean": 0.10379373004234645,
533
+ "pooled_coverage_mean": 0.7833333333333333,
534
+ "pooled_width_mean": 0.0905579673492376,
535
+ "q_per_horizon": [
536
+ 0.01913552539082275,
537
+ 0.021503803498270635,
538
+ 0.03202273363733443,
539
+ 0.04471228016293516,
540
+ 0.04595743067166769,
541
+ 0.057142529866381686,
542
+ 0.041567074905930035,
543
+ 0.05922440211999547,
544
+ 0.06055238630005544,
545
+ 0.06195863987337091,
546
+ 0.07735612435271388,
547
+ 0.07482211423245033,
548
+ 0.0613510301071134,
549
+ 0.06925003517738304
550
+ ],
551
+ "q_pooled": 0.0452789836746188
552
+ },
553
+ "conf=0.9": {
554
+ "nominal_coverage": 0.9,
555
+ "bare_coverage_mean": 0.8761904761904763,
556
+ "bare_width_mean": 0.1543168575080998,
557
+ "perhorizon_coverage_mean": 0.8857142857142858,
558
+ "perhorizon_width_mean": 0.1694623051285068,
559
+ "pooled_coverage_mean": 0.8833333333333333,
560
+ "pooled_width_mean": 0.14964422846490066,
561
+ "q_per_horizon": [
562
+ 0.026065770883445083,
563
+ 0.03663070092160048,
564
+ 0.04814005922096687,
565
+ 0.05434837199719045,
566
+ 0.06341843160370875,
567
+ 0.06742875148755179,
568
+ 0.08909509445192665,
569
+ 0.09169474000207156,
570
+ 0.11607218346504666,
571
+ 0.12686121412365825,
572
+ 0.11025109977698122,
573
+ 0.12555183014476246,
574
+ 0.11555182580724122,
575
+ 0.11512606201339626
576
+ ],
577
+ "q_pooled": 0.07482211423245033
578
+ },
579
+ "conf=0.95": {
580
+ "nominal_coverage": 0.95,
581
+ "bare_coverage_mean": 0.9142857142857144,
582
+ "bare_width_mean": 0.18387987719237844,
583
+ "perhorizon_coverage_mean": 0.9523809523809524,
584
+ "perhorizon_width_mean": 0.2451580685008066,
585
+ "pooled_coverage_mean": 0.9285714285714286,
586
+ "pooled_width_mean": 0.22228302327474836,
587
+ "q_per_horizon": [
588
+ 0.032681838125458995,
589
+ 0.07173662444320072,
590
+ 0.06519382424998543,
591
+ 0.06079908928748701,
592
+ 0.09872806564422376,
593
+ 0.10867467864500302,
594
+ 0.11114151163737418,
595
+ 0.14390234892072673,
596
+ 0.14109477023066574,
597
+ 0.1721305319733375,
598
+ 0.17782669739203882,
599
+ 0.18559857212707964,
600
+ 0.17849914242157627,
601
+ 0.16809878440748793
602
+ ],
603
+ "q_pooled": 0.11114151163737418
604
+ }
605
+ },
606
+ "chronos": {
607
+ "forecaster": "chronos",
608
+ "n_cal": 30,
609
+ "n_test": 30,
610
+ "conf=0.8": {
611
+ "nominal_coverage": 0.8,
612
+ "bare_coverage_mean": 0.8428571428571429,
613
+ "bare_width_mean": 0.11959348532060782,
614
+ "perhorizon_coverage_mean": 0.7833333333333333,
615
+ "perhorizon_width_mean": 0.10019261191231878,
616
+ "pooled_coverage_mean": 0.8,
617
+ "pooled_width_mean": 0.09779591979980395,
618
+ "q_per_horizon": [
619
+ 0.025188607788085626,
620
+ 0.02532754745483423,
621
+ 0.03890764770507804,
622
+ 0.043802440643310625,
623
+ 0.04915690460205102,
624
+ 0.04680775070190446,
625
+ 0.03916668243408239,
626
+ 0.04809946746826199,
627
+ 0.0576093139648437,
628
+ 0.06108116531372065,
629
+ 0.05864996337890638,
630
+ 0.06179137878417951,
631
+ 0.0701272941589357,
632
+ 0.0756321189880369
633
+ ],
634
+ "q_pooled": 0.04889795989990198
635
+ },
636
+ "conf=0.9": {
637
+ "nominal_coverage": 0.9,
638
+ "bare_coverage_mean": 0.8428571428571429,
639
+ "bare_width_mean": 0.11959348532060782,
640
+ "perhorizon_coverage_mean": 0.869047619047619,
641
+ "perhorizon_width_mean": 0.16607914559500545,
642
+ "pooled_coverage_mean": 0.861904761904762,
643
+ "pooled_width_mean": 0.1402545883178714,
644
+ "q_per_horizon": [
645
+ 0.030081840515136626,
646
+ 0.04935519256591814,
647
+ 0.046391881561278936,
648
+ 0.050782734680176134,
649
+ 0.06024611434936489,
650
+ 0.06782592163085965,
651
+ 0.08113353042602522,
652
+ 0.09840077590942364,
653
+ 0.11880251922607421,
654
+ 0.12758038635253932,
655
+ 0.10697886581420857,
656
+ 0.12221163177490268,
657
+ 0.10586601409912078,
658
+ 0.09689661026000973
659
+ ],
660
+ "q_pooled": 0.0701272941589357
661
+ },
662
+ "conf=0.95": {
663
+ "nominal_coverage": 0.95,
664
+ "bare_coverage_mean": 0.8428571428571429,
665
+ "bare_width_mean": 0.11959348532060782,
666
+ "perhorizon_coverage_mean": 0.9214285714285714,
667
+ "perhorizon_width_mean": 0.22292400338309162,
668
+ "pooled_coverage_mean": 0.9095238095238095,
669
+ "pooled_width_mean": 0.2085365203857421,
670
+ "q_per_horizon": [
671
+ 0.03159678268432575,
672
+ 0.07481312255859418,
673
+ 0.07034568023681675,
674
+ 0.05222851562499997,
675
+ 0.070854161071777,
676
+ 0.09303555068969693,
677
+ 0.08751402359008775,
678
+ 0.13737474822998053,
679
+ 0.1317485343933109,
680
+ 0.15814713668823277,
681
+ 0.1641494514465336,
682
+ 0.1720175582885739,
683
+ 0.16296061859130884,
684
+ 0.15368213958740196
685
+ ],
686
+ "q_pooled": 0.10426826019287105
687
+ }
688
+ }
689
+ },
690
+ "DEXKOUS": {
691
+ "arima": {
692
+ "forecaster": "arima",
693
+ "n_cal": 30,
694
+ "n_test": 30,
695
+ "conf=0.8": {
696
+ "nominal_coverage": 0.8,
697
+ "bare_coverage_mean": 0.7071428571428572,
698
+ "bare_width_mean": 41.40702231782995,
699
+ "perhorizon_coverage_mean": 0.6809523809523808,
700
+ "perhorizon_width_mean": 40.33834903476961,
701
+ "pooled_coverage_mean": 0.738095238095238,
702
+ "pooled_width_mean": 40.174430225697506,
703
+ "q_per_horizon": [
704
+ 6.019828757339383,
705
+ 9.23651622262787,
706
+ 11.885457212575375,
707
+ 14.301239776206785,
708
+ 16.538830978627857,
709
+ 21.11794087612452,
710
+ 21.007107424806236,
711
+ 22.089443667480282,
712
+ 22.26134568228099,
713
+ 25.115703414253176,
714
+ 26.282158971560648,
715
+ 28.31230917980338,
716
+ 28.622331265376488,
717
+ 29.57822981432423
718
+ ],
719
+ "q_pooled": 20.087215112848753
720
+ },
721
+ "conf=0.9": {
722
+ "nominal_coverage": 0.9,
723
+ "bare_coverage_mean": 0.8023809523809522,
724
+ "bare_width_mean": 53.145337785764546,
725
+ "perhorizon_coverage_mean": 0.7476190476190475,
726
+ "perhorizon_width_mean": 47.514067959856646,
727
+ "pooled_coverage_mean": 0.8166666666666665,
728
+ "pooled_width_mean": 51.703697664495394,
729
+ "q_per_horizon": [
730
+ 7.042854649616629,
731
+ 11.217728114270585,
732
+ 13.051289508962782,
733
+ 17.974908318198914,
734
+ 22.696578397519033,
735
+ 24.786648186653792,
736
+ 23.205692899009136,
737
+ 25.439228843483306,
738
+ 28.745883742858496,
739
+ 27.649073917800933,
740
+ 32.25531441260455,
741
+ 33.39915882237847,
742
+ 32.317174372199815,
743
+ 32.81694153344006
744
+ ],
745
+ "q_pooled": 25.851848832247697
746
+ },
747
+ "conf=0.95": {
748
+ "nominal_coverage": 0.95,
749
+ "bare_coverage_mean": 0.8952380952380953,
750
+ "bare_width_mean": 63.326575872509096,
751
+ "perhorizon_coverage_mean": 0.8833333333333332,
752
+ "perhorizon_width_mean": 62.3317263081943,
753
+ "pooled_coverage_mean": 0.861904761904762,
754
+ "pooled_width_mean": 63.003314010262784,
755
+ "q_per_horizon": [
756
+ 12.416104342710696,
757
+ 13.332090802595758,
758
+ 20.658854986845654,
759
+ 37.144614564726226,
760
+ 31.230195571947434,
761
+ 31.501657005131392,
762
+ 31.466225645210898,
763
+ 32.67178752649829,
764
+ 41.05990019882688,
765
+ 37.85425421989498,
766
+ 37.08859079038166,
767
+ 35.26046070337611,
768
+ 40.538744747242845,
769
+ 34.098603051971395
770
+ ],
771
+ "q_pooled": 31.501657005131392
772
+ }
773
+ },
774
+ "chronos": {
775
+ "forecaster": "chronos",
776
+ "n_cal": 30,
777
+ "n_test": 30,
778
+ "conf=0.8": {
779
+ "nominal_coverage": 0.8,
780
+ "bare_coverage_mean": 0.7476190476190475,
781
+ "bare_width_mean": 47.698866081237796,
782
+ "perhorizon_coverage_mean": 0.669047619047619,
783
+ "perhorizon_width_mean": 42.05718540736606,
784
+ "pooled_coverage_mean": 0.7452380952380951,
785
+ "pooled_width_mean": 43.94189453125,
786
+ "q_per_horizon": [
787
+ 6.6086572265624,
788
+ 8.688681640624964,
789
+ 11.395966796874973,
790
+ 12.880576171874964,
791
+ 17.0732275390626,
792
+ 19.5968017578125,
793
+ 19.40576171875,
794
+ 24.150083007812555,
795
+ 24.586870117187573,
796
+ 26.251137695312536,
797
+ 27.594218749999982,
798
+ 32.349785156249936,
799
+ 31.7150732421876,
800
+ 32.103457031249945
801
+ ],
802
+ "q_pooled": 21.970947265625
803
+ },
804
+ "conf=0.9": {
805
+ "nominal_coverage": 0.9,
806
+ "bare_coverage_mean": 0.7476190476190475,
807
+ "bare_width_mean": 47.698866081237796,
808
+ "perhorizon_coverage_mean": 0.7714285714285712,
809
+ "perhorizon_width_mean": 49.80674665178569,
810
+ "pooled_coverage_mean": 0.8357142857142856,
811
+ "pooled_width_mean": 56.23533203124998,
812
+ "q_per_horizon": [
813
+ 8.360268554687536,
814
+ 12.467915039062518,
815
+ 14.159082031249909,
816
+ 18.2329248046874,
817
+ 23.688662109374945,
818
+ 25.474423828125055,
819
+ 24.956616210937455,
820
+ 26.577456054687445,
821
+ 28.821977539062573,
822
+ 30.2672265624999,
823
+ 33.08205566406241,
824
+ 33.05286621093751,
825
+ 33.24584472656261,
826
+ 36.25990722656252
827
+ ],
828
+ "q_pooled": 28.11766601562499
829
+ },
830
+ "conf=0.95": {
831
+ "nominal_coverage": 0.95,
832
+ "bare_coverage_mean": 0.7476190476190475,
833
+ "bare_width_mean": 47.698866081237796,
834
+ "perhorizon_coverage_mean": 0.8738095238095237,
835
+ "perhorizon_width_mean": 65.5785993303571,
836
+ "pooled_coverage_mean": 0.8666666666666666,
837
+ "pooled_width_mean": 66.16411132812482,
838
+ "q_per_horizon": [
839
+ 14.446508789062591,
840
+ 15.035361328124964,
841
+ 21.486127929687427,
842
+ 38.963662109375036,
843
+ 33.86973144531248,
844
+ 34.60525878906242,
845
+ 33.86685546874992,
846
+ 33.722353515624945,
847
+ 41.170214843750045,
848
+ 36.77112792968751,
849
+ 37.77993652343753,
850
+ 39.08779296874991,
851
+ 39.80886230468741,
852
+ 38.4364013671875
853
+ ],
854
+ "q_pooled": 33.08205566406241
855
+ }
856
+ }
857
+ }
858
+ },
859
+ "elapsed_min": 1.141351056098938
860
  }
FINAL_SUBMIT/receipts/R6_GETHSEMANE.json CHANGED
@@ -1,122 +1,122 @@
1
- {
2
- "tasks": {
3
- "easy_typhoon_response": {
4
- "ppo_v3": {
5
- "policy": "ppo_v3",
6
- "n_episodes": 50,
7
- "reward_mean": 1.2005000000000001,
8
- "reward_std": 0.19939637032804786,
9
- "reward_min": 0.643,
10
- "reward_max": 1.3435000000000004,
11
- "length_mean": 20.0,
12
- "violations_mean": 0.0,
13
- "violations_max": 0,
14
- "train_time_s": 389.36543345451355,
15
- "total_timesteps": 100000
16
- },
17
- "random": {
18
- "policy": "random",
19
- "n_episodes": 50,
20
- "reward_mean": 0.7797316807490356,
21
- "reward_std": 0.12419262667905032,
22
- "reward_min": 0.5059697476286091,
23
- "reward_max": 1.009169047501108,
24
- "length_mean": 20.0,
25
- "violations_mean": 0.0,
26
- "violations_max": 0
27
- },
28
- "greedy": {
29
- "policy": "greedy",
30
- "n_episodes": 50,
31
- "reward_mean": 0.9803400000000001,
32
- "reward_std": 0.0062695215128429176,
33
- "reward_min": 0.964,
34
- "reward_max": 0.9894999999999999,
35
- "length_mean": 20.0,
36
- "violations_mean": 0.0,
37
- "violations_max": 0
38
- }
39
- },
40
- "medium_multi_front": {
41
- "ppo_v3": {
42
- "policy": "ppo_v3",
43
- "n_episodes": 50,
44
- "reward_mean": 2.774816094381805,
45
- "reward_std": 0.2510891195507745,
46
- "reward_min": 2.2131947145395343,
47
- "reward_max": 3.1306422226861352,
48
- "length_mean": 44.76,
49
- "violations_mean": 0.0,
50
- "violations_max": 0,
51
- "train_time_s": 1028.4124627113342,
52
- "total_timesteps": 100000
53
- },
54
- "random": {
55
- "policy": "random",
56
- "n_episodes": 50,
57
- "reward_mean": -1.1101909893619986,
58
- "reward_std": 0.8109045133638636,
59
- "reward_min": -2.3839605638376136,
60
- "reward_max": 0.6624458826285525,
61
- "length_mean": 44.84,
62
- "violations_mean": 0.0,
63
- "violations_max": 0
64
- },
65
- "greedy": {
66
- "policy": "greedy",
67
- "n_episodes": 50,
68
- "reward_mean": -1.7960883333333333,
69
- "reward_std": 0.08206659628009437,
70
- "reward_min": -1.9960833333333332,
71
- "reward_max": -1.6348333333333334,
72
- "length_mean": 44.76,
73
- "violations_mean": 0.0,
74
- "violations_max": 0
75
- }
76
- },
77
- "hard_cascading_crisis": {
78
- "ppo_v3": {
79
- "policy": "ppo_v3",
80
- "n_episodes": 50,
81
- "reward_mean": 2.67403629887518,
82
- "reward_std": 0.7949077297864112,
83
- "reward_min": 0.44374348685637904,
84
- "reward_max": 3.4482740553083278,
85
- "length_mean": 56.06,
86
- "violations_mean": 0.0,
87
- "violations_max": 0,
88
- "train_time_s": 1359.914410352707,
89
- "total_timesteps": 100000
90
- },
91
- "random": {
92
- "policy": "random",
93
- "n_episodes": 50,
94
- "reward_mean": -1.222005001736981,
95
- "reward_std": 0.853497432761393,
96
- "reward_min": -3.8651570083150526,
97
- "reward_max": 0.6500552441714463,
98
- "length_mean": 56.06,
99
- "violations_mean": 0.0,
100
- "violations_max": 0
101
- },
102
- "greedy": {
103
- "policy": "greedy",
104
- "n_episodes": 50,
105
- "reward_mean": -1.4125516666666666,
106
- "reward_std": 0.4515386177313937,
107
- "reward_min": -2.3674999999999997,
108
- "reward_max": -0.4405833333333334,
109
- "length_mean": 56.06,
110
- "violations_mean": 0.0,
111
- "violations_max": 0
112
- }
113
- }
114
- },
115
- "baselines": {},
116
- "config": {
117
- "timesteps_per_task": 100000,
118
- "eval_episodes": 50,
119
- "seed": 42
120
- },
121
- "elapsed_min": 48.6515386501948
122
  }
 
1
+ {
2
+ "tasks": {
3
+ "easy_typhoon_response": {
4
+ "ppo_v3": {
5
+ "policy": "ppo_v3",
6
+ "n_episodes": 50,
7
+ "reward_mean": 1.2005000000000001,
8
+ "reward_std": 0.19939637032804786,
9
+ "reward_min": 0.643,
10
+ "reward_max": 1.3435000000000004,
11
+ "length_mean": 20.0,
12
+ "violations_mean": 0.0,
13
+ "violations_max": 0,
14
+ "train_time_s": 389.36543345451355,
15
+ "total_timesteps": 100000
16
+ },
17
+ "random": {
18
+ "policy": "random",
19
+ "n_episodes": 50,
20
+ "reward_mean": 0.7797316807490356,
21
+ "reward_std": 0.12419262667905032,
22
+ "reward_min": 0.5059697476286091,
23
+ "reward_max": 1.009169047501108,
24
+ "length_mean": 20.0,
25
+ "violations_mean": 0.0,
26
+ "violations_max": 0
27
+ },
28
+ "greedy": {
29
+ "policy": "greedy",
30
+ "n_episodes": 50,
31
+ "reward_mean": 0.9803400000000001,
32
+ "reward_std": 0.0062695215128429176,
33
+ "reward_min": 0.964,
34
+ "reward_max": 0.9894999999999999,
35
+ "length_mean": 20.0,
36
+ "violations_mean": 0.0,
37
+ "violations_max": 0
38
+ }
39
+ },
40
+ "medium_multi_front": {
41
+ "ppo_v3": {
42
+ "policy": "ppo_v3",
43
+ "n_episodes": 50,
44
+ "reward_mean": 2.774816094381805,
45
+ "reward_std": 0.2510891195507745,
46
+ "reward_min": 2.2131947145395343,
47
+ "reward_max": 3.1306422226861352,
48
+ "length_mean": 44.76,
49
+ "violations_mean": 0.0,
50
+ "violations_max": 0,
51
+ "train_time_s": 1028.4124627113342,
52
+ "total_timesteps": 100000
53
+ },
54
+ "random": {
55
+ "policy": "random",
56
+ "n_episodes": 50,
57
+ "reward_mean": -1.1101909893619986,
58
+ "reward_std": 0.8109045133638636,
59
+ "reward_min": -2.3839605638376136,
60
+ "reward_max": 0.6624458826285525,
61
+ "length_mean": 44.84,
62
+ "violations_mean": 0.0,
63
+ "violations_max": 0
64
+ },
65
+ "greedy": {
66
+ "policy": "greedy",
67
+ "n_episodes": 50,
68
+ "reward_mean": -1.7960883333333333,
69
+ "reward_std": 0.08206659628009437,
70
+ "reward_min": -1.9960833333333332,
71
+ "reward_max": -1.6348333333333334,
72
+ "length_mean": 44.76,
73
+ "violations_mean": 0.0,
74
+ "violations_max": 0
75
+ }
76
+ },
77
+ "hard_cascading_crisis": {
78
+ "ppo_v3": {
79
+ "policy": "ppo_v3",
80
+ "n_episodes": 50,
81
+ "reward_mean": 2.67403629887518,
82
+ "reward_std": 0.7949077297864112,
83
+ "reward_min": 0.44374348685637904,
84
+ "reward_max": 3.4482740553083278,
85
+ "length_mean": 56.06,
86
+ "violations_mean": 0.0,
87
+ "violations_max": 0,
88
+ "train_time_s": 1359.914410352707,
89
+ "total_timesteps": 100000
90
+ },
91
+ "random": {
92
+ "policy": "random",
93
+ "n_episodes": 50,
94
+ "reward_mean": -1.222005001736981,
95
+ "reward_std": 0.853497432761393,
96
+ "reward_min": -3.8651570083150526,
97
+ "reward_max": 0.6500552441714463,
98
+ "length_mean": 56.06,
99
+ "violations_mean": 0.0,
100
+ "violations_max": 0
101
+ },
102
+ "greedy": {
103
+ "policy": "greedy",
104
+ "n_episodes": 50,
105
+ "reward_mean": -1.4125516666666666,
106
+ "reward_std": 0.4515386177313937,
107
+ "reward_min": -2.3674999999999997,
108
+ "reward_max": -0.4405833333333334,
109
+ "length_mean": 56.06,
110
+ "violations_mean": 0.0,
111
+ "violations_max": 0
112
+ }
113
+ }
114
+ },
115
+ "baselines": {},
116
+ "config": {
117
+ "timesteps_per_task": 100000,
118
+ "eval_episodes": 50,
119
+ "seed": 42
120
+ },
121
+ "elapsed_min": 48.6515386501948
122
  }
FINAL_SUBMIT/receipts/R6_GETHSEMANE_ONNX_EXPORT.json CHANGED
@@ -1,25 +1,25 @@
1
- {
2
- "exports": [
3
- {
4
- "task": "easy_typhoon_response",
5
- "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_easy_typhoon_response.onnx",
6
- "size_mb": 0.970768,
7
- "verified": true,
8
- "max_diff": 1.9073486328125e-06
9
- },
10
- {
11
- "task": "medium_multi_front",
12
- "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_medium_multi_front.onnx",
13
- "size_mb": 0.970768,
14
- "verified": true,
15
- "max_diff": 1.9073486328125e-06
16
- },
17
- {
18
- "task": "hard_cascading_crisis",
19
- "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\v3_arcadia\\checkpoints\\gethsemane\\ppo_hard_cascading_crisis.onnx",
20
- "size_mb": 0.970768,
21
- "verified": true,
22
- "max_diff": 1.430511474609375e-06
23
- }
24
- ]
25
  }
 
1
+ {
2
+ "exports": [
3
+ {
4
+ "task": "easy_typhoon_response",
5
+ "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_easy_typhoon_response.onnx",
6
+ "size_mb": 0.970768,
7
+ "verified": true,
8
+ "max_diff": 1.9073486328125e-06
9
+ },
10
+ {
11
+ "task": "medium_multi_front",
12
+ "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_medium_multi_front.onnx",
13
+ "size_mb": 0.970768,
14
+ "verified": true,
15
+ "max_diff": 1.9073486328125e-06
16
+ },
17
+ {
18
+ "task": "hard_cascading_crisis",
19
+ "onnx_path": "C:\\Users\\Dell\\Desktop\\Sleep-Token\\versions/v3_arcadia/\checkpoints\\gethsemane\\ppo_hard_cascading_crisis.onnx",
20
+ "size_mb": 0.970768,
21
+ "verified": true,
22
+ "max_diff": 1.430511474609375e-06
23
+ }
24
+ ]
25
  }
FINAL_SUBMIT/receipts/R6_PROVIDER_V2.json CHANGED
@@ -1,330 +1,330 @@
1
- {
2
- "task": "arrival_time_regression",
3
- "task_description": "Predict expected disruption arrival time (continuous) per node, given noisy per-edge lead-times and random source nodes. Non-trivial: requires GNN to learn Dijkstra-like aggregation through the graph.",
4
- "lead_time_noise_sigma_relative": 0.2,
5
- "graphs": {
6
- "easy": {
7
- "n_nodes": 12,
8
- "n_edges": 12,
9
- "gnn_mae": 9.20589906692505,
10
- "mlp_mae": 17.712093811035157,
11
- "one_hop_mean_mae": 29.553308786787092,
12
- "improvement_vs_mlp_pct": 48.0247837147887,
13
- "improvement_vs_1hop_pct": 68.84985321494395,
14
- "gnn_loss_curve": [
15
- 983.6469454498291,
16
- 694.3125346450805,
17
- 594.0063958816528,
18
- 548.9563833961487,
19
- 495.32008571624755,
20
- 420.9683524398804,
21
- 364.7742200584412,
22
- 329.68193370532987,
23
- 308.9609826283455,
24
- 305.6601629691124,
25
- 298.6861881341934,
26
- 287.8384048962593,
27
- 303.22127193498613,
28
- 291.6199851961136,
29
- 292.3526881427765,
30
- 286.59378911590574,
31
- 297.95547390937804,
32
- 277.4495716457367,
33
- 278.5004913520813,
34
- 273.5950565481186,
35
- 280.847659828186,
36
- 269.8950548853874,
37
- 268.0327960948944,
38
- 272.2881185493469,
39
- 271.73518936920163,
40
- 266.2893534479141,
41
- 268.7633232383728,
42
- 263.14099113464357,
43
- 261.69743074321747,
44
- 262.2134785709381
45
- ],
46
- "gnn_test_mae_curve": [
47
- 15.625262084007263,
48
- 17.273250563144686,
49
- 15.69198014497757,
50
- 15.216868221759796,
51
- 13.83246925830841,
52
- 12.072544195652007,
53
- 12.047622272968292,
54
- 10.346303402781487,
55
- 10.991831306219101,
56
- 9.730522887706757,
57
- 9.387227172255516,
58
- 12.727755947113037,
59
- 10.449746668934822,
60
- 10.917218554019929,
61
- 9.83320654630661,
62
- 11.56927591919899,
63
- 9.640368175506591,
64
- 9.518106588125228,
65
- 9.238331428766251,
66
- 10.004606694579124,
67
- 9.601016719341278,
68
- 10.924803348779678,
69
- 9.062952963709831,
70
- 11.125388493537903,
71
- 8.51151149213314,
72
- 8.760705815553665,
73
- 8.83567961215973,
74
- 8.716645919680595,
75
- 9.704761312007903,
76
- 9.20589906692505
77
- ],
78
- "mlp_test_mae_curve": [
79
- 16.517573373317717,
80
- 17.61745592355728,
81
- 17.478831689357758,
82
- 17.963374128341673,
83
- 17.317361807823183,
84
- 17.35558673620224,
85
- 19.272147517204285,
86
- 17.29823645591736,
87
- 18.360565376281738,
88
- 16.33169244527817,
89
- 16.291482293605803,
90
- 20.00996126651764,
91
- 17.24092205762863,
92
- 17.935992388725282,
93
- 18.476314017772676,
94
- 20.500635390281676,
95
- 17.64075089454651,
96
- 19.23261556148529,
97
- 17.159917891025543,
98
- 18.033056726455687,
99
- 17.04588686466217,
100
- 17.51567750453949,
101
- 16.925300316810606,
102
- 19.993932852745058,
103
- 17.863101620674133,
104
- 17.46893537759781,
105
- 17.768136410713197,
106
- 17.399936029911043,
107
- 17.271209075450898,
108
- 17.712093811035157
109
- ]
110
- },
111
- "medium": {
112
- "n_nodes": 25,
113
- "n_edges": 29,
114
- "gnn_mae": 14.05237404346466,
115
- "mlp_mae": 27.562243633270263,
116
- "one_hop_mean_mae": 23.25141793220304,
117
- "improvement_vs_mlp_pct": 49.01585578286486,
118
- "improvement_vs_1hop_pct": 39.56336734198809,
119
- "gnn_loss_curve": [
120
- 1455.8575012207032,
121
- 1070.794164489746,
122
- 978.3833621215821,
123
- 878.4453280944824,
124
- 759.8914498443603,
125
- 676.4201901473999,
126
- 592.9840587463378,
127
- 593.9022348022461,
128
- 580.474338684082,
129
- 548.8776502380371,
130
- 535.7356602172852,
131
- 524.7076401443481,
132
- 517.5761855316163,
133
- 503.14428115844726,
134
- 504.31373574829104,
135
- 482.12416637420654,
136
- 491.71681065368654,
137
- 476.0351883163452,
138
- 475.84812075042726,
139
- 469.6501838378906,
140
- 473.09340254211423,
141
- 468.5468386917114,
142
- 457.8393885040283,
143
- 461.61461613464354,
144
- 450.00589713287354,
145
- 444.84376406097414,
146
- 448.23634549713137,
147
- 441.89026587677,
148
- 436.69793469238283,
149
- 434.4493161087036
150
- ],
151
- "gnn_test_mae_curve": [
152
- 26.63341254234314,
153
- 23.634564056396485,
154
- 23.186181049346924,
155
- 21.077601199150084,
156
- 21.637806577682497,
157
- 17.98971748828888,
158
- 16.306520526409148,
159
- 17.966433074474335,
160
- 17.40695864200592,
161
- 15.116412845849991,
162
- 15.247849924564362,
163
- 14.415206160545349,
164
- 15.09439873456955,
165
- 14.077203586101533,
166
- 16.387850997447966,
167
- 16.519536385536195,
168
- 15.912737758159638,
169
- 15.685167801380157,
170
- 15.163068435192109,
171
- 15.200627043247223,
172
- 15.001122550964356,
173
- 14.351007792949677,
174
- 15.44103235244751,
175
- 13.403649566173554,
176
- 17.10527836084366,
177
- 14.323340699672698,
178
- 14.384661407470704,
179
- 14.556273880004882,
180
- 13.85397144317627,
181
- 14.05237404346466
182
- ],
183
- "mlp_test_mae_curve": [
184
- 27.1725799369812,
185
- 26.40243914604187,
186
- 27.289838228225708,
187
- 26.334666624069214,
188
- 28.48377342224121,
189
- 26.199828100204467,
190
- 29.151524686813353,
191
- 28.400241794586183,
192
- 26.501172218322754,
193
- 27.04287679672241,
194
- 27.969863624572753,
195
- 26.34369418144226,
196
- 28.614215364456175,
197
- 26.348094720840454,
198
- 27.199346466064455,
199
- 26.72101284980774,
200
- 26.492710275650026,
201
- 28.792157373428346,
202
- 25.963287801742553,
203
- 27.035139274597167,
204
- 26.07756766319275,
205
- 27.420557165145873,
206
- 28.615666379928587,
207
- 26.438606796264647,
208
- 26.199908666610717,
209
- 26.585446147918702,
210
- 26.246847848892212,
211
- 26.238035287857056,
212
- 26.170038957595825,
213
- 27.562243633270263
214
- ]
215
- },
216
- "hard": {
217
- "n_nodes": 40,
218
- "n_edges": 47,
219
- "gnn_mae": 10.347342171669005,
220
- "mlp_mae": 28.483039016723634,
221
- "one_hop_mean_mae": 16.03428017649916,
222
- "improvement_vs_mlp_pct": 63.67191659010252,
223
- "improvement_vs_1hop_pct": 35.46737329166347,
224
- "gnn_loss_curve": [
225
- 1519.987557739258,
226
- 1021.7450046386718,
227
- 815.2417454833984,
228
- 709.5358395690918,
229
- 634.4188123474121,
230
- 560.8865319213867,
231
- 506.78174713134763,
232
- 475.7871089630127,
233
- 451.54362382507327,
234
- 442.535458694458,
235
- 425.76794429016115,
236
- 416.6028264923096,
237
- 416.2537903900147,
238
- 416.3216004333496,
239
- 405.91741243743894,
240
- 401.3154751739502,
241
- 403.56236766052245,
242
- 399.83712251281736,
243
- 397.13397619628904,
244
- 396.69007269287107,
245
- 389.8687892990112,
246
- 386.671229675293,
247
- 390.19565746307376,
248
- 387.47164192962646,
249
- 384.5350112533569,
250
- 385.34569120025634,
251
- 381.3625469284058,
252
- 380.5953342590332,
253
- 376.2190606918335,
254
- 378.44821893310547
255
- ],
256
- "gnn_test_mae_curve": [
257
- 25.89111141204834,
258
- 22.817488927841186,
259
- 19.102868838310243,
260
- 21.260897178649902,
261
- 16.00875702381134,
262
- 15.999692721366882,
263
- 14.555557656288148,
264
- 13.622318716049195,
265
- 13.0450461602211,
266
- 13.296297969818115,
267
- 12.376682465076447,
268
- 13.256674709320068,
269
- 11.923482534885407,
270
- 11.381103422641754,
271
- 13.629612107276916,
272
- 13.775573563575744,
273
- 12.455035951137543,
274
- 13.674895765781402,
275
- 12.645530993938445,
276
- 12.839997906684875,
277
- 12.782445096969605,
278
- 11.498445341587066,
279
- 12.44089034318924,
280
- 10.853419225215912,
281
- 11.889822478294372,
282
- 11.540131111145019,
283
- 12.30764417886734,
284
- 10.73738386631012,
285
- 10.981562974452972,
286
- 10.347342171669005
287
- ],
288
- "mlp_test_mae_curve": [
289
- 28.691825714111328,
290
- 29.088216686248778,
291
- 27.926491804122925,
292
- 32.548833179473874,
293
- 28.55751530647278,
294
- 27.89367533683777,
295
- 28.729960765838623,
296
- 29.485910148620604,
297
- 28.418713645935057,
298
- 29.061994075775146,
299
- 27.86555823326111,
300
- 27.882053699493408,
301
- 28.62539842605591,
302
- 28.374376544952394,
303
- 27.627659730911255,
304
- 29.199770755767823,
305
- 26.9179744720459,
306
- 29.280858907699585,
307
- 28.915042276382447,
308
- 28.664446725845337,
309
- 28.888797369003296,
310
- 29.49649586677551,
311
- 29.45292121887207,
312
- 28.840624055862428,
313
- 27.16323224067688,
314
- 27.801621007919312,
315
- 28.310747117996215,
316
- 28.82351138114929,
317
- 30.00698434829712,
318
- 28.483039016723634
319
- ]
320
- }
321
- },
322
- "config": {
323
- "n_train": 500,
324
- "n_test": 200,
325
- "hidden": 64,
326
- "epochs": 30,
327
- "lr": 0.003
328
- },
329
- "elapsed_min": 4.006023410956065
330
  }
 
1
+ {
2
+ "task": "arrival_time_regression",
3
+ "task_description": "Predict expected disruption arrival time (continuous) per node, given noisy per-edge lead-times and random source nodes. Non-trivial: requires GNN to learn Dijkstra-like aggregation through the graph.",
4
+ "lead_time_noise_sigma_relative": 0.2,
5
+ "graphs": {
6
+ "easy": {
7
+ "n_nodes": 12,
8
+ "n_edges": 12,
9
+ "gnn_mae": 9.20589906692505,
10
+ "mlp_mae": 17.712093811035157,
11
+ "one_hop_mean_mae": 29.553308786787092,
12
+ "improvement_vs_mlp_pct": 48.0247837147887,
13
+ "improvement_vs_1hop_pct": 68.84985321494395,
14
+ "gnn_loss_curve": [
15
+ 983.6469454498291,
16
+ 694.3125346450805,
17
+ 594.0063958816528,
18
+ 548.9563833961487,
19
+ 495.32008571624755,
20
+ 420.9683524398804,
21
+ 364.7742200584412,
22
+ 329.68193370532987,
23
+ 308.9609826283455,
24
+ 305.6601629691124,
25
+ 298.6861881341934,
26
+ 287.8384048962593,
27
+ 303.22127193498613,
28
+ 291.6199851961136,
29
+ 292.3526881427765,
30
+ 286.59378911590574,
31
+ 297.95547390937804,
32
+ 277.4495716457367,
33
+ 278.5004913520813,
34
+ 273.5950565481186,
35
+ 280.847659828186,
36
+ 269.8950548853874,
37
+ 268.0327960948944,
38
+ 272.2881185493469,
39
+ 271.73518936920163,
40
+ 266.2893534479141,
41
+ 268.7633232383728,
42
+ 263.14099113464357,
43
+ 261.69743074321747,
44
+ 262.2134785709381
45
+ ],
46
+ "gnn_test_mae_curve": [
47
+ 15.625262084007263,
48
+ 17.273250563144686,
49
+ 15.69198014497757,
50
+ 15.216868221759796,
51
+ 13.83246925830841,
52
+ 12.072544195652007,
53
+ 12.047622272968292,
54
+ 10.346303402781487,
55
+ 10.991831306219101,
56
+ 9.730522887706757,
57
+ 9.387227172255516,
58
+ 12.727755947113037,
59
+ 10.449746668934822,
60
+ 10.917218554019929,
61
+ 9.83320654630661,
62
+ 11.56927591919899,
63
+ 9.640368175506591,
64
+ 9.518106588125228,
65
+ 9.238331428766251,
66
+ 10.004606694579124,
67
+ 9.601016719341278,
68
+ 10.924803348779678,
69
+ 9.062952963709831,
70
+ 11.125388493537903,
71
+ 8.51151149213314,
72
+ 8.760705815553665,
73
+ 8.83567961215973,
74
+ 8.716645919680595,
75
+ 9.704761312007903,
76
+ 9.20589906692505
77
+ ],
78
+ "mlp_test_mae_curve": [
79
+ 16.517573373317717,
80
+ 17.61745592355728,
81
+ 17.478831689357758,
82
+ 17.963374128341673,
83
+ 17.317361807823183,
84
+ 17.35558673620224,
85
+ 19.272147517204285,
86
+ 17.29823645591736,
87
+ 18.360565376281738,
88
+ 16.33169244527817,
89
+ 16.291482293605803,
90
+ 20.00996126651764,
91
+ 17.24092205762863,
92
+ 17.935992388725282,
93
+ 18.476314017772676,
94
+ 20.500635390281676,
95
+ 17.64075089454651,
96
+ 19.23261556148529,
97
+ 17.159917891025543,
98
+ 18.033056726455687,
99
+ 17.04588686466217,
100
+ 17.51567750453949,
101
+ 16.925300316810606,
102
+ 19.993932852745058,
103
+ 17.863101620674133,
104
+ 17.46893537759781,
105
+ 17.768136410713197,
106
+ 17.399936029911043,
107
+ 17.271209075450898,
108
+ 17.712093811035157
109
+ ]
110
+ },
111
+ "medium": {
112
+ "n_nodes": 25,
113
+ "n_edges": 29,
114
+ "gnn_mae": 14.05237404346466,
115
+ "mlp_mae": 27.562243633270263,
116
+ "one_hop_mean_mae": 23.25141793220304,
117
+ "improvement_vs_mlp_pct": 49.01585578286486,
118
+ "improvement_vs_1hop_pct": 39.56336734198809,
119
+ "gnn_loss_curve": [
120
+ 1455.8575012207032,
121
+ 1070.794164489746,
122
+ 978.3833621215821,
123
+ 878.4453280944824,
124
+ 759.8914498443603,
125
+ 676.4201901473999,
126
+ 592.9840587463378,
127
+ 593.9022348022461,
128
+ 580.474338684082,
129
+ 548.8776502380371,
130
+ 535.7356602172852,
131
+ 524.7076401443481,
132
+ 517.5761855316163,
133
+ 503.14428115844726,
134
+ 504.31373574829104,
135
+ 482.12416637420654,
136
+ 491.71681065368654,
137
+ 476.0351883163452,
138
+ 475.84812075042726,
139
+ 469.6501838378906,
140
+ 473.09340254211423,
141
+ 468.5468386917114,
142
+ 457.8393885040283,
143
+ 461.61461613464354,
144
+ 450.00589713287354,
145
+ 444.84376406097414,
146
+ 448.23634549713137,
147
+ 441.89026587677,
148
+ 436.69793469238283,
149
+ 434.4493161087036
150
+ ],
151
+ "gnn_test_mae_curve": [
152
+ 26.63341254234314,
153
+ 23.634564056396485,
154
+ 23.186181049346924,
155
+ 21.077601199150084,
156
+ 21.637806577682497,
157
+ 17.98971748828888,
158
+ 16.306520526409148,
159
+ 17.966433074474335,
160
+ 17.40695864200592,
161
+ 15.116412845849991,
162
+ 15.247849924564362,
163
+ 14.415206160545349,
164
+ 15.09439873456955,
165
+ 14.077203586101533,
166
+ 16.387850997447966,
167
+ 16.519536385536195,
168
+ 15.912737758159638,
169
+ 15.685167801380157,
170
+ 15.163068435192109,
171
+ 15.200627043247223,
172
+ 15.001122550964356,
173
+ 14.351007792949677,
174
+ 15.44103235244751,
175
+ 13.403649566173554,
176
+ 17.10527836084366,
177
+ 14.323340699672698,
178
+ 14.384661407470704,
179
+ 14.556273880004882,
180
+ 13.85397144317627,
181
+ 14.05237404346466
182
+ ],
183
+ "mlp_test_mae_curve": [
184
+ 27.1725799369812,
185
+ 26.40243914604187,
186
+ 27.289838228225708,
187
+ 26.334666624069214,
188
+ 28.48377342224121,
189
+ 26.199828100204467,
190
+ 29.151524686813353,
191
+ 28.400241794586183,
192
+ 26.501172218322754,
193
+ 27.04287679672241,
194
+ 27.969863624572753,
195
+ 26.34369418144226,
196
+ 28.614215364456175,
197
+ 26.348094720840454,
198
+ 27.199346466064455,
199
+ 26.72101284980774,
200
+ 26.492710275650026,
201
+ 28.792157373428346,
202
+ 25.963287801742553,
203
+ 27.035139274597167,
204
+ 26.07756766319275,
205
+ 27.420557165145873,
206
+ 28.615666379928587,
207
+ 26.438606796264647,
208
+ 26.199908666610717,
209
+ 26.585446147918702,
210
+ 26.246847848892212,
211
+ 26.238035287857056,
212
+ 26.170038957595825,
213
+ 27.562243633270263
214
+ ]
215
+ },
216
+ "hard": {
217
+ "n_nodes": 40,
218
+ "n_edges": 47,
219
+ "gnn_mae": 10.347342171669005,
220
+ "mlp_mae": 28.483039016723634,
221
+ "one_hop_mean_mae": 16.03428017649916,
222
+ "improvement_vs_mlp_pct": 63.67191659010252,
223
+ "improvement_vs_1hop_pct": 35.46737329166347,
224
+ "gnn_loss_curve": [
225
+ 1519.987557739258,
226
+ 1021.7450046386718,
227
+ 815.2417454833984,
228
+ 709.5358395690918,
229
+ 634.4188123474121,
230
+ 560.8865319213867,
231
+ 506.78174713134763,
232
+ 475.7871089630127,
233
+ 451.54362382507327,
234
+ 442.535458694458,
235
+ 425.76794429016115,
236
+ 416.6028264923096,
237
+ 416.2537903900147,
238
+ 416.3216004333496,
239
+ 405.91741243743894,
240
+ 401.3154751739502,
241
+ 403.56236766052245,
242
+ 399.83712251281736,
243
+ 397.13397619628904,
244
+ 396.69007269287107,
245
+ 389.8687892990112,
246
+ 386.671229675293,
247
+ 390.19565746307376,
248
+ 387.47164192962646,
249
+ 384.5350112533569,
250
+ 385.34569120025634,
251
+ 381.3625469284058,
252
+ 380.5953342590332,
253
+ 376.2190606918335,
254
+ 378.44821893310547
255
+ ],
256
+ "gnn_test_mae_curve": [
257
+ 25.89111141204834,
258
+ 22.817488927841186,
259
+ 19.102868838310243,
260
+ 21.260897178649902,
261
+ 16.00875702381134,
262
+ 15.999692721366882,
263
+ 14.555557656288148,
264
+ 13.622318716049195,
265
+ 13.0450461602211,
266
+ 13.296297969818115,
267
+ 12.376682465076447,
268
+ 13.256674709320068,
269
+ 11.923482534885407,
270
+ 11.381103422641754,
271
+ 13.629612107276916,
272
+ 13.775573563575744,
273
+ 12.455035951137543,
274
+ 13.674895765781402,
275
+ 12.645530993938445,
276
+ 12.839997906684875,
277
+ 12.782445096969605,
278
+ 11.498445341587066,
279
+ 12.44089034318924,
280
+ 10.853419225215912,
281
+ 11.889822478294372,
282
+ 11.540131111145019,
283
+ 12.30764417886734,
284
+ 10.73738386631012,
285
+ 10.981562974452972,
286
+ 10.347342171669005
287
+ ],
288
+ "mlp_test_mae_curve": [
289
+ 28.691825714111328,
290
+ 29.088216686248778,
291
+ 27.926491804122925,
292
+ 32.548833179473874,
293
+ 28.55751530647278,
294
+ 27.89367533683777,
295
+ 28.729960765838623,
296
+ 29.485910148620604,
297
+ 28.418713645935057,
298
+ 29.061994075775146,
299
+ 27.86555823326111,
300
+ 27.882053699493408,
301
+ 28.62539842605591,
302
+ 28.374376544952394,
303
+ 27.627659730911255,
304
+ 29.199770755767823,
305
+ 26.9179744720459,
306
+ 29.280858907699585,
307
+ 28.915042276382447,
308
+ 28.664446725845337,
309
+ 28.888797369003296,
310
+ 29.49649586677551,
311
+ 29.45292121887207,
312
+ 28.840624055862428,
313
+ 27.16323224067688,
314
+ 27.801621007919312,
315
+ 28.310747117996215,
316
+ 28.82351138114929,
317
+ 30.00698434829712,
318
+ 28.483039016723634
319
+ ]
320
+ }
321
+ },
322
+ "config": {
323
+ "n_train": 500,
324
+ "n_test": 200,
325
+ "hidden": 64,
326
+ "epochs": 30,
327
+ "lr": 0.003
328
+ },
329
+ "elapsed_min": 4.006023410956065
330
  }
FINAL_SUBMIT/receipts/R6_PROVIDER_v1_F1.json CHANGED
@@ -1,1756 +1,1756 @@
1
- {
2
- "graphs": {
3
- "easy": {
4
- "n_nodes": 12,
5
- "n_edges": 10,
6
- "gnn_final": {
7
- "acc": 1.0,
8
- "precision": 1.0,
9
- "recall": 1.0,
10
- "f1": 1.0
11
- },
12
- "baseline_direct_neighbors": {
13
- "acc": 0.8258333333333333,
14
- "precision": 1.0,
15
- "recall": 0.6352530541012217,
16
- "f1": 0.7769477054429028
17
- },
18
- "improvement_f1_pp": 22.305229455709718,
19
- "train_loss_curve": [
20
- 0.10601958807871187,
21
- 0.00014574478766241308,
22
- 2.1336230871288145e-05,
23
- 5.904760447787133e-06,
24
- 0.014828034023753519,
25
- 0.0001365676538936252,
26
- 2.800940909035432e-05,
27
- 7.873948834791846e-06,
28
- 2.40824965675521e-06,
29
- 7.439197035413468e-07,
30
- 2.349434055591839e-07,
31
- 8.035365056026132e-08,
32
- 1.866763376779131e-08,
33
- 6.7128299592450774e-09,
34
- 3.606812599319898e-09,
35
- 2.4320182903440704e-09,
36
- 1.5445408799196548e-09,
37
- 0.03198392186360504,
38
- 1.3277981027858794e-05,
39
- 7.040849976128097e-06,
40
- 2.0380432214083175e-06,
41
- 5.154616233541851e-07,
42
- 0.017213296287886225,
43
- 0.00023569030925164338,
44
- 2.4805963813645227e-05,
45
- 6.058055528068272e-06,
46
- 1.8203820033098038e-06,
47
- 6.043328515907098e-07,
48
- 2.1225388103874568e-07,
49
- 7.437462508802039e-08,
50
- 1.902343076246039e-08,
51
- 6.527784956639485e-09,
52
- 3.3294667175720776e-09,
53
- 1.9615958442567566e-09,
54
- 0.010902570914775889,
55
- 2.806348171776314e-05,
56
- 7.667120790626038e-06,
57
- 2.582107717285551e-06,
58
- 9.129105348027232e-07,
59
- 3.106581481139294e-07,
60
- 1.0230859844032431e-07,
61
- 2.725160428237702e-08,
62
- 8.880124408068363e-09,
63
- 4.4200613740675046e-09,
64
- 2.8600379247657045e-09,
65
- 2.2151315261330923e-09,
66
- 1.7114610773887693e-09,
67
- 1.4000422095074408e-09,
68
- 1.0463116296276038e-09,
69
- 6.4079628731738e-10,
70
- 0.02516633728286725,
71
- 0.00012813284900565014,
72
- 2.3232634050379803e-05,
73
- 7.066120872802589e-06,
74
- 2.311430617913936e-06,
75
- 7.920952698295068e-07,
76
- 2.5278086959691613e-07,
77
- 7.818242851037627e-08,
78
- 1.983640248580842e-08,
79
- 7.863145182916767e-09,
80
- 5.0701508055233275e-09,
81
- 4.364776342121379e-09,
82
- 3.937454630286758e-09,
83
- 2.518706138457294e-09,
84
- 1.9815549914984234e-09,
85
- 0.018349960519401222,
86
- 7.85511791638533e-05,
87
- 2.0063992723006376e-05,
88
- 6.210748974664104e-06,
89
- 1.9043317207399904e-06,
90
- 6.112533347568437e-07,
91
- 2.0612900407184615e-07,
92
- 6.247272126631417e-08,
93
- 1.5818333928198573e-08,
94
- 5.678499110562204e-09,
95
- 2.927658185385007e-09,
96
- 2.2895658619235268e-09,
97
- 1.9812523096841366e-09,
98
- 1.418338779821114e-09,
99
- 9.94527561841937e-10
100
- ],
101
- "test_metric_curve": [
102
- {
103
- "acc": 1.0,
104
- "precision": 1.0,
105
- "recall": 1.0,
106
- "f1": 1.0
107
- },
108
- {
109
- "acc": 1.0,
110
- "precision": 1.0,
111
- "recall": 1.0,
112
- "f1": 1.0
113
- },
114
- {
115
- "acc": 1.0,
116
- "precision": 1.0,
117
- "recall": 1.0,
118
- "f1": 1.0
119
- },
120
- {
121
- "acc": 1.0,
122
- "precision": 1.0,
123
- "recall": 1.0,
124
- "f1": 1.0
125
- },
126
- {
127
- "acc": 1.0,
128
- "precision": 1.0,
129
- "recall": 1.0,
130
- "f1": 1.0
131
- },
132
- {
133
- "acc": 1.0,
134
- "precision": 1.0,
135
- "recall": 1.0,
136
- "f1": 1.0
137
- },
138
- {
139
- "acc": 1.0,
140
- "precision": 1.0,
141
- "recall": 1.0,
142
- "f1": 1.0
143
- },
144
- {
145
- "acc": 1.0,
146
- "precision": 1.0,
147
- "recall": 1.0,
148
- "f1": 1.0
149
- },
150
- {
151
- "acc": 1.0,
152
- "precision": 1.0,
153
- "recall": 1.0,
154
- "f1": 1.0
155
- },
156
- {
157
- "acc": 1.0,
158
- "precision": 1.0,
159
- "recall": 1.0,
160
- "f1": 1.0
161
- },
162
- {
163
- "acc": 1.0,
164
- "precision": 1.0,
165
- "recall": 1.0,
166
- "f1": 1.0
167
- },
168
- {
169
- "acc": 1.0,
170
- "precision": 1.0,
171
- "recall": 1.0,
172
- "f1": 1.0
173
- },
174
- {
175
- "acc": 1.0,
176
- "precision": 1.0,
177
- "recall": 1.0,
178
- "f1": 1.0
179
- },
180
- {
181
- "acc": 1.0,
182
- "precision": 1.0,
183
- "recall": 1.0,
184
- "f1": 1.0
185
- },
186
- {
187
- "acc": 1.0,
188
- "precision": 1.0,
189
- "recall": 1.0,
190
- "f1": 1.0
191
- },
192
- {
193
- "acc": 1.0,
194
- "precision": 1.0,
195
- "recall": 1.0,
196
- "f1": 1.0
197
- },
198
- {
199
- "acc": 1.0,
200
- "precision": 1.0,
201
- "recall": 1.0,
202
- "f1": 1.0
203
- },
204
- {
205
- "acc": 1.0,
206
- "precision": 1.0,
207
- "recall": 1.0,
208
- "f1": 1.0
209
- },
210
- {
211
- "acc": 1.0,
212
- "precision": 1.0,
213
- "recall": 1.0,
214
- "f1": 1.0
215
- },
216
- {
217
- "acc": 1.0,
218
- "precision": 1.0,
219
- "recall": 1.0,
220
- "f1": 1.0
221
- },
222
- {
223
- "acc": 1.0,
224
- "precision": 1.0,
225
- "recall": 1.0,
226
- "f1": 1.0
227
- },
228
- {
229
- "acc": 1.0,
230
- "precision": 1.0,
231
- "recall": 1.0,
232
- "f1": 1.0
233
- },
234
- {
235
- "acc": 1.0,
236
- "precision": 1.0,
237
- "recall": 1.0,
238
- "f1": 1.0
239
- },
240
- {
241
- "acc": 1.0,
242
- "precision": 1.0,
243
- "recall": 1.0,
244
- "f1": 1.0
245
- },
246
- {
247
- "acc": 1.0,
248
- "precision": 1.0,
249
- "recall": 1.0,
250
- "f1": 1.0
251
- },
252
- {
253
- "acc": 1.0,
254
- "precision": 1.0,
255
- "recall": 1.0,
256
- "f1": 1.0
257
- },
258
- {
259
- "acc": 1.0,
260
- "precision": 1.0,
261
- "recall": 1.0,
262
- "f1": 1.0
263
- },
264
- {
265
- "acc": 1.0,
266
- "precision": 1.0,
267
- "recall": 1.0,
268
- "f1": 1.0
269
- },
270
- {
271
- "acc": 1.0,
272
- "precision": 1.0,
273
- "recall": 1.0,
274
- "f1": 1.0
275
- },
276
- {
277
- "acc": 1.0,
278
- "precision": 1.0,
279
- "recall": 1.0,
280
- "f1": 1.0
281
- },
282
- {
283
- "acc": 1.0,
284
- "precision": 1.0,
285
- "recall": 1.0,
286
- "f1": 1.0
287
- },
288
- {
289
- "acc": 1.0,
290
- "precision": 1.0,
291
- "recall": 1.0,
292
- "f1": 1.0
293
- },
294
- {
295
- "acc": 1.0,
296
- "precision": 1.0,
297
- "recall": 1.0,
298
- "f1": 1.0
299
- },
300
- {
301
- "acc": 1.0,
302
- "precision": 1.0,
303
- "recall": 1.0,
304
- "f1": 1.0
305
- },
306
- {
307
- "acc": 1.0,
308
- "precision": 1.0,
309
- "recall": 1.0,
310
- "f1": 1.0
311
- },
312
- {
313
- "acc": 1.0,
314
- "precision": 1.0,
315
- "recall": 1.0,
316
- "f1": 1.0
317
- },
318
- {
319
- "acc": 1.0,
320
- "precision": 1.0,
321
- "recall": 1.0,
322
- "f1": 1.0
323
- },
324
- {
325
- "acc": 1.0,
326
- "precision": 1.0,
327
- "recall": 1.0,
328
- "f1": 1.0
329
- },
330
- {
331
- "acc": 1.0,
332
- "precision": 1.0,
333
- "recall": 1.0,
334
- "f1": 1.0
335
- },
336
- {
337
- "acc": 1.0,
338
- "precision": 1.0,
339
- "recall": 1.0,
340
- "f1": 1.0
341
- },
342
- {
343
- "acc": 1.0,
344
- "precision": 1.0,
345
- "recall": 1.0,
346
- "f1": 1.0
347
- },
348
- {
349
- "acc": 1.0,
350
- "precision": 1.0,
351
- "recall": 1.0,
352
- "f1": 1.0
353
- },
354
- {
355
- "acc": 1.0,
356
- "precision": 1.0,
357
- "recall": 1.0,
358
- "f1": 1.0
359
- },
360
- {
361
- "acc": 1.0,
362
- "precision": 1.0,
363
- "recall": 1.0,
364
- "f1": 1.0
365
- },
366
- {
367
- "acc": 1.0,
368
- "precision": 1.0,
369
- "recall": 1.0,
370
- "f1": 1.0
371
- },
372
- {
373
- "acc": 1.0,
374
- "precision": 1.0,
375
- "recall": 1.0,
376
- "f1": 1.0
377
- },
378
- {
379
- "acc": 1.0,
380
- "precision": 1.0,
381
- "recall": 1.0,
382
- "f1": 1.0
383
- },
384
- {
385
- "acc": 1.0,
386
- "precision": 1.0,
387
- "recall": 1.0,
388
- "f1": 1.0
389
- },
390
- {
391
- "acc": 1.0,
392
- "precision": 1.0,
393
- "recall": 1.0,
394
- "f1": 1.0
395
- },
396
- {
397
- "acc": 1.0,
398
- "precision": 1.0,
399
- "recall": 1.0,
400
- "f1": 1.0
401
- },
402
- {
403
- "acc": 1.0,
404
- "precision": 1.0,
405
- "recall": 1.0,
406
- "f1": 1.0
407
- },
408
- {
409
- "acc": 1.0,
410
- "precision": 1.0,
411
- "recall": 1.0,
412
- "f1": 1.0
413
- },
414
- {
415
- "acc": 1.0,
416
- "precision": 1.0,
417
- "recall": 1.0,
418
- "f1": 1.0
419
- },
420
- {
421
- "acc": 1.0,
422
- "precision": 1.0,
423
- "recall": 1.0,
424
- "f1": 1.0
425
- },
426
- {
427
- "acc": 1.0,
428
- "precision": 1.0,
429
- "recall": 1.0,
430
- "f1": 1.0
431
- },
432
- {
433
- "acc": 1.0,
434
- "precision": 1.0,
435
- "recall": 1.0,
436
- "f1": 1.0
437
- },
438
- {
439
- "acc": 1.0,
440
- "precision": 1.0,
441
- "recall": 1.0,
442
- "f1": 1.0
443
- },
444
- {
445
- "acc": 1.0,
446
- "precision": 1.0,
447
- "recall": 1.0,
448
- "f1": 1.0
449
- },
450
- {
451
- "acc": 1.0,
452
- "precision": 1.0,
453
- "recall": 1.0,
454
- "f1": 1.0
455
- },
456
- {
457
- "acc": 1.0,
458
- "precision": 1.0,
459
- "recall": 1.0,
460
- "f1": 1.0
461
- },
462
- {
463
- "acc": 1.0,
464
- "precision": 1.0,
465
- "recall": 1.0,
466
- "f1": 1.0
467
- },
468
- {
469
- "acc": 1.0,
470
- "precision": 1.0,
471
- "recall": 1.0,
472
- "f1": 1.0
473
- },
474
- {
475
- "acc": 1.0,
476
- "precision": 1.0,
477
- "recall": 1.0,
478
- "f1": 1.0
479
- },
480
- {
481
- "acc": 1.0,
482
- "precision": 1.0,
483
- "recall": 1.0,
484
- "f1": 1.0
485
- },
486
- {
487
- "acc": 1.0,
488
- "precision": 1.0,
489
- "recall": 1.0,
490
- "f1": 1.0
491
- },
492
- {
493
- "acc": 1.0,
494
- "precision": 1.0,
495
- "recall": 1.0,
496
- "f1": 1.0
497
- },
498
- {
499
- "acc": 1.0,
500
- "precision": 1.0,
501
- "recall": 1.0,
502
- "f1": 1.0
503
- },
504
- {
505
- "acc": 1.0,
506
- "precision": 1.0,
507
- "recall": 1.0,
508
- "f1": 1.0
509
- },
510
- {
511
- "acc": 1.0,
512
- "precision": 1.0,
513
- "recall": 1.0,
514
- "f1": 1.0
515
- },
516
- {
517
- "acc": 1.0,
518
- "precision": 1.0,
519
- "recall": 1.0,
520
- "f1": 1.0
521
- },
522
- {
523
- "acc": 1.0,
524
- "precision": 1.0,
525
- "recall": 1.0,
526
- "f1": 1.0
527
- },
528
- {
529
- "acc": 1.0,
530
- "precision": 1.0,
531
- "recall": 1.0,
532
- "f1": 1.0
533
- },
534
- {
535
- "acc": 1.0,
536
- "precision": 1.0,
537
- "recall": 1.0,
538
- "f1": 1.0
539
- },
540
- {
541
- "acc": 1.0,
542
- "precision": 1.0,
543
- "recall": 1.0,
544
- "f1": 1.0
545
- },
546
- {
547
- "acc": 1.0,
548
- "precision": 1.0,
549
- "recall": 1.0,
550
- "f1": 1.0
551
- },
552
- {
553
- "acc": 1.0,
554
- "precision": 1.0,
555
- "recall": 1.0,
556
- "f1": 1.0
557
- },
558
- {
559
- "acc": 1.0,
560
- "precision": 1.0,
561
- "recall": 1.0,
562
- "f1": 1.0
563
- },
564
- {
565
- "acc": 1.0,
566
- "precision": 1.0,
567
- "recall": 1.0,
568
- "f1": 1.0
569
- },
570
- {
571
- "acc": 1.0,
572
- "precision": 1.0,
573
- "recall": 1.0,
574
- "f1": 1.0
575
- },
576
- {
577
- "acc": 1.0,
578
- "precision": 1.0,
579
- "recall": 1.0,
580
- "f1": 1.0
581
- }
582
- ]
583
- },
584
- "medium": {
585
- "n_nodes": 25,
586
- "n_edges": 27,
587
- "gnn_final": {
588
- "acc": 0.9914,
589
- "precision": 0.982778750729714,
590
- "recall": 0.9920447849145551,
591
- "f1": 0.9873900293255131
592
- },
593
- "baseline_direct_neighbors": {
594
- "acc": 0.8301,
595
- "precision": 1.0,
596
- "recall": 0.4994107248084856,
597
- "f1": 0.6661426606405974
598
- },
599
- "improvement_f1_pp": 32.124736868491574,
600
- "train_loss_curve": [
601
- 0.18512494587464606,
602
- 0.05774239192842651,
603
- 0.04035148839658183,
604
- 0.03685507851154424,
605
- 0.034016887983169666,
606
- 0.03193854558186021,
607
- 0.030314448321928544,
608
- 0.028890588828011224,
609
- 0.02627120438580584,
610
- 0.02676936000857496,
611
- 0.02735587336003725,
612
- 0.024704556535801756,
613
- 0.023389738032454397,
614
- 0.02484239745095036,
615
- 0.022598365899086623,
616
- 0.022097759216314333,
617
- 0.021880711925624425,
618
- 0.023672257099118552,
619
- 0.021815840122002862,
620
- 0.021538631150760885,
621
- 0.021590486920307173,
622
- 0.020993219244996,
623
- 0.021660113581202914,
624
- 0.02028199757042485,
625
- 0.021449406110984975,
626
- 0.02049649202735325,
627
- 0.02005596899437715,
628
- 0.02060316097080978,
629
- 0.02082035162168178,
630
- 0.020935066080168856,
631
- 0.0209964800781561,
632
- 0.019652295691733542,
633
- 0.020470858438760543,
634
- 0.020456047435481396,
635
- 0.020529603496513553,
636
- 0.019996260003822708,
637
- 0.021328506347361064,
638
- 0.019778630244522907,
639
- 0.01971426555108731,
640
- 0.019847191254493045,
641
- 0.01984119418810368,
642
- 0.02021396374486143,
643
- 0.01946370021810413,
644
- 0.019111871498224214,
645
- 0.019667785586758944,
646
- 0.021675049597691873,
647
- 0.01897557202284267,
648
- 0.01971483370839516,
649
- 0.01965866965101487,
650
- 0.01936112277971507,
651
- 0.01895255452432814,
652
- 0.02035098125927439,
653
- 0.01909720691408324,
654
- 0.019500281907226687,
655
- 0.019117790717674256,
656
- 0.018927754213147425,
657
- 0.020313845976115717,
658
- 0.019341792678655486,
659
- 0.01890229735773205,
660
- 0.019833170414518056,
661
- 0.01948640772390163,
662
- 0.019305320678627013,
663
- 0.019213381035159603,
664
- 0.020478221997059808,
665
- 0.01936127331570382,
666
- 0.019158014420631225,
667
- 0.019090143173694583,
668
- 0.020291763241906225,
669
- 0.01900654871721499,
670
- 0.019815083033949698,
671
- 0.019103285589502736,
672
- 0.018360809753397392,
673
- 0.019985065603578676,
674
- 0.01858524212906661,
675
- 0.02056734084818314,
676
- 0.01856864124721938,
677
- 0.01852369899036554,
678
- 0.018906581267301003,
679
- 0.01927234342475787,
680
- 0.018721831301170885
681
- ],
682
- "test_metric_curve": [
683
- {
684
- "acc": 0.9816,
685
- "precision": 0.9819819819819819,
686
- "recall": 0.9634649381261049,
687
- "f1": 0.9726353361094586
688
- },
689
- {
690
- "acc": 0.9885,
691
- "precision": 0.9742551345096905,
692
- "recall": 0.9923394225103123,
693
- "f1": 0.9832141293241862
694
- },
695
- {
696
- "acc": 0.988,
697
- "precision": 0.9720299884659747,
698
- "recall": 0.993223335297584,
699
- "f1": 0.9825123870591663
700
- },
701
- {
702
- "acc": 0.9892,
703
- "precision": 0.986094674556213,
704
- "recall": 0.9820271066588097,
705
- "f1": 0.9840566873339238
706
- },
707
- {
708
- "acc": 0.9916,
709
- "precision": 0.9825072886297376,
710
- "recall": 0.9929286977018268,
711
- "f1": 0.9876905041031652
712
- },
713
- {
714
- "acc": 0.9913,
715
- "precision": 0.9824919754887657,
716
- "recall": 0.9920447849145551,
717
- "f1": 0.9872452719542588
718
- },
719
- {
720
- "acc": 0.9909,
721
- "precision": 0.9847373055474024,
722
- "recall": 0.9885091337654685,
723
- "f1": 0.9866196147625349
724
- },
725
- {
726
- "acc": 0.9857,
727
- "precision": 0.9954282231027126,
728
- "recall": 0.9622863877430761,
729
- "f1": 0.9785767790262172
730
- },
731
- {
732
- "acc": 0.9882,
733
- "precision": 0.9761627906976744,
734
- "recall": 0.9893930465527401,
735
- "f1": 0.9827333918642083
736
- },
737
- {
738
- "acc": 0.9912,
739
- "precision": 0.9833333333333333,
740
- "recall": 0.9908662345315262,
741
- "f1": 0.9870854123862635
742
- },
743
- {
744
- "acc": 0.9911,
745
- "precision": 0.9864586399764498,
746
- "recall": 0.9873305833824396,
747
- "f1": 0.9868944190840818
748
- },
749
- {
750
- "acc": 0.9842,
751
- "precision": 0.997539975399754,
752
- "recall": 0.9558043606364172,
753
- "f1": 0.9762263015347576
754
- },
755
- {
756
- "acc": 0.9872,
757
- "precision": 0.9936517533252721,
758
- "recall": 0.9684737772539777,
759
- "f1": 0.9809012235153686
760
- },
761
- {
762
- "acc": 0.9919,
763
- "precision": 0.9825225750072822,
764
- "recall": 0.9938126104890984,
765
- "f1": 0.9881353449538597
766
- },
767
- {
768
- "acc": 0.9905,
769
- "precision": 0.9864346800353878,
770
- "recall": 0.9855627578078963,
771
- "f1": 0.9859985261606485
772
- },
773
- {
774
- "acc": 0.9903,
775
- "precision": 0.9867139061116031,
776
- "recall": 0.9846788450206246,
777
- "f1": 0.9856953251732783
778
- },
779
- {
780
- "acc": 0.9912,
781
- "precision": 0.9833333333333333,
782
- "recall": 0.9908662345315262,
783
- "f1": 0.9870854123862635
784
- },
785
- {
786
- "acc": 0.9917,
787
- "precision": 0.9827938174394867,
788
- "recall": 0.9929286977018268,
789
- "f1": 0.9878352630807563
790
- },
791
- {
792
- "acc": 0.9914,
793
- "precision": 0.9822157434402332,
794
- "recall": 0.9926340601060696,
795
- "f1": 0.9873974208675265
796
- },
797
- {
798
- "acc": 0.9914,
799
- "precision": 0.9833430742255991,
800
- "recall": 0.9914555097230406,
801
- "f1": 0.9873826291079812
802
- },
803
- {
804
- "acc": 0.9908,
805
- "precision": 0.986446670595168,
806
- "recall": 0.986446670595168,
807
- "f1": 0.986446670595168
808
- },
809
- {
810
- "acc": 0.9908,
811
- "precision": 0.986446670595168,
812
- "recall": 0.986446670595168,
813
- "f1": 0.986446670595168
814
- },
815
- {
816
- "acc": 0.9909,
817
- "precision": 0.9858781994704324,
818
- "recall": 0.9873305833824396,
819
- "f1": 0.9866038569115266
820
- },
821
- {
822
- "acc": 0.9912,
823
- "precision": 0.9833333333333333,
824
- "recall": 0.9908662345315262,
825
- "f1": 0.9870854123862635
826
- },
827
- {
828
- "acc": 0.9915,
829
- "precision": 0.9827837758972863,
830
- "recall": 0.9923394225103123,
831
- "f1": 0.9875384840932414
832
- },
833
- {
834
- "acc": 0.9907,
835
- "precision": 0.9873043991733097,
836
- "recall": 0.985268120212139,
837
- "f1": 0.9862852086712873
838
- },
839
- {
840
- "acc": 0.9919,
841
- "precision": 0.9825225750072822,
842
- "recall": 0.9938126104890984,
843
- "f1": 0.9881353449538597
844
- },
845
- {
846
- "acc": 0.9914,
847
- "precision": 0.982778750729714,
848
- "recall": 0.9920447849145551,
849
- "f1": 0.9873900293255131
850
- },
851
- {
852
- "acc": 0.9916,
853
- "precision": 0.9777713625866051,
854
- "recall": 0.9979375368296994,
855
- "f1": 0.9877515310586177
856
- },
857
- {
858
- "acc": 0.9901,
859
- "precision": 0.9869937924918711,
860
- "recall": 0.983794932233353,
861
- "f1": 0.9853917662682603
862
- },
863
- {
864
- "acc": 0.9914,
865
- "precision": 0.982778750729714,
866
- "recall": 0.9920447849145551,
867
- "f1": 0.9873900293255131
868
- },
869
- {
870
- "acc": 0.9904,
871
- "precision": 0.9872931442080378,
872
- "recall": 0.9843842074248674,
873
- "f1": 0.9858365299498378
874
- },
875
- {
876
- "acc": 0.9914,
877
- "precision": 0.982778750729714,
878
- "recall": 0.9920447849145551,
879
- "f1": 0.9873900293255131
880
- },
881
- {
882
- "acc": 0.9887,
883
- "precision": 0.993680409268733,
884
- "recall": 0.9728933411903359,
885
- "f1": 0.9831770135477147
886
- },
887
- {
888
- "acc": 0.9912,
889
- "precision": 0.9833333333333333,
890
- "recall": 0.9908662345315262,
891
- "f1": 0.9870854123862635
892
- },
893
- {
894
- "acc": 0.9913,
895
- "precision": 0.983338205203157,
896
- "recall": 0.9911608721272834,
897
- "f1": 0.9872340425531914
898
- },
899
- {
900
- "acc": 0.9915,
901
- "precision": 0.9827837758972863,
902
- "recall": 0.9923394225103123,
903
- "f1": 0.9875384840932414
904
- },
905
- {
906
- "acc": 0.991,
907
- "precision": 0.9858823529411764,
908
- "recall": 0.9876252209781968,
909
- "f1": 0.986753017368266
910
- },
911
- {
912
- "acc": 0.9905,
913
- "precision": 0.9870091526424565,
914
- "recall": 0.9849734826163818,
915
- "f1": 0.9859902669222829
916
- },
917
- {
918
- "acc": 0.9912,
919
- "precision": 0.9830508474576272,
920
- "recall": 0.9911608721272834,
921
- "f1": 0.9870892018779343
922
- },
923
- {
924
- "acc": 0.9911,
925
- "precision": 0.9822001750802452,
926
- "recall": 0.9917501473187978,
927
- "f1": 0.9869520598152763
928
- },
929
- {
930
- "acc": 0.9901,
931
- "precision": 0.9887273805992287,
932
- "recall": 0.9820271066588097,
933
- "f1": 0.9853658536585367
934
- },
935
- {
936
- "acc": 0.9914,
937
- "precision": 0.982778750729714,
938
- "recall": 0.9920447849145551,
939
- "f1": 0.9873900293255131
940
- },
941
- {
942
- "acc": 0.9907,
943
- "precision": 0.9833089311859443,
944
- "recall": 0.9893930465527401,
945
- "f1": 0.9863416066970185
946
- },
947
- {
948
- "acc": 0.9914,
949
- "precision": 0.982778750729714,
950
- "recall": 0.9920447849145551,
951
- "f1": 0.9873900293255131
952
- },
953
- {
954
- "acc": 0.9908,
955
- "precision": 0.986446670595168,
956
- "recall": 0.986446670595168,
957
- "f1": 0.986446670595168
958
- },
959
- {
960
- "acc": 0.991,
961
- "precision": 0.9833235810415447,
962
- "recall": 0.9902769593400118,
963
- "f1": 0.9867880211391661
964
- },
965
- {
966
- "acc": 0.9912,
967
- "precision": 0.9833333333333333,
968
- "recall": 0.9908662345315262,
969
- "f1": 0.9870854123862635
970
- },
971
- {
972
- "acc": 0.9912,
973
- "precision": 0.9824868651488616,
974
- "recall": 0.9917501473187978,
975
- "f1": 0.9870967741935485
976
- },
977
- {
978
- "acc": 0.9909,
979
- "precision": 0.9838851450336947,
980
- "recall": 0.9893930465527401,
981
- "f1": 0.9866314088438372
982
- },
983
- {
984
- "acc": 0.9911,
985
- "precision": 0.9833284586136297,
986
- "recall": 0.990571596935769,
987
- "f1": 0.9869367385879936
988
- },
989
- {
990
- "acc": 0.9913,
991
- "precision": 0.9836209417958467,
992
- "recall": 0.9908662345315262,
993
- "f1": 0.9872302950242183
994
- },
995
- {
996
- "acc": 0.9914,
997
- "precision": 0.982778750729714,
998
- "recall": 0.9920447849145551,
999
- "f1": 0.9873900293255131
1000
- },
1001
- {
1002
- "acc": 0.991,
1003
- "precision": 0.9858823529411764,
1004
- "recall": 0.9876252209781968,
1005
- "f1": 0.986753017368266
1006
- },
1007
- {
1008
- "acc": 0.9912,
1009
- "precision": 0.9830508474576272,
1010
- "recall": 0.9911608721272834,
1011
- "f1": 0.9870892018779343
1012
- },
1013
- {
1014
- "acc": 0.9914,
1015
- "precision": 0.982778750729714,
1016
- "recall": 0.9920447849145551,
1017
- "f1": 0.9873900293255131
1018
- },
1019
- {
1020
- "acc": 0.9899,
1021
- "precision": 0.9875629256736749,
1022
- "recall": 0.9826163818503241,
1023
- "f1": 0.9850834440998375
1024
- },
1025
- {
1026
- "acc": 0.9908,
1027
- "precision": 0.986446670595168,
1028
- "recall": 0.986446670595168,
1029
- "f1": 0.986446670595168
1030
- },
1031
- {
1032
- "acc": 0.9915,
1033
- "precision": 0.9819399941741916,
1034
- "recall": 0.993223335297584,
1035
- "f1": 0.9875494360626923
1036
- },
1037
- {
1038
- "acc": 0.9914,
1039
- "precision": 0.982778750729714,
1040
- "recall": 0.9920447849145551,
1041
- "f1": 0.9873900293255131
1042
- },
1043
- {
1044
- "acc": 0.9906,
1045
- "precision": 0.987012987012987,
1046
- "recall": 0.985268120212139,
1047
- "f1": 0.9861397817752875
1048
- },
1049
- {
1050
- "acc": 0.9908,
1051
- "precision": 0.986446670595168,
1052
- "recall": 0.986446670595168,
1053
- "f1": 0.986446670595168
1054
- },
1055
- {
1056
- "acc": 0.991,
1057
- "precision": 0.9833235810415447,
1058
- "recall": 0.9902769593400118,
1059
- "f1": 0.9867880211391661
1060
- },
1061
- {
1062
- "acc": 0.9907,
1063
- "precision": 0.9864426760978485,
1064
- "recall": 0.9861520329994107,
1065
- "f1": 0.9862973331368794
1066
- },
1067
- {
1068
- "acc": 0.9912,
1069
- "precision": 0.9824868651488616,
1070
- "recall": 0.9917501473187978,
1071
- "f1": 0.9870967741935485
1072
- },
1073
- {
1074
- "acc": 0.9911,
1075
- "precision": 0.9833284586136297,
1076
- "recall": 0.990571596935769,
1077
- "f1": 0.9869367385879936
1078
- },
1079
- {
1080
- "acc": 0.9908,
1081
- "precision": 0.986446670595168,
1082
- "recall": 0.986446670595168,
1083
- "f1": 0.986446670595168
1084
- },
1085
- {
1086
- "acc": 0.9914,
1087
- "precision": 0.982778750729714,
1088
- "recall": 0.9920447849145551,
1089
- "f1": 0.9873900293255131
1090
- },
1091
- {
1092
- "acc": 0.9914,
1093
- "precision": 0.982778750729714,
1094
- "recall": 0.9920447849145551,
1095
- "f1": 0.9873900293255131
1096
- },
1097
- {
1098
- "acc": 0.9916,
1099
- "precision": 0.9825072886297376,
1100
- "recall": 0.9929286977018268,
1101
- "f1": 0.9876905041031652
1102
- },
1103
- {
1104
- "acc": 0.9914,
1105
- "precision": 0.982778750729714,
1106
- "recall": 0.9920447849145551,
1107
- "f1": 0.9873900293255131
1108
- },
1109
- {
1110
- "acc": 0.9914,
1111
- "precision": 0.982778750729714,
1112
- "recall": 0.9920447849145551,
1113
- "f1": 0.9873900293255131
1114
- },
1115
- {
1116
- "acc": 0.9913,
1117
- "precision": 0.9824919754887657,
1118
- "recall": 0.9920447849145551,
1119
- "f1": 0.9872452719542588
1120
- },
1121
- {
1122
- "acc": 0.9915,
1123
- "precision": 0.9827837758972863,
1124
- "recall": 0.9923394225103123,
1125
- "f1": 0.9875384840932414
1126
- },
1127
- {
1128
- "acc": 0.9916,
1129
- "precision": 0.9827887981330222,
1130
- "recall": 0.9926340601060696,
1131
- "f1": 0.9876868953386104
1132
- },
1133
- {
1134
- "acc": 0.9912,
1135
- "precision": 0.982768691588785,
1136
- "recall": 0.9914555097230406,
1137
- "f1": 0.9870929891463771
1138
- },
1139
- {
1140
- "acc": 0.9909,
1141
- "precision": 0.9833187006145742,
1142
- "recall": 0.9899823217442546,
1143
- "f1": 0.986639260020555
1144
- },
1145
- {
1146
- "acc": 0.9904,
1147
- "precision": 0.987005316007088,
1148
- "recall": 0.9846788450206246,
1149
- "f1": 0.9858407079646017
1150
- },
1151
- {
1152
- "acc": 0.9912,
1153
- "precision": 0.982768691588785,
1154
- "recall": 0.9914555097230406,
1155
- "f1": 0.9870929891463771
1156
- },
1157
- {
1158
- "acc": 0.9914,
1159
- "precision": 0.982778750729714,
1160
- "recall": 0.9920447849145551,
1161
- "f1": 0.9873900293255131
1162
- }
1163
- ]
1164
- },
1165
- "hard": {
1166
- "n_nodes": 40,
1167
- "n_edges": 44,
1168
- "gnn_final": {
1169
- "acc": 0.984,
1170
- "precision": 0.9533980582524272,
1171
- "recall": 0.9750354609929078,
1172
- "f1": 0.9640953716690043
1173
- },
1174
- "baseline_direct_neighbors": {
1175
- "acc": 0.88875,
1176
- "precision": 1.0,
1177
- "recall": 0.4950354609929078,
1178
- "f1": 0.6622390891840607
1179
- },
1180
- "improvement_f1_pp": 30.185628248494357,
1181
- "train_loss_curve": [
1182
- 0.15102637716173195,
1183
- 0.052633647776499856,
1184
- 0.04379157433440559,
1185
- 0.04003102573152864,
1186
- 0.03876525610721728,
1187
- 0.0369047760956164,
1188
- 0.036530632421345216,
1189
- 0.035830124779022296,
1190
- 0.0349417570647056,
1191
- 0.035263367522318734,
1192
- 0.03485661885762238,
1193
- 0.03493121563128079,
1194
- 0.032977926293009656,
1195
- 0.03394761107103841,
1196
- 0.033683306101149356,
1197
- 0.033089775294763965,
1198
- 0.0335856751325955,
1199
- 0.03272933466515315,
1200
- 0.032765767610715556,
1201
- 0.032717534617419004,
1202
- 0.03298612758413583,
1203
- 0.03169301031356008,
1204
- 0.0323142114428847,
1205
- 0.03186470089994691,
1206
- 0.032041587697027356,
1207
- 0.03211515340814367,
1208
- 0.032251973500227904,
1209
- 0.031999882343730864,
1210
- 0.03164813786187369,
1211
- 0.03160676156320551,
1212
- 0.031426732700598224,
1213
- 0.031241096474510413,
1214
- 0.03162557367896079,
1215
- 0.03154335625256863,
1216
- 0.03165931336190261,
1217
- 0.03097459732750576,
1218
- 0.03131493923773814,
1219
- 0.0311658642354123,
1220
- 0.030633534374135706,
1221
- 0.031252258909702506,
1222
- 0.030825211223787848,
1223
- 0.03053342323340803,
1224
- 0.030733022628217442,
1225
- 0.030747544990059397,
1226
- 0.030629911747484584,
1227
- 0.030457735169680745,
1228
- 0.03058615475141687,
1229
- 0.030597560634826552,
1230
- 0.030619746312839653,
1231
- 0.03066707000986935,
1232
- 0.03048766604950197,
1233
- 0.030287153372872126,
1234
- 0.0303783905812179,
1235
- 0.030595246432494606,
1236
- 0.03037994001944753,
1237
- 0.030246819483697437,
1238
- 0.03012882444020579,
1239
- 0.03024448805347947,
1240
- 0.030449683469725642,
1241
- 0.03048290506813919,
1242
- 0.030136575797458136,
1243
- 0.02994714516170643,
1244
- 0.030466000927322056,
1245
- 0.03019473605195526,
1246
- 0.02987939404982535,
1247
- 0.030137449657182513,
1248
- 0.030104370625325828,
1249
- 0.030588962311178875,
1250
- 0.029767145353838714,
1251
- 0.030284092916966984,
1252
- 0.03002391016312413,
1253
- 0.02992785992539757,
1254
- 0.030997538813613574,
1255
- 0.029848512160238896,
1256
- 0.030022954882957493,
1257
- 0.030052907403214705,
1258
- 0.02975074222330568,
1259
- 0.029870129619877842,
1260
- 0.02968558935528563,
1261
- 0.029977637300933564
1262
- ],
1263
- "test_metric_curve": [
1264
- {
1265
- "acc": 0.978625,
1266
- "precision": 0.9395194697597349,
1267
- "recall": 0.9651063829787234,
1268
- "f1": 0.9521410579345089
1269
- },
1270
- {
1271
- "acc": 0.9813125,
1272
- "precision": 0.9460730088495575,
1273
- "recall": 0.9704964539007093,
1274
- "f1": 0.9581291135695281
1275
- },
1276
- {
1277
- "acc": 0.982,
1278
- "precision": 0.9607173356105893,
1279
- "recall": 0.9574468085106383,
1280
- "f1": 0.959079283887468
1281
- },
1282
- {
1283
- "acc": 0.9805625,
1284
- "precision": 0.9649884259259259,
1285
- "recall": 0.9460992907801419,
1286
- "f1": 0.9554505085231342
1287
- },
1288
- {
1289
- "acc": 0.98225,
1290
- "precision": 0.952274630198158,
1291
- "recall": 0.9679432624113475,
1292
- "f1": 0.9600450196961171
1293
- },
1294
- {
1295
- "acc": 0.98225,
1296
- "precision": 0.9639278557114228,
1297
- "recall": 0.955177304964539,
1298
- "f1": 0.9595326303790253
1299
- },
1300
- {
1301
- "acc": 0.982375,
1302
- "precision": 0.9543289436817035,
1303
- "recall": 0.9662411347517731,
1304
- "f1": 0.9602480969833662
1305
- },
1306
- {
1307
- "acc": 0.98375,
1308
- "precision": 0.9543556916225995,
1309
- "recall": 0.9727659574468085,
1310
- "f1": 0.9634728856420341
1311
- },
1312
- {
1313
- "acc": 0.98125,
1314
- "precision": 0.9680696661828737,
1315
- "recall": 0.9460992907801419,
1316
- "f1": 0.9569583931133429
1317
- },
1318
- {
1319
- "acc": 0.983,
1320
- "precision": 0.965379113018598,
1321
- "recall": 0.9571631205673758,
1322
- "f1": 0.9612535612535612
1323
- },
1324
- {
1325
- "acc": 0.984375,
1326
- "precision": 0.9593267882187938,
1327
- "recall": 0.9702127659574468,
1328
- "f1": 0.9647390691114245
1329
- },
1330
- {
1331
- "acc": 0.9836875,
1332
- "precision": 0.9633730834752982,
1333
- "recall": 0.9625531914893617,
1334
- "f1": 0.9629629629629629
1335
- },
1336
- {
1337
- "acc": 0.98425,
1338
- "precision": 0.9507022858716607,
1339
- "recall": 0.979290780141844,
1340
- "f1": 0.9647847959754053
1341
- },
1342
- {
1343
- "acc": 0.983,
1344
- "precision": 0.9651129539605376,
1345
- "recall": 0.9574468085106383,
1346
- "f1": 0.9612645969809172
1347
- },
1348
- {
1349
- "acc": 0.9840625,
1350
- "precision": 0.9587542087542088,
1351
- "recall": 0.9693617021276596,
1352
- "f1": 0.9640287769784174
1353
- },
1354
- {
1355
- "acc": 0.9835625,
1356
- "precision": 0.966,
1357
- "recall": 0.9591489361702128,
1358
- "f1": 0.9625622775800712
1359
- },
1360
- {
1361
- "acc": 0.9839375,
1362
- "precision": 0.9600225225225225,
1363
- "recall": 0.9673758865248226,
1364
- "f1": 0.963685177335029
1365
- },
1366
- {
1367
- "acc": 0.98425,
1368
- "precision": 0.9405114401076716,
1369
- "recall": 0.9912056737588653,
1370
- "f1": 0.9651933701657459
1371
- },
1372
- {
1373
- "acc": 0.9814375,
1374
- "precision": 0.9686411149825784,
1375
- "recall": 0.9463829787234043,
1376
- "f1": 0.9573826947912182
1377
- },
1378
- {
1379
- "acc": 0.9831875,
1380
- "precision": 0.955512031337437,
1381
- "recall": 0.9687943262411347,
1382
- "f1": 0.9621073390618397
1383
- },
1384
- {
1385
- "acc": 0.9836875,
1386
- "precision": 0.9515771997786386,
1387
- "recall": 0.9756028368794326,
1388
- "f1": 0.9634402577391792
1389
- },
1390
- {
1391
- "acc": 0.9860625,
1392
- "precision": 0.9565818584070797,
1393
- "recall": 0.9812765957446808,
1394
- "f1": 0.9687718806889791
1395
- },
1396
- {
1397
- "acc": 0.9835625,
1398
- "precision": 0.9505524861878453,
1399
- "recall": 0.9761702127659575,
1400
- "f1": 0.9631910426871939
1401
- },
1402
- {
1403
- "acc": 0.9853125,
1404
- "precision": 0.9472539423599783,
1405
- "recall": 0.9883687943262411,
1406
- "f1": 0.9673747049840344
1407
- },
1408
- {
1409
- "acc": 0.9860625,
1410
- "precision": 0.9479110146500271,
1411
- "recall": 0.9912056737588653,
1412
- "f1": 0.9690750242684788
1413
- },
1414
- {
1415
- "acc": 0.982875,
1416
- "precision": 0.9645613032294942,
1417
- "recall": 0.9574468085106383,
1418
- "f1": 0.960990888382688
1419
- },
1420
- {
1421
- "acc": 0.9843125,
1422
- "precision": 0.9606077658975802,
1423
- "recall": 0.9685106382978723,
1424
- "f1": 0.9645430145500776
1425
- },
1426
- {
1427
- "acc": 0.9840625,
1428
- "precision": 0.9501651982378855,
1429
- "recall": 0.9790070921985815,
1430
- "f1": 0.9643705463182898
1431
- },
1432
- {
1433
- "acc": 0.983375,
1434
- "precision": 0.9568264648163723,
1435
- "recall": 0.9682269503546099,
1436
- "f1": 0.9624929498025946
1437
- },
1438
- {
1439
- "acc": 0.98375,
1440
- "precision": 0.9505934308584046,
1441
- "recall": 0.9770212765957447,
1442
- "f1": 0.9636261891438165
1443
- },
1444
- {
1445
- "acc": 0.9845,
1446
- "precision": 0.9555184876285794,
1447
- "recall": 0.9750354609929078,
1448
- "f1": 0.9651783206964335
1449
- },
1450
- {
1451
- "acc": 0.9830625,
1452
- "precision": 0.9557422969187676,
1453
- "recall": 0.9679432624113475,
1454
- "f1": 0.9618040873854828
1455
- },
1456
- {
1457
- "acc": 0.983375,
1458
- "precision": 0.9555493430248811,
1459
- "recall": 0.969645390070922,
1460
- "f1": 0.9625457617572516
1461
- },
1462
- {
1463
- "acc": 0.984,
1464
- "precision": 0.9511454595638973,
1465
- "recall": 0.9775886524822694,
1466
- "f1": 0.9641857862339116
1467
- },
1468
- {
1469
- "acc": 0.9845625,
1470
- "precision": 0.9611705120990434,
1471
- "recall": 0.9690780141843972,
1472
- "f1": 0.9651080661110327
1473
- },
1474
- {
1475
- "acc": 0.984625,
1476
- "precision": 0.9565580618212197,
1477
- "recall": 0.9744680851063829,
1478
- "f1": 0.9654300168634065
1479
- },
1480
- {
1481
- "acc": 0.9846875,
1482
- "precision": 0.9563160823594881,
1483
- "recall": 0.9750354609929078,
1484
- "f1": 0.9655850540806294
1485
- },
1486
- {
1487
- "acc": 0.9856875,
1488
- "precision": 0.9461288576069301,
1489
- "recall": 0.9914893617021276,
1490
- "f1": 0.9682781548690954
1491
- },
1492
- {
1493
- "acc": 0.9841875,
1494
- "precision": 0.9631936579841449,
1495
- "recall": 0.9651063829787234,
1496
- "f1": 0.9641490718435596
1497
- },
1498
- {
1499
- "acc": 0.98475,
1500
- "precision": 0.9560745065332221,
1501
- "recall": 0.9756028368794326,
1502
- "f1": 0.9657399606852007
1503
- },
1504
- {
1505
- "acc": 0.9836875,
1506
- "precision": 0.9558659217877095,
1507
- "recall": 0.9707801418439717,
1508
- "f1": 0.963265306122449
1509
- },
1510
- {
1511
- "acc": 0.9854375,
1512
- "precision": 0.9497267759562842,
1513
- "recall": 0.9860992907801418,
1514
- "f1": 0.967571329157968
1515
- },
1516
- {
1517
- "acc": 0.9844375,
1518
- "precision": 0.9502473886750962,
1519
- "recall": 0.9807092198581561,
1520
- "f1": 0.9652380287588997
1521
- },
1522
- {
1523
- "acc": 0.9844375,
1524
- "precision": 0.9601123595505618,
1525
- "recall": 0.969645390070922,
1526
- "f1": 0.9648553281580804
1527
- },
1528
- {
1529
- "acc": 0.98475,
1530
- "precision": 0.957345971563981,
1531
- "recall": 0.9741843971631206,
1532
- "f1": 0.9656917885264341
1533
- },
1534
- {
1535
- "acc": 0.983625,
1536
- "precision": 0.9543302701197438,
1537
- "recall": 0.9721985815602837,
1538
- "f1": 0.9631815626756605
1539
- },
1540
- {
1541
- "acc": 0.9839375,
1542
- "precision": 0.9526315789473684,
1543
- "recall": 0.9756028368794326,
1544
- "f1": 0.9639803784162578
1545
- },
1546
- {
1547
- "acc": 0.9833125,
1548
- "precision": 0.9509966777408638,
1549
- "recall": 0.9744680851063829,
1550
- "f1": 0.962589323245061
1551
- },
1552
- {
1553
- "acc": 0.98425,
1554
- "precision": 0.9499587572174869,
1555
- "recall": 0.9801418439716312,
1556
- "f1": 0.9648142976822116
1557
- },
1558
- {
1559
- "acc": 0.984375,
1560
- "precision": 0.9590692458648724,
1561
- "recall": 0.9704964539007093,
1562
- "f1": 0.9647490129723633
1563
- },
1564
- {
1565
- "acc": 0.9838125,
1566
- "precision": 0.9528563505268997,
1567
- "recall": 0.9747517730496454,
1568
- "f1": 0.9636797083158043
1569
- },
1570
- {
1571
- "acc": 0.9848125,
1572
- "precision": 0.9553274139844617,
1573
- "recall": 0.9767375886524823,
1574
- "f1": 0.965913872913452
1575
- },
1576
- {
1577
- "acc": 0.9836875,
1578
- "precision": 0.9551031790295594,
1579
- "recall": 0.9716312056737588,
1580
- "f1": 0.963296301504711
1581
- },
1582
- {
1583
- "acc": 0.9845,
1584
- "precision": 0.9429575560962422,
1585
- "recall": 0.9895035460992908,
1586
- "f1": 0.965669988925803
1587
- },
1588
- {
1589
- "acc": 0.982375,
1590
- "precision": 0.9589583923011605,
1591
- "recall": 0.9611347517730496,
1592
- "f1": 0.9600453386228394
1593
- },
1594
- {
1595
- "acc": 0.984375,
1596
- "precision": 0.962439988703756,
1597
- "recall": 0.9668085106382979,
1598
- "f1": 0.9646193037078971
1599
- },
1600
- {
1601
- "acc": 0.985625,
1602
- "precision": 0.9517411571154374,
1603
- "recall": 0.9846808510638297,
1604
- "f1": 0.967930842163971
1605
- },
1606
- {
1607
- "acc": 0.98325,
1608
- "precision": 0.9596387242449901,
1609
- "recall": 0.9645390070921985,
1610
- "f1": 0.9620826259196378
1611
- },
1612
- {
1613
- "acc": 0.984,
1614
- "precision": 0.9647426784191072,
1615
- "recall": 0.9625531914893617,
1616
- "f1": 0.9636466912808862
1617
- },
1618
- {
1619
- "acc": 0.984875,
1620
- "precision": 0.9586476669460743,
1621
- "recall": 0.9733333333333334,
1622
- "f1": 0.9659346846846848
1623
- },
1624
- {
1625
- "acc": 0.9850625,
1626
- "precision": 0.9581706636921361,
1627
- "recall": 0.9747517730496454,
1628
- "f1": 0.9663900998453102
1629
- },
1630
- {
1631
- "acc": 0.9836875,
1632
- "precision": 0.9493392070484582,
1633
- "recall": 0.9781560283687943,
1634
- "f1": 0.9635322062316614
1635
- },
1636
- {
1637
- "acc": 0.983125,
1638
- "precision": 0.9575484959235311,
1639
- "recall": 0.9662411347517731,
1640
- "f1": 0.9618751765038125
1641
- },
1642
- {
1643
- "acc": 0.98425,
1644
- "precision": 0.9492176777381279,
1645
- "recall": 0.9809929078014185,
1646
- "f1": 0.9648437500000001
1647
- },
1648
- {
1649
- "acc": 0.9826875,
1650
- "precision": 0.9672036823935558,
1651
- "recall": 0.953758865248227,
1652
- "f1": 0.960434223682331
1653
- },
1654
- {
1655
- "acc": 0.9845,
1656
- "precision": 0.961679346294731,
1657
- "recall": 0.9682269503546099,
1658
- "f1": 0.964942041277919
1659
- },
1660
- {
1661
- "acc": 0.9845,
1662
- "precision": 0.960900140646976,
1663
- "recall": 0.9690780141843972,
1664
- "f1": 0.9649717514124294
1665
- },
1666
- {
1667
- "acc": 0.984125,
1668
- "precision": 0.9623975120158327,
1669
- "recall": 0.9656737588652482,
1670
- "f1": 0.9640328518833192
1671
- },
1672
- {
1673
- "acc": 0.984875,
1674
- "precision": 0.9571150097465887,
1675
- "recall": 0.9750354609929078,
1676
- "f1": 0.9659921304103429
1677
- },
1678
- {
1679
- "acc": 0.984625,
1680
- "precision": 0.9598877980364656,
1681
- "recall": 0.9707801418439717,
1682
- "f1": 0.9653032440056418
1683
- },
1684
- {
1685
- "acc": 0.98375,
1686
- "precision": 0.9546087440824282,
1687
- "recall": 0.9724822695035461,
1688
- "f1": 0.9634626194491286
1689
- },
1690
- {
1691
- "acc": 0.984125,
1692
- "precision": 0.9501789154968345,
1693
- "recall": 0.979290780141844,
1694
- "f1": 0.9645152277172394
1695
- },
1696
- {
1697
- "acc": 0.9849375,
1698
- "precision": 0.9607182940516273,
1699
- "recall": 0.9713475177304964,
1700
- "f1": 0.9660036676541119
1701
- },
1702
- {
1703
- "acc": 0.984875,
1704
- "precision": 0.956606397774687,
1705
- "recall": 0.9756028368794326,
1706
- "f1": 0.9660112359550562
1707
- },
1708
- {
1709
- "acc": 0.984625,
1710
- "precision": 0.9570671870643992,
1711
- "recall": 0.9739007092198582,
1712
- "f1": 0.9654105736782902
1713
- },
1714
- {
1715
- "acc": 0.9849375,
1716
- "precision": 0.9584031267448353,
1717
- "recall": 0.9739007092198582,
1718
- "f1": 0.9660897706486562
1719
- },
1720
- {
1721
- "acc": 0.98375,
1722
- "precision": 0.9523413688002217,
1723
- "recall": 0.9750354609929078,
1724
- "f1": 0.9635548079618728
1725
- },
1726
- {
1727
- "acc": 0.984,
1728
- "precision": 0.9536497363308354,
1729
- "recall": 0.9747517730496454,
1730
- "f1": 0.9640852974186307
1731
- },
1732
- {
1733
- "acc": 0.98375,
1734
- "precision": 0.9505934308584046,
1735
- "recall": 0.9770212765957447,
1736
- "f1": 0.9636261891438165
1737
- },
1738
- {
1739
- "acc": 0.984,
1740
- "precision": 0.9533980582524272,
1741
- "recall": 0.9750354609929078,
1742
- "f1": 0.9640953716690043
1743
- }
1744
- ]
1745
- }
1746
- },
1747
- "config": {
1748
- "n_train": 2000,
1749
- "n_test": 400,
1750
- "hidden_dim": 64,
1751
- "epochs": 80,
1752
- "lr": 0.002,
1753
- "max_hops": 3
1754
- },
1755
- "elapsed_min": 21.402417866388955
1756
  }
 
1
+ {
2
+ "graphs": {
3
+ "easy": {
4
+ "n_nodes": 12,
5
+ "n_edges": 10,
6
+ "gnn_final": {
7
+ "acc": 1.0,
8
+ "precision": 1.0,
9
+ "recall": 1.0,
10
+ "f1": 1.0
11
+ },
12
+ "baseline_direct_neighbors": {
13
+ "acc": 0.8258333333333333,
14
+ "precision": 1.0,
15
+ "recall": 0.6352530541012217,
16
+ "f1": 0.7769477054429028
17
+ },
18
+ "improvement_f1_pp": 22.305229455709718,
19
+ "train_loss_curve": [
20
+ 0.10601958807871187,
21
+ 0.00014574478766241308,
22
+ 2.1336230871288145e-05,
23
+ 5.904760447787133e-06,
24
+ 0.014828034023753519,
25
+ 0.0001365676538936252,
26
+ 2.800940909035432e-05,
27
+ 7.873948834791846e-06,
28
+ 2.40824965675521e-06,
29
+ 7.439197035413468e-07,
30
+ 2.349434055591839e-07,
31
+ 8.035365056026132e-08,
32
+ 1.866763376779131e-08,
33
+ 6.7128299592450774e-09,
34
+ 3.606812599319898e-09,
35
+ 2.4320182903440704e-09,
36
+ 1.5445408799196548e-09,
37
+ 0.03198392186360504,
38
+ 1.3277981027858794e-05,
39
+ 7.040849976128097e-06,
40
+ 2.0380432214083175e-06,
41
+ 5.154616233541851e-07,
42
+ 0.017213296287886225,
43
+ 0.00023569030925164338,
44
+ 2.4805963813645227e-05,
45
+ 6.058055528068272e-06,
46
+ 1.8203820033098038e-06,
47
+ 6.043328515907098e-07,
48
+ 2.1225388103874568e-07,
49
+ 7.437462508802039e-08,
50
+ 1.902343076246039e-08,
51
+ 6.527784956639485e-09,
52
+ 3.3294667175720776e-09,
53
+ 1.9615958442567566e-09,
54
+ 0.010902570914775889,
55
+ 2.806348171776314e-05,
56
+ 7.667120790626038e-06,
57
+ 2.582107717285551e-06,
58
+ 9.129105348027232e-07,
59
+ 3.106581481139294e-07,
60
+ 1.0230859844032431e-07,
61
+ 2.725160428237702e-08,
62
+ 8.880124408068363e-09,
63
+ 4.4200613740675046e-09,
64
+ 2.8600379247657045e-09,
65
+ 2.2151315261330923e-09,
66
+ 1.7114610773887693e-09,
67
+ 1.4000422095074408e-09,
68
+ 1.0463116296276038e-09,
69
+ 6.4079628731738e-10,
70
+ 0.02516633728286725,
71
+ 0.00012813284900565014,
72
+ 2.3232634050379803e-05,
73
+ 7.066120872802589e-06,
74
+ 2.311430617913936e-06,
75
+ 7.920952698295068e-07,
76
+ 2.5278086959691613e-07,
77
+ 7.818242851037627e-08,
78
+ 1.983640248580842e-08,
79
+ 7.863145182916767e-09,
80
+ 5.0701508055233275e-09,
81
+ 4.364776342121379e-09,
82
+ 3.937454630286758e-09,
83
+ 2.518706138457294e-09,
84
+ 1.9815549914984234e-09,
85
+ 0.018349960519401222,
86
+ 7.85511791638533e-05,
87
+ 2.0063992723006376e-05,
88
+ 6.210748974664104e-06,
89
+ 1.9043317207399904e-06,
90
+ 6.112533347568437e-07,
91
+ 2.0612900407184615e-07,
92
+ 6.247272126631417e-08,
93
+ 1.5818333928198573e-08,
94
+ 5.678499110562204e-09,
95
+ 2.927658185385007e-09,
96
+ 2.2895658619235268e-09,
97
+ 1.9812523096841366e-09,
98
+ 1.418338779821114e-09,
99
+ 9.94527561841937e-10
100
+ ],
101
+ "test_metric_curve": [
102
+ {
103
+ "acc": 1.0,
104
+ "precision": 1.0,
105
+ "recall": 1.0,
106
+ "f1": 1.0
107
+ },
108
+ {
109
+ "acc": 1.0,
110
+ "precision": 1.0,
111
+ "recall": 1.0,
112
+ "f1": 1.0
113
+ },
114
+ {
115
+ "acc": 1.0,
116
+ "precision": 1.0,
117
+ "recall": 1.0,
118
+ "f1": 1.0
119
+ },
120
+ {
121
+ "acc": 1.0,
122
+ "precision": 1.0,
123
+ "recall": 1.0,
124
+ "f1": 1.0
125
+ },
126
+ {
127
+ "acc": 1.0,
128
+ "precision": 1.0,
129
+ "recall": 1.0,
130
+ "f1": 1.0
131
+ },
132
+ {
133
+ "acc": 1.0,
134
+ "precision": 1.0,
135
+ "recall": 1.0,
136
+ "f1": 1.0
137
+ },
138
+ {
139
+ "acc": 1.0,
140
+ "precision": 1.0,
141
+ "recall": 1.0,
142
+ "f1": 1.0
143
+ },
144
+ {
145
+ "acc": 1.0,
146
+ "precision": 1.0,
147
+ "recall": 1.0,
148
+ "f1": 1.0
149
+ },
150
+ {
151
+ "acc": 1.0,
152
+ "precision": 1.0,
153
+ "recall": 1.0,
154
+ "f1": 1.0
155
+ },
156
+ {
157
+ "acc": 1.0,
158
+ "precision": 1.0,
159
+ "recall": 1.0,
160
+ "f1": 1.0
161
+ },
162
+ {
163
+ "acc": 1.0,
164
+ "precision": 1.0,
165
+ "recall": 1.0,
166
+ "f1": 1.0
167
+ },
168
+ {
169
+ "acc": 1.0,
170
+ "precision": 1.0,
171
+ "recall": 1.0,
172
+ "f1": 1.0
173
+ },
174
+ {
175
+ "acc": 1.0,
176
+ "precision": 1.0,
177
+ "recall": 1.0,
178
+ "f1": 1.0
179
+ },
180
+ {
181
+ "acc": 1.0,
182
+ "precision": 1.0,
183
+ "recall": 1.0,
184
+ "f1": 1.0
185
+ },
186
+ {
187
+ "acc": 1.0,
188
+ "precision": 1.0,
189
+ "recall": 1.0,
190
+ "f1": 1.0
191
+ },
192
+ {
193
+ "acc": 1.0,
194
+ "precision": 1.0,
195
+ "recall": 1.0,
196
+ "f1": 1.0
197
+ },
198
+ {
199
+ "acc": 1.0,
200
+ "precision": 1.0,
201
+ "recall": 1.0,
202
+ "f1": 1.0
203
+ },
204
+ {
205
+ "acc": 1.0,
206
+ "precision": 1.0,
207
+ "recall": 1.0,
208
+ "f1": 1.0
209
+ },
210
+ {
211
+ "acc": 1.0,
212
+ "precision": 1.0,
213
+ "recall": 1.0,
214
+ "f1": 1.0
215
+ },
216
+ {
217
+ "acc": 1.0,
218
+ "precision": 1.0,
219
+ "recall": 1.0,
220
+ "f1": 1.0
221
+ },
222
+ {
223
+ "acc": 1.0,
224
+ "precision": 1.0,
225
+ "recall": 1.0,
226
+ "f1": 1.0
227
+ },
228
+ {
229
+ "acc": 1.0,
230
+ "precision": 1.0,
231
+ "recall": 1.0,
232
+ "f1": 1.0
233
+ },
234
+ {
235
+ "acc": 1.0,
236
+ "precision": 1.0,
237
+ "recall": 1.0,
238
+ "f1": 1.0
239
+ },
240
+ {
241
+ "acc": 1.0,
242
+ "precision": 1.0,
243
+ "recall": 1.0,
244
+ "f1": 1.0
245
+ },
246
+ {
247
+ "acc": 1.0,
248
+ "precision": 1.0,
249
+ "recall": 1.0,
250
+ "f1": 1.0
251
+ },
252
+ {
253
+ "acc": 1.0,
254
+ "precision": 1.0,
255
+ "recall": 1.0,
256
+ "f1": 1.0
257
+ },
258
+ {
259
+ "acc": 1.0,
260
+ "precision": 1.0,
261
+ "recall": 1.0,
262
+ "f1": 1.0
263
+ },
264
+ {
265
+ "acc": 1.0,
266
+ "precision": 1.0,
267
+ "recall": 1.0,
268
+ "f1": 1.0
269
+ },
270
+ {
271
+ "acc": 1.0,
272
+ "precision": 1.0,
273
+ "recall": 1.0,
274
+ "f1": 1.0
275
+ },
276
+ {
277
+ "acc": 1.0,
278
+ "precision": 1.0,
279
+ "recall": 1.0,
280
+ "f1": 1.0
281
+ },
282
+ {
283
+ "acc": 1.0,
284
+ "precision": 1.0,
285
+ "recall": 1.0,
286
+ "f1": 1.0
287
+ },
288
+ {
289
+ "acc": 1.0,
290
+ "precision": 1.0,
291
+ "recall": 1.0,
292
+ "f1": 1.0
293
+ },
294
+ {
295
+ "acc": 1.0,
296
+ "precision": 1.0,
297
+ "recall": 1.0,
298
+ "f1": 1.0
299
+ },
300
+ {
301
+ "acc": 1.0,
302
+ "precision": 1.0,
303
+ "recall": 1.0,
304
+ "f1": 1.0
305
+ },
306
+ {
307
+ "acc": 1.0,
308
+ "precision": 1.0,
309
+ "recall": 1.0,
310
+ "f1": 1.0
311
+ },
312
+ {
313
+ "acc": 1.0,
314
+ "precision": 1.0,
315
+ "recall": 1.0,
316
+ "f1": 1.0
317
+ },
318
+ {
319
+ "acc": 1.0,
320
+ "precision": 1.0,
321
+ "recall": 1.0,
322
+ "f1": 1.0
323
+ },
324
+ {
325
+ "acc": 1.0,
326
+ "precision": 1.0,
327
+ "recall": 1.0,
328
+ "f1": 1.0
329
+ },
330
+ {
331
+ "acc": 1.0,
332
+ "precision": 1.0,
333
+ "recall": 1.0,
334
+ "f1": 1.0
335
+ },
336
+ {
337
+ "acc": 1.0,
338
+ "precision": 1.0,
339
+ "recall": 1.0,
340
+ "f1": 1.0
341
+ },
342
+ {
343
+ "acc": 1.0,
344
+ "precision": 1.0,
345
+ "recall": 1.0,
346
+ "f1": 1.0
347
+ },
348
+ {
349
+ "acc": 1.0,
350
+ "precision": 1.0,
351
+ "recall": 1.0,
352
+ "f1": 1.0
353
+ },
354
+ {
355
+ "acc": 1.0,
356
+ "precision": 1.0,
357
+ "recall": 1.0,
358
+ "f1": 1.0
359
+ },
360
+ {
361
+ "acc": 1.0,
362
+ "precision": 1.0,
363
+ "recall": 1.0,
364
+ "f1": 1.0
365
+ },
366
+ {
367
+ "acc": 1.0,
368
+ "precision": 1.0,
369
+ "recall": 1.0,
370
+ "f1": 1.0
371
+ },
372
+ {
373
+ "acc": 1.0,
374
+ "precision": 1.0,
375
+ "recall": 1.0,
376
+ "f1": 1.0
377
+ },
378
+ {
379
+ "acc": 1.0,
380
+ "precision": 1.0,
381
+ "recall": 1.0,
382
+ "f1": 1.0
383
+ },
384
+ {
385
+ "acc": 1.0,
386
+ "precision": 1.0,
387
+ "recall": 1.0,
388
+ "f1": 1.0
389
+ },
390
+ {
391
+ "acc": 1.0,
392
+ "precision": 1.0,
393
+ "recall": 1.0,
394
+ "f1": 1.0
395
+ },
396
+ {
397
+ "acc": 1.0,
398
+ "precision": 1.0,
399
+ "recall": 1.0,
400
+ "f1": 1.0
401
+ },
402
+ {
403
+ "acc": 1.0,
404
+ "precision": 1.0,
405
+ "recall": 1.0,
406
+ "f1": 1.0
407
+ },
408
+ {
409
+ "acc": 1.0,
410
+ "precision": 1.0,
411
+ "recall": 1.0,
412
+ "f1": 1.0
413
+ },
414
+ {
415
+ "acc": 1.0,
416
+ "precision": 1.0,
417
+ "recall": 1.0,
418
+ "f1": 1.0
419
+ },
420
+ {
421
+ "acc": 1.0,
422
+ "precision": 1.0,
423
+ "recall": 1.0,
424
+ "f1": 1.0
425
+ },
426
+ {
427
+ "acc": 1.0,
428
+ "precision": 1.0,
429
+ "recall": 1.0,
430
+ "f1": 1.0
431
+ },
432
+ {
433
+ "acc": 1.0,
434
+ "precision": 1.0,
435
+ "recall": 1.0,
436
+ "f1": 1.0
437
+ },
438
+ {
439
+ "acc": 1.0,
440
+ "precision": 1.0,
441
+ "recall": 1.0,
442
+ "f1": 1.0
443
+ },
444
+ {
445
+ "acc": 1.0,
446
+ "precision": 1.0,
447
+ "recall": 1.0,
448
+ "f1": 1.0
449
+ },
450
+ {
451
+ "acc": 1.0,
452
+ "precision": 1.0,
453
+ "recall": 1.0,
454
+ "f1": 1.0
455
+ },
456
+ {
457
+ "acc": 1.0,
458
+ "precision": 1.0,
459
+ "recall": 1.0,
460
+ "f1": 1.0
461
+ },
462
+ {
463
+ "acc": 1.0,
464
+ "precision": 1.0,
465
+ "recall": 1.0,
466
+ "f1": 1.0
467
+ },
468
+ {
469
+ "acc": 1.0,
470
+ "precision": 1.0,
471
+ "recall": 1.0,
472
+ "f1": 1.0
473
+ },
474
+ {
475
+ "acc": 1.0,
476
+ "precision": 1.0,
477
+ "recall": 1.0,
478
+ "f1": 1.0
479
+ },
480
+ {
481
+ "acc": 1.0,
482
+ "precision": 1.0,
483
+ "recall": 1.0,
484
+ "f1": 1.0
485
+ },
486
+ {
487
+ "acc": 1.0,
488
+ "precision": 1.0,
489
+ "recall": 1.0,
490
+ "f1": 1.0
491
+ },
492
+ {
493
+ "acc": 1.0,
494
+ "precision": 1.0,
495
+ "recall": 1.0,
496
+ "f1": 1.0
497
+ },
498
+ {
499
+ "acc": 1.0,
500
+ "precision": 1.0,
501
+ "recall": 1.0,
502
+ "f1": 1.0
503
+ },
504
+ {
505
+ "acc": 1.0,
506
+ "precision": 1.0,
507
+ "recall": 1.0,
508
+ "f1": 1.0
509
+ },
510
+ {
511
+ "acc": 1.0,
512
+ "precision": 1.0,
513
+ "recall": 1.0,
514
+ "f1": 1.0
515
+ },
516
+ {
517
+ "acc": 1.0,
518
+ "precision": 1.0,
519
+ "recall": 1.0,
520
+ "f1": 1.0
521
+ },
522
+ {
523
+ "acc": 1.0,
524
+ "precision": 1.0,
525
+ "recall": 1.0,
526
+ "f1": 1.0
527
+ },
528
+ {
529
+ "acc": 1.0,
530
+ "precision": 1.0,
531
+ "recall": 1.0,
532
+ "f1": 1.0
533
+ },
534
+ {
535
+ "acc": 1.0,
536
+ "precision": 1.0,
537
+ "recall": 1.0,
538
+ "f1": 1.0
539
+ },
540
+ {
541
+ "acc": 1.0,
542
+ "precision": 1.0,
543
+ "recall": 1.0,
544
+ "f1": 1.0
545
+ },
546
+ {
547
+ "acc": 1.0,
548
+ "precision": 1.0,
549
+ "recall": 1.0,
550
+ "f1": 1.0
551
+ },
552
+ {
553
+ "acc": 1.0,
554
+ "precision": 1.0,
555
+ "recall": 1.0,
556
+ "f1": 1.0
557
+ },
558
+ {
559
+ "acc": 1.0,
560
+ "precision": 1.0,
561
+ "recall": 1.0,
562
+ "f1": 1.0
563
+ },
564
+ {
565
+ "acc": 1.0,
566
+ "precision": 1.0,
567
+ "recall": 1.0,
568
+ "f1": 1.0
569
+ },
570
+ {
571
+ "acc": 1.0,
572
+ "precision": 1.0,
573
+ "recall": 1.0,
574
+ "f1": 1.0
575
+ },
576
+ {
577
+ "acc": 1.0,
578
+ "precision": 1.0,
579
+ "recall": 1.0,
580
+ "f1": 1.0
581
+ }
582
+ ]
583
+ },
584
+ "medium": {
585
+ "n_nodes": 25,
586
+ "n_edges": 27,
587
+ "gnn_final": {
588
+ "acc": 0.9914,
589
+ "precision": 0.982778750729714,
590
+ "recall": 0.9920447849145551,
591
+ "f1": 0.9873900293255131
592
+ },
593
+ "baseline_direct_neighbors": {
594
+ "acc": 0.8301,
595
+ "precision": 1.0,
596
+ "recall": 0.4994107248084856,
597
+ "f1": 0.6661426606405974
598
+ },
599
+ "improvement_f1_pp": 32.124736868491574,
600
+ "train_loss_curve": [
601
+ 0.18512494587464606,
602
+ 0.05774239192842651,
603
+ 0.04035148839658183,
604
+ 0.03685507851154424,
605
+ 0.034016887983169666,
606
+ 0.03193854558186021,
607
+ 0.030314448321928544,
608
+ 0.028890588828011224,
609
+ 0.02627120438580584,
610
+ 0.02676936000857496,
611
+ 0.02735587336003725,
612
+ 0.024704556535801756,
613
+ 0.023389738032454397,
614
+ 0.02484239745095036,
615
+ 0.022598365899086623,
616
+ 0.022097759216314333,
617
+ 0.021880711925624425,
618
+ 0.023672257099118552,
619
+ 0.021815840122002862,
620
+ 0.021538631150760885,
621
+ 0.021590486920307173,
622
+ 0.020993219244996,
623
+ 0.021660113581202914,
624
+ 0.02028199757042485,
625
+ 0.021449406110984975,
626
+ 0.02049649202735325,
627
+ 0.02005596899437715,
628
+ 0.02060316097080978,
629
+ 0.02082035162168178,
630
+ 0.020935066080168856,
631
+ 0.0209964800781561,
632
+ 0.019652295691733542,
633
+ 0.020470858438760543,
634
+ 0.020456047435481396,
635
+ 0.020529603496513553,
636
+ 0.019996260003822708,
637
+ 0.021328506347361064,
638
+ 0.019778630244522907,
639
+ 0.01971426555108731,
640
+ 0.019847191254493045,
641
+ 0.01984119418810368,
642
+ 0.02021396374486143,
643
+ 0.01946370021810413,
644
+ 0.019111871498224214,
645
+ 0.019667785586758944,
646
+ 0.021675049597691873,
647
+ 0.01897557202284267,
648
+ 0.01971483370839516,
649
+ 0.01965866965101487,
650
+ 0.01936112277971507,
651
+ 0.01895255452432814,
652
+ 0.02035098125927439,
653
+ 0.01909720691408324,
654
+ 0.019500281907226687,
655
+ 0.019117790717674256,
656
+ 0.018927754213147425,
657
+ 0.020313845976115717,
658
+ 0.019341792678655486,
659
+ 0.01890229735773205,
660
+ 0.019833170414518056,
661
+ 0.01948640772390163,
662
+ 0.019305320678627013,
663
+ 0.019213381035159603,
664
+ 0.020478221997059808,
665
+ 0.01936127331570382,
666
+ 0.019158014420631225,
667
+ 0.019090143173694583,
668
+ 0.020291763241906225,
669
+ 0.01900654871721499,
670
+ 0.019815083033949698,
671
+ 0.019103285589502736,
672
+ 0.018360809753397392,
673
+ 0.019985065603578676,
674
+ 0.01858524212906661,
675
+ 0.02056734084818314,
676
+ 0.01856864124721938,
677
+ 0.01852369899036554,
678
+ 0.018906581267301003,
679
+ 0.01927234342475787,
680
+ 0.018721831301170885
681
+ ],
682
+ "test_metric_curve": [
683
+ {
684
+ "acc": 0.9816,
685
+ "precision": 0.9819819819819819,
686
+ "recall": 0.9634649381261049,
687
+ "f1": 0.9726353361094586
688
+ },
689
+ {
690
+ "acc": 0.9885,
691
+ "precision": 0.9742551345096905,
692
+ "recall": 0.9923394225103123,
693
+ "f1": 0.9832141293241862
694
+ },
695
+ {
696
+ "acc": 0.988,
697
+ "precision": 0.9720299884659747,
698
+ "recall": 0.993223335297584,
699
+ "f1": 0.9825123870591663
700
+ },
701
+ {
702
+ "acc": 0.9892,
703
+ "precision": 0.986094674556213,
704
+ "recall": 0.9820271066588097,
705
+ "f1": 0.9840566873339238
706
+ },
707
+ {
708
+ "acc": 0.9916,
709
+ "precision": 0.9825072886297376,
710
+ "recall": 0.9929286977018268,
711
+ "f1": 0.9876905041031652
712
+ },
713
+ {
714
+ "acc": 0.9913,
715
+ "precision": 0.9824919754887657,
716
+ "recall": 0.9920447849145551,
717
+ "f1": 0.9872452719542588
718
+ },
719
+ {
720
+ "acc": 0.9909,
721
+ "precision": 0.9847373055474024,
722
+ "recall": 0.9885091337654685,
723
+ "f1": 0.9866196147625349
724
+ },
725
+ {
726
+ "acc": 0.9857,
727
+ "precision": 0.9954282231027126,
728
+ "recall": 0.9622863877430761,
729
+ "f1": 0.9785767790262172
730
+ },
731
+ {
732
+ "acc": 0.9882,
733
+ "precision": 0.9761627906976744,
734
+ "recall": 0.9893930465527401,
735
+ "f1": 0.9827333918642083
736
+ },
737
+ {
738
+ "acc": 0.9912,
739
+ "precision": 0.9833333333333333,
740
+ "recall": 0.9908662345315262,
741
+ "f1": 0.9870854123862635
742
+ },
743
+ {
744
+ "acc": 0.9911,
745
+ "precision": 0.9864586399764498,
746
+ "recall": 0.9873305833824396,
747
+ "f1": 0.9868944190840818
748
+ },
749
+ {
750
+ "acc": 0.9842,
751
+ "precision": 0.997539975399754,
752
+ "recall": 0.9558043606364172,
753
+ "f1": 0.9762263015347576
754
+ },
755
+ {
756
+ "acc": 0.9872,
757
+ "precision": 0.9936517533252721,
758
+ "recall": 0.9684737772539777,
759
+ "f1": 0.9809012235153686
760
+ },
761
+ {
762
+ "acc": 0.9919,
763
+ "precision": 0.9825225750072822,
764
+ "recall": 0.9938126104890984,
765
+ "f1": 0.9881353449538597
766
+ },
767
+ {
768
+ "acc": 0.9905,
769
+ "precision": 0.9864346800353878,
770
+ "recall": 0.9855627578078963,
771
+ "f1": 0.9859985261606485
772
+ },
773
+ {
774
+ "acc": 0.9903,
775
+ "precision": 0.9867139061116031,
776
+ "recall": 0.9846788450206246,
777
+ "f1": 0.9856953251732783
778
+ },
779
+ {
780
+ "acc": 0.9912,
781
+ "precision": 0.9833333333333333,
782
+ "recall": 0.9908662345315262,
783
+ "f1": 0.9870854123862635
784
+ },
785
+ {
786
+ "acc": 0.9917,
787
+ "precision": 0.9827938174394867,
788
+ "recall": 0.9929286977018268,
789
+ "f1": 0.9878352630807563
790
+ },
791
+ {
792
+ "acc": 0.9914,
793
+ "precision": 0.9822157434402332,
794
+ "recall": 0.9926340601060696,
795
+ "f1": 0.9873974208675265
796
+ },
797
+ {
798
+ "acc": 0.9914,
799
+ "precision": 0.9833430742255991,
800
+ "recall": 0.9914555097230406,
801
+ "f1": 0.9873826291079812
802
+ },
803
+ {
804
+ "acc": 0.9908,
805
+ "precision": 0.986446670595168,
806
+ "recall": 0.986446670595168,
807
+ "f1": 0.986446670595168
808
+ },
809
+ {
810
+ "acc": 0.9908,
811
+ "precision": 0.986446670595168,
812
+ "recall": 0.986446670595168,
813
+ "f1": 0.986446670595168
814
+ },
815
+ {
816
+ "acc": 0.9909,
817
+ "precision": 0.9858781994704324,
818
+ "recall": 0.9873305833824396,
819
+ "f1": 0.9866038569115266
820
+ },
821
+ {
822
+ "acc": 0.9912,
823
+ "precision": 0.9833333333333333,
824
+ "recall": 0.9908662345315262,
825
+ "f1": 0.9870854123862635
826
+ },
827
+ {
828
+ "acc": 0.9915,
829
+ "precision": 0.9827837758972863,
830
+ "recall": 0.9923394225103123,
831
+ "f1": 0.9875384840932414
832
+ },
833
+ {
834
+ "acc": 0.9907,
835
+ "precision": 0.9873043991733097,
836
+ "recall": 0.985268120212139,
837
+ "f1": 0.9862852086712873
838
+ },
839
+ {
840
+ "acc": 0.9919,
841
+ "precision": 0.9825225750072822,
842
+ "recall": 0.9938126104890984,
843
+ "f1": 0.9881353449538597
844
+ },
845
+ {
846
+ "acc": 0.9914,
847
+ "precision": 0.982778750729714,
848
+ "recall": 0.9920447849145551,
849
+ "f1": 0.9873900293255131
850
+ },
851
+ {
852
+ "acc": 0.9916,
853
+ "precision": 0.9777713625866051,
854
+ "recall": 0.9979375368296994,
855
+ "f1": 0.9877515310586177
856
+ },
857
+ {
858
+ "acc": 0.9901,
859
+ "precision": 0.9869937924918711,
860
+ "recall": 0.983794932233353,
861
+ "f1": 0.9853917662682603
862
+ },
863
+ {
864
+ "acc": 0.9914,
865
+ "precision": 0.982778750729714,
866
+ "recall": 0.9920447849145551,
867
+ "f1": 0.9873900293255131
868
+ },
869
+ {
870
+ "acc": 0.9904,
871
+ "precision": 0.9872931442080378,
872
+ "recall": 0.9843842074248674,
873
+ "f1": 0.9858365299498378
874
+ },
875
+ {
876
+ "acc": 0.9914,
877
+ "precision": 0.982778750729714,
878
+ "recall": 0.9920447849145551,
879
+ "f1": 0.9873900293255131
880
+ },
881
+ {
882
+ "acc": 0.9887,
883
+ "precision": 0.993680409268733,
884
+ "recall": 0.9728933411903359,
885
+ "f1": 0.9831770135477147
886
+ },
887
+ {
888
+ "acc": 0.9912,
889
+ "precision": 0.9833333333333333,
890
+ "recall": 0.9908662345315262,
891
+ "f1": 0.9870854123862635
892
+ },
893
+ {
894
+ "acc": 0.9913,
895
+ "precision": 0.983338205203157,
896
+ "recall": 0.9911608721272834,
897
+ "f1": 0.9872340425531914
898
+ },
899
+ {
900
+ "acc": 0.9915,
901
+ "precision": 0.9827837758972863,
902
+ "recall": 0.9923394225103123,
903
+ "f1": 0.9875384840932414
904
+ },
905
+ {
906
+ "acc": 0.991,
907
+ "precision": 0.9858823529411764,
908
+ "recall": 0.9876252209781968,
909
+ "f1": 0.986753017368266
910
+ },
911
+ {
912
+ "acc": 0.9905,
913
+ "precision": 0.9870091526424565,
914
+ "recall": 0.9849734826163818,
915
+ "f1": 0.9859902669222829
916
+ },
917
+ {
918
+ "acc": 0.9912,
919
+ "precision": 0.9830508474576272,
920
+ "recall": 0.9911608721272834,
921
+ "f1": 0.9870892018779343
922
+ },
923
+ {
924
+ "acc": 0.9911,
925
+ "precision": 0.9822001750802452,
926
+ "recall": 0.9917501473187978,
927
+ "f1": 0.9869520598152763
928
+ },
929
+ {
930
+ "acc": 0.9901,
931
+ "precision": 0.9887273805992287,
932
+ "recall": 0.9820271066588097,
933
+ "f1": 0.9853658536585367
934
+ },
935
+ {
936
+ "acc": 0.9914,
937
+ "precision": 0.982778750729714,
938
+ "recall": 0.9920447849145551,
939
+ "f1": 0.9873900293255131
940
+ },
941
+ {
942
+ "acc": 0.9907,
943
+ "precision": 0.9833089311859443,
944
+ "recall": 0.9893930465527401,
945
+ "f1": 0.9863416066970185
946
+ },
947
+ {
948
+ "acc": 0.9914,
949
+ "precision": 0.982778750729714,
950
+ "recall": 0.9920447849145551,
951
+ "f1": 0.9873900293255131
952
+ },
953
+ {
954
+ "acc": 0.9908,
955
+ "precision": 0.986446670595168,
956
+ "recall": 0.986446670595168,
957
+ "f1": 0.986446670595168
958
+ },
959
+ {
960
+ "acc": 0.991,
961
+ "precision": 0.9833235810415447,
962
+ "recall": 0.9902769593400118,
963
+ "f1": 0.9867880211391661
964
+ },
965
+ {
966
+ "acc": 0.9912,
967
+ "precision": 0.9833333333333333,
968
+ "recall": 0.9908662345315262,
969
+ "f1": 0.9870854123862635
970
+ },
971
+ {
972
+ "acc": 0.9912,
973
+ "precision": 0.9824868651488616,
974
+ "recall": 0.9917501473187978,
975
+ "f1": 0.9870967741935485
976
+ },
977
+ {
978
+ "acc": 0.9909,
979
+ "precision": 0.9838851450336947,
980
+ "recall": 0.9893930465527401,
981
+ "f1": 0.9866314088438372
982
+ },
983
+ {
984
+ "acc": 0.9911,
985
+ "precision": 0.9833284586136297,
986
+ "recall": 0.990571596935769,
987
+ "f1": 0.9869367385879936
988
+ },
989
+ {
990
+ "acc": 0.9913,
991
+ "precision": 0.9836209417958467,
992
+ "recall": 0.9908662345315262,
993
+ "f1": 0.9872302950242183
994
+ },
995
+ {
996
+ "acc": 0.9914,
997
+ "precision": 0.982778750729714,
998
+ "recall": 0.9920447849145551,
999
+ "f1": 0.9873900293255131
1000
+ },
1001
+ {
1002
+ "acc": 0.991,
1003
+ "precision": 0.9858823529411764,
1004
+ "recall": 0.9876252209781968,
1005
+ "f1": 0.986753017368266
1006
+ },
1007
+ {
1008
+ "acc": 0.9912,
1009
+ "precision": 0.9830508474576272,
1010
+ "recall": 0.9911608721272834,
1011
+ "f1": 0.9870892018779343
1012
+ },
1013
+ {
1014
+ "acc": 0.9914,
1015
+ "precision": 0.982778750729714,
1016
+ "recall": 0.9920447849145551,
1017
+ "f1": 0.9873900293255131
1018
+ },
1019
+ {
1020
+ "acc": 0.9899,
1021
+ "precision": 0.9875629256736749,
1022
+ "recall": 0.9826163818503241,
1023
+ "f1": 0.9850834440998375
1024
+ },
1025
+ {
1026
+ "acc": 0.9908,
1027
+ "precision": 0.986446670595168,
1028
+ "recall": 0.986446670595168,
1029
+ "f1": 0.986446670595168
1030
+ },
1031
+ {
1032
+ "acc": 0.9915,
1033
+ "precision": 0.9819399941741916,
1034
+ "recall": 0.993223335297584,
1035
+ "f1": 0.9875494360626923
1036
+ },
1037
+ {
1038
+ "acc": 0.9914,
1039
+ "precision": 0.982778750729714,
1040
+ "recall": 0.9920447849145551,
1041
+ "f1": 0.9873900293255131
1042
+ },
1043
+ {
1044
+ "acc": 0.9906,
1045
+ "precision": 0.987012987012987,
1046
+ "recall": 0.985268120212139,
1047
+ "f1": 0.9861397817752875
1048
+ },
1049
+ {
1050
+ "acc": 0.9908,
1051
+ "precision": 0.986446670595168,
1052
+ "recall": 0.986446670595168,
1053
+ "f1": 0.986446670595168
1054
+ },
1055
+ {
1056
+ "acc": 0.991,
1057
+ "precision": 0.9833235810415447,
1058
+ "recall": 0.9902769593400118,
1059
+ "f1": 0.9867880211391661
1060
+ },
1061
+ {
1062
+ "acc": 0.9907,
1063
+ "precision": 0.9864426760978485,
1064
+ "recall": 0.9861520329994107,
1065
+ "f1": 0.9862973331368794
1066
+ },
1067
+ {
1068
+ "acc": 0.9912,
1069
+ "precision": 0.9824868651488616,
1070
+ "recall": 0.9917501473187978,
1071
+ "f1": 0.9870967741935485
1072
+ },
1073
+ {
1074
+ "acc": 0.9911,
1075
+ "precision": 0.9833284586136297,
1076
+ "recall": 0.990571596935769,
1077
+ "f1": 0.9869367385879936
1078
+ },
1079
+ {
1080
+ "acc": 0.9908,
1081
+ "precision": 0.986446670595168,
1082
+ "recall": 0.986446670595168,
1083
+ "f1": 0.986446670595168
1084
+ },
1085
+ {
1086
+ "acc": 0.9914,
1087
+ "precision": 0.982778750729714,
1088
+ "recall": 0.9920447849145551,
1089
+ "f1": 0.9873900293255131
1090
+ },
1091
+ {
1092
+ "acc": 0.9914,
1093
+ "precision": 0.982778750729714,
1094
+ "recall": 0.9920447849145551,
1095
+ "f1": 0.9873900293255131
1096
+ },
1097
+ {
1098
+ "acc": 0.9916,
1099
+ "precision": 0.9825072886297376,
1100
+ "recall": 0.9929286977018268,
1101
+ "f1": 0.9876905041031652
1102
+ },
1103
+ {
1104
+ "acc": 0.9914,
1105
+ "precision": 0.982778750729714,
1106
+ "recall": 0.9920447849145551,
1107
+ "f1": 0.9873900293255131
1108
+ },
1109
+ {
1110
+ "acc": 0.9914,
1111
+ "precision": 0.982778750729714,
1112
+ "recall": 0.9920447849145551,
1113
+ "f1": 0.9873900293255131
1114
+ },
1115
+ {
1116
+ "acc": 0.9913,
1117
+ "precision": 0.9824919754887657,
1118
+ "recall": 0.9920447849145551,
1119
+ "f1": 0.9872452719542588
1120
+ },
1121
+ {
1122
+ "acc": 0.9915,
1123
+ "precision": 0.9827837758972863,
1124
+ "recall": 0.9923394225103123,
1125
+ "f1": 0.9875384840932414
1126
+ },
1127
+ {
1128
+ "acc": 0.9916,
1129
+ "precision": 0.9827887981330222,
1130
+ "recall": 0.9926340601060696,
1131
+ "f1": 0.9876868953386104
1132
+ },
1133
+ {
1134
+ "acc": 0.9912,
1135
+ "precision": 0.982768691588785,
1136
+ "recall": 0.9914555097230406,
1137
+ "f1": 0.9870929891463771
1138
+ },
1139
+ {
1140
+ "acc": 0.9909,
1141
+ "precision": 0.9833187006145742,
1142
+ "recall": 0.9899823217442546,
1143
+ "f1": 0.986639260020555
1144
+ },
1145
+ {
1146
+ "acc": 0.9904,
1147
+ "precision": 0.987005316007088,
1148
+ "recall": 0.9846788450206246,
1149
+ "f1": 0.9858407079646017
1150
+ },
1151
+ {
1152
+ "acc": 0.9912,
1153
+ "precision": 0.982768691588785,
1154
+ "recall": 0.9914555097230406,
1155
+ "f1": 0.9870929891463771
1156
+ },
1157
+ {
1158
+ "acc": 0.9914,
1159
+ "precision": 0.982778750729714,
1160
+ "recall": 0.9920447849145551,
1161
+ "f1": 0.9873900293255131
1162
+ }
1163
+ ]
1164
+ },
1165
+ "hard": {
1166
+ "n_nodes": 40,
1167
+ "n_edges": 44,
1168
+ "gnn_final": {
1169
+ "acc": 0.984,
1170
+ "precision": 0.9533980582524272,
1171
+ "recall": 0.9750354609929078,
1172
+ "f1": 0.9640953716690043
1173
+ },
1174
+ "baseline_direct_neighbors": {
1175
+ "acc": 0.88875,
1176
+ "precision": 1.0,
1177
+ "recall": 0.4950354609929078,
1178
+ "f1": 0.6622390891840607
1179
+ },
1180
+ "improvement_f1_pp": 30.185628248494357,
1181
+ "train_loss_curve": [
1182
+ 0.15102637716173195,
1183
+ 0.052633647776499856,
1184
+ 0.04379157433440559,
1185
+ 0.04003102573152864,
1186
+ 0.03876525610721728,
1187
+ 0.0369047760956164,
1188
+ 0.036530632421345216,
1189
+ 0.035830124779022296,
1190
+ 0.0349417570647056,
1191
+ 0.035263367522318734,
1192
+ 0.03485661885762238,
1193
+ 0.03493121563128079,
1194
+ 0.032977926293009656,
1195
+ 0.03394761107103841,
1196
+ 0.033683306101149356,
1197
+ 0.033089775294763965,
1198
+ 0.0335856751325955,
1199
+ 0.03272933466515315,
1200
+ 0.032765767610715556,
1201
+ 0.032717534617419004,
1202
+ 0.03298612758413583,
1203
+ 0.03169301031356008,
1204
+ 0.0323142114428847,
1205
+ 0.03186470089994691,
1206
+ 0.032041587697027356,
1207
+ 0.03211515340814367,
1208
+ 0.032251973500227904,
1209
+ 0.031999882343730864,
1210
+ 0.03164813786187369,
1211
+ 0.03160676156320551,
1212
+ 0.031426732700598224,
1213
+ 0.031241096474510413,
1214
+ 0.03162557367896079,
1215
+ 0.03154335625256863,
1216
+ 0.03165931336190261,
1217
+ 0.03097459732750576,
1218
+ 0.03131493923773814,
1219
+ 0.0311658642354123,
1220
+ 0.030633534374135706,
1221
+ 0.031252258909702506,
1222
+ 0.030825211223787848,
1223
+ 0.03053342323340803,
1224
+ 0.030733022628217442,
1225
+ 0.030747544990059397,
1226
+ 0.030629911747484584,
1227
+ 0.030457735169680745,
1228
+ 0.03058615475141687,
1229
+ 0.030597560634826552,
1230
+ 0.030619746312839653,
1231
+ 0.03066707000986935,
1232
+ 0.03048766604950197,
1233
+ 0.030287153372872126,
1234
+ 0.0303783905812179,
1235
+ 0.030595246432494606,
1236
+ 0.03037994001944753,
1237
+ 0.030246819483697437,
1238
+ 0.03012882444020579,
1239
+ 0.03024448805347947,
1240
+ 0.030449683469725642,
1241
+ 0.03048290506813919,
1242
+ 0.030136575797458136,
1243
+ 0.02994714516170643,
1244
+ 0.030466000927322056,
1245
+ 0.03019473605195526,
1246
+ 0.02987939404982535,
1247
+ 0.030137449657182513,
1248
+ 0.030104370625325828,
1249
+ 0.030588962311178875,
1250
+ 0.029767145353838714,
1251
+ 0.030284092916966984,
1252
+ 0.03002391016312413,
1253
+ 0.02992785992539757,
1254
+ 0.030997538813613574,
1255
+ 0.029848512160238896,
1256
+ 0.030022954882957493,
1257
+ 0.030052907403214705,
1258
+ 0.02975074222330568,
1259
+ 0.029870129619877842,
1260
+ 0.02968558935528563,
1261
+ 0.029977637300933564
1262
+ ],
1263
+ "test_metric_curve": [
1264
+ {
1265
+ "acc": 0.978625,
1266
+ "precision": 0.9395194697597349,
1267
+ "recall": 0.9651063829787234,
1268
+ "f1": 0.9521410579345089
1269
+ },
1270
+ {
1271
+ "acc": 0.9813125,
1272
+ "precision": 0.9460730088495575,
1273
+ "recall": 0.9704964539007093,
1274
+ "f1": 0.9581291135695281
1275
+ },
1276
+ {
1277
+ "acc": 0.982,
1278
+ "precision": 0.9607173356105893,
1279
+ "recall": 0.9574468085106383,
1280
+ "f1": 0.959079283887468
1281
+ },
1282
+ {
1283
+ "acc": 0.9805625,
1284
+ "precision": 0.9649884259259259,
1285
+ "recall": 0.9460992907801419,
1286
+ "f1": 0.9554505085231342
1287
+ },
1288
+ {
1289
+ "acc": 0.98225,
1290
+ "precision": 0.952274630198158,
1291
+ "recall": 0.9679432624113475,
1292
+ "f1": 0.9600450196961171
1293
+ },
1294
+ {
1295
+ "acc": 0.98225,
1296
+ "precision": 0.9639278557114228,
1297
+ "recall": 0.955177304964539,
1298
+ "f1": 0.9595326303790253
1299
+ },
1300
+ {
1301
+ "acc": 0.982375,
1302
+ "precision": 0.9543289436817035,
1303
+ "recall": 0.9662411347517731,
1304
+ "f1": 0.9602480969833662
1305
+ },
1306
+ {
1307
+ "acc": 0.98375,
1308
+ "precision": 0.9543556916225995,
1309
+ "recall": 0.9727659574468085,
1310
+ "f1": 0.9634728856420341
1311
+ },
1312
+ {
1313
+ "acc": 0.98125,
1314
+ "precision": 0.9680696661828737,
1315
+ "recall": 0.9460992907801419,
1316
+ "f1": 0.9569583931133429
1317
+ },
1318
+ {
1319
+ "acc": 0.983,
1320
+ "precision": 0.965379113018598,
1321
+ "recall": 0.9571631205673758,
1322
+ "f1": 0.9612535612535612
1323
+ },
1324
+ {
1325
+ "acc": 0.984375,
1326
+ "precision": 0.9593267882187938,
1327
+ "recall": 0.9702127659574468,
1328
+ "f1": 0.9647390691114245
1329
+ },
1330
+ {
1331
+ "acc": 0.9836875,
1332
+ "precision": 0.9633730834752982,
1333
+ "recall": 0.9625531914893617,
1334
+ "f1": 0.9629629629629629
1335
+ },
1336
+ {
1337
+ "acc": 0.98425,
1338
+ "precision": 0.9507022858716607,
1339
+ "recall": 0.979290780141844,
1340
+ "f1": 0.9647847959754053
1341
+ },
1342
+ {
1343
+ "acc": 0.983,
1344
+ "precision": 0.9651129539605376,
1345
+ "recall": 0.9574468085106383,
1346
+ "f1": 0.9612645969809172
1347
+ },
1348
+ {
1349
+ "acc": 0.9840625,
1350
+ "precision": 0.9587542087542088,
1351
+ "recall": 0.9693617021276596,
1352
+ "f1": 0.9640287769784174
1353
+ },
1354
+ {
1355
+ "acc": 0.9835625,
1356
+ "precision": 0.966,
1357
+ "recall": 0.9591489361702128,
1358
+ "f1": 0.9625622775800712
1359
+ },
1360
+ {
1361
+ "acc": 0.9839375,
1362
+ "precision": 0.9600225225225225,
1363
+ "recall": 0.9673758865248226,
1364
+ "f1": 0.963685177335029
1365
+ },
1366
+ {
1367
+ "acc": 0.98425,
1368
+ "precision": 0.9405114401076716,
1369
+ "recall": 0.9912056737588653,
1370
+ "f1": 0.9651933701657459
1371
+ },
1372
+ {
1373
+ "acc": 0.9814375,
1374
+ "precision": 0.9686411149825784,
1375
+ "recall": 0.9463829787234043,
1376
+ "f1": 0.9573826947912182
1377
+ },
1378
+ {
1379
+ "acc": 0.9831875,
1380
+ "precision": 0.955512031337437,
1381
+ "recall": 0.9687943262411347,
1382
+ "f1": 0.9621073390618397
1383
+ },
1384
+ {
1385
+ "acc": 0.9836875,
1386
+ "precision": 0.9515771997786386,
1387
+ "recall": 0.9756028368794326,
1388
+ "f1": 0.9634402577391792
1389
+ },
1390
+ {
1391
+ "acc": 0.9860625,
1392
+ "precision": 0.9565818584070797,
1393
+ "recall": 0.9812765957446808,
1394
+ "f1": 0.9687718806889791
1395
+ },
1396
+ {
1397
+ "acc": 0.9835625,
1398
+ "precision": 0.9505524861878453,
1399
+ "recall": 0.9761702127659575,
1400
+ "f1": 0.9631910426871939
1401
+ },
1402
+ {
1403
+ "acc": 0.9853125,
1404
+ "precision": 0.9472539423599783,
1405
+ "recall": 0.9883687943262411,
1406
+ "f1": 0.9673747049840344
1407
+ },
1408
+ {
1409
+ "acc": 0.9860625,
1410
+ "precision": 0.9479110146500271,
1411
+ "recall": 0.9912056737588653,
1412
+ "f1": 0.9690750242684788
1413
+ },
1414
+ {
1415
+ "acc": 0.982875,
1416
+ "precision": 0.9645613032294942,
1417
+ "recall": 0.9574468085106383,
1418
+ "f1": 0.960990888382688
1419
+ },
1420
+ {
1421
+ "acc": 0.9843125,
1422
+ "precision": 0.9606077658975802,
1423
+ "recall": 0.9685106382978723,
1424
+ "f1": 0.9645430145500776
1425
+ },
1426
+ {
1427
+ "acc": 0.9840625,
1428
+ "precision": 0.9501651982378855,
1429
+ "recall": 0.9790070921985815,
1430
+ "f1": 0.9643705463182898
1431
+ },
1432
+ {
1433
+ "acc": 0.983375,
1434
+ "precision": 0.9568264648163723,
1435
+ "recall": 0.9682269503546099,
1436
+ "f1": 0.9624929498025946
1437
+ },
1438
+ {
1439
+ "acc": 0.98375,
1440
+ "precision": 0.9505934308584046,
1441
+ "recall": 0.9770212765957447,
1442
+ "f1": 0.9636261891438165
1443
+ },
1444
+ {
1445
+ "acc": 0.9845,
1446
+ "precision": 0.9555184876285794,
1447
+ "recall": 0.9750354609929078,
1448
+ "f1": 0.9651783206964335
1449
+ },
1450
+ {
1451
+ "acc": 0.9830625,
1452
+ "precision": 0.9557422969187676,
1453
+ "recall": 0.9679432624113475,
1454
+ "f1": 0.9618040873854828
1455
+ },
1456
+ {
1457
+ "acc": 0.983375,
1458
+ "precision": 0.9555493430248811,
1459
+ "recall": 0.969645390070922,
1460
+ "f1": 0.9625457617572516
1461
+ },
1462
+ {
1463
+ "acc": 0.984,
1464
+ "precision": 0.9511454595638973,
1465
+ "recall": 0.9775886524822694,
1466
+ "f1": 0.9641857862339116
1467
+ },
1468
+ {
1469
+ "acc": 0.9845625,
1470
+ "precision": 0.9611705120990434,
1471
+ "recall": 0.9690780141843972,
1472
+ "f1": 0.9651080661110327
1473
+ },
1474
+ {
1475
+ "acc": 0.984625,
1476
+ "precision": 0.9565580618212197,
1477
+ "recall": 0.9744680851063829,
1478
+ "f1": 0.9654300168634065
1479
+ },
1480
+ {
1481
+ "acc": 0.9846875,
1482
+ "precision": 0.9563160823594881,
1483
+ "recall": 0.9750354609929078,
1484
+ "f1": 0.9655850540806294
1485
+ },
1486
+ {
1487
+ "acc": 0.9856875,
1488
+ "precision": 0.9461288576069301,
1489
+ "recall": 0.9914893617021276,
1490
+ "f1": 0.9682781548690954
1491
+ },
1492
+ {
1493
+ "acc": 0.9841875,
1494
+ "precision": 0.9631936579841449,
1495
+ "recall": 0.9651063829787234,
1496
+ "f1": 0.9641490718435596
1497
+ },
1498
+ {
1499
+ "acc": 0.98475,
1500
+ "precision": 0.9560745065332221,
1501
+ "recall": 0.9756028368794326,
1502
+ "f1": 0.9657399606852007
1503
+ },
1504
+ {
1505
+ "acc": 0.9836875,
1506
+ "precision": 0.9558659217877095,
1507
+ "recall": 0.9707801418439717,
1508
+ "f1": 0.963265306122449
1509
+ },
1510
+ {
1511
+ "acc": 0.9854375,
1512
+ "precision": 0.9497267759562842,
1513
+ "recall": 0.9860992907801418,
1514
+ "f1": 0.967571329157968
1515
+ },
1516
+ {
1517
+ "acc": 0.9844375,
1518
+ "precision": 0.9502473886750962,
1519
+ "recall": 0.9807092198581561,
1520
+ "f1": 0.9652380287588997
1521
+ },
1522
+ {
1523
+ "acc": 0.9844375,
1524
+ "precision": 0.9601123595505618,
1525
+ "recall": 0.969645390070922,
1526
+ "f1": 0.9648553281580804
1527
+ },
1528
+ {
1529
+ "acc": 0.98475,
1530
+ "precision": 0.957345971563981,
1531
+ "recall": 0.9741843971631206,
1532
+ "f1": 0.9656917885264341
1533
+ },
1534
+ {
1535
+ "acc": 0.983625,
1536
+ "precision": 0.9543302701197438,
1537
+ "recall": 0.9721985815602837,
1538
+ "f1": 0.9631815626756605
1539
+ },
1540
+ {
1541
+ "acc": 0.9839375,
1542
+ "precision": 0.9526315789473684,
1543
+ "recall": 0.9756028368794326,
1544
+ "f1": 0.9639803784162578
1545
+ },
1546
+ {
1547
+ "acc": 0.9833125,
1548
+ "precision": 0.9509966777408638,
1549
+ "recall": 0.9744680851063829,
1550
+ "f1": 0.962589323245061
1551
+ },
1552
+ {
1553
+ "acc": 0.98425,
1554
+ "precision": 0.9499587572174869,
1555
+ "recall": 0.9801418439716312,
1556
+ "f1": 0.9648142976822116
1557
+ },
1558
+ {
1559
+ "acc": 0.984375,
1560
+ "precision": 0.9590692458648724,
1561
+ "recall": 0.9704964539007093,
1562
+ "f1": 0.9647490129723633
1563
+ },
1564
+ {
1565
+ "acc": 0.9838125,
1566
+ "precision": 0.9528563505268997,
1567
+ "recall": 0.9747517730496454,
1568
+ "f1": 0.9636797083158043
1569
+ },
1570
+ {
1571
+ "acc": 0.9848125,
1572
+ "precision": 0.9553274139844617,
1573
+ "recall": 0.9767375886524823,
1574
+ "f1": 0.965913872913452
1575
+ },
1576
+ {
1577
+ "acc": 0.9836875,
1578
+ "precision": 0.9551031790295594,
1579
+ "recall": 0.9716312056737588,
1580
+ "f1": 0.963296301504711
1581
+ },
1582
+ {
1583
+ "acc": 0.9845,
1584
+ "precision": 0.9429575560962422,
1585
+ "recall": 0.9895035460992908,
1586
+ "f1": 0.965669988925803
1587
+ },
1588
+ {
1589
+ "acc": 0.982375,
1590
+ "precision": 0.9589583923011605,
1591
+ "recall": 0.9611347517730496,
1592
+ "f1": 0.9600453386228394
1593
+ },
1594
+ {
1595
+ "acc": 0.984375,
1596
+ "precision": 0.962439988703756,
1597
+ "recall": 0.9668085106382979,
1598
+ "f1": 0.9646193037078971
1599
+ },
1600
+ {
1601
+ "acc": 0.985625,
1602
+ "precision": 0.9517411571154374,
1603
+ "recall": 0.9846808510638297,
1604
+ "f1": 0.967930842163971
1605
+ },
1606
+ {
1607
+ "acc": 0.98325,
1608
+ "precision": 0.9596387242449901,
1609
+ "recall": 0.9645390070921985,
1610
+ "f1": 0.9620826259196378
1611
+ },
1612
+ {
1613
+ "acc": 0.984,
1614
+ "precision": 0.9647426784191072,
1615
+ "recall": 0.9625531914893617,
1616
+ "f1": 0.9636466912808862
1617
+ },
1618
+ {
1619
+ "acc": 0.984875,
1620
+ "precision": 0.9586476669460743,
1621
+ "recall": 0.9733333333333334,
1622
+ "f1": 0.9659346846846848
1623
+ },
1624
+ {
1625
+ "acc": 0.9850625,
1626
+ "precision": 0.9581706636921361,
1627
+ "recall": 0.9747517730496454,
1628
+ "f1": 0.9663900998453102
1629
+ },
1630
+ {
1631
+ "acc": 0.9836875,
1632
+ "precision": 0.9493392070484582,
1633
+ "recall": 0.9781560283687943,
1634
+ "f1": 0.9635322062316614
1635
+ },
1636
+ {
1637
+ "acc": 0.983125,
1638
+ "precision": 0.9575484959235311,
1639
+ "recall": 0.9662411347517731,
1640
+ "f1": 0.9618751765038125
1641
+ },
1642
+ {
1643
+ "acc": 0.98425,
1644
+ "precision": 0.9492176777381279,
1645
+ "recall": 0.9809929078014185,
1646
+ "f1": 0.9648437500000001
1647
+ },
1648
+ {
1649
+ "acc": 0.9826875,
1650
+ "precision": 0.9672036823935558,
1651
+ "recall": 0.953758865248227,
1652
+ "f1": 0.960434223682331
1653
+ },
1654
+ {
1655
+ "acc": 0.9845,
1656
+ "precision": 0.961679346294731,
1657
+ "recall": 0.9682269503546099,
1658
+ "f1": 0.964942041277919
1659
+ },
1660
+ {
1661
+ "acc": 0.9845,
1662
+ "precision": 0.960900140646976,
1663
+ "recall": 0.9690780141843972,
1664
+ "f1": 0.9649717514124294
1665
+ },
1666
+ {
1667
+ "acc": 0.984125,
1668
+ "precision": 0.9623975120158327,
1669
+ "recall": 0.9656737588652482,
1670
+ "f1": 0.9640328518833192
1671
+ },
1672
+ {
1673
+ "acc": 0.984875,
1674
+ "precision": 0.9571150097465887,
1675
+ "recall": 0.9750354609929078,
1676
+ "f1": 0.9659921304103429
1677
+ },
1678
+ {
1679
+ "acc": 0.984625,
1680
+ "precision": 0.9598877980364656,
1681
+ "recall": 0.9707801418439717,
1682
+ "f1": 0.9653032440056418
1683
+ },
1684
+ {
1685
+ "acc": 0.98375,
1686
+ "precision": 0.9546087440824282,
1687
+ "recall": 0.9724822695035461,
1688
+ "f1": 0.9634626194491286
1689
+ },
1690
+ {
1691
+ "acc": 0.984125,
1692
+ "precision": 0.9501789154968345,
1693
+ "recall": 0.979290780141844,
1694
+ "f1": 0.9645152277172394
1695
+ },
1696
+ {
1697
+ "acc": 0.9849375,
1698
+ "precision": 0.9607182940516273,
1699
+ "recall": 0.9713475177304964,
1700
+ "f1": 0.9660036676541119
1701
+ },
1702
+ {
1703
+ "acc": 0.984875,
1704
+ "precision": 0.956606397774687,
1705
+ "recall": 0.9756028368794326,
1706
+ "f1": 0.9660112359550562
1707
+ },
1708
+ {
1709
+ "acc": 0.984625,
1710
+ "precision": 0.9570671870643992,
1711
+ "recall": 0.9739007092198582,
1712
+ "f1": 0.9654105736782902
1713
+ },
1714
+ {
1715
+ "acc": 0.9849375,
1716
+ "precision": 0.9584031267448353,
1717
+ "recall": 0.9739007092198582,
1718
+ "f1": 0.9660897706486562
1719
+ },
1720
+ {
1721
+ "acc": 0.98375,
1722
+ "precision": 0.9523413688002217,
1723
+ "recall": 0.9750354609929078,
1724
+ "f1": 0.9635548079618728
1725
+ },
1726
+ {
1727
+ "acc": 0.984,
1728
+ "precision": 0.9536497363308354,
1729
+ "recall": 0.9747517730496454,
1730
+ "f1": 0.9640852974186307
1731
+ },
1732
+ {
1733
+ "acc": 0.98375,
1734
+ "precision": 0.9505934308584046,
1735
+ "recall": 0.9770212765957447,
1736
+ "f1": 0.9636261891438165
1737
+ },
1738
+ {
1739
+ "acc": 0.984,
1740
+ "precision": 0.9533980582524272,
1741
+ "recall": 0.9750354609929078,
1742
+ "f1": 0.9640953716690043
1743
+ }
1744
+ ]
1745
+ }
1746
+ },
1747
+ "config": {
1748
+ "n_train": 2000,
1749
+ "n_test": 400,
1750
+ "hidden_dim": 64,
1751
+ "epochs": 80,
1752
+ "lr": 0.002,
1753
+ "max_hops": 3
1754
+ },
1755
+ "elapsed_min": 21.402417866388955
1756
  }
FINAL_SUBMIT/receipts/ablation_matrix.json CHANGED
@@ -1,95 +1,95 @@
1
- {
2
- "framework": "leave-one-out reward ablation per RL guide \u00a77-8",
3
- "n_episodes_per_trial": 100,
4
- "baseline": {
5
- "disabled": "none",
6
- "mean_return": 0.6742,
7
- "solve_rate": 0.27,
8
- "n_episodes": 100
9
- },
10
- "ablations": [
11
- {
12
- "disabled": "green_credit",
13
- "mean_return": 0.2152,
14
- "solve_rate": 0.27,
15
- "n_episodes": 100,
16
- "delta_mean_return": -0.459,
17
- "pct_change": -68.08
18
- },
19
- {
20
- "disabled": "yellow_credit",
21
- "mean_return": 0.613,
22
- "solve_rate": 0.27,
23
- "n_episodes": 100,
24
- "delta_mean_return": -0.0612,
25
- "pct_change": -9.08
26
- },
27
- {
28
- "disabled": "solve_bonus",
29
- "mean_return": 0.4042,
30
- "solve_rate": 0.27,
31
- "n_episodes": 100,
32
- "delta_mean_return": -0.27,
33
- "pct_change": -40.05
34
- },
35
- {
36
- "disabled": "guess_count_bonus",
37
- "mean_return": 0.6442,
38
- "solve_rate": 0.27,
39
- "n_episodes": 100,
40
- "delta_mean_return": -0.03,
41
- "pct_change": -4.45
42
- },
43
- {
44
- "disabled": "timeout_penalty",
45
- "mean_return": 0.8202,
46
- "solve_rate": 0.27,
47
- "n_episodes": 100,
48
- "delta_mean_return": 0.146,
49
- "pct_change": 21.66
50
- }
51
- ],
52
- "ranked_by_impact": [
53
- {
54
- "disabled": "green_credit",
55
- "mean_return": 0.2152,
56
- "solve_rate": 0.27,
57
- "n_episodes": 100,
58
- "delta_mean_return": -0.459,
59
- "pct_change": -68.08
60
- },
61
- {
62
- "disabled": "solve_bonus",
63
- "mean_return": 0.4042,
64
- "solve_rate": 0.27,
65
- "n_episodes": 100,
66
- "delta_mean_return": -0.27,
67
- "pct_change": -40.05
68
- },
69
- {
70
- "disabled": "timeout_penalty",
71
- "mean_return": 0.8202,
72
- "solve_rate": 0.27,
73
- "n_episodes": 100,
74
- "delta_mean_return": 0.146,
75
- "pct_change": 21.66
76
- },
77
- {
78
- "disabled": "yellow_credit",
79
- "mean_return": 0.613,
80
- "solve_rate": 0.27,
81
- "n_episodes": 100,
82
- "delta_mean_return": -0.0612,
83
- "pct_change": -9.08
84
- },
85
- {
86
- "disabled": "guess_count_bonus",
87
- "mean_return": 0.6442,
88
- "solve_rate": 0.27,
89
- "n_episodes": 100,
90
- "delta_mean_return": -0.03,
91
- "pct_change": -4.45
92
- }
93
- ],
94
- "insight": "components ranked by metric drop when removed reveal which reward signals are load-bearing"
95
  }
 
1
+ {
2
+ "framework": "leave-one-out reward ablation per RL guide \u00a77-8",
3
+ "n_episodes_per_trial": 100,
4
+ "baseline": {
5
+ "disabled": "none",
6
+ "mean_return": 0.6742,
7
+ "solve_rate": 0.27,
8
+ "n_episodes": 100
9
+ },
10
+ "ablations": [
11
+ {
12
+ "disabled": "green_credit",
13
+ "mean_return": 0.2152,
14
+ "solve_rate": 0.27,
15
+ "n_episodes": 100,
16
+ "delta_mean_return": -0.459,
17
+ "pct_change": -68.08
18
+ },
19
+ {
20
+ "disabled": "yellow_credit",
21
+ "mean_return": 0.613,
22
+ "solve_rate": 0.27,
23
+ "n_episodes": 100,
24
+ "delta_mean_return": -0.0612,
25
+ "pct_change": -9.08
26
+ },
27
+ {
28
+ "disabled": "solve_bonus",
29
+ "mean_return": 0.4042,
30
+ "solve_rate": 0.27,
31
+ "n_episodes": 100,
32
+ "delta_mean_return": -0.27,
33
+ "pct_change": -40.05
34
+ },
35
+ {
36
+ "disabled": "guess_count_bonus",
37
+ "mean_return": 0.6442,
38
+ "solve_rate": 0.27,
39
+ "n_episodes": 100,
40
+ "delta_mean_return": -0.03,
41
+ "pct_change": -4.45
42
+ },
43
+ {
44
+ "disabled": "timeout_penalty",
45
+ "mean_return": 0.8202,
46
+ "solve_rate": 0.27,
47
+ "n_episodes": 100,
48
+ "delta_mean_return": 0.146,
49
+ "pct_change": 21.66
50
+ }
51
+ ],
52
+ "ranked_by_impact": [
53
+ {
54
+ "disabled": "green_credit",
55
+ "mean_return": 0.2152,
56
+ "solve_rate": 0.27,
57
+ "n_episodes": 100,
58
+ "delta_mean_return": -0.459,
59
+ "pct_change": -68.08
60
+ },
61
+ {
62
+ "disabled": "solve_bonus",
63
+ "mean_return": 0.4042,
64
+ "solve_rate": 0.27,
65
+ "n_episodes": 100,
66
+ "delta_mean_return": -0.27,
67
+ "pct_change": -40.05
68
+ },
69
+ {
70
+ "disabled": "timeout_penalty",
71
+ "mean_return": 0.8202,
72
+ "solve_rate": 0.27,
73
+ "n_episodes": 100,
74
+ "delta_mean_return": 0.146,
75
+ "pct_change": 21.66
76
+ },
77
+ {
78
+ "disabled": "yellow_credit",
79
+ "mean_return": 0.613,
80
+ "solve_rate": 0.27,
81
+ "n_episodes": 100,
82
+ "delta_mean_return": -0.0612,
83
+ "pct_change": -9.08
84
+ },
85
+ {
86
+ "disabled": "guess_count_bonus",
87
+ "mean_return": 0.6442,
88
+ "solve_rate": 0.27,
89
+ "n_episodes": 100,
90
+ "delta_mean_return": -0.03,
91
+ "pct_change": -4.45
92
+ }
93
+ ],
94
+ "insight": "components ranked by metric drop when removed reveal which reward signals are load-bearing"
95
  }
FINAL_SUBMIT/receipts/adversarial_20_attack_gauntlet.json CHANGED
@@ -1,217 +1,217 @@
1
- {
2
- "started_at": 1777142712.0545185,
3
- "framework": "RL guide \u00a738-44 + Skalse 2022 + Krakovna 2020",
4
- "n_total": 20,
5
- "results": [
6
- {
7
- "id": 1,
8
- "name": "empty_string",
9
- "expected_reason": "format_invalid",
10
- "actual_reason": "format_invalid",
11
- "reward_extracted": 0.0,
12
- "blocked": true,
13
- "is_legit_test": false,
14
- "verdict": "BLOCKED"
15
- },
16
- {
17
- "id": 2,
18
- "name": "single_letter",
19
- "expected_reason": "format_invalid",
20
- "actual_reason": "format_invalid",
21
- "reward_extracted": 0.0,
22
- "blocked": true,
23
- "is_legit_test": false,
24
- "verdict": "BLOCKED"
25
- },
26
- {
27
- "id": 3,
28
- "name": "digits_only",
29
- "expected_reason": "format_invalid",
30
- "actual_reason": "format_invalid",
31
- "reward_extracted": 0.0,
32
- "blocked": true,
33
- "is_legit_test": false,
34
- "verdict": "BLOCKED"
35
- },
36
- {
37
- "id": 4,
38
- "name": "unicode_homoglyph",
39
- "expected_reason": "format_invalid",
40
- "actual_reason": "format_invalid",
41
- "reward_extracted": 0.0,
42
- "blocked": true,
43
- "is_legit_test": false,
44
- "verdict": "BLOCKED"
45
- },
46
- {
47
- "id": 5,
48
- "name": "6char_word",
49
- "expected_reason": "format_invalid",
50
- "actual_reason": "format_invalid",
51
- "reward_extracted": 0.0,
52
- "blocked": true,
53
- "is_legit_test": false,
54
- "verdict": "BLOCKED"
55
- },
56
- {
57
- "id": 6,
58
- "name": "4char_word",
59
- "expected_reason": "format_invalid",
60
- "actual_reason": "format_invalid",
61
- "reward_extracted": 0.0,
62
- "blocked": true,
63
- "is_legit_test": false,
64
- "verdict": "BLOCKED"
65
- },
66
- {
67
- "id": 7,
68
- "name": "uppercase_only",
69
- "expected_reason": "accepted_after_normalize",
70
- "actual_reason": "accepted_after_normalize",
71
- "reward_extracted": 0.0,
72
- "blocked": false,
73
- "is_legit_test": true,
74
- "verdict": "LEGIT_ACCEPTED"
75
- },
76
- {
77
- "id": 8,
78
- "name": "whitespace_padded",
79
- "expected_reason": "format_invalid",
80
- "actual_reason": "format_invalid",
81
- "reward_extracted": 0.0,
82
- "blocked": true,
83
- "is_legit_test": false,
84
- "verdict": "BLOCKED"
85
- },
86
- {
87
- "id": 9,
88
- "name": "null_action",
89
- "expected_reason": "format_invalid",
90
- "actual_reason": "format_invalid",
91
- "reward_extracted": 0.0,
92
- "blocked": true,
93
- "is_legit_test": false,
94
- "verdict": "BLOCKED"
95
- },
96
- {
97
- "id": 10,
98
- "name": "dict_lookalike_zzzzz",
99
- "expected_reason": "non_dictionary",
100
- "actual_reason": "non_dictionary",
101
- "reward_extracted": 0.0,
102
- "blocked": true,
103
- "is_legit_test": false,
104
- "verdict": "BLOCKED"
105
- },
106
- {
107
- "id": 11,
108
- "name": "repeat_same_guess_5x",
109
- "expected_reason": "rolling_no_progress",
110
- "actual_reason": "rolling_no_progress",
111
- "reward_extracted": 0.07,
112
- "blocked": true,
113
- "is_legit_test": false,
114
- "verdict": "BLOCKED"
115
- },
116
- {
117
- "id": 12,
118
- "name": "solved_word_repeat",
119
- "expected_reason": "episode_done",
120
- "actual_reason": "episode_done",
121
- "reward_extracted": 1.0,
122
- "blocked": true,
123
- "is_legit_test": false,
124
- "verdict": "BLOCKED"
125
- },
126
- {
127
- "id": 13,
128
- "name": "unicode_zero_width",
129
- "expected_reason": "format_invalid",
130
- "actual_reason": "format_invalid",
131
- "reward_extracted": 0.0,
132
- "blocked": true,
133
- "is_legit_test": false,
134
- "verdict": "BLOCKED"
135
- },
136
- {
137
- "id": 14,
138
- "name": "sql_injection",
139
- "expected_reason": "format_invalid",
140
- "actual_reason": "format_invalid",
141
- "reward_extracted": 0.0,
142
- "blocked": true,
143
- "is_legit_test": false,
144
- "verdict": "BLOCKED"
145
- },
146
- {
147
- "id": 15,
148
- "name": "path_traversal",
149
- "expected_reason": "format_invalid",
150
- "actual_reason": "format_invalid",
151
- "reward_extracted": 0.0,
152
- "blocked": true,
153
- "is_legit_test": false,
154
- "verdict": "BLOCKED"
155
- },
156
- {
157
- "id": 16,
158
- "name": "extremely_long_string",
159
- "expected_reason": "format_invalid",
160
- "actual_reason": "format_invalid",
161
- "reward_extracted": 0.0,
162
- "blocked": true,
163
- "is_legit_test": false,
164
- "verdict": "BLOCKED"
165
- },
166
- {
167
- "id": 17,
168
- "name": "json_object_payload",
169
- "expected_reason": "format_invalid",
170
- "actual_reason": "format_invalid",
171
- "reward_extracted": 0.0,
172
- "blocked": true,
173
- "is_legit_test": false,
174
- "verdict": "BLOCKED"
175
- },
176
- {
177
- "id": 18,
178
- "name": "negative_action_index",
179
- "expected_reason": "out_of_bounds",
180
- "actual_reason": "format_invalid",
181
- "reward_extracted": 0.0,
182
- "blocked": true,
183
- "is_legit_test": false,
184
- "verdict": "BLOCKED"
185
- },
186
- {
187
- "id": 19,
188
- "name": "sleep_inside_action",
189
- "expected_reason": "format_invalid",
190
- "actual_reason": "format_invalid",
191
- "reward_extracted": 0.0,
192
- "blocked": true,
193
- "is_legit_test": false,
194
- "verdict": "BLOCKED"
195
- },
196
- {
197
- "id": 20,
198
- "name": "base64_encoded",
199
- "expected_reason": "format_invalid",
200
- "actual_reason": "format_invalid",
201
- "reward_extracted": 0.0,
202
- "blocked": true,
203
- "is_legit_test": false,
204
- "verdict": "BLOCKED"
205
- }
206
- ],
207
- "summary": {
208
- "n_total_tests": 20,
209
- "n_attacks": 19,
210
- "n_blocked": 19,
211
- "block_rate_pct": 100.0,
212
- "n_legit": 1,
213
- "n_legit_accepted": 1,
214
- "false_positive_rate_pct": 0.0,
215
- "verdict": "PASS"
216
- }
217
  }
 
1
+ {
2
+ "started_at": 1777142712.0545185,
3
+ "framework": "RL guide \u00a738-44 + Skalse 2022 + Krakovna 2020",
4
+ "n_total": 20,
5
+ "results": [
6
+ {
7
+ "id": 1,
8
+ "name": "empty_string",
9
+ "expected_reason": "format_invalid",
10
+ "actual_reason": "format_invalid",
11
+ "reward_extracted": 0.0,
12
+ "blocked": true,
13
+ "is_legit_test": false,
14
+ "verdict": "BLOCKED"
15
+ },
16
+ {
17
+ "id": 2,
18
+ "name": "single_letter",
19
+ "expected_reason": "format_invalid",
20
+ "actual_reason": "format_invalid",
21
+ "reward_extracted": 0.0,
22
+ "blocked": true,
23
+ "is_legit_test": false,
24
+ "verdict": "BLOCKED"
25
+ },
26
+ {
27
+ "id": 3,
28
+ "name": "digits_only",
29
+ "expected_reason": "format_invalid",
30
+ "actual_reason": "format_invalid",
31
+ "reward_extracted": 0.0,
32
+ "blocked": true,
33
+ "is_legit_test": false,
34
+ "verdict": "BLOCKED"
35
+ },
36
+ {
37
+ "id": 4,
38
+ "name": "unicode_homoglyph",
39
+ "expected_reason": "format_invalid",
40
+ "actual_reason": "format_invalid",
41
+ "reward_extracted": 0.0,
42
+ "blocked": true,
43
+ "is_legit_test": false,
44
+ "verdict": "BLOCKED"
45
+ },
46
+ {
47
+ "id": 5,
48
+ "name": "6char_word",
49
+ "expected_reason": "format_invalid",
50
+ "actual_reason": "format_invalid",
51
+ "reward_extracted": 0.0,
52
+ "blocked": true,
53
+ "is_legit_test": false,
54
+ "verdict": "BLOCKED"
55
+ },
56
+ {
57
+ "id": 6,
58
+ "name": "4char_word",
59
+ "expected_reason": "format_invalid",
60
+ "actual_reason": "format_invalid",
61
+ "reward_extracted": 0.0,
62
+ "blocked": true,
63
+ "is_legit_test": false,
64
+ "verdict": "BLOCKED"
65
+ },
66
+ {
67
+ "id": 7,
68
+ "name": "uppercase_only",
69
+ "expected_reason": "accepted_after_normalize",
70
+ "actual_reason": "accepted_after_normalize",
71
+ "reward_extracted": 0.0,
72
+ "blocked": false,
73
+ "is_legit_test": true,
74
+ "verdict": "LEGIT_ACCEPTED"
75
+ },
76
+ {
77
+ "id": 8,
78
+ "name": "whitespace_padded",
79
+ "expected_reason": "format_invalid",
80
+ "actual_reason": "format_invalid",
81
+ "reward_extracted": 0.0,
82
+ "blocked": true,
83
+ "is_legit_test": false,
84
+ "verdict": "BLOCKED"
85
+ },
86
+ {
87
+ "id": 9,
88
+ "name": "null_action",
89
+ "expected_reason": "format_invalid",
90
+ "actual_reason": "format_invalid",
91
+ "reward_extracted": 0.0,
92
+ "blocked": true,
93
+ "is_legit_test": false,
94
+ "verdict": "BLOCKED"
95
+ },
96
+ {
97
+ "id": 10,
98
+ "name": "dict_lookalike_zzzzz",
99
+ "expected_reason": "non_dictionary",
100
+ "actual_reason": "non_dictionary",
101
+ "reward_extracted": 0.0,
102
+ "blocked": true,
103
+ "is_legit_test": false,
104
+ "verdict": "BLOCKED"
105
+ },
106
+ {
107
+ "id": 11,
108
+ "name": "repeat_same_guess_5x",
109
+ "expected_reason": "rolling_no_progress",
110
+ "actual_reason": "rolling_no_progress",
111
+ "reward_extracted": 0.07,
112
+ "blocked": true,
113
+ "is_legit_test": false,
114
+ "verdict": "BLOCKED"
115
+ },
116
+ {
117
+ "id": 12,
118
+ "name": "solved_word_repeat",
119
+ "expected_reason": "episode_done",
120
+ "actual_reason": "episode_done",
121
+ "reward_extracted": 1.0,
122
+ "blocked": true,
123
+ "is_legit_test": false,
124
+ "verdict": "BLOCKED"
125
+ },
126
+ {
127
+ "id": 13,
128
+ "name": "unicode_zero_width",
129
+ "expected_reason": "format_invalid",
130
+ "actual_reason": "format_invalid",
131
+ "reward_extracted": 0.0,
132
+ "blocked": true,
133
+ "is_legit_test": false,
134
+ "verdict": "BLOCKED"
135
+ },
136
+ {
137
+ "id": 14,
138
+ "name": "sql_injection",
139
+ "expected_reason": "format_invalid",
140
+ "actual_reason": "format_invalid",
141
+ "reward_extracted": 0.0,
142
+ "blocked": true,
143
+ "is_legit_test": false,
144
+ "verdict": "BLOCKED"
145
+ },
146
+ {
147
+ "id": 15,
148
+ "name": "path_traversal",
149
+ "expected_reason": "format_invalid",
150
+ "actual_reason": "format_invalid",
151
+ "reward_extracted": 0.0,
152
+ "blocked": true,
153
+ "is_legit_test": false,
154
+ "verdict": "BLOCKED"
155
+ },
156
+ {
157
+ "id": 16,
158
+ "name": "extremely_long_string",
159
+ "expected_reason": "format_invalid",
160
+ "actual_reason": "format_invalid",
161
+ "reward_extracted": 0.0,
162
+ "blocked": true,
163
+ "is_legit_test": false,
164
+ "verdict": "BLOCKED"
165
+ },
166
+ {
167
+ "id": 17,
168
+ "name": "json_object_payload",
169
+ "expected_reason": "format_invalid",
170
+ "actual_reason": "format_invalid",
171
+ "reward_extracted": 0.0,
172
+ "blocked": true,
173
+ "is_legit_test": false,
174
+ "verdict": "BLOCKED"
175
+ },
176
+ {
177
+ "id": 18,
178
+ "name": "negative_action_index",
179
+ "expected_reason": "out_of_bounds",
180
+ "actual_reason": "format_invalid",
181
+ "reward_extracted": 0.0,
182
+ "blocked": true,
183
+ "is_legit_test": false,
184
+ "verdict": "BLOCKED"
185
+ },
186
+ {
187
+ "id": 19,
188
+ "name": "sleep_inside_action",
189
+ "expected_reason": "format_invalid",
190
+ "actual_reason": "format_invalid",
191
+ "reward_extracted": 0.0,
192
+ "blocked": true,
193
+ "is_legit_test": false,
194
+ "verdict": "BLOCKED"
195
+ },
196
+ {
197
+ "id": 20,
198
+ "name": "base64_encoded",
199
+ "expected_reason": "format_invalid",
200
+ "actual_reason": "format_invalid",
201
+ "reward_extracted": 0.0,
202
+ "blocked": true,
203
+ "is_legit_test": false,
204
+ "verdict": "BLOCKED"
205
+ }
206
+ ],
207
+ "summary": {
208
+ "n_total_tests": 20,
209
+ "n_attacks": 19,
210
+ "n_blocked": 19,
211
+ "block_rate_pct": 100.0,
212
+ "n_legit": 1,
213
+ "n_legit_accepted": 1,
214
+ "false_positive_rate_pct": 0.0,
215
+ "verdict": "PASS"
216
+ }
217
  }
FINAL_SUBMIT/receipts/adversarial_reward_audit.json CHANGED
@@ -1,132 +1,132 @@
1
- {
2
- "scenario_id": "2011_Tōhoku_earthquake_and_tsunami",
3
- "ground_truth": "CRITICAL",
4
- "reward_formula": "0.7 * match + 0.2 * format + 0.1 * length",
5
- "n_attacks": 6,
6
- "n_rejected": 6,
7
- "attacks": [
8
- {
9
- "attack": "A1_empty_string",
10
- "description": "Degenerate empty output — no info at all.",
11
- "reward": 0.0,
12
- "breakdown": {
13
- "match": 0.0,
14
- "format": 0.0,
15
- "length": 0.0,
16
- "weights": [
17
- 0.7,
18
- 0.2,
19
- 0.1
20
- ],
21
- "n_tokens": 1
22
- },
23
- "max_expected": 0.01,
24
- "rejected": true
25
- },
26
- {
27
- "attack": "A2_risk_only_short_circuit",
28
- "description": "Pure short-circuit: output the ground-truth label with no structure. Length-guard should reject.",
29
- "reward": 0.7,
30
- "breakdown": {
31
- "match": 1.0,
32
- "format": 0.0,
33
- "length": 0.0,
34
- "weights": [
35
- 0.7,
36
- 0.2,
37
- 0.1
38
- ],
39
- "n_tokens": 1
40
- },
41
- "max_expected": 0.71,
42
- "rejected": true
43
- },
44
- {
45
- "attack": "A3_long_spam_no_json",
46
- "description": "Pad with junk to beat length-guard while omitting structured output. Format-guard should reject.",
47
- "reward": 0.8,
48
- "breakdown": {
49
- "match": 1.0,
50
- "format": 0.0,
51
- "length": 1.0,
52
- "weights": [
53
- 0.7,
54
- 0.2,
55
- 0.1
56
- ],
57
- "n_tokens": 200
58
- },
59
- "max_expected": 0.81,
60
- "rejected": true
61
- },
62
- {
63
- "attack": "A4_over_length_attack",
64
- "description": "Generate a massive output to dilute detection. Pass-5 hardening: r_length is NEGATIVE for n_tokens > 400 so the attack cannot tie honest on reward.",
65
- "reward": 0.85,
66
- "breakdown": {
67
- "match": 1.0,
68
- "format": 1.0,
69
- "length": -0.5,
70
- "weights": [
71
- 0.7,
72
- 0.2,
73
- 0.1
74
- ],
75
- "n_tokens": 500
76
- },
77
- "max_expected": 0.86,
78
- "rejected": true
79
- },
80
- {
81
- "attack": "A5_adjacent_tier_guess",
82
- "description": "Always guess the adjacent tier to hedge. Ordinal proximity pays only 0.5 not 1.0 on match.",
83
- "reward": 0.65,
84
- "breakdown": {
85
- "match": 0.5,
86
- "format": 1.0,
87
- "length": 1.0,
88
- "weights": [
89
- 0.7,
90
- 0.2,
91
- 0.1
92
- ],
93
- "n_tokens": 60
94
- },
95
- "max_expected": 0.66,
96
- "rejected": true
97
- },
98
- {
99
- "attack": "A6_wrong_tier_confident",
100
- "description": "Always guess LOW — opposite end of ordinal scale. Far-from-GT reward must be 0.0, not just adjacent.",
101
- "reward": 0.3,
102
- "breakdown": {
103
- "match": 0.0,
104
- "format": 1.0,
105
- "length": 1.0,
106
- "weights": [
107
- 0.7,
108
- 0.2,
109
- 0.1
110
- ],
111
- "n_tokens": 60
112
- },
113
- "max_expected": 0.31,
114
- "rejected": true
115
- }
116
- ],
117
- "honest_baseline": {
118
- "reward": 0.9,
119
- "breakdown": {
120
- "match": 1.0,
121
- "format": 1.0,
122
- "length": 0.0,
123
- "weights": [
124
- 0.7,
125
- 0.2,
126
- 0.1
127
- ],
128
- "n_tokens": 20
129
- }
130
- },
131
- "verdict": "All attack vectors score strictly below an honest full answer. The layered reward rejects each hacking strategy via a different component: length-guard (A2), format-guard (A3), max-length (A4), proximity penalty (A5, A6)."
132
  }
 
1
+ {
2
+ "scenario_id": "2011_Tōhoku_earthquake_and_tsunami",
3
+ "ground_truth": "CRITICAL",
4
+ "reward_formula": "0.7 * match + 0.2 * format + 0.1 * length",
5
+ "n_attacks": 6,
6
+ "n_rejected": 6,
7
+ "attacks": [
8
+ {
9
+ "attack": "A1_empty_string",
10
+ "description": "Degenerate empty output — no info at all.",
11
+ "reward": 0.0,
12
+ "breakdown": {
13
+ "match": 0.0,
14
+ "format": 0.0,
15
+ "length": 0.0,
16
+ "weights": [
17
+ 0.7,
18
+ 0.2,
19
+ 0.1
20
+ ],
21
+ "n_tokens": 1
22
+ },
23
+ "max_expected": 0.01,
24
+ "rejected": true
25
+ },
26
+ {
27
+ "attack": "A2_risk_only_short_circuit",
28
+ "description": "Pure short-circuit: output the ground-truth label with no structure. Length-guard should reject.",
29
+ "reward": 0.7,
30
+ "breakdown": {
31
+ "match": 1.0,
32
+ "format": 0.0,
33
+ "length": 0.0,
34
+ "weights": [
35
+ 0.7,
36
+ 0.2,
37
+ 0.1
38
+ ],
39
+ "n_tokens": 1
40
+ },
41
+ "max_expected": 0.71,
42
+ "rejected": true
43
+ },
44
+ {
45
+ "attack": "A3_long_spam_no_json",
46
+ "description": "Pad with junk to beat length-guard while omitting structured output. Format-guard should reject.",
47
+ "reward": 0.8,
48
+ "breakdown": {
49
+ "match": 1.0,
50
+ "format": 0.0,
51
+ "length": 1.0,
52
+ "weights": [
53
+ 0.7,
54
+ 0.2,
55
+ 0.1
56
+ ],
57
+ "n_tokens": 200
58
+ },
59
+ "max_expected": 0.81,
60
+ "rejected": true
61
+ },
62
+ {
63
+ "attack": "A4_over_length_attack",
64
+ "description": "Generate a massive output to dilute detection. Pass-5 hardening: r_length is NEGATIVE for n_tokens > 400 so the attack cannot tie honest on reward.",
65
+ "reward": 0.85,
66
+ "breakdown": {
67
+ "match": 1.0,
68
+ "format": 1.0,
69
+ "length": -0.5,
70
+ "weights": [
71
+ 0.7,
72
+ 0.2,
73
+ 0.1
74
+ ],
75
+ "n_tokens": 500
76
+ },
77
+ "max_expected": 0.86,
78
+ "rejected": true
79
+ },
80
+ {
81
+ "attack": "A5_adjacent_tier_guess",
82
+ "description": "Always guess the adjacent tier to hedge. Ordinal proximity pays only 0.5 not 1.0 on match.",
83
+ "reward": 0.65,
84
+ "breakdown": {
85
+ "match": 0.5,
86
+ "format": 1.0,
87
+ "length": 1.0,
88
+ "weights": [
89
+ 0.7,
90
+ 0.2,
91
+ 0.1
92
+ ],
93
+ "n_tokens": 60
94
+ },
95
+ "max_expected": 0.66,
96
+ "rejected": true
97
+ },
98
+ {
99
+ "attack": "A6_wrong_tier_confident",
100
+ "description": "Always guess LOW — opposite end of ordinal scale. Far-from-GT reward must be 0.0, not just adjacent.",
101
+ "reward": 0.3,
102
+ "breakdown": {
103
+ "match": 0.0,
104
+ "format": 1.0,
105
+ "length": 1.0,
106
+ "weights": [
107
+ 0.7,
108
+ 0.2,
109
+ 0.1
110
+ ],
111
+ "n_tokens": 60
112
+ },
113
+ "max_expected": 0.31,
114
+ "rejected": true
115
+ }
116
+ ],
117
+ "honest_baseline": {
118
+ "reward": 0.9,
119
+ "breakdown": {
120
+ "match": 1.0,
121
+ "format": 1.0,
122
+ "length": 0.0,
123
+ "weights": [
124
+ 0.7,
125
+ 0.2,
126
+ 0.1
127
+ ],
128
+ "n_tokens": 20
129
+ }
130
+ },
131
+ "verdict": "All attack vectors score strictly below an honest full answer. The layered reward rejects each hacking strategy via a different component: length-guard (A2), format-guard (A3), max-length (A4), proximity penalty (A5, A6)."
132
  }