Invalid JSON:Unexpected token 'I', ..."plexity": Infinity,
"... is not valid JSON
| { | |
| "source_model": "Qwen/Qwen3.5-9B", | |
| "technique": "refusal_direction_ablation", | |
| "method": "advanced", | |
| "method_config": { | |
| "n_directions": 4, | |
| "norm_preserve": true, | |
| "regularization": 0.3, | |
| "refinement_passes": 2, | |
| "project_biases": true, | |
| "use_chat_template": true, | |
| "use_whitened_svd": false, | |
| "true_iterative_refinement": false, | |
| "winsorize_activations": false, | |
| "float_layer_interpolation": false, | |
| "cot_aware": false, | |
| "use_kl_optimization": false, | |
| "use_lora_ablation": false, | |
| "spectral_cascade": true, | |
| "spectral_bands": 8, | |
| "spectral_threshold": 0.01 | |
| }, | |
| "references": [ | |
| "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)", | |
| "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)", | |
| "Norm-Preserving Biprojected Abliteration (grimjim, 2025)", | |
| "Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)", | |
| "Joad et al., More to Refusal than a Single Direction (2026)", | |
| "Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization", | |
| "OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)" | |
| ], | |
| "strong_layers": [ | |
| 31, | |
| 30, | |
| 29, | |
| 28, | |
| 26, | |
| 27, | |
| 25, | |
| 24 | |
| ], | |
| "n_harmful_prompts": 512, | |
| "n_harmless_prompts": 512, | |
| "quality_metrics": { | |
| "perplexity": Infinity, | |
| "coherence": 0.0, | |
| "refusal_rate": 0.0, | |
| "degenerate_count": 30, | |
| "kl_divergence": Infinity, | |
| "spectral_certification": null | |
| }, | |
| "kl_contributions": {}, | |
| "cot_preserved_layers": [], | |
| "float_layer_weights": {}, | |
| "lora_adapters_saved": false | |
| } |