{ "base_model": "Qwen/Qwen2.5-1.5B-Instruct", "training_method": "GRPO with Unsloth", "dataset_size": 1602, "training_steps": 360, "final_reward": 3.17, "improvement": "+67%", "key_achievements": [ "Foreign language bias detection", "Structured reasoning output", "67% reward improvement", "46% reduction in output variance" ] }