Spaces:

100XZX001
/

CodeReview-Professional-Workflow

Sleeping

App Files Files Community

100XZX001 commited on Apr 26

Commit

530fe32

verified ·

1 Parent(s): 91038d2

Update training.py

Browse files

Files changed (1) hide show

training.py +763 -745

training.py CHANGED Viewed

@@ -1,793 +1,811 @@
-# training.py
-import torch._dynamo
-torch._dynamo.config.disable = True
-import json
-import os
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
-from dataclasses import dataclass
-from typing import List, Dict, Tuple, Optional
 import numpy as np
-import re
-import random
-import matplotlib.pyplot as plt
 from unsloth import FastLanguageModel
-from transformers import TrainingArguments
-from trl import SFTTrainer
-from datasets import Dataset
-# Import your environment and actions (unchanged)
 from environment import CodeReviewEnv
 from redteam import BUG_DB
-from models import (
-    RunTests, RunLinter, Inspect,
-    ProposeFix, WriteComment, AskQuestion,
-    Done, Skip , QueryDocs
 )
-# ======================================================================
-# 1. ACTION PARSING (improved with fallback)
-# ======================================================================
 @dataclass
 class AgentAction:
     action_type: str
     content: Optional[str] = None
-def parse_action(output: str) -> AgentAction:
-    """Robust JSON parsing with regex fallback and keyword detection."""
-    # Try strict JSON first
     try:
-        data = json.loads(output)
-        return AgentAction(
-            action_type=data.get("action_type", "").lower(),
-            content=data.get("content")
-        )
-    except:
         pass
-    # Try to extract JSON from markdown blocks
-    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', output, re.DOTALL)
-    if json_match:
-        try:
-            data = json.loads(json_match.group(1))
-            return AgentAction(
-                action_type=data.get("action_type", "").lower(),
-                content=data.get("content")
-            )
-        except:
-            pass
-    # Try to find "action_type" field with regex
-    action_pattern = r'"action_type"\s*:\s*"(\w+)"'
-    match = re.search(action_pattern, output)
-    if match:
-        return AgentAction(action_type=match.group(1).lower())
-    # Keyword detection as last resort
-    output_lower = output.lower()
-    if "test" in output_lower:
-        return AgentAction("run_tests")
-    if "lint" in output_lower:
-        return AgentAction("run_linter")
-    if "inspect" in output_lower:
-        return AgentAction("inspect")
-    if "doc" in output_lower or "documentation" in output_lower:
-        # Bridge natural language mentions to rltool-backed retrieval action.
-        return AgentAction("query_docs", "bug fix guidance")
-    return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
-    if action.action_type == "run_tests":
-        return RunTests()
-    elif action.action_type == "run_linter":
-        return RunLinter()
-    elif action.action_type == "inspect":
-        return Inspect()
-    elif action.action_type == "fix":
-        return ProposeFix(fix_code=action.content or "")
-    elif action.action_type == "comment":
-        return WriteComment(comment_text=action.content or "")
-    elif action.action_type == "question":
-        return AskQuestion(question=action.content or "")
-    elif action.action_type == "query_docs":               # <-- new
-        return QueryDocs(query_topic=action.content or "")
-    elif action.action_type == "done":
-        return Done()
-    else:
-        return Skip()
-# ======================================================================
-# 2. MODEL SETUP (stabilised LoRA)
-# ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name="unsloth/gemma-2-2b-it-bnb-4bit",
-        max_seq_length=768,
-        load_in_4bit=True,
     )
-    # FIXED: Lower rank (16), dropout=0 for stability
     model = FastLanguageModel.get_peft_model(
         model,
-        r=16,                     # was 64 → causes collapse
-        target_modules=[
-            "q_proj", "k_proj", "v_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj"
-        ],
-        lora_alpha=32,            # adjusted for r=16
-        lora_dropout=0.0,         # dropout can cause empty outputs
     )
-    # Ensure tokenizer has correct chat template for Gemma-2
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}<start_of_turn>user\n{{ message['content'] }}<end_of_turn>\n<start_of_turn>model\n{% elif message['role'] == 'assistant' %}{{ message['content'] }}<end_of_turn>\n{% endif %}{% endfor %}"
     return model, tokenizer
-# ======================================================================
-# 3. MODEL SANITY CHECK (new – ensures model can generate text)
-# ======================================================================
-def test_model_sanity(model, tokenizer) -> bool:
-    print("\n" + "="*60)
-    print("SANITY CHECK: Testing base model generation")
-    print("="*60)
-    test_prompt = "Hello, how are you?"
-    messages = [{"role": "user", "content": test_prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", max_length=768, truncation=True).to("cuda")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=30,
-            do_sample=True,
-            temperature=0.7,
-            min_new_tokens=1,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-        )
-    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
-    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-    print(f"Prompt: {test_prompt}")
-    print(f"Response: {repr(response)}")
-    if len(response) == 0:
-        print("❌ Model produces empty output – cannot train.")
-        return False
-    print("✓ Model sanity check PASSED\n")
-    return True
-# ======================================================================
-# 4. SUPERVISED WARM-UP (teaches JSON output)
-# ======================================================================
-def supervised_warmup(model, tokenizer, n_examples=500, epochs=8):
-    print("\n" + "="*60)
-    print("SUPERVISED WARM-UP: Teaching JSON format")
-    print("="*60)
-    examples = []
-    action_templates = [
-        '{"action_type": "run_tests"}',
-        '{"action_type": "run_linter"}',
-        '{"action_type": "inspect"}',
-        '{"action_type": "fix", "content": "def corrected():\n    pass"}',
-        '{"action_type": "comment", "content": "This looks good."}',
-        '{"action_type": "question", "content": "Why is this variable used?"}',
-        '{"action_type": "query_docs", "content": "KeyError"}',
-        '{"action_type": "done"}',
-    ]
-    for i in range(n_examples):
-        code = f"def example_{i}():\n    return {i % 10}"
-        last_outputs = [
-            "Tests passed: 2/3",
-            "Linter found 1 error",
-            "Inspection complete",
-            "No previous action",
-        ]
-        last_output = random.choice(last_outputs)
-        # Use same prompt structure as build_prompt
-        prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix.
-The developer has a **defensive** personality and will only accept if you provide solid evidence:
-- Tests pass (high pass ratio)
-- Lint is clean (zero errors)
-- Documentation or references are provided
-- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
-Workflow:
-1. Use `inspect` to understand the code.
-2. Use `run_tests` and `run_linter` to gather evidence.
-3. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
-4. If the developer pushes back, read their response carefully and address their specific concern.
-5. Once convinced, use `done` to finish.
-Code:
-{code}
-Author says:
-(no response yet – start with inspection)
-Last tool output:
-{last_output}
-Available actions:
-run_tests, run_linter, inspect, fix, comment, question, done, query_docs
-Respond ONLY in JSON:
-{{"action_type": "...", "content": "..."}}"""
-        action_json = random.choice(action_templates)
-        messages = [
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": action_json}
-        ]
-        full_text = tokenizer.apply_chat_template(messages, tokenize=False)
-        examples.append({"text": full_text})
-    dataset = Dataset.from_list(examples)
-    trainer = SFTTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-        dataset_text_field="text",
-        max_seq_length=512,
-        args=TrainingArguments(
-            output_dir="warmup_output",
-            num_train_epochs=epochs,
-            per_device_train_batch_size=4,
-            gradient_accumulation_steps=2,
-            learning_rate=2e-5,
-            logging_steps=50,
-            save_strategy="no",
-            fp16=True,
-        ),
     )
-    print(f"Training on {n_examples} examples for {epochs} epochs...")
-    trainer.train()
-    print("✓ Warm-up complete\n")
-    torch.cuda.empty_cache()
-# ======================================================================
-# 5. ACTION GENERATION WITH LOGPROB TRACKING (fixed)
-# ======================================================================
-def generate_action_with_logprob(
-    prompt: str,
-    model,
-    tokenizer,
-    temperature: float = 0.0,   # changed: greedy by default for stability
-    max_retries: int = 2
-) -> Tuple[str, float]:
-    """Generate action using correct chat template, with fallback."""
-    messages = [{"role": "user", "content": prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
-    for attempt in range(max_retries):
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=128,
-                do_sample=(temperature > 0),
-                temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
-                min_new_tokens=1,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-        generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
-        action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-        # Compute logprob
-        logprobs = []
-        for idx, token_id in enumerate(generated_ids):
-            if idx < len(outputs.scores):
-                token_logits = outputs.scores[idx][0]
-                token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
-                logprobs.append(token_logprob)
-        total_logprob = sum(logprobs) if logprobs else -100.0
-        # If empty, use fallback
-        if not action_text:
-            fallback_actions = [
-                '{"action_type": "run_tests"}',
-                '{"action_type": "run_linter"}',
-                '{"action_type": "inspect"}',
-                '{"action_type": "skip"}',
-            ]
-            action_text = random.choice(fallback_actions)
-            total_logprob = -50.0
-            print(f"[WARN] Empty generation → using fallback: {action_text}")
-            return action_text, total_logprob
-        # Validate JSON
-        try:
-            json.loads(action_text)
-            return action_text, total_logprob
-        except:
-            if attempt == max_retries - 1:
-                return '{"action_type":"skip"}', -100.0
             continue
-    return '{"action_type":"skip"}', -100.0
-# ======================================================================
-# 6. PROMPT BUILDER (unchanged – exactly as you wrote)
-# ======================================================================
-def build_prompt(obs, history_lines: List[str]) -> str:
-    author_msg = getattr(obs, "author_response", "") or ""
-    tool_output = getattr(obs, "last_tool_output", "") or ""
-    # Personality hint (optional but helpful)
-    author_personality = getattr(obs, "author_personality", "defensive")  # e.g., from env
-    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
-The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
-- Tests pass (high pass ratio)
-- Lint is clean (zero errors)
-- Documentation or references are provided
-- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
-Workflow:
-1. Use `inspect` to understand the code.
-2. Use `run_tests` and `run_linter` to gather evidence.
-3. Use `query_docs` when you need references or language-specific guidance.
-4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
-5. If the developer pushes back, read their response carefully and address their specific concern.
-6. Once convinced, use `done` to finish.
-Code:
-{obs.code_snippet}
-Author says:
-{author_msg if author_msg else "(no response yet – start with inspection)"}
-Last tool output:
-{tool_output if tool_output else "(none)"}
-Available actions:
-run_tests, run_linter, inspect, query_docs, fix, comment, question, done
-Respond ONLY in JSON:
-{{"action_type": "...", "content": "..."}}"""
-    if history_lines:
-        history = "\n".join(history_lines[-6:])
-        prompt += f"\n\nPrevious steps:\n{history}"
-    return prompt
-# ======================================================================
-# 7. TRAJECTORY STORAGE (unchanged)
-# ======================================================================
-@dataclass
-class Trajectory:
-    states: List[str]
-    actions: List[str]
-    rewards: List[float]
-    logprobs: List[float]
-    dones: List[bool]
-    def __len__(self):
-        return len(self.states)
-    def to_dict(self):
-        return {
-            "states": self.states,
-            "actions": self.actions,
-            "rewards": self.rewards,
-            "logprobs": self.logprobs,
-            "dones": self.dones,
-        }
-# ======================================================================
-# 8. ROLLOUT COLLECTION (uses fixed generate)
-# ======================================================================
-def collect_trajectory(
-    env: CodeReviewEnv,
-    model,
-    tokenizer,
-    max_steps: int = 10,
-    temperature: float = 0.0   # changed to greedy
-) -> Trajectory:
     obs = env.reset()
-    history_lines = []
-    states = []
-    actions = []
-    rewards = []
-    logprobs = []
-    dones = []
-    for step in range(max_steps):
-        prompt = build_prompt(obs, history_lines)
-        states.append(prompt)
-        action_text, logprob = generate_action_with_logprob(
-            prompt, model, tokenizer, temperature
-        )
-        actions.append(action_text)
-        logprobs.append(logprob)
-        action = parse_action(action_text)
-        env_action = map_to_env(action)
-        next_obs, reward, done, _ = env.step(env_action)
-        rewards.append(reward.value)
-        dones.append(done)
-        history_lines.append(f"Agent: {action_text}")
-        history_lines.append(f"Env: {next_obs.last_tool_output}")
-        obs = next_obs
-        if done:
             break
-    return Trajectory(states, actions, rewards, logprobs, dones)
-def collect_trajectories(
-    env: CodeReviewEnv,
-    model,
-    tokenizer,
-    n_trajectories: int,
-    max_steps: int = 10,
-    task_levels: Optional[List[str]] = None,
-    task_weights: Optional[List[float]] = None,
-) -> List[Trajectory]:
-    # Link training to RedTeam's full bug distribution by sampling tasks
-    # per trajectory instead of training only on env default ("easy").
-    if task_levels is None:
-        task_levels = list(BUG_DB.keys())
-    if task_weights is not None and len(task_weights) != len(task_levels):
-        raise ValueError("task_weights must match task_levels length")
-    if task_weights is not None and sum(task_weights) <= 0:
-        raise ValueError("task_weights must have a positive total")
-    trajectories = []
-    for i in range(n_trajectories):
-        # Weighted sampling supports curriculum-style training schedules.
-        sampled_task = random.choices(task_levels, weights=task_weights, k=1)[0]
-        env.set_task(sampled_task)
-        traj = collect_trajectory(env, model, tokenizer, max_steps)
-        total_reward = sum(traj.rewards)
-        print(f"Trajectory {i+1}/{n_trajectories}: "
-              f"task={sampled_task}, steps={len(traj)}, reward={total_reward:.3f}")
-        trajectories.append(traj)
-    return trajectories
-# ======================================================================
-# 9. ADVANTAGE ESTIMATION (unchanged)
-# ======================================================================
-def compute_returns_and_advantages(
-    rewards: List[float],
-    dones: List[bool],
-    gamma: float = 0.99,
-    standardize: bool = True
-) -> Tuple[List[float], List[float]]:
-    """
-    Computes discounted returns and normalised advantages (no critic).
-    Advantages = returns - mean(returns)  (or zero baseline).
-    """
-    n = len(rewards)
-    returns = [0.0] * n
-    running_return = 0.0
-    for t in reversed(range(n)):
-        if dones[t]:
-            running_return = 0.0
-        running_return = rewards[t] + gamma * running_return
-        returns[t] = running_return
-    if standardize:
-        advantages = np.array(returns) - np.mean(returns)
-        adv_std = np.std(advantages) + 1e-8
-        advantages = (advantages / adv_std).tolist()
-    else:
-        advantages = returns.copy()
-    return advantages, returns
-# ======================================================================
-# 10. COMPUTE NEW LOGPROBS (unchanged)
-# ======================================================================
-def compute_logprob(prompt: str, action: str, model, tokenizer) -> float:
-    messages = [{"role": "user", "content": prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    full_text = formatted + action
-    inputs = tokenizer(full_text, return_tensors="pt").to("cuda")
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-    action_ids = tokenizer.encode(action, add_special_tokens=False)
-    prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
-    action_start = len(prefix_ids)
-    logprobs = []
-    for idx, token_id in enumerate(action_ids):
-        position = action_start + idx - 1
-        if 0 <= position < logits.shape[1]:
-            token_logits = logits[0, position]
-            token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
-            logprobs.append(token_logprob)
-    return sum(logprobs) if logprobs else -100.0
-# ======================================================================
-# 11. PPO UPDATE (unchanged except uses compute_logprob correctly)
-# ======================================================================
-def ppo_update(
-    trajectories: List[Trajectory],
-    model,
-    tokenizer,
-    optimizer,
-    n_epochs: int = 4,
-    clip_epsilon: float = 0.2,
-    entropy_coef: float = 0.01,
-    gamma: float = 0.99,
-) -> Dict[str, float]:
     model.train()
-    all_states = []
-    all_actions = []
-    all_old_logprobs = []
-    all_advantages = []
     all_returns = []
     for traj in trajectories:
-        advantages, returns = compute_returns_and_advantages(
-            traj.rewards, traj.dones, gamma=gamma, standardize=True
-        )
-        all_states.extend(traj.states)
-        all_actions.extend(traj.actions)
-        all_old_logprobs.extend(traj.logprobs)
-        all_advantages.extend(advantages)
-        all_returns.extend(returns)
-    n_samples = len(all_states)
-    total_loss = 0.0
-    total_policy_loss = 0.0
-    total_entropy = 0.0
-    n_updates = 0
-    for epoch in range(n_epochs):
-        indices = np.random.permutation(n_samples)
-        for i in indices:
-            state = all_states[i]
-            action = all_actions[i]
-            old_logprob = all_old_logprobs[i]
-            advantage = all_advantages[i]
-            # Use the same chat template for PPO update
-            messages = [{"role": "user", "content": state}]
-            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            full_text = formatted + action
-            inputs = tokenizer(full_text, return_tensors="pt", max_length=768, truncation=True).to("cuda")
-            outputs = model(**inputs)
-            logits = outputs.logits
-            action_ids = tokenizer.encode(action, add_special_tokens=False)
-            prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
-            action_start = len(prefix_ids)
-            logprobs = []
-            entropy = 0.0
-            for idx, token_id in enumerate(action_ids):
-                position = action_start + idx - 1
-                if 0 <= position < logits.shape[1]:
-                    token_logits = logits[0, position]
-                    log_probs = F.log_softmax(token_logits, dim=-1)
-                    token_logprob = log_probs[token_id]
-                    logprobs.append(token_logprob)
-                    probs = F.softmax(token_logits, dim=-1)
-                    entropy += -(probs * log_probs).sum()
-            if not logprobs:
                 continue
-            new_logprob = sum(logprobs)
-            avg_entropy = entropy / len(logprobs) if logprobs else 0.0
-            ratio = torch.exp(new_logprob - old_logprob)
-            surr1 = ratio * advantage
-            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
-            policy_loss = -torch.min(surr1, surr2)
-            loss = policy_loss - entropy_coef * avg_entropy
             optimizer.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             optimizer.step()
-            total_loss += loss.item()
-            total_policy_loss += policy_loss.item()
-            total_entropy += avg_entropy.item()
-            n_updates += 1
-    return {
-        "loss": total_loss / n_updates if n_updates > 0 else 0.0,
-        "policy_loss": total_policy_loss / n_updates if n_updates > 0 else 0.0,
-        "entropy": total_entropy / n_updates if n_updates > 0 else 0.0,
-    }
-# ======================================================================
-# 12. EVALUATION (unchanged)
-# ======================================================================
-def evaluate_policy(
-    env: CodeReviewEnv,
-    model,
-    tokenizer,
-    n_episodes: int = 10,
-    max_steps: int = 10
-) -> Dict[str, float]:
-    model.eval()
-    total_rewards = []
-    episode_lengths = []
-    success_count = 0
-    for _ in range(n_episodes):
-        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.0)
-        total_reward = sum(traj.rewards)
-        total_rewards.append(total_reward)
-        episode_lengths.append(len(traj))
-        if total_reward > 0.5:
-            success_count += 1
-    return {
-        "avg_reward": np.mean(total_rewards),
-        "std_reward": np.std(total_rewards),
-        "avg_length": np.mean(episode_lengths),
-        "success_rate": success_count / n_episodes,
-    }
-# ======================================================================
-# 13. MAIN TRAINING LOOP (added sanity check and warm-up)
-# ======================================================================
-def train_ppo(
-    n_iterations: int = 50,
-    trajectories_per_iter: int = 10,
-    n_epochs: int = 2,
-    max_steps: int = 10,
-    learning_rate: float = 3e-5,
-    clip_epsilon: float = 0.2,
-    entropy_coef: float = 0.01,
-    gamma: float = 0.99,
-    eval_every: int = 5,
-    task_levels: Optional[List[str]] = None,
-    curriculum_weighted_sampling: bool = True,
-    reward_profile: str = "full",
-):
-    print("Loading model...")
     model, tokenizer = load_model()
-    # NEW: Sanity check before any training
-    if not test_model_sanity(model, tokenizer):
-        print("\n❌ Model sanity check failed – cannot proceed.")
-        return
-    # NEW: Supervised warm-up to teach JSON format (500 steps with epochs=8)
-    supervised_warmup(model, tokenizer, n_examples=500, epochs=8)
-    optimizer = AdamW(model.parameters(), lr=learning_rate)
     env = CodeReviewEnv()
-    if task_levels is None:
-        task_levels = list(BUG_DB.keys())
-    print(f"\n{'='*60}")
-    print(f"Starting PPO Training")
-    print(f"Iterations: {n_iterations}")
-    print(f"Trajectories per iteration: {trajectories_per_iter}")
-    print(f"PPO epochs: {n_epochs}")
-    print(f"Reward profile: {reward_profile}")
-    print(f"{'='*60}\n")
-    reward_history: List[float] = []
-    loss_history: List[float] = []
-    for iteration in range(n_iterations):
-        print(f"\n--- Iteration {iteration + 1}/{n_iterations} ---")
-        # Optional weighted curriculum:
-        # start with easier tasks and smoothly ramp difficulty over training.
-        if curriculum_weighted_sampling:
-            progress = (iteration + 1) / max(n_iterations, 1)
-            easy_w = max(0.15, 0.55 - 0.40 * progress)
-            medium_w = max(0.15, 0.25 - 0.10 * progress)
-            hard_w = 0.10 + 0.05 * progress
-            harder_w = 0.05 + 0.20 * progress
-            hardest_w = 0.05 + 0.25 * progress
-            task_weight_map = {
-                "easy": easy_w,
-                "medium": medium_w,
-                "hard": hard_w,
-                "harder": harder_w,
-                "hardest": hardest_w,
-            }
-            task_weights = [task_weight_map.get(level, 1.0) for level in task_levels]
-        else:
-            task_weights = None
-        print("Collecting trajectories...")
-        trajectories = collect_trajectories(
-            env,
-            model,
-            tokenizer,
-            trajectories_per_iter,
-            max_steps,
-            task_levels=task_levels,
-            task_weights=task_weights,
-        )
-        avg_reward = np.mean([sum(t.rewards) for t in trajectories])
-        avg_length = np.mean([len(t) for t in trajectories])
-        reward_history.append(float(avg_reward))
-        print(f"Avg reward: {avg_reward:.3f}")
-        print(f"Avg length: {avg_length:.1f}")
-        print("Updating policy...")
-        metrics = ppo_update(
-            trajectories,
-            model,
-            tokenizer,
-            optimizer,
-            n_epochs=n_epochs,
-            clip_epsilon=clip_epsilon,
-            entropy_coef=entropy_coef,
-            gamma=gamma,
-        )
-        print(f"Loss: {metrics['loss']:.4f}")
-        print(f"Policy loss: {metrics['policy_loss']:.4f}")
-        print(f"Entropy: {metrics['entropy']:.4f}")
-        loss_history.append(float(metrics["loss"]))
-        if (iteration + 1) % eval_every == 0:
-            print("\nEvaluating policy...")
-            eval_metrics = evaluate_policy(env, model, tokenizer, n_episodes=10)
-            print(f"Eval avg reward: {eval_metrics['avg_reward']:.3f} ± {eval_metrics['std_reward']:.3f}")
-            print(f"Eval success rate: {eval_metrics['success_rate']:.2%}")
-            print(f"Eval avg length: {eval_metrics['avg_length']:.1f}")
     print("\n" + "="*60)
-    print("Training complete. Saving model...")
-    model.save_pretrained("ppo_final_model")
-    tokenizer.save_pretrained("ppo_final_model")
-    print("Model saved to ppo_final_model/")
-    # Save training curves for quick before/after comparisons.
-    if reward_history:
-        plt.figure(figsize=(8, 4))
-        plt.plot(range(1, len(reward_history) + 1), reward_history, marker="o")
-        plt.title("Average Reward per Iteration")
-        plt.xlabel("Iteration")
-        plt.ylabel("Average Reward")
-        plt.grid(alpha=0.3)
-        plt.tight_layout()
-        plt.savefig("reward_curve.png", dpi=150)
-        plt.close()
-    if loss_history:
-        plt.figure(figsize=(8, 4))
-        plt.plot(range(1, len(loss_history) + 1), loss_history, marker="o", color="tab:red")
-        plt.title("Training Loss per Iteration")
-        plt.xlabel("Iteration")
-        plt.ylabel("Loss")
-        plt.grid(alpha=0.3)
-        plt.tight_layout()
-        plt.savefig("loss_curve.png", dpi=150)
-        plt.close()
-    if os.path.exists("reward_curve.png") and os.path.exists("loss_curve.png"):
-        print("Saved reward_curve.png and loss_curve.png")
     print("="*60)
-# ======================================================================
-# 14. ENTRY POINT (unchanged)
-# ======================================================================
 if __name__ == "__main__":
-    train_ppo(
-        n_iterations=30,
-        trajectories_per_iter=10,
-        n_epochs=4,
-        max_steps=10,
-        learning_rate=3e-5,
-        clip_epsilon=0.2,
-        entropy_coef=0.01,
-        gamma=0.99,
-        eval_every=5,
-    )

+# training.py  –  PPO + QLoRA + Supervised Warm-up
+# Model : Qwen/Qwen2.5-1.5B-Instruct  (via Unsloth – 2× faster, fits Colab T4)
+# Fixed : label-masking, BPE-boundary alignment, log-ratio clamping, OOM guards
+# Evidence: reward curves, before/after traces, per-difficulty breakdown, KL, entropy
+# ============================================================
+import os, json, random, re
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict
+from collections import Counter, defaultdict
 import numpy as np
+# ── Unsloth gives 2× throughput with identical outputs ────────────────────────
 from unsloth import FastLanguageModel
 from environment import CodeReviewEnv
 from redteam import BUG_DB
+# Graceful import: use project map_to_env if available, else inline fallback.
+try:
+    from models import map_to_env as model_map_to_env
+    _HAVE_MODEL_MAP = True
+except (ImportError, AttributeError):
+    _HAVE_MODEL_MAP = False
+if not _HAVE_MODEL_MAP:
+    try:
+        from models import (RunTests, RunLinter, Inspect, ProposeFix,
+                            WriteComment, AskQuestion, Done, Skip, QueryDocs)
+        def model_map_to_env(action_type: str, content=None):
+            return {
+                "run_tests":  RunTests(),
+                "run_linter": RunLinter(),
+                "inspect":    Inspect(),
+                "query_docs": QueryDocs(content or "python bug fix"),
+                "fix":        ProposeFix(content or ""),
+                "comment":    WriteComment(content or ""),
+                "question":   AskQuestion(content or ""),
+                "done":       Done(),
+            }.get(action_type, Skip())
+    except ImportError:
+        # Last resort: duck-typed object the env can introspect.
+        class _EnvAction:
+            def __init__(self, **kw): self.__dict__.update(kw)
+        def model_map_to_env(action_type: str, content=None):
+            return _EnvAction(action_type=action_type, content=content)
+# ══════════════════════════════════════════════════════════════════════════════
+# CONFIG
+# ══════════════════════════════════════════════════════════════════════════════
+CFG = dict(
+    model_name       = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
+    max_seq_len      = 512,       # hard cap; prevents OOM on T4
+    lora_r           = 16,
+    lora_alpha       = 32,
+    # Warm-up
+    warmup_data      = "training_data.json",
+    warmup_epochs    = 2,
+    warmup_lr        = 2e-5,
+    warmup_grad_acc  = 4,         # effective batch = 4 examples
+    # PPO
+    ppo_iters        = 15,
+    trajs_per_iter   = 6,
+    max_steps        = 7,
+    ppo_lr           = 3e-5,
+    clip_eps         = 0.2,
+    entropy_coef     = 0.01,
+    gamma            = 0.99,
+    log_ratio_clamp  = 5.0,       # ← prevents exp-explosion / NaN loss
+    temp_start       = 0.8,
+    temp_end         = 0.1,
+    # Eval
+    eval_episodes    = 10,        # episodes per evaluation snapshot
 )
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TASK_LEVELS = list(BUG_DB.keys())   # [easy, medium, hard, harder, hardest]
+# ══════════════════════════════════════════════════════════════════════════════
+# DATA STRUCTURES
+# ══════════════════════════════════════════════════════════════════════════════
 @dataclass
 class AgentAction:
     action_type: str
     content: Optional[str] = None
+@dataclass
+class Trajectory:
+    states:   List[str]
+    actions:  List[str]
+    rewards:  List[float]
+    logprobs: List[float]
+    dones:    List[bool]
+    task:     str = ""
+@dataclass
+class EvalSnapshot:
+    """Captures full agent behaviour for before/after comparison."""
+    avg_reward:    float
+    per_task:      Dict[str, float]  = field(default_factory=dict)
+    action_dist:   Dict[str, float]  = field(default_factory=dict)
+    success_rate:  float = 0.0
+    avg_steps:     float = 0.0
+    traces:        List[dict] = field(default_factory=list)
+# ══════════════════════════════════════════════════════════════════════════════
+# ACTION PARSER
+# ══════════════════════════════════════════════════════════════════════════════
+def parse_action(text: str) -> AgentAction:
+    """Robust parser: tries strict JSON, then regex, then keyword heuristic."""
+    text = text.strip()
     try:
+        d = json.loads(text)
+        return AgentAction(d.get("action_type","skip").lower(), d.get("content"))
+    except json.JSONDecodeError:
         pass
+    m = re.search(r'"action_type"\s*:\s*"(\w+)"', text)
+    if m:
+        cm = re.search(r'"content"\s*:\s*"(.*?)"', text, re.DOTALL)
+        return AgentAction(m.group(1).lower(), cm.group(1) if cm else None)
+    tl = text.lower()
+    for kw in ("run_tests","run_linter","inspect","query_docs","fix",
+               "comment","question","done"):
+        if kw in tl:
+            return AgentAction(kw)
+    return AgentAction("skip")
 def map_to_env(action: AgentAction):
+    return model_map_to_env(action.action_type, action.content)
+# ══════════════════════════════════════════════════════════════════════════════
+# MODEL  (Qwen2.5-1.5B via Unsloth)
+# ══════════════════════════════════════════════════════════════════════════════
 def load_model():
+    print(f"Loading {CFG['model_name']} …")
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name     = CFG["model_name"],
+        max_seq_length = CFG["max_seq_len"],
+        load_in_4bit   = True,
     )
     model = FastLanguageModel.get_peft_model(
         model,
+        r              = CFG["lora_r"],
+        lora_alpha     = CFG["lora_alpha"],
+        target_modules = ["q_proj","k_proj","v_proj","o_proj",
+                          "gate_proj","up_proj","down_proj"],
+        lora_dropout   = 0.0,
     )
+    tokenizer.pad_token = tokenizer.eos_token
+    print(f"  trainable params: "
+          f"{sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")
     return model, tokenizer
+# ══════════════════════════════════════════════════════════════════════════════
+# PROMPT BUILDER
+# ══════════════════════════════════════════════════════════════════════════════
+def build_prompt(obs, history_lines: List[str]) -> str:
+    author_msg   = getattr(obs, "author_response",  "") or ""
+    tool_output  = getattr(obs, "last_tool_output", "") or ""
+    personality  = getattr(obs, "author_personality","defensive")
+    # Trim tool output to avoid context explosion
+    if len(tool_output) > 600:
+        tool_output = tool_output[:600] + " …[truncated]"
+    p = (
+        f"You are an AI code review agent. Convince the developer (personality: "
+        f"**{personality}**) to accept your fix. Name your fix function `fix`.\n\n"
+        "Evidence required: tests pass, lint clean, docs cited, reasoning uses "
+        "'because'/'therefore' (>30 words).\n\n"
+        "Workflow: inspect → run_tests → run_linter → query_docs → fix → "
+        "comment/question → done.\n\n"
+        f"Code:\n{obs.code_snippet}\n\n"
+        f"Author: {author_msg or '(no response yet – start with inspect)'}\n\n"
+        f"Last tool: {tool_output or '(none)'}\n\n"
+        "Actions: run_tests, run_linter, inspect, query_docs, fix, comment, question, done\n\n"
+        'Respond ONLY in JSON: {"action_type": "...", "content": "..."}'
     )
+    if history_lines:
+        p += "\n\nRecent steps:\n" + "\n".join(history_lines[-4:])
+    return p
+# ══════════════════════════════════════════════════════════════════════════════
+# BUG FIX 1 – label masking in supervised warmup
+# (original: labels=inputs["input_ids"] trains on ALL tokens, including prompt)
+# ══════════════════════════════════════════════════════════════════════════════
+def _masked_labels(input_ids: torch.Tensor, prompt_len: int) -> torch.Tensor:
+    """Return labels with prompt positions set to -100 (ignored by CE loss)."""
+    labels = input_ids.clone()
+    labels[0, :prompt_len] = -100
+    return labels
+# ══════════════════════════════════════════════════════════════════════════════
+# BUG FIX 2 – BPE-boundary-safe logprob computation
+# (original: tokenize(prompt) + tokenize(action) ≠ tokenize(prompt+action))
+# ══════════════════════════════════════════════════════════════════════════════
+def _compute_action_logprob(
+    logits:      torch.Tensor,   # [1, seq_len, vocab]
+    input_ids:   torch.Tensor,   # [1, seq_len]
+    prompt_len:  int,            # #tokens in the prompt part of the joint sequence
+) -> tuple:
+    """
+    Compute sum of log-probs for *action* tokens only, using the jointly
+    tokenised sequence so BPE boundaries are respected.
+    Returns (total_logprob, avg_entropy, n_tokens).
+    """
+    action_len = input_ids.shape[1] - prompt_len
+    if action_len <= 0:
+        return torch.tensor(0.0, device=DEVICE), torch.tensor(0.0, device=DEVICE), 0
+    total_lp  = torch.tensor(0.0, device=DEVICE)
+    total_ent = torch.tensor(0.0, device=DEVICE)
+    for k in range(action_len):
+        pos = prompt_len + k           # position of the k-th action token
+        pred_pos = pos - 1             # logit at pred_pos predicts token at pos
+        if pred_pos < 0 or pred_pos >= logits.shape[1]:
             continue
+        token_id  = input_ids[0, pos]
+        lp_dist   = F.log_softmax(logits[0, pred_pos], dim=-1)
+        total_lp  = total_lp  + lp_dist[token_id]
+        probs     = torch.exp(lp_dist)
+        total_ent = total_ent + (-(probs * lp_dist).sum()).detach()
+    n = action_len
+    return total_lp, total_ent / max(n, 1), n
+# ══════════════════════════════════════════════════════════════════════════════
+# GENERATION  (returns text + joint-sequence logprob)
+# ══════════════════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def generate_action(prompt: str, model, tokenizer,
+                    temperature: float) -> tuple:
+    messages  = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(
+        formatted, return_tensors="pt",
+        max_length=CFG["max_seq_len"] - 128,   # leave room for response
+        truncation=True
+    ).to(DEVICE)
+    prompt_len = inputs["input_ids"].shape[1]
+    gen_kwargs = dict(
+        max_new_tokens      = 128,
+        do_sample           = temperature > 0,
+        return_dict_in_generate = True,
+        output_scores       = True,
+        pad_token_id        = tokenizer.eos_token_id,
+        eos_token_id        = tokenizer.eos_token_id,
+    )
+    if temperature > 0:
+        gen_kwargs["temperature"] = temperature
+    out     = model.generate(**inputs, **gen_kwargs)
+    gen_ids = out.sequences[0][prompt_len:]
+    text    = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    if not text:
+        fallback = random.choice([
+            '{"action_type":"inspect"}',
+            '{"action_type":"run_tests"}',
+            '{"action_type":"run_linter"}',
+        ])
+        print(f"  [WARN] empty generation → fallback {fallback}")
+        # BUG FIX 3: don't use -100 sentinel; use a mildly negative logprob
+        # so that PPO ratio = exp(new - old) stays finite when re-evaluated
+        return fallback, -10.0
+    # Recompute logprob from the full joint sequence (BPE-safe)
+    joint_ids = torch.cat(
+        [inputs["input_ids"], gen_ids.unsqueeze(0).to(DEVICE)], dim=1
+    )
+    joint_ids = joint_ids[:, :CFG["max_seq_len"]]
+    logits = model(input_ids=joint_ids).logits
+    lp, _, _ = _compute_action_logprob(logits, joint_ids, prompt_len)
+    return text, lp.item()
+# ══════════════════════════════════════════════════════════════════════════════
+# TRAJECTORY COLLECTION
+# ══════════════════════════════════════════════════════════════════════════════
+# Per-action shaped rewards.  These create reward variance so that
+# trajectories with meaningful tool use beat inspect-only episodes.
+_STEP_REWARD = {
+    "run_tests":  +0.08,
+    "run_linter": +0.05,
+    "fix":        +0.15,
+    "comment":    +0.08,
+    "query_docs": +0.05,
+    "question":   +0.04,
+    "inspect":     0.00,   # neutral – observe before acting
+    "done":        0.00,   # env handles the terminal reward
+    "skip":       -0.10,   # penalise doing nothing
+}
+def collect_trajectory(env, model, tokenizer,
+                       max_steps: int, temperature: float,
+                       task: str) -> tuple:
+    """
+    FIX 4 – Override env done/reward for non-terminal actions.
+    Root cause of the degenerate policy:
+    • env.step(Inspect()) returns done=True, reward=+0.002
+    • agent discovers inspect → tiny reward → done is the easiest path
+    • every trajectory is identical → zero advantage → PPO does nothing
+    Fix: only accept env's done+reward when the agent explicitly emits
+    {"action_type": "done"}.  For every other action, use a shaped step
+    reward and force the episode to continue.
+    """
+    env.set_task(task)
     obs = env.reset()
+    history: List[str] = []
+    traj = Trajectory([], [], [], [], [], task=task)
+    action_seq = []
+    for step_num in range(max_steps):
+        prompt = build_prompt(obs, history)
+        traj.states.append(prompt)
+        text, lp = generate_action(prompt, model, tokenizer, temperature)
+        traj.actions.append(text)
+        traj.logprobs.append(lp)
+        action = parse_action(text)
+        action_seq.append(action.action_type)
+        obs, reward, env_done, _ = env.step(map_to_env(action))
+        raw_r = float(reward.value)
+        if action.action_type == "done":
+            # Agent explicitly chose to terminate → honour env reward
+            shaped_r     = raw_r
+            effective_done = True
+        else:
+            # Intermediate step: use shaped reward, ignore env's done signal.
+            # Also keep a fraction of any large env reward (e.g. test pass).
+            shaped_r = _STEP_REWARD.get(action.action_type, 0.0)
+            if raw_r > 0.1:            # env signalling meaningful progress
+                shaped_r += raw_r * 0.3
+            effective_done = False     # ← key: don't let env short-circuit
+        traj.rewards.append(float(np.clip(shaped_r, -1.0, 1.0)))
+        traj.dones.append(effective_done)
+        history.append(f"Agent: {text[:120]}")
+        history.append(f"Env: {(obs.last_tool_output or '')[:120]}")
+        if effective_done:
             break
+    return traj, action_seq
+# ══════════════════════════════════════════════════════════════════════════════
+# SUPERVISED WARM-UP  (BUG FIX 1: action-only label masking)
+# ══════════════════════════════════════════════════════════════════════════════
+def supervised_warmup(model, tokenizer):
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP")
+    print("="*60)
+    with open(CFG["warmup_data"], encoding="utf-8") as f:
+        data = json.load(f)
+    opt = AdamW(model.parameters(), lr=CFG["warmup_lr"])
     model.train()
+    loss_history = []
+    for epoch in range(CFG["warmup_epochs"]):
+        random.shuffle(data)
+        epoch_loss, n_valid = 0.0, 0
+        opt.zero_grad()
+        for step, ex in enumerate(data):
+            # ── Tokenise prompt and full sequence jointly ────────────────
+            prompt_chat = tokenizer.apply_chat_template(
+                [{"role": "user", "content": ex["prompt"]}],
+                tokenize=False, add_generation_prompt=True
+            )
+            full_chat = tokenizer.apply_chat_template(
+                [{"role": "user",      "content": ex["prompt"]},
+                 {"role": "assistant", "content": ex["action"]}],
+                tokenize=False
+            )
+            prompt_ids = tokenizer(
+                prompt_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            )["input_ids"]
+            full_inputs = tokenizer(
+                full_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            ).to(DEVICE)
+            prompt_len = prompt_ids.shape[1]
+            if prompt_len >= full_inputs["input_ids"].shape[1]:
+                continue  # action got truncated away
+            # BUG FIX 1 ── mask prompt tokens so loss is action-only
+            labels = _masked_labels(full_inputs["input_ids"], prompt_len)
+            out  = model(**full_inputs, labels=labels)
+            loss = out.loss / CFG["warmup_grad_acc"]
+            loss.backward()
+            if (step + 1) % CFG["warmup_grad_acc"] == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                opt.step()
+                opt.zero_grad()
+            epoch_loss += loss.item() * CFG["warmup_grad_acc"]
+            n_valid    += 1
+            if (step + 1) % 50 == 0:
+                print(f"  epoch {epoch+1}  step {step+1}/{len(data)}"
+                      f"  loss={epoch_loss/n_valid:.4f}")
+        avg = epoch_loss / max(n_valid, 1)
+        loss_history.append(avg)
+        print(f"  Epoch {epoch+1} complete: avg_loss={avg:.4f}")
+    torch.cuda.empty_cache()
+    print(f"✓ Warm-up done. Loss: {' → '.join(f'{l:.4f}' for l in loss_history)}\n")
+    return loss_history
+# ══════════════════════════════════════════════════════════════════════════════
+# EVALUATION  (produces rich EvalSnapshot for comparison plots)
+# ══════════════════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def evaluate(env, model, tokenizer, label: str = "") -> EvalSnapshot:
+    model.eval()
+    per_task: Dict[str, List[float]] = defaultdict(list)
+    action_counter: Counter = Counter()
+    all_steps, all_success = [], []
+    traces = []
+    for ep in range(CFG["eval_episodes"]):
+        task = TASK_LEVELS[ep % len(TASK_LEVELS)]
+        traj, actions = collect_trajectory(
+            env, model, tokenizer, CFG["max_steps"], 0.0, task
+        )
+        ep_r = sum(traj.rewards)
+        per_task[task].append(ep_r)
+        action_counter.update(actions)
+        all_steps.append(len(traj.actions))
+        # FIX 6 – meaningful success = agent explicitly called "done".
+        # ep_r > 0 is misleading: even a single inspect returns +0.002.
+        all_success.append(1 if "done" in actions else 0)
+        traces.append({"task": task, "reward": round(ep_r, 4),
+                       "steps": len(traj.actions), "actions": actions})
+    total_actions = max(sum(action_counter.values()), 1)
+    snap = EvalSnapshot(
+        avg_reward   = float(np.mean([r for rs in per_task.values() for r in rs])),
+        per_task     = {t: float(np.mean(rs)) for t, rs in per_task.items()},
+        action_dist  = {a: c/total_actions for a, c in action_counter.most_common()},
+        success_rate = float(np.mean(all_success)),
+        avg_steps    = float(np.mean(all_steps)),
+        traces       = traces,
+    )
+    if label:
+        print(f"\n── {label} ──")
+        print(f"  avg_reward={snap.avg_reward:+.4f}  "
+              f"success={snap.success_rate:.0%}  steps={snap.avg_steps:.1f}")
+        print(f"  per-task: " +
+              "  ".join(f"{t}={v:+.3f}" for t,v in snap.per_task.items()))
+        print(f"  top actions: " +
+              "  ".join(f"{a}={p:.0%}" for a,p in list(snap.action_dist.items())[:5]))
+    model.train()
+    return snap
+# ══════════════════════════════════════════════════════════════════════════════
+# PPO UPDATE  (BUG FIX 2 + 3: BPE-safe logprob + log-ratio clamping)
+# ══════════════════════════════════════════════════════════════════════════════
+def ppo_update(trajectories: List[Trajectory],
+               model, tokenizer, optimizer) -> dict:
+    model.train()
+    losses, kls, entropies = [], [], []
+    # ── Compute discounted returns and a global mean baseline ────────────────
     all_returns = []
+    traj_returns = []
     for traj in trajectories:
+        ret, running = [], 0.0
+        for r, done in zip(reversed(traj.rewards), reversed(traj.dones)):
+            running = r + CFG["gamma"] * (0.0 if done else running)
+            ret.insert(0, running)
+        traj_returns.append(ret)
+        all_returns.extend(ret)
+    # FIX 5 – Normalise advantages to zero mean / unit std.
+    # When all returns are identical (e.g. every episode returns 0.002),
+    # baseline = mean = every return, so adv = 0 for all steps, the
+    # policy loss is 0, and PPO never updates.  Normalising creates real
+    # signal: better-than-average trajectories get positive advantage,
+    # worse-than-average get negative, even if the absolute spread is tiny.
+    ret_arr  = np.array(all_returns) if all_returns else np.array([0.0])
+    ret_mean = float(ret_arr.mean())
+    ret_std  = float(ret_arr.std())
+    if ret_std < 1e-6:
+        # Truly zero variance – nothing to learn this iteration.
+        print("  [PPO] Zero return variance – skipping gradient update.")
+        return dict(loss=0.0, kl=0.0, entropy=0.0)
+    # Build a lookup so we can retrieve the normalised advantage by
+    # (trajectory index, step index) during the update loop below.
+    norm_returns: List[List[float]] = [
+        [(r - ret_mean) / (ret_std + 1e-8) for r in ret_list]
+        for ret_list in traj_returns
+    ]
+    for traj_idx, (traj, returns) in enumerate(zip(trajectories, traj_returns)):
+        for i in range(len(traj.states)):
+            state  = traj.states[i]
+            action = traj.actions[i]
+            old_lp = traj.logprobs[i]
+            adv    = norm_returns[traj_idx][i]   # ← normalised advantage
+            # ── Tokenise jointly (BPE FIX 2) ────────────────────────────────
+            prompt_chat = tokenizer.apply_chat_template(
+                [{"role": "user", "content": state}],
+                tokenize=False, add_generation_prompt=True
+            )
+            full_text = prompt_chat + action
+            full_ids = tokenizer(
+                full_text, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            ).to(DEVICE)
+            # Count prompt tokens IN THE JOINT SEQUENCE (not separately)
+            prompt_ids = tokenizer(
+                prompt_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"] - 10, truncation=True
+            )["input_ids"]
+            prompt_len = min(prompt_ids.shape[1], full_ids["input_ids"].shape[1] - 1)
+            logits = model(**full_ids).logits
+            new_lp, avg_ent, n_tokens = _compute_action_logprob(
+                logits, full_ids["input_ids"], prompt_len
+            )
+            if n_tokens == 0:
                 continue
+            # BUG FIX 3 ── clamp log-ratio before exp to prevent NaN
+            old_lp_t  = torch.tensor(old_lp, dtype=torch.float32, device=DEVICE)
+            log_ratio = torch.clamp(new_lp - old_lp_t,
+                                    -CFG["log_ratio_clamp"],
+                                     CFG["log_ratio_clamp"])
+            ratio     = torch.exp(log_ratio)
+            adv_t = torch.tensor(adv, dtype=torch.float32, device=DEVICE)
+            s1    = ratio * adv_t
+            s2    = torch.clamp(ratio,
+                                1.0 - CFG["clip_eps"],
+                                1.0 + CFG["clip_eps"]) * adv_t
+            policy_loss = -torch.min(s1, s2)
+            loss        = policy_loss - CFG["entropy_coef"] * avg_ent
+            if torch.isnan(loss) or torch.isinf(loss):
+                continue
             optimizer.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             optimizer.step()
+            losses.append(loss.item())
+            kls.append((old_lp_t - new_lp).detach().cpu().item())
+            entropies.append(avg_ent.item())
+    torch.cuda.empty_cache()
+    return dict(
+        loss    = float(np.mean(losses))    if losses    else 0.0,
+        kl      = float(np.mean(kls))       if kls       else 0.0,
+        entropy = float(np.mean(entropies)) if entropies else 0.0,
+    )
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOTTING  (rich evidence panel)
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
+             baseline_snap: EvalSnapshot,
+             postwarmup_snap: EvalSnapshot,
+             final_snap: EvalSnapshot):
+    iters = list(range(1, len(reward_hist) + 1))
+    # ── Figure 1: training curves (2×3 grid) ─────────────────────────────────
+    fig = plt.figure(figsize=(18, 10))
+    gs  = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35)
+    # (0,0) Warm-up loss
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(range(1, len(warmup_losses)+1), warmup_losses,
+            marker="o", color="mediumpurple", linewidth=2)
+    ax.set_title("A. Warm-up CE Loss ↓", fontweight="bold")
+    ax.set_xlabel("Epoch"); ax.set_ylabel("Loss"); ax.grid(alpha=0.3)
+    # (0,1) PPO reward
+    ax = fig.add_subplot(gs[0, 1])
+    smooth = np.convolve(reward_hist, np.ones(3)/3, mode="same")
+    ax.plot(iters, reward_hist, alpha=0.35, color="steelblue", linewidth=1)
+    ax.plot(iters, smooth, color="steelblue", linewidth=2.5, label="reward (smoothed)")
+    ax.axhline(baseline_snap.avg_reward, color="gray", linestyle=":",
+               label=f"pre-warmup ({baseline_snap.avg_reward:+.3f})")
+    ax.axhline(postwarmup_snap.avg_reward, color="mediumpurple", linestyle="--",
+               label=f"post-warmup ({postwarmup_snap.avg_reward:+.3f})")
+    ax.axhline(final_snap.avg_reward, color="forestgreen", linestyle="-.",
+               label=f"final ({final_snap.avg_reward:+.3f})")
+    ax.set_title("B. PPO Reward ↑", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Avg Reward")
+    ax.legend(fontsize=7); ax.grid(alpha=0.3)
+    # (0,2) Success rate
+    ax = fig.add_subplot(gs[0, 2])
+    ax.plot(iters, success_hist, marker="s", color="seagreen", linewidth=2)
+    ax.set_ylim(0, 1)
+    ax.set_title("C. Episode Success Rate ↑", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Fraction")
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y,_: f"{y:.0%}"))
+    ax.grid(alpha=0.3)
+    # (1,0) KL divergence
+    ax = fig.add_subplot(gs[1, 0])
+    ax.plot(iters, kl_hist, marker="^", color="tomato", linewidth=2)
+    ax.axhline(0, color="gray", linewidth=0.8)
+    ax.set_title("D. KL Divergence", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("KL"); ax.grid(alpha=0.3)
+    # (1,1) Entropy
+    ax = fig.add_subplot(gs[1, 1])
+    ax.plot(iters, entropy_hist, marker="D", color="darkorange", linewidth=2)
+    ax.set_title("E. Policy Entropy", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Entropy"); ax.grid(alpha=0.3)
+    # (1,2) Per-difficulty final reward
+    ax = fig.add_subplot(gs[1, 2])
+    tasks = TASK_LEVELS
+    vals_base  = [baseline_snap.per_task.get(t, 0)   for t in tasks]
+    vals_final = [final_snap.per_task.get(t, 0)       for t in tasks]
+    x = np.arange(len(tasks))
+    ax.bar(x - 0.2, vals_base,  0.35, label="baseline",color="lightcoral",  alpha=0.8)
+    ax.bar(x + 0.2, vals_final, 0.35, label="final",   color="steelblue",   alpha=0.8)
+    ax.set_xticks(x); ax.set_xticklabels(tasks, fontsize=8)
+    ax.set_title("F. Per-Difficulty Reward", fontweight="bold")
+    ax.set_ylabel("Avg Reward"); ax.legend(fontsize=8); ax.grid(alpha=0.3, axis="y")
+    ax.axhline(0, color="gray", linewidth=0.8)
+    fig.suptitle(f"Code-Review Agent – Full Training Evidence  "
+                 f"(Qwen2.5-1.5B, PPO + QLoRA)",
+                 fontsize=13, fontweight="bold")
+    fig.savefig("training_summary.png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print("  Saved: training_summary.png")
+    # ── Figure 2: before / after action distribution ─────────────────────────
+    fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=False)
+    for ax, snap, title in zip(
+        axes,
+        [baseline_snap, postwarmup_snap, final_snap],
+        ["Before (baseline)", "After warm-up", "After PPO (final)"]
+    ):
+        if snap.action_dist:
+            labels = list(snap.action_dist.keys())
+            vals   = [snap.action_dist[l]*100 for l in labels]
+            bars   = ax.barh(labels, vals,
+                             color=plt.cm.tab10(np.linspace(0, 0.8, len(labels))))
+            ax.bar_label(bars, fmt="%.0f%%", padding=3, fontsize=8)
+        ax.set_xlim(0, 105)
+        ax.set_title(title, fontweight="bold")
+        ax.set_xlabel("% of actions")
+        ax.grid(alpha=0.3, axis="x")
+    fig.suptitle("Action Distribution: Before vs After Training",
+                 fontsize=12, fontweight="bold")
+    plt.tight_layout()
+    fig.savefig("action_distribution.png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print("  Saved: action_distribution.png")
+# ══════════════════════════════════════════════════════════════════════════════
+# MAIN
+# ══════════════════════════════════════════════════════════════════════════════
+def train():
     model, tokenizer = load_model()
     env = CodeReviewEnv()
+    # ── PHASE 0: pre-warmup baseline ────────────────────────────────────────
     print("\n" + "="*60)
+    print("PHASE 0 – BASELINE (untrained)")
     print("="*60)
+    baseline_snap = evaluate(env, model, tokenizer, "Baseline")
+    # ── PHASE 1: supervised warm-up ─────────────────────────────────────────
+    warmup_losses = supervised_warmup(model, tokenizer)
+    postwarmup_snap = evaluate(env, model, tokenizer, "Post-Warmup")
+    # ── PHASE 2: PPO ────────────────────────────────────────────────────────
+    optimizer = AdamW(model.parameters(), lr=CFG["ppo_lr"])
+    reward_hist, success_hist, kl_hist, entropy_hist = [], [], [], []
+    print("\n" + "="*60)
+    print(f"PHASE 2 – PPO ({CFG['ppo_iters']} iterations × "
+          f"{CFG['trajs_per_iter']} trajectories)")
+    print("="*60)
+    for it in range(CFG["ppo_iters"]):
+        # Linearly anneal exploration temperature
+        # FIX 7 – exponential decay with a floor (never below 0.35).
+        # Linear annealing to 0.1 collapses exploration before we learn
+        # anything; keeping >= 0.35 ensures trajectory diversity.
+        t = max(CFG["temp_start"] * (0.93 ** it), 0.35)
+        print(f"\n── Iteration {it+1}/{CFG['ppo_iters']}  temp={t:.2f} ──")
+        trajectories, action_counts = [], Counter()
+        successes = 0
+        for j in range(CFG["trajs_per_iter"]):
+            task = TASK_LEVELS[j % len(TASK_LEVELS)]
+            traj, actions = collect_trajectory(
+                env, model, tokenizer, CFG["max_steps"], t, task
+            )
+            trajectories.append(traj)
+            action_counts.update(actions)
+            ep_r = sum(traj.rewards)
+            # FIX 6b – consistent with evaluate(): only explicit done counts
+            successes += int("done" in actions)
+            print(f"  traj {j+1}/{CFG['trajs_per_iter']}  task={task}"
+                  f"  steps={len(traj.actions)}  reward={ep_r:+.3f}")
+        avg_r       = float(np.mean([sum(t.rewards) for t in trajectories]))
+        success_r   = successes / CFG["trajs_per_iter"]
+        m = ppo_update(trajectories, model, tokenizer, optimizer)
+        reward_hist.append(avg_r)
+        success_hist.append(success_r)
+        kl_hist.append(m["kl"])
+        entropy_hist.append(m["entropy"])
+        delta = avg_r - baseline_snap.avg_reward
+        print(f"  → avg_reward={avg_r:+.4f}  Δbaseline={delta:+.4f}"
+              f"  success={success_r:.0%}"
+              f"  loss={m['loss']:.4f}  kl={m['kl']:.4f}  ent={m['entropy']:.4f}")
+        print(f"  actions: {dict(action_counts.most_common(5))}")
+    # ── PHASE 3: final evaluation ───────────────────────────────────────────
+    print("\n" + "="*60)
+    print("PHASE 3 – FINAL EVALUATION")
+    print("="*60)
+    final_snap = evaluate(env, model, tokenizer, "Final")
+    # ── Summary table ───────────────────────────────────────────────────────
+    print("\n" + "="*60)
+    print("TRAINING SUMMARY")
+    print("="*60)
+    print(f"  {'Stage':<20} {'Reward':>10} {'Success':>10} {'Δ baseline':>12}")
+    print(f"  {'-'*54}")
+    for label, snap in [("Baseline",    baseline_snap),
+                        ("Post-warmup", postwarmup_snap),
+                        ("Final (PPO)", final_snap)]:
+        delta = snap.avg_reward - baseline_snap.avg_reward
+        print(f"  {label:<20} {snap.avg_reward:>+10.4f}"
+              f" {snap.success_rate:>10.0%}  {delta:>+11.4f}")
+    improve = final_snap.avg_reward - baseline_snap.avg_reward
+    verdict = "✓ LEARNED" if improve > 0 else "✗ NO IMPROVEMENT"
+    print(f"\n  {verdict}  (total Δ = {improve:+.4f})")
+    print("\nBefore → After traces (one per difficulty):")
+    btask = {t["task"]: t for t in baseline_snap.traces}
+    ftask = {t["task"]: t for t in final_snap.traces}
+    for task in TASK_LEVELS:
+        b = btask.get(task, {})
+        f = ftask.get(task, {})
+        print(f"  {task:8s}  baseline actions={b.get('actions',[])}  "
+              f"reward={b.get('reward',0):+.3f}"
+              f"  │  final actions={f.get('actions',[])}  "
+              f"reward={f.get('reward',0):+.3f}")
+    # ── Plots ───────────────────────────────────────────────────────────────
+    plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
+             baseline_snap, postwarmup_snap, final_snap)
+    print("\nAll done. Saved: training_summary.png  action_distribution.png")
 if __name__ == "__main__":
+    train()