Adhitya122
/

molforge-grpo-oncology

@@ -116,6 +116,14 @@ After SFT, the policy is trained with GRPO-style RL against MolForge itself. Dur
 ![Reward Curve](assets/reward_curve.png)
 ![Training Logs](assets/Logs.png)
 As shown in the reward curve and logs, the model successfully learns to navigate the scientific constraints, moving from early exploration to consistent, verifier-backed molecule submissions. For strict evaluation, the environment switches back to `assay_gated` mode.

 ![Reward Curve](assets/reward_curve.png)
 ![Training Logs](assets/Logs.png)
+### Performance Comparison: SFT vs. RL
+| Difficulty | Before (SFT Model) | After RL Training | Improvement |
+| :--- | :---: | :---: | :---: |
+| **Easy** | 0.1167 | 0.1295 | **+10.9%** |
+| **Medium** | 0.1167 | 0.1278 | **+9.5%** |
+| **Hard** | 0.0800 | 0.0866 | **+8.3%** |
 As shown in the reward curve and logs, the model successfully learns to navigate the scientific constraints, moving from early exploration to consistent, verifier-backed molecule submissions. For strict evaluation, the environment switches back to `assay_gated` mode.

molforge_grpo_official_submission.ipynb CHANGED Viewed

@@ -69,7 +69,9 @@
         "PLOT_DIR = OUTPUT_DIR / \"plots\"\n",
         "\n",
         "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
-        "PLOT_DIR.mkdir(parents=True, exist_ok=True)"
       ]
     },
     {
@@ -88,77 +90,76 @@
       "outputs": [],
       "source": [
         "import json\n",
         "from typing import Any, Dict, Tuple\n",
-        "from inference_common import (\n",
-        "    MolForgeAction,\n",
-        "    attach_reasoning_fields,\n",
-        "    attach_team_messages,\n",
-        "    extract_json,\n",
-        ")\n",
         "from server.molforge_environment import MolForgeEnvironment\n",
-        "from models import MolForgeState\n",
         "\n",
         "def replay_to_state(record: dict[str, Any]) -> MolForgeEnvironment:\n",
-        "    \"\"\"Replays actions to reach a specific state.\"\"\"\n",
         "    env = MolForgeEnvironment()\n",
-        "    # Set randomization and seed if provided\n",
-        "    if record.get(\"randomized\"):\n",
-        "        os.environ[\"MOLFORGE_TRAINING_RANDOMIZATION\"] = \"1\"\n",
         "    os.environ[\"MOLFORGE_RAND_SEED\"] = str(record.get(\"random_seed\", \"rl\"))\n",
-        "    \n",
         "    observation = env.reset()\n",
         "    for action_payload in record.get(\"pre_actions\", []):\n",
         "        action = MolForgeAction(**action_payload)\n",
         "        observation = env.step(attach_team_messages(observation, attach_reasoning_fields(observation, action)))\n",
-        "        if observation.done:\n",
-        "            break\n",
         "    return env\n",
         "\n",
-        "def evaluate_completion(prompt_str: str, completion_str: str, record: dict[str, Any]) -> Tuple[float, dict]:\n",
-        "    diagnostics = {\"valid_json\": False}\n",
         "    try:\n",
         "        action_dict = extract_json(completion_str)\n",
         "        action = MolForgeAction(**action_dict)\n",
         "    except Exception:\n",
-        "        return -1.2, diagnostics\n",
         "\n",
-        "    diagnostics[\"valid_json\"] = True\n",
         "    env = replay_to_state(record)\n",
-        "    \n",
-        "    # Step the OpenEnv environment\n",
         "    observation = env._build_observation(reward=0.0, done=False, reward_components=[])\n",
         "    action = attach_team_messages(observation, attach_reasoning_fields(observation, action))\n",
         "    next_observation = env.step(action)\n",
         "    \n",
-        "    reward = float(next_observation.reward)\n",
-        "    grader_scores = next_observation.metadata.get(\"terminal_grader_scores\", {})\n",
         "    \n",
-        "    # Reward Shaping\n",
-        "    if action.action_type == \"run_assay\" and reward > 0:\n",
-        "        reward *= 0.25\n",
-        "    elif action.action_type == \"submit\":\n",
-        "        sub_score = float(grader_scores.get(\"submission_score\", 0.0))\n",
-        "        if sub_score > 0.0: reward += sub_score * 3.0\n",
-        "    elif action.action_type == \"edit\" and reward > 0:\n",
-        "        reward *= 1.5\n",
         "\n",
-        "    diagnostics.update({\"action_type\": action.action_type, \"reward\": reward, \"done\": next_observation.done})\n",
-        "    return reward, diagnostics\n",
         "\n",
         "def molforge_reward_func(prompts, completions, **kwargs) -> list[float]:\n",
         "    rewards = []\n",
-        "    # When using dynamic dataset, columns like scenario_id, pre_actions etc are in kwargs\n",
         "    for i in range(len(completions)):\n",
-        "        # Reconstruct the record from kwargs\n",
-        "        record = {\n",
-        "            \"pre_actions\": kwargs[\"record\"][i][\"pre_actions\"] if \"record\" in kwargs else [],\n",
-        "            \"randomized\": True,\n",
-        "            \"random_seed\": \"dynamic-rl\"\n",
-        "        }\n",
-        "        prompt_str = prompts[i][-1][\"content\"] if isinstance(prompts[i], list) else str(prompts[i])\n",
-        "        completion_str = completions[i][0][\"content\"] if isinstance(completions[i], list) else str(completions[i])\n",
-        "        reward, _ = evaluate_completion(prompt_str, completion_str, record)\n",
         "        rewards.append(reward)\n",
         "    return rewards\n"
       ]
     },
@@ -267,8 +268,12 @@
       "source": [
         "from trl import GRPOConfig, GRPOTrainer\n",
         "import inspect\n",
         "\n",
-        "# Safe GRPO Configuration (detects supported arguments automatically)\n",
         "config_kwargs = {\n",
         "    \"output_dir\": str(OUTPUT_DIR),\n",
         "    \"learning_rate\": LEARNING_RATE,\n",
@@ -280,18 +285,17 @@
         "    \"max_steps\": RL_MAX_STEPS,\n",
         "    \"logging_steps\": 1,\n",
         "    \"save_steps\": 25,\n",
-        "    \"bf16\": True,\n",
         "    \"report_to\": \"none\",\n",
         "    \"log_completions\": True,\n",
         "}\n",
         "\n",
-        "# Filter arguments to only pass what the current library version supports\n",
         "supported_params = inspect.signature(GRPOConfig.__init__).parameters\n",
         "filtered_kwargs = {k: v for k, v in config_kwargs.items() if k in supported_params}\n",
         "\n",
         "training_args = GRPOConfig(**filtered_kwargs)\n",
         "\n",
-        "# Initialize Trainer\n",
         "trainer = GRPOTrainer(\n",
         "    model=model,\n",
         "    reward_funcs=molforge_reward_func,\n",
@@ -307,48 +311,6 @@
         "trainer.save_model(str(ADAPTER_SAVE_DIR))\n",
         "tokenizer.save_pretrained(str(ADAPTER_SAVE_DIR))\n"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import matplotlib.pyplot as plt\n",
-        "import pandas as pd\n",
-        "\n",
-        "# Extract metrics from trainer log history\n",
-        "log_history = trainer.state.log_history\n",
-        "df = pd.DataFrame(log_history)\n",
-        "\n",
-        "print(\"Generating plots...\")\n",
-        "\n",
-        "if \"loss\" in df.columns:\n",
-        "    plt.figure(figsize=(10, 5))\n",
-        "    df_loss = df.dropna(subset=[\"loss\"])\n",
-        "    plt.plot(df_loss[\"step\"], df_loss[\"loss\"], label=\"Loss\", color=\"blue\")\n",
-        "    plt.title(\"Training Loss\")\n",
-        "    plt.grid(True)\n",
-        "    plt.show()\n",
-        "\n",
-        "if \"reward\" in df.columns:\n",
-        "    plt.figure(figsize=(10, 5))\n",
-        "    df_reward = df.dropna(subset=[\"reward\"])\n",
-        "    plt.plot(df_reward[\"step\"], df_reward[\"reward\"], label=\"Reward\", color=\"green\")\n",
-        "    plt.title(\"Reward Curve\")\n",
-        "    plt.grid(True)\n",
-        "    plt.show()\n",
-        "\n",
-        "import shutil\n",
-        "from google.colab import files\n",
-        "\n",
-        "print(f\"Zipping results from {OUTPUT_DIR}...\")\n",
-        "zip_filename = f\"{RUN_NAME}_results\"\n",
-        "shutil.make_archive(zip_filename, \"zip\", OUTPUT_DIR)\n",
-        "\n",
-        "print(f\"Downloading {zip_filename}.zip...\")\n",
-        "files.download(f\"{zip_filename}.zip\")\n"
-      ]
     }
   ],
   "metadata": {

         "PLOT_DIR = OUTPUT_DIR / \"plots\"\n",
         "\n",
         "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
+        "PLOT_DIR.mkdir(parents=True, exist_ok=True)\n",
+        "LOG_DIR = OUTPUT_DIR / \"logs\"\n",
+        "LOG_DIR.mkdir(parents=True, exist_ok=True)\n"
       ]
     },
     {
       "outputs": [],
       "source": [
         "import json\n",
+        "import time\n",
         "from typing import Any, Dict, Tuple\n",
+        "from inference_common import MolForgeAction, attach_reasoning_fields, attach_team_messages, extract_json\n",
         "from server.molforge_environment import MolForgeEnvironment\n",
+        "\n",
+        "COMPLETION_LOG = LOG_DIR / \"completion_diagnostics.jsonl\"\n",
         "\n",
         "def replay_to_state(record: dict[str, Any]) -> MolForgeEnvironment:\n",
         "    env = MolForgeEnvironment()\n",
+        "    if record.get(\"randomized\"): os.environ[\"MOLFORGE_TRAINING_RANDOMIZATION\"] = \"1\"\n",
         "    os.environ[\"MOLFORGE_RAND_SEED\"] = str(record.get(\"random_seed\", \"rl\"))\n",
         "    observation = env.reset()\n",
         "    for action_payload in record.get(\"pre_actions\", []):\n",
         "        action = MolForgeAction(**action_payload)\n",
         "        observation = env.step(attach_team_messages(observation, attach_reasoning_fields(observation, action)))\n",
         "    return env\n",
         "\n",
+        "def evaluate_completion(prompt_str, completion_str, record) -> Tuple[float, dict]:\n",
         "    try:\n",
         "        action_dict = extract_json(completion_str)\n",
         "        action = MolForgeAction(**action_dict)\n",
+        "        valid_json = True\n",
         "    except Exception:\n",
+        "        return -1.5, {\"valid_json\": False, \"action_type\": \"invalid\"}\n",
         "\n",
         "    env = replay_to_state(record)\n",
         "    observation = env._build_observation(reward=0.0, done=False, reward_components=[])\n",
         "    action = attach_team_messages(observation, attach_reasoning_fields(observation, action))\n",
         "    next_observation = env.step(action)\n",
         "    \n",
+        "    # --- ANTI-REWARD HACKING FILTER ---\n",
+        "    # We manually sum only the scientific reward components, ignoring \"chatter\" rewards\n",
+        "    filtered_reward = 0.0\n",
+        "    keep_components = {\n",
+        "        \"edit_delta\", \"submission_quality\", \"hard_constraints\", \"baseline_gate\",\n",
+        "        \"submission_evidence\", \"curriculum_terminal_progress\", \"curriculum_evidence_gate\"\n",
+        "    }\n",
+        "    penalties = {\"invalid_action\", \"budget_exhausted\", \"step_limit\", \"policy_veto\", \"loop_penalty\"}\n",
         "    \n",
+        "    for component in next_observation.reward_components:\n",
+        "        if component.name in keep_components:\n",
+        "            filtered_reward += component.value\n",
+        "        elif component.name in penalties:\n",
+        "            filtered_reward += component.value\n",
         "\n",
+        "    # Add a mandatory time pressure penalty for every step\n",
+        "    filtered_reward -= 0.15 \n",
+        "    \n",
+        "    grader_scores = next_observation.metadata.get(\"terminal_grader_scores\", {})\n",
+        "    \n",
+        "    # Extra multipliers for reaching the goal\n",
+        "    if action.action_type == \"submit\" and grader_scores.get(\"submission_score\", 0) > 0:\n",
+        "        filtered_reward += float(grader_scores[\"submission_score\"]) * 4.0\n",
+        "    \n",
+        "    reward = round(filtered_reward, 4)\n",
+        "    \n",
+        "    return reward, {\n",
+        "        \"valid_json\": True, \"action_type\": action.action_type, \"reward\": reward, \n",
+        "        \"done\": next_observation.done, \"scores\": grader_scores, \n",
+        "        \"raw_completion\": completion_str, \"timestamp\": time.time()\n",
+        "    }\n",
         "\n",
         "def molforge_reward_func(prompts, completions, **kwargs) -> list[float]:\n",
         "    rewards = []\n",
         "    for i in range(len(completions)):\n",
+        "        record = {\"pre_actions\": kwargs[\"record\"][i][\"pre_actions\"] if \"record\" in kwargs else []}\n",
+        "        reward, diagnostics = evaluate_completion(\"\", completions[i][0][\"content\"], record)\n",
         "        rewards.append(reward)\n",
+        "        with open(COMPLETION_LOG, \"a\") as f:\n",
+        "            f.write(json.dumps(diagnostics) + \"\\n\")\n",
         "    return rewards\n"
       ]
     },
       "source": [
         "from trl import GRPOConfig, GRPOTrainer\n",
         "import inspect\n",
+        "import torch\n",
+        "\n",
+        "# Check for BF16 support (T4 does not support it, A100/L4 do)\n",
+        "has_bf16 = torch.cuda.is_bf16_supported()\n",
+        "print(f\"GPU supports BF16: {has_bf16}\")\n",
         "\n",
         "config_kwargs = {\n",
         "    \"output_dir\": str(OUTPUT_DIR),\n",
         "    \"learning_rate\": LEARNING_RATE,\n",
         "    \"max_steps\": RL_MAX_STEPS,\n",
         "    \"logging_steps\": 1,\n",
         "    \"save_steps\": 25,\n",
+        "    \"bf16\": has_bf16,\n",
+        "    \"fp16\": not has_bf16,\n",
         "    \"report_to\": \"none\",\n",
         "    \"log_completions\": True,\n",
         "}\n",
         "\n",
         "supported_params = inspect.signature(GRPOConfig.__init__).parameters\n",
         "filtered_kwargs = {k: v for k, v in config_kwargs.items() if k in supported_params}\n",
         "\n",
         "training_args = GRPOConfig(**filtered_kwargs)\n",
         "\n",
         "trainer = GRPOTrainer(\n",
         "    model=model,\n",
         "    reward_funcs=molforge_reward_func,\n",
         "trainer.save_model(str(ADAPTER_SAVE_DIR))\n",
         "tokenizer.save_pretrained(str(ADAPTER_SAVE_DIR))\n"
       ]
     }
   ],
   "metadata": {