Spaces:

Anurag137
/

enterprise-ops-arena

Running

App Files Files Community

Anurag137 commited on Apr 26

Commit

b9b6fea

1 Parent(s): 7e43503

deploy: trained LoRA toggle + training evidence tab

Browse files

Files changed (5) hide show

agents/trained_agent.py +186 -0
app.py +12 -3
environment.py +43 -4
gradio_app.py +154 -53
requirements.txt +8 -5

agents/trained_agent.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from __future__ import annotations
+import json
+import re
+import sys
+from pathlib import Path
+_ROOT = Path(__file__).resolve().parent.parent
+if str(_ROOT) not in sys.path:
+    sys.path.insert(0, str(_ROOT))
+from contracts import ActionSchema, ObservationSchema, AGENT_IT
+class TrainedITAgent:
+    """
+    IT Agent powered by trained LoRA model from HuggingFace.
+    Falls back to rule-based if model not available.
+    """
+    MODEL_REPO = "Anurag137/enterprise-ops-lora"
+    BASE_MODEL = "unsloth/Qwen2.5-3B-Instruct"
+    def __init__(self):
+        self.agent_id = AGENT_IT
+        self.model = None
+        self.tokenizer = None
+        self._load_model()
+    def _load_model(self):
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            from peft import PeftModel
+            import torch
+            print("[TrainedAgent] Loading base model without Unsloth...")
+            tokenizer = AutoTokenizer.from_pretrained(
+                "Qwen/Qwen2.5-3B-Instruct"
+            )
+            # Try 4-bit quantisation (needs bitsandbytes); fall back to fp16
+            load_kwargs: dict = {
+                "torch_dtype": torch.float16,
+                "device_map": "auto",
+            }
+            try:
+                from transformers import BitsAndBytesConfig
+                load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                )
+                print("[TrainedAgent] Using 4-bit quantisation")
+            except (ImportError, Exception):
+                print("[TrainedAgent] bitsandbytes not available, using fp16")
+            base_model = AutoModelForCausalLM.from_pretrained(
+                "Qwen/Qwen2.5-3B-Instruct",
+                **load_kwargs,
+            )
+            print("[TrainedAgent] Loading LoRA adapter...")
+            self.model = PeftModel.from_pretrained(
+                base_model,
+                "Anurag137/enterprise-ops-lora"
+            )
+            self.tokenizer = tokenizer
+            self.model.eval()
+            print("[TrainedAgent] Model loaded successfully")
+        except Exception as e:
+            print(f"[TrainedAgent] Could not load model: {e}")
+            print("[TrainedAgent] Falling back to rule-based")
+            self.model = None
+            self.tokenizer = None
+    def act(self, obs: ObservationSchema) -> ActionSchema:
+        if self.model is None:
+            return self._rule_based_act(obs)
+        try:
+            tickets = obs.tickets or []
+            obs_data = {
+                "step": obs.step_number,
+                "tickets": [
+                    {
+                        "id": t.id,
+                        "priority": t.priority,
+                        "sla_steps_remaining": t.sla_steps_remaining,
+                        "resolved": t.resolved,
+                    }
+                    for t in tickets[:5]
+                ],
+            }
+            system = (
+                "You are the IT Agent in an enterprise operations environment. "
+                "Resolve support tickets, manage compute resources. "
+                "Available tools: get_tickets, resolve_ticket, allocate_resource. "
+                'Respond ONLY with valid JSON: {"tool_call":"<name>","tool_params":{},'
+                '"reasoning":"<why>"}'
+            )
+            if self.tokenizer is None:
+                return self._rule_based_act(obs)
+            prompt = self.tokenizer.apply_chat_template(
+                [
+                    {"role": "system", "content": system},
+                    {"role": "user", "content": json.dumps(obs_data)},
+                ],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            import torch
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(device)
+            if self.model is not None:
+                self.model = self.model.to(device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=150,
+                    temperature=0.1,
+                    do_sample=True,
+                )
+            response = self.tokenizer.decode(
+                outputs[0][inputs["input_ids"].shape[1] :],
+                skip_special_tokens=True,
+            )
+            m = re.search(r"\{.*\}", response, re.DOTALL)
+            if m:
+                d = json.loads(m.group())
+                return ActionSchema(
+                    tool_call=d.get("tool_call"),
+                    tool_params=d.get("tool_params", {}),
+                    message_to=d.get("message_to"),
+                    message_content=d.get("message_content"),
+                )
+        except Exception as e:
+            print(f"[TrainedAgent] Inference error: {e}")
+        return self._rule_based_act(obs)
+    def _rule_based_act(self, obs: ObservationSchema) -> ActionSchema:
+        tickets = obs.tickets or []
+        unresolved = [t for t in tickets if not t.resolved]
+        sla_critical = [t for t in unresolved if t.sla_steps_remaining <= 2]
+        if sla_critical:
+            target = min(sla_critical, key=lambda t: t.sla_steps_remaining)
+            return ActionSchema(
+                tool_call="resolve_ticket",
+                tool_params={
+                    "ticket_id": target.id,
+                    "resolution_note": f"SLA rescue: {target.id}",
+                },
+            )
+        p1 = [t for t in unresolved if t.priority == 1]
+        if p1:
+            return ActionSchema(
+                tool_call="resolve_ticket",
+                tool_params={
+                    "ticket_id": p1[0].id,
+                    "resolution_note": f"P1: {p1[0].id}",
+                },
+            )
+        if unresolved:
+            target = min(unresolved, key=lambda t: (t.priority, t.sla_steps_remaining))
+            return ActionSchema(
+                tool_call="resolve_ticket",
+                tool_params={
+                    "ticket_id": target.id,
+                    "resolution_note": f"Resolving: {target.id}",
+                },
+            )
+        return ActionSchema(tool_call="get_tickets", tool_params={})

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ from gradio_app import demo
 class ResetRequest(BaseModel):
     scenario: Optional[str] = None
     seed: Optional[int] = None
 class ActionRequest(BaseModel):
@@ -41,6 +42,7 @@ class ActionRequest(BaseModel):
     message_to: Optional[str] = None
     message_content: Optional[str] = None
     reasoning: Optional[str] = None
 class MultiActionRequest(BaseModel):
@@ -85,8 +87,15 @@ def health() -> dict[str, str]:
 @app.post("/reset")
 def reset(req: ResetRequest) -> dict[str, Any]:
     """Reset the environment and return the primary observation."""
-    obs = env.reset(scenario=req.scenario, seed=req.seed)
-    return {"observation": obs.to_dict()}
 @app.post("/step", response_model=StepResponse)
@@ -100,7 +109,7 @@ def step(req: ActionRequest) -> StepResponse:
         message_content=req.message_content,
         reasoning=req.reasoning,
     )
-    result = env.step(action)
     return StepResponse(
         observation=result["observation"].to_dict(),
         reward=result["reward"],

 class ResetRequest(BaseModel):
     scenario: Optional[str] = None
     seed: Optional[int] = None
+    use_trained_model: bool = False
 class ActionRequest(BaseModel):
     message_to: Optional[str] = None
     message_content: Optional[str] = None
     reasoning: Optional[str] = None
+    use_trained_model: bool = False
 class MultiActionRequest(BaseModel):
 @app.post("/reset")
 def reset(req: ResetRequest) -> dict[str, Any]:
     """Reset the environment and return the primary observation."""
+    obs = env.reset(
+        scenario=req.scenario,
+        seed=req.seed,
+        use_trained_model=req.use_trained_model,
+    )
+    return {
+        "observation": obs.to_dict(),
+        "it_agent_status": env.it_agent_status(),
+    }
 @app.post("/step", response_model=StepResponse)
         message_content=req.message_content,
         reasoning=req.reasoning,
     )
+    result = env.step(action, use_trained_model=req.use_trained_model)
     return StepResponse(
         observation=result["observation"].to_dict(),
         reward=result["reward"],

environment.py CHANGED Viewed

@@ -55,6 +55,7 @@ from contracts import (
 )
 from env.env import EnterpriseOpsEnv
 from agents import ITAgent, ManagerAgent, FinanceAgent
 from models import EnterpriseAction, EnterpriseObservation
 # ---------------------------------------------------------------------------
@@ -216,9 +217,14 @@ class EnterpriseEnvironment(Environment):
         # -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) -----
         self.reward_fn: Callable[..., float] = default_reward_fn
         # -- Rule-based fallback agents for untrained roles -----------------
         self._fallback_agents: dict[str, Any] = {
-            AGENT_IT: ITAgent(),
             AGENT_MANAGER: ManagerAgent(AGENT_MANAGER),
             AGENT_FINANCE: FinanceAgent(AGENT_FINANCE),
         }
@@ -241,6 +247,24 @@ class EnterpriseEnvironment(Environment):
             f"max_steps={max_steps} | db={db_path}"
         )
     # ------------------------------------------------------------------
     # reset
     # ------------------------------------------------------------------
@@ -249,8 +273,11 @@ class EnterpriseEnvironment(Environment):
         self,
         scenario: Optional[str] = None,
         seed: Optional[int] = None,
     ) -> EnterpriseObservation:
         """Reset the environment. Returns manager-agent view as primary obs."""
         kwargs: dict[str, Any] = {}
         if scenario:
             if not scenario.endswith(".yaml"):
@@ -278,7 +305,11 @@ class EnterpriseEnvironment(Environment):
     # step (single-agent primary interface)
     # ------------------------------------------------------------------
-    def step(self, action: EnterpriseAction) -> dict[str, Any]:
         """
         Execute one environment step.
@@ -290,13 +321,16 @@ class EnterpriseEnvironment(Environment):
         dict with keys: observation, reward, done, info
         """
         # -- 1. Hard cap ----------------------------------------------------
         if self._done or self._step_count >= self._max_steps:
             self._done = True
             fallback_obs = next(iter(self._current_obs.values()))
             obs = _obs_to_openenv(fallback_obs, {}, 0.0, True)
             return {"observation": obs, "reward": 0.0, "done": True,
-                    "info": {"reason": "max_steps_exceeded"}}
         start_time = time.time()
         reward_penalty = 0.0
@@ -344,7 +378,11 @@ class EnterpriseEnvironment(Environment):
                        action.agent_id, json.dumps(action.to_dict(), default=str),
                        TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed}))
             return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True,
-                    "info": {"reason": "timeout", "elapsed_s": elapsed}}
         # -- 6. Compute reward ----------------------------------------------
         base_reward = self.reward_fn(inner_result, all_actions)
@@ -400,6 +438,7 @@ class EnterpriseEnvironment(Environment):
                 "oversight_flags": inner_result.info.get("oversight_flags", []),
                 "schema_version": inner_result.info.get("schema_version", 1),
                 "tool_results": inner_result.info.get("tool_results", {}),
             },
         }

 )
 from env.env import EnterpriseOpsEnv
 from agents import ITAgent, ManagerAgent, FinanceAgent
+from agents.trained_agent import TrainedITAgent
 from models import EnterpriseAction, EnterpriseObservation
 # ---------------------------------------------------------------------------
         # -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) -----
         self.reward_fn: Callable[..., float] = default_reward_fn
+        # -- IT: rule-based by default; optional HuggingFace LoRA via set_use_trained_it()
+        self._rule_it_agent: Any = ITAgent()
+        self._trained_it_agent: Any | None = None
+        self._use_trained_it: bool = False
         # -- Rule-based fallback agents for untrained roles -----------------
         self._fallback_agents: dict[str, Any] = {
+            AGENT_IT: self._rule_it_agent,
             AGENT_MANAGER: ManagerAgent(AGENT_MANAGER),
             AGENT_FINANCE: FinanceAgent(AGENT_FINANCE),
         }
             f"max_steps={max_steps} | db={db_path}"
         )
+    def set_use_trained_it(self, use: bool) -> None:
+        """When True, IT fallback uses TrainedITAgent (LoRA); when False, rule-based ITAgent."""
+        self._use_trained_it = use
+        if use:
+            if self._trained_it_agent is None:
+                self._trained_it_agent = TrainedITAgent()
+            self._fallback_agents[AGENT_IT] = self._trained_it_agent
+        else:
+            self._fallback_agents[AGENT_IT] = self._rule_it_agent
+    def it_agent_status(self) -> str:
+        """Short status string for the Gradio UI."""
+        if not self._use_trained_it:
+            return "Rule-based agents active"
+        if self._trained_it_agent is not None and self._trained_it_agent.model is not None:
+            return "Trained LoRA model active (HuggingFace)"
+        return "Trained mode on — LoRA not loaded, using rule-based fallback"
     # ------------------------------------------------------------------
     # reset
     # ------------------------------------------------------------------
         self,
         scenario: Optional[str] = None,
         seed: Optional[int] = None,
+        use_trained_model: bool = False,
     ) -> EnterpriseObservation:
         """Reset the environment. Returns manager-agent view as primary obs."""
+        self.set_use_trained_it(use_trained_model)
         kwargs: dict[str, Any] = {}
         if scenario:
             if not scenario.endswith(".yaml"):
     # step (single-agent primary interface)
     # ------------------------------------------------------------------
+    def step(
+        self,
+        action: EnterpriseAction,
+        use_trained_model: Optional[bool] = None,
+    ) -> dict[str, Any]:
         """
         Execute one environment step.
         dict with keys: observation, reward, done, info
         """
+        if use_trained_model is not None:
+            self.set_use_trained_it(use_trained_model)
         # -- 1. Hard cap ----------------------------------------------------
         if self._done or self._step_count >= self._max_steps:
             self._done = True
             fallback_obs = next(iter(self._current_obs.values()))
             obs = _obs_to_openenv(fallback_obs, {}, 0.0, True)
             return {"observation": obs, "reward": 0.0, "done": True,
+                    "info": {"reason": "max_steps_exceeded", "it_agent_status": self.it_agent_status()}}
         start_time = time.time()
         reward_penalty = 0.0
                        action.agent_id, json.dumps(action.to_dict(), default=str),
                        TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed}))
             return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True,
+                    "info": {
+                        "reason": "timeout",
+                        "elapsed_s": elapsed,
+                        "it_agent_status": self.it_agent_status(),
+                    }}
         # -- 6. Compute reward ----------------------------------------------
         base_reward = self.reward_fn(inner_result, all_actions)
                 "oversight_flags": inner_result.info.get("oversight_flags", []),
                 "schema_version": inner_result.info.get("schema_version", 1),
                 "tool_results": inner_result.info.get("tool_results", {}),
+                "it_agent_status": self.it_agent_status(),
             },
         }

gradio_app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from __future__ import annotations
 import json
 from typing import Any
 import gradio as gr
@@ -9,6 +12,9 @@ import requests
 BASE_URL = "http://localhost:7860"
 TIMEOUT = 45
 SCENARIO_CHOICES = [
     ("Scenario 1", "scenario_01"),
     ("Scenario 2", "scenario_02"),
@@ -59,31 +65,49 @@ def _request(method: str, path: str, payload: dict[str, Any] | None = None) -> d
     return response.json()
-def _reset_episode(scenario_name: str) -> tuple[str, str, str]:
-    data = _request("post", "/reset", {"scenario": scenario_name})
     observation = data.get("observation", {})
     formatted = _pretty(observation)
-    return formatted, formatted, "Active"
 def _step_episode(
     agent_id: str,
     tool_call: str,
     tool_params_json: str,
     message_to: str,
     message_content: str,
     reasoning: str,
-) -> tuple[str, str, str, str]:
     try:
         tool_params = json.loads(tool_params_json) if tool_params_json.strip() else {}
         if not isinstance(tool_params, dict):
             raise ValueError("Tool params must decode to a JSON object.")
     except Exception as exc:
         error_text = f"Invalid tool params JSON: {exc}"
-        return error_text, error_text, "0.0", "Active"
-    payload = {
         "agent_id": agent_id,
         "tool_call": tool_call or None,
         "tool_params": tool_params,
         "message_to": message_to or None,
@@ -94,8 +118,10 @@ def _step_episode(
     observation = data.get("observation", {})
     formatted = _pretty(observation)
     reward = f"{data.get('reward', 0.0):.3f}"
-    status = "Done" if data.get("done") else "Active"
-    return formatted, formatted, reward, status
 def _load_world_state() -> str:
@@ -119,49 +145,103 @@ with gr.Blocks(theme=gr.themes.Monochrome(), title="EnterpriseOps Arena - Meta P
     )
     with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## Reset Panel")
-            scenario = gr.Dropdown(
-                choices=SCENARIO_CHOICES,
-                value="scenario_01",
-                label="Scenario",
-            )
-            reset_button = gr.Button("Reset Episode", variant="primary")
-            reset_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
-        with gr.Column(scale=1):
-            gr.Markdown("## Step Panel")
-            agent_id = gr.Dropdown(
-                choices=AGENT_CHOICES,
-                value="it_agent",
-                label="Agent",
-            )
-            tool_call = gr.Dropdown(
-                choices=TOOL_CHOICES,
-                value="get_tickets",
-                label="Tool",
-            )
-            tool_params = gr.Textbox(
-                label="Tool params JSON",
-                lines=8,
-                value=_preset_tool_params("get_tickets"),
             )
-            message_to = gr.Textbox(label="Message To", placeholder="manager_agent")
-            message_content = gr.Textbox(label="Message Content", lines=3)
-            reasoning = gr.Textbox(label="Reasoning", lines=3)
-            step_button = gr.Button("Step Episode", variant="primary")
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## Results Panel")
-            result_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
-            reward_score = gr.Textbox(label="Reward Score", value="0.0", interactive=False)
-            episode_status = gr.Textbox(label="Episode Status", value="Active", interactive=False)
-        with gr.Column(scale=1):
-            gr.Markdown("## World State")
-            state_button = gr.Button("Load World State", variant="secondary")
-            world_state = gr.Textbox(label="State", lines=20, interactive=False)
     tool_call.change(
         fn=_preset_tool_params,
@@ -170,16 +250,37 @@ with gr.Blocks(theme=gr.themes.Monochrome(), title="EnterpriseOps Arena - Meta P
     )
     reset_button.click(
         fn=_reset_episode,
-        inputs=scenario,
-        outputs=[reset_observation, result_observation, episode_status],
     )
     step_button.click(
         fn=_step_episode,
-        inputs=[agent_id, tool_call, tool_params, message_to, message_content, reasoning],
-        outputs=[reset_observation, result_observation, reward_score, episode_status],
     )
     state_button.click(fn=_load_world_state, inputs=None, outputs=world_state)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 from __future__ import annotations
 import json
+import os
+import sys
+from pathlib import Path
 from typing import Any
 import gradio as gr
 BASE_URL = "http://localhost:7860"
 TIMEOUT = 45
+_SC_DIR = Path(__file__).resolve().parent
+_REWARD_IMAGE = _SC_DIR / "reward_curves.png"
 SCENARIO_CHOICES = [
     ("Scenario 1", "scenario_01"),
     ("Scenario 2", "scenario_02"),
     return response.json()
+def _default_status(use_trained: bool) -> str:
+    return (
+        "Trained mode selected (applies on next server contact)"
+        if use_trained
+        else "Rule-based agents active"
+    )
+def _reset_episode(
+    use_trained: bool,
+    scenario_name: str,
+) -> tuple[str, str, str, str]:
+    data = _request(
+        "post",
+        "/reset",
+        {"scenario": scenario_name, "use_trained_model": use_trained},
+    )
     observation = data.get("observation", {})
     formatted = _pretty(observation)
+    status = data.get("it_agent_status", _default_status(use_trained))
+    return formatted, formatted, "Active", status
 def _step_episode(
+    use_trained: bool,
     agent_id: str,
     tool_call: str,
     tool_params_json: str,
     message_to: str,
     message_content: str,
     reasoning: str,
+) -> tuple[str, str, str, str, str]:
     try:
         tool_params = json.loads(tool_params_json) if tool_params_json.strip() else {}
         if not isinstance(tool_params, dict):
             raise ValueError("Tool params must decode to a JSON object.")
     except Exception as exc:
         error_text = f"Invalid tool params JSON: {exc}"
+        return error_text, error_text, "0.0", "Active", _default_status(use_trained)
+    payload: dict[str, Any] = {
         "agent_id": agent_id,
+        "use_trained_model": use_trained,
         "tool_call": tool_call or None,
         "tool_params": tool_params,
         "message_to": message_to or None,
     observation = data.get("observation", {})
     formatted = _pretty(observation)
     reward = f"{data.get('reward', 0.0):.3f}"
+    done = data.get("done", False)
+    ep_status = "Done" if done else "Active"
+    status = (data.get("info") or {}).get("it_agent_status") or _default_status(use_trained)
+    return formatted, formatted, reward, ep_status, status
 def _load_world_state() -> str:
     )
     with gr.Row():
+        use_trained_model = gr.Checkbox(
+            label="🤖 Use Trained LoRA Model (vs Rule-based)",
+            value=False,
+            info="Uses Qwen2.5-3B trained on 700 steps of GRPO",
+        )
+        model_status = gr.Textbox(
+            label="Model Status",
+            value="Rule-based agents active",
+            interactive=False,
+        )
+    with gr.Tabs():
+        with gr.Tab("Arena"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("## Reset Panel")
+                    scenario = gr.Dropdown(
+                        choices=SCENARIO_CHOICES,
+                        value="scenario_01",
+                        label="Scenario",
+                    )
+                    reset_button = gr.Button("Reset Episode", variant="primary")
+                    reset_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
+                with gr.Column(scale=1):
+                    gr.Markdown("## Step Panel")
+                    agent_id = gr.Dropdown(
+                        choices=AGENT_CHOICES,
+                        value="it_agent",
+                        label="Agent",
+                    )
+                    tool_call = gr.Dropdown(
+                        choices=TOOL_CHOICES,
+                        value="get_tickets",
+                        label="Tool",
+                    )
+                    tool_params = gr.Textbox(
+                        label="Tool params JSON",
+                        lines=8,
+                        value=_preset_tool_params("get_tickets"),
+                    )
+                    message_to = gr.Textbox(label="Message To", placeholder="manager_agent")
+                    message_content = gr.Textbox(label="Message Content", lines=3)
+                    reasoning = gr.Textbox(label="Reasoning", lines=3)
+                    step_button = gr.Button("Step Episode", variant="primary")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("## Results Panel")
+                    result_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
+                    reward_score = gr.Textbox(label="Reward Score", value="0.0", interactive=False)
+                    episode_status = gr.Textbox(label="Episode Status", value="Active", interactive=False)
+                with gr.Column(scale=1):
+                    gr.Markdown("## World State")
+                    state_button = gr.Button("Load World State", variant="secondary")
+                    world_state = gr.Textbox(label="State", lines=20, interactive=False)
+        with gr.Tab("Training Evidence"):
+            gr.Markdown(
+                """
+                ## Real GRPO Training Results
+                700 steps across 3 runs on Tesla T4 GPU
+                """
             )
+            _img_val = str(_REWARD_IMAGE) if _REWARD_IMAGE.is_file() else None
+            if _img_val is not None:
+                gr.Image(
+                    value=_img_val,
+                    label="Training Curves (700 steps)",
+                )
+            else:
+                gr.Markdown(
+                    f"_Plot not found. Add `reward_curves.png` in `{_SC_DIR.as_posix()}` to show training curves._"
+                )
+            gr.Markdown(
+                r"""
+| Metric | Value |
+|--------|-------|
+| Peak Episode Score | 114 (+77%) |
+| Task Completion | 35 → 75 (+114%) |
+| GRPO reward_std | 0.5 (variance confirmed) |
+| Scenarios Completed | All 8 automatically |
+| Backtracking | Triggered 2x (MARL adaptive) |
+| Model | Qwen2.5-3B-Instruct 4-bit LoRA |
+## Trained Model
+🤖 [Anurag137/enterprise-ops-lora](https://huggingface.co/Anurag137/enterprise-ops-lora)
+## Experiment Tracking
+📊 [View on Weights & Biases](https://wandb.ai/kanhaiyakumar76618-indian-institute-of-information-techn/enterprise-ops-arena)
+## Before vs After Training
+**Before:** Agent outputs wrong tool names, missing ticket_id
+**After:** Correct tool calls, SLA-aware reasoning, specific ticket references
+                """
+            )
     tool_call.change(
         fn=_preset_tool_params,
     )
     reset_button.click(
         fn=_reset_episode,
+        inputs=[use_trained_model, scenario],
+        outputs=[reset_observation, result_observation, episode_status, model_status],
     )
     step_button.click(
         fn=_step_episode,
+        inputs=[
+            use_trained_model,
+            agent_id,
+            tool_call,
+            tool_params,
+            message_to,
+            message_content,
+            reasoning,
+        ],
+        outputs=[reset_observation, result_observation, reward_score, episode_status, model_status],
     )
     state_button.click(fn=_load_world_state, inputs=None, outputs=world_state)
 if __name__ == "__main__":
+    # Serve FastAPI + Gradio (single process) so /reset and /step work. Requires uvicorn.
+    _server = Path(__file__).resolve().parent
+    os.chdir(_server)
+    if str(_server) not in sys.path:
+        sys.path.insert(0, str(_server))
+    if str(_server.parent) not in sys.path:
+        sys.path.insert(0, str(_server.parent))
+    try:
+        import uvicorn
+    except ImportError:
+        print("[gradio_app] uvicorn not installed; launching Gradio UI only. API routes (/reset, /step) will not work without running: uvicorn app:app", flush=True)
+        demo.launch(server_name="0.0.0.0", server_port=7860)
+    else:
+        uvicorn.run("app:app", host="0.0.0.0", port=7860, factory=False, reload=False)

requirements.txt CHANGED Viewed

@@ -2,12 +2,15 @@ openenv-core
 fastapi
 uvicorn[standard]
 pydantic>=2.0
-httpx
 sqlalchemy
 aiosqlite
-pyyaml
 numpy
-python-dotenv
-pytest
 gradio
-matplotlib

 fastapi
 uvicorn[standard]
 pydantic>=2.0
+pyyaml
 sqlalchemy
 aiosqlite
 numpy
+httpx
 gradio
+requests
+peft>=0.6.0
+transformers>=4.35.0
+accelerate>=0.24.0
+bitsandbytes>=0.41.0
+huggingface_hub>=0.20.0