Anurag137 commited on
Commit
b9b6fea
·
1 Parent(s): 7e43503

deploy: trained LoRA toggle + training evidence tab

Browse files
Files changed (5) hide show
  1. agents/trained_agent.py +186 -0
  2. app.py +12 -3
  3. environment.py +43 -4
  4. gradio_app.py +154 -53
  5. requirements.txt +8 -5
agents/trained_agent.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ _ROOT = Path(__file__).resolve().parent.parent
9
+ if str(_ROOT) not in sys.path:
10
+ sys.path.insert(0, str(_ROOT))
11
+
12
+ from contracts import ActionSchema, ObservationSchema, AGENT_IT
13
+
14
+
15
+ class TrainedITAgent:
16
+ """
17
+ IT Agent powered by trained LoRA model from HuggingFace.
18
+ Falls back to rule-based if model not available.
19
+ """
20
+
21
+ MODEL_REPO = "Anurag137/enterprise-ops-lora"
22
+ BASE_MODEL = "unsloth/Qwen2.5-3B-Instruct"
23
+
24
+ def __init__(self):
25
+ self.agent_id = AGENT_IT
26
+ self.model = None
27
+ self.tokenizer = None
28
+ self._load_model()
29
+
30
+ def _load_model(self):
31
+ try:
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+ from peft import PeftModel
34
+ import torch
35
+
36
+ print("[TrainedAgent] Loading base model without Unsloth...")
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(
39
+ "Qwen/Qwen2.5-3B-Instruct"
40
+ )
41
+
42
+ # Try 4-bit quantisation (needs bitsandbytes); fall back to fp16
43
+ load_kwargs: dict = {
44
+ "torch_dtype": torch.float16,
45
+ "device_map": "auto",
46
+ }
47
+ try:
48
+ from transformers import BitsAndBytesConfig
49
+ load_kwargs["quantization_config"] = BitsAndBytesConfig(
50
+ load_in_4bit=True,
51
+ bnb_4bit_compute_dtype=torch.float16,
52
+ )
53
+ print("[TrainedAgent] Using 4-bit quantisation")
54
+ except (ImportError, Exception):
55
+ print("[TrainedAgent] bitsandbytes not available, using fp16")
56
+
57
+ base_model = AutoModelForCausalLM.from_pretrained(
58
+ "Qwen/Qwen2.5-3B-Instruct",
59
+ **load_kwargs,
60
+ )
61
+
62
+ print("[TrainedAgent] Loading LoRA adapter...")
63
+ self.model = PeftModel.from_pretrained(
64
+ base_model,
65
+ "Anurag137/enterprise-ops-lora"
66
+ )
67
+ self.tokenizer = tokenizer
68
+ self.model.eval()
69
+ print("[TrainedAgent] Model loaded successfully")
70
+
71
+ except Exception as e:
72
+ print(f"[TrainedAgent] Could not load model: {e}")
73
+ print("[TrainedAgent] Falling back to rule-based")
74
+ self.model = None
75
+ self.tokenizer = None
76
+
77
+ def act(self, obs: ObservationSchema) -> ActionSchema:
78
+ if self.model is None:
79
+ return self._rule_based_act(obs)
80
+
81
+ try:
82
+ tickets = obs.tickets or []
83
+
84
+ obs_data = {
85
+ "step": obs.step_number,
86
+ "tickets": [
87
+ {
88
+ "id": t.id,
89
+ "priority": t.priority,
90
+ "sla_steps_remaining": t.sla_steps_remaining,
91
+ "resolved": t.resolved,
92
+ }
93
+ for t in tickets[:5]
94
+ ],
95
+ }
96
+
97
+ system = (
98
+ "You are the IT Agent in an enterprise operations environment. "
99
+ "Resolve support tickets, manage compute resources. "
100
+ "Available tools: get_tickets, resolve_ticket, allocate_resource. "
101
+ 'Respond ONLY with valid JSON: {"tool_call":"<name>","tool_params":{},'
102
+ '"reasoning":"<why>"}'
103
+ )
104
+
105
+ if self.tokenizer is None:
106
+ return self._rule_based_act(obs)
107
+
108
+ prompt = self.tokenizer.apply_chat_template(
109
+ [
110
+ {"role": "system", "content": system},
111
+ {"role": "user", "content": json.dumps(obs_data)},
112
+ ],
113
+ tokenize=False,
114
+ add_generation_prompt=True,
115
+ )
116
+
117
+ import torch
118
+
119
+ device = "cuda" if torch.cuda.is_available() else "cpu"
120
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(device)
121
+ if self.model is not None:
122
+ self.model = self.model.to(device)
123
+
124
+ with torch.no_grad():
125
+ outputs = self.model.generate(
126
+ **inputs,
127
+ max_new_tokens=150,
128
+ temperature=0.1,
129
+ do_sample=True,
130
+ )
131
+
132
+ response = self.tokenizer.decode(
133
+ outputs[0][inputs["input_ids"].shape[1] :],
134
+ skip_special_tokens=True,
135
+ )
136
+
137
+ m = re.search(r"\{.*\}", response, re.DOTALL)
138
+ if m:
139
+ d = json.loads(m.group())
140
+ return ActionSchema(
141
+ tool_call=d.get("tool_call"),
142
+ tool_params=d.get("tool_params", {}),
143
+ message_to=d.get("message_to"),
144
+ message_content=d.get("message_content"),
145
+ )
146
+ except Exception as e:
147
+ print(f"[TrainedAgent] Inference error: {e}")
148
+
149
+ return self._rule_based_act(obs)
150
+
151
+ def _rule_based_act(self, obs: ObservationSchema) -> ActionSchema:
152
+ tickets = obs.tickets or []
153
+ unresolved = [t for t in tickets if not t.resolved]
154
+
155
+ sla_critical = [t for t in unresolved if t.sla_steps_remaining <= 2]
156
+ if sla_critical:
157
+ target = min(sla_critical, key=lambda t: t.sla_steps_remaining)
158
+ return ActionSchema(
159
+ tool_call="resolve_ticket",
160
+ tool_params={
161
+ "ticket_id": target.id,
162
+ "resolution_note": f"SLA rescue: {target.id}",
163
+ },
164
+ )
165
+
166
+ p1 = [t for t in unresolved if t.priority == 1]
167
+ if p1:
168
+ return ActionSchema(
169
+ tool_call="resolve_ticket",
170
+ tool_params={
171
+ "ticket_id": p1[0].id,
172
+ "resolution_note": f"P1: {p1[0].id}",
173
+ },
174
+ )
175
+
176
+ if unresolved:
177
+ target = min(unresolved, key=lambda t: (t.priority, t.sla_steps_remaining))
178
+ return ActionSchema(
179
+ tool_call="resolve_ticket",
180
+ tool_params={
181
+ "ticket_id": target.id,
182
+ "resolution_note": f"Resolving: {target.id}",
183
+ },
184
+ )
185
+
186
+ return ActionSchema(tool_call="get_tickets", tool_params={})
app.py CHANGED
@@ -32,6 +32,7 @@ from gradio_app import demo
32
  class ResetRequest(BaseModel):
33
  scenario: Optional[str] = None
34
  seed: Optional[int] = None
 
35
 
36
 
37
  class ActionRequest(BaseModel):
@@ -41,6 +42,7 @@ class ActionRequest(BaseModel):
41
  message_to: Optional[str] = None
42
  message_content: Optional[str] = None
43
  reasoning: Optional[str] = None
 
44
 
45
 
46
  class MultiActionRequest(BaseModel):
@@ -85,8 +87,15 @@ def health() -> dict[str, str]:
85
  @app.post("/reset")
86
  def reset(req: ResetRequest) -> dict[str, Any]:
87
  """Reset the environment and return the primary observation."""
88
- obs = env.reset(scenario=req.scenario, seed=req.seed)
89
- return {"observation": obs.to_dict()}
 
 
 
 
 
 
 
90
 
91
 
92
  @app.post("/step", response_model=StepResponse)
@@ -100,7 +109,7 @@ def step(req: ActionRequest) -> StepResponse:
100
  message_content=req.message_content,
101
  reasoning=req.reasoning,
102
  )
103
- result = env.step(action)
104
  return StepResponse(
105
  observation=result["observation"].to_dict(),
106
  reward=result["reward"],
 
32
  class ResetRequest(BaseModel):
33
  scenario: Optional[str] = None
34
  seed: Optional[int] = None
35
+ use_trained_model: bool = False
36
 
37
 
38
  class ActionRequest(BaseModel):
 
42
  message_to: Optional[str] = None
43
  message_content: Optional[str] = None
44
  reasoning: Optional[str] = None
45
+ use_trained_model: bool = False
46
 
47
 
48
  class MultiActionRequest(BaseModel):
 
87
  @app.post("/reset")
88
  def reset(req: ResetRequest) -> dict[str, Any]:
89
  """Reset the environment and return the primary observation."""
90
+ obs = env.reset(
91
+ scenario=req.scenario,
92
+ seed=req.seed,
93
+ use_trained_model=req.use_trained_model,
94
+ )
95
+ return {
96
+ "observation": obs.to_dict(),
97
+ "it_agent_status": env.it_agent_status(),
98
+ }
99
 
100
 
101
  @app.post("/step", response_model=StepResponse)
 
109
  message_content=req.message_content,
110
  reasoning=req.reasoning,
111
  )
112
+ result = env.step(action, use_trained_model=req.use_trained_model)
113
  return StepResponse(
114
  observation=result["observation"].to_dict(),
115
  reward=result["reward"],
environment.py CHANGED
@@ -55,6 +55,7 @@ from contracts import (
55
  )
56
  from env.env import EnterpriseOpsEnv
57
  from agents import ITAgent, ManagerAgent, FinanceAgent
 
58
  from models import EnterpriseAction, EnterpriseObservation
59
 
60
  # ---------------------------------------------------------------------------
@@ -216,9 +217,14 @@ class EnterpriseEnvironment(Environment):
216
  # -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) -----
217
  self.reward_fn: Callable[..., float] = default_reward_fn
218
 
 
 
 
 
 
219
  # -- Rule-based fallback agents for untrained roles -----------------
220
  self._fallback_agents: dict[str, Any] = {
221
- AGENT_IT: ITAgent(),
222
  AGENT_MANAGER: ManagerAgent(AGENT_MANAGER),
223
  AGENT_FINANCE: FinanceAgent(AGENT_FINANCE),
224
  }
@@ -241,6 +247,24 @@ class EnterpriseEnvironment(Environment):
241
  f"max_steps={max_steps} | db={db_path}"
242
  )
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # ------------------------------------------------------------------
245
  # reset
246
  # ------------------------------------------------------------------
@@ -249,8 +273,11 @@ class EnterpriseEnvironment(Environment):
249
  self,
250
  scenario: Optional[str] = None,
251
  seed: Optional[int] = None,
 
252
  ) -> EnterpriseObservation:
253
  """Reset the environment. Returns manager-agent view as primary obs."""
 
 
254
  kwargs: dict[str, Any] = {}
255
  if scenario:
256
  if not scenario.endswith(".yaml"):
@@ -278,7 +305,11 @@ class EnterpriseEnvironment(Environment):
278
  # step (single-agent primary interface)
279
  # ------------------------------------------------------------------
280
 
281
- def step(self, action: EnterpriseAction) -> dict[str, Any]:
 
 
 
 
282
  """
283
  Execute one environment step.
284
 
@@ -290,13 +321,16 @@ class EnterpriseEnvironment(Environment):
290
  dict with keys: observation, reward, done, info
291
  """
292
 
 
 
 
293
  # -- 1. Hard cap ----------------------------------------------------
294
  if self._done or self._step_count >= self._max_steps:
295
  self._done = True
296
  fallback_obs = next(iter(self._current_obs.values()))
297
  obs = _obs_to_openenv(fallback_obs, {}, 0.0, True)
298
  return {"observation": obs, "reward": 0.0, "done": True,
299
- "info": {"reason": "max_steps_exceeded"}}
300
 
301
  start_time = time.time()
302
  reward_penalty = 0.0
@@ -344,7 +378,11 @@ class EnterpriseEnvironment(Environment):
344
  action.agent_id, json.dumps(action.to_dict(), default=str),
345
  TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed}))
346
  return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True,
347
- "info": {"reason": "timeout", "elapsed_s": elapsed}}
 
 
 
 
348
 
349
  # -- 6. Compute reward ----------------------------------------------
350
  base_reward = self.reward_fn(inner_result, all_actions)
@@ -400,6 +438,7 @@ class EnterpriseEnvironment(Environment):
400
  "oversight_flags": inner_result.info.get("oversight_flags", []),
401
  "schema_version": inner_result.info.get("schema_version", 1),
402
  "tool_results": inner_result.info.get("tool_results", {}),
 
403
  },
404
  }
405
 
 
55
  )
56
  from env.env import EnterpriseOpsEnv
57
  from agents import ITAgent, ManagerAgent, FinanceAgent
58
+ from agents.trained_agent import TrainedITAgent
59
  from models import EnterpriseAction, EnterpriseObservation
60
 
61
  # ---------------------------------------------------------------------------
 
217
  # -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) -----
218
  self.reward_fn: Callable[..., float] = default_reward_fn
219
 
220
+ # -- IT: rule-based by default; optional HuggingFace LoRA via set_use_trained_it()
221
+ self._rule_it_agent: Any = ITAgent()
222
+ self._trained_it_agent: Any | None = None
223
+ self._use_trained_it: bool = False
224
+
225
  # -- Rule-based fallback agents for untrained roles -----------------
226
  self._fallback_agents: dict[str, Any] = {
227
+ AGENT_IT: self._rule_it_agent,
228
  AGENT_MANAGER: ManagerAgent(AGENT_MANAGER),
229
  AGENT_FINANCE: FinanceAgent(AGENT_FINANCE),
230
  }
 
247
  f"max_steps={max_steps} | db={db_path}"
248
  )
249
 
250
+ def set_use_trained_it(self, use: bool) -> None:
251
+ """When True, IT fallback uses TrainedITAgent (LoRA); when False, rule-based ITAgent."""
252
+ self._use_trained_it = use
253
+ if use:
254
+ if self._trained_it_agent is None:
255
+ self._trained_it_agent = TrainedITAgent()
256
+ self._fallback_agents[AGENT_IT] = self._trained_it_agent
257
+ else:
258
+ self._fallback_agents[AGENT_IT] = self._rule_it_agent
259
+
260
+ def it_agent_status(self) -> str:
261
+ """Short status string for the Gradio UI."""
262
+ if not self._use_trained_it:
263
+ return "Rule-based agents active"
264
+ if self._trained_it_agent is not None and self._trained_it_agent.model is not None:
265
+ return "Trained LoRA model active (HuggingFace)"
266
+ return "Trained mode on — LoRA not loaded, using rule-based fallback"
267
+
268
  # ------------------------------------------------------------------
269
  # reset
270
  # ------------------------------------------------------------------
 
273
  self,
274
  scenario: Optional[str] = None,
275
  seed: Optional[int] = None,
276
+ use_trained_model: bool = False,
277
  ) -> EnterpriseObservation:
278
  """Reset the environment. Returns manager-agent view as primary obs."""
279
+ self.set_use_trained_it(use_trained_model)
280
+
281
  kwargs: dict[str, Any] = {}
282
  if scenario:
283
  if not scenario.endswith(".yaml"):
 
305
  # step (single-agent primary interface)
306
  # ------------------------------------------------------------------
307
 
308
+ def step(
309
+ self,
310
+ action: EnterpriseAction,
311
+ use_trained_model: Optional[bool] = None,
312
+ ) -> dict[str, Any]:
313
  """
314
  Execute one environment step.
315
 
 
321
  dict with keys: observation, reward, done, info
322
  """
323
 
324
+ if use_trained_model is not None:
325
+ self.set_use_trained_it(use_trained_model)
326
+
327
  # -- 1. Hard cap ----------------------------------------------------
328
  if self._done or self._step_count >= self._max_steps:
329
  self._done = True
330
  fallback_obs = next(iter(self._current_obs.values()))
331
  obs = _obs_to_openenv(fallback_obs, {}, 0.0, True)
332
  return {"observation": obs, "reward": 0.0, "done": True,
333
+ "info": {"reason": "max_steps_exceeded", "it_agent_status": self.it_agent_status()}}
334
 
335
  start_time = time.time()
336
  reward_penalty = 0.0
 
378
  action.agent_id, json.dumps(action.to_dict(), default=str),
379
  TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed}))
380
  return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True,
381
+ "info": {
382
+ "reason": "timeout",
383
+ "elapsed_s": elapsed,
384
+ "it_agent_status": self.it_agent_status(),
385
+ }}
386
 
387
  # -- 6. Compute reward ----------------------------------------------
388
  base_reward = self.reward_fn(inner_result, all_actions)
 
438
  "oversight_flags": inner_result.info.get("oversight_flags", []),
439
  "schema_version": inner_result.info.get("schema_version", 1),
440
  "tool_results": inner_result.info.get("tool_results", {}),
441
+ "it_agent_status": self.it_agent_status(),
442
  },
443
  }
444
 
gradio_app.py CHANGED
@@ -1,6 +1,9 @@
1
  from __future__ import annotations
2
 
3
  import json
 
 
 
4
  from typing import Any
5
 
6
  import gradio as gr
@@ -9,6 +12,9 @@ import requests
9
  BASE_URL = "http://localhost:7860"
10
  TIMEOUT = 45
11
 
 
 
 
12
  SCENARIO_CHOICES = [
13
  ("Scenario 1", "scenario_01"),
14
  ("Scenario 2", "scenario_02"),
@@ -59,31 +65,49 @@ def _request(method: str, path: str, payload: dict[str, Any] | None = None) -> d
59
  return response.json()
60
 
61
 
62
- def _reset_episode(scenario_name: str) -> tuple[str, str, str]:
63
- data = _request("post", "/reset", {"scenario": scenario_name})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  observation = data.get("observation", {})
65
  formatted = _pretty(observation)
66
- return formatted, formatted, "Active"
 
67
 
68
 
69
  def _step_episode(
 
70
  agent_id: str,
71
  tool_call: str,
72
  tool_params_json: str,
73
  message_to: str,
74
  message_content: str,
75
  reasoning: str,
76
- ) -> tuple[str, str, str, str]:
77
  try:
78
  tool_params = json.loads(tool_params_json) if tool_params_json.strip() else {}
79
  if not isinstance(tool_params, dict):
80
  raise ValueError("Tool params must decode to a JSON object.")
81
  except Exception as exc:
82
  error_text = f"Invalid tool params JSON: {exc}"
83
- return error_text, error_text, "0.0", "Active"
84
 
85
- payload = {
86
  "agent_id": agent_id,
 
87
  "tool_call": tool_call or None,
88
  "tool_params": tool_params,
89
  "message_to": message_to or None,
@@ -94,8 +118,10 @@ def _step_episode(
94
  observation = data.get("observation", {})
95
  formatted = _pretty(observation)
96
  reward = f"{data.get('reward', 0.0):.3f}"
97
- status = "Done" if data.get("done") else "Active"
98
- return formatted, formatted, reward, status
 
 
99
 
100
 
101
  def _load_world_state() -> str:
@@ -119,49 +145,103 @@ with gr.Blocks(theme=gr.themes.Monochrome(), title="EnterpriseOps Arena - Meta P
119
  )
120
 
121
  with gr.Row():
122
- with gr.Column(scale=1):
123
- gr.Markdown("## Reset Panel")
124
- scenario = gr.Dropdown(
125
- choices=SCENARIO_CHOICES,
126
- value="scenario_01",
127
- label="Scenario",
128
- )
129
- reset_button = gr.Button("Reset Episode", variant="primary")
130
- reset_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
131
-
132
- with gr.Column(scale=1):
133
- gr.Markdown("## Step Panel")
134
- agent_id = gr.Dropdown(
135
- choices=AGENT_CHOICES,
136
- value="it_agent",
137
- label="Agent",
138
- )
139
- tool_call = gr.Dropdown(
140
- choices=TOOL_CHOICES,
141
- value="get_tickets",
142
- label="Tool",
143
- )
144
- tool_params = gr.Textbox(
145
- label="Tool params JSON",
146
- lines=8,
147
- value=_preset_tool_params("get_tickets"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  )
149
- message_to = gr.Textbox(label="Message To", placeholder="manager_agent")
150
- message_content = gr.Textbox(label="Message Content", lines=3)
151
- reasoning = gr.Textbox(label="Reasoning", lines=3)
152
- step_button = gr.Button("Step Episode", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- with gr.Row():
155
- with gr.Column(scale=1):
156
- gr.Markdown("## Results Panel")
157
- result_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
158
- reward_score = gr.Textbox(label="Reward Score", value="0.0", interactive=False)
159
- episode_status = gr.Textbox(label="Episode Status", value="Active", interactive=False)
160
 
161
- with gr.Column(scale=1):
162
- gr.Markdown("## World State")
163
- state_button = gr.Button("Load World State", variant="secondary")
164
- world_state = gr.Textbox(label="State", lines=20, interactive=False)
 
165
 
166
  tool_call.change(
167
  fn=_preset_tool_params,
@@ -170,16 +250,37 @@ with gr.Blocks(theme=gr.themes.Monochrome(), title="EnterpriseOps Arena - Meta P
170
  )
171
  reset_button.click(
172
  fn=_reset_episode,
173
- inputs=scenario,
174
- outputs=[reset_observation, result_observation, episode_status],
175
  )
176
  step_button.click(
177
  fn=_step_episode,
178
- inputs=[agent_id, tool_call, tool_params, message_to, message_content, reasoning],
179
- outputs=[reset_observation, result_observation, reward_score, episode_status],
 
 
 
 
 
 
 
 
180
  )
181
  state_button.click(fn=_load_world_state, inputs=None, outputs=world_state)
182
 
183
 
184
  if __name__ == "__main__":
185
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
  from typing import Any
8
 
9
  import gradio as gr
 
12
  BASE_URL = "http://localhost:7860"
13
  TIMEOUT = 45
14
 
15
+ _SC_DIR = Path(__file__).resolve().parent
16
+ _REWARD_IMAGE = _SC_DIR / "reward_curves.png"
17
+
18
  SCENARIO_CHOICES = [
19
  ("Scenario 1", "scenario_01"),
20
  ("Scenario 2", "scenario_02"),
 
65
  return response.json()
66
 
67
 
68
+ def _default_status(use_trained: bool) -> str:
69
+ return (
70
+ "Trained mode selected (applies on next server contact)"
71
+ if use_trained
72
+ else "Rule-based agents active"
73
+ )
74
+
75
+
76
+ def _reset_episode(
77
+ use_trained: bool,
78
+ scenario_name: str,
79
+ ) -> tuple[str, str, str, str]:
80
+ data = _request(
81
+ "post",
82
+ "/reset",
83
+ {"scenario": scenario_name, "use_trained_model": use_trained},
84
+ )
85
  observation = data.get("observation", {})
86
  formatted = _pretty(observation)
87
+ status = data.get("it_agent_status", _default_status(use_trained))
88
+ return formatted, formatted, "Active", status
89
 
90
 
91
  def _step_episode(
92
+ use_trained: bool,
93
  agent_id: str,
94
  tool_call: str,
95
  tool_params_json: str,
96
  message_to: str,
97
  message_content: str,
98
  reasoning: str,
99
+ ) -> tuple[str, str, str, str, str]:
100
  try:
101
  tool_params = json.loads(tool_params_json) if tool_params_json.strip() else {}
102
  if not isinstance(tool_params, dict):
103
  raise ValueError("Tool params must decode to a JSON object.")
104
  except Exception as exc:
105
  error_text = f"Invalid tool params JSON: {exc}"
106
+ return error_text, error_text, "0.0", "Active", _default_status(use_trained)
107
 
108
+ payload: dict[str, Any] = {
109
  "agent_id": agent_id,
110
+ "use_trained_model": use_trained,
111
  "tool_call": tool_call or None,
112
  "tool_params": tool_params,
113
  "message_to": message_to or None,
 
118
  observation = data.get("observation", {})
119
  formatted = _pretty(observation)
120
  reward = f"{data.get('reward', 0.0):.3f}"
121
+ done = data.get("done", False)
122
+ ep_status = "Done" if done else "Active"
123
+ status = (data.get("info") or {}).get("it_agent_status") or _default_status(use_trained)
124
+ return formatted, formatted, reward, ep_status, status
125
 
126
 
127
  def _load_world_state() -> str:
 
145
  )
146
 
147
  with gr.Row():
148
+ use_trained_model = gr.Checkbox(
149
+ label="🤖 Use Trained LoRA Model (vs Rule-based)",
150
+ value=False,
151
+ info="Uses Qwen2.5-3B trained on 700 steps of GRPO",
152
+ )
153
+ model_status = gr.Textbox(
154
+ label="Model Status",
155
+ value="Rule-based agents active",
156
+ interactive=False,
157
+ )
158
+
159
+ with gr.Tabs():
160
+ with gr.Tab("Arena"):
161
+ with gr.Row():
162
+ with gr.Column(scale=1):
163
+ gr.Markdown("## Reset Panel")
164
+ scenario = gr.Dropdown(
165
+ choices=SCENARIO_CHOICES,
166
+ value="scenario_01",
167
+ label="Scenario",
168
+ )
169
+ reset_button = gr.Button("Reset Episode", variant="primary")
170
+ reset_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
171
+
172
+ with gr.Column(scale=1):
173
+ gr.Markdown("## Step Panel")
174
+ agent_id = gr.Dropdown(
175
+ choices=AGENT_CHOICES,
176
+ value="it_agent",
177
+ label="Agent",
178
+ )
179
+ tool_call = gr.Dropdown(
180
+ choices=TOOL_CHOICES,
181
+ value="get_tickets",
182
+ label="Tool",
183
+ )
184
+ tool_params = gr.Textbox(
185
+ label="Tool params JSON",
186
+ lines=8,
187
+ value=_preset_tool_params("get_tickets"),
188
+ )
189
+ message_to = gr.Textbox(label="Message To", placeholder="manager_agent")
190
+ message_content = gr.Textbox(label="Message Content", lines=3)
191
+ reasoning = gr.Textbox(label="Reasoning", lines=3)
192
+ step_button = gr.Button("Step Episode", variant="primary")
193
+
194
+ with gr.Row():
195
+ with gr.Column(scale=1):
196
+ gr.Markdown("## Results Panel")
197
+ result_observation = gr.Textbox(label="Observation", lines=12, interactive=False)
198
+ reward_score = gr.Textbox(label="Reward Score", value="0.0", interactive=False)
199
+ episode_status = gr.Textbox(label="Episode Status", value="Active", interactive=False)
200
+
201
+ with gr.Column(scale=1):
202
+ gr.Markdown("## World State")
203
+ state_button = gr.Button("Load World State", variant="secondary")
204
+ world_state = gr.Textbox(label="State", lines=20, interactive=False)
205
+
206
+ with gr.Tab("Training Evidence"):
207
+ gr.Markdown(
208
+ """
209
+ ## Real GRPO Training Results
210
+ 700 steps across 3 runs on Tesla T4 GPU
211
+ """
212
  )
213
+ _img_val = str(_REWARD_IMAGE) if _REWARD_IMAGE.is_file() else None
214
+ if _img_val is not None:
215
+ gr.Image(
216
+ value=_img_val,
217
+ label="Training Curves (700 steps)",
218
+ )
219
+ else:
220
+ gr.Markdown(
221
+ f"_Plot not found. Add `reward_curves.png` in `{_SC_DIR.as_posix()}` to show training curves._"
222
+ )
223
+ gr.Markdown(
224
+ r"""
225
+ | Metric | Value |
226
+ |--------|-------|
227
+ | Peak Episode Score | 114 (+77%) |
228
+ | Task Completion | 35 → 75 (+114%) |
229
+ | GRPO reward_std | 0.5 (variance confirmed) |
230
+ | Scenarios Completed | All 8 automatically |
231
+ | Backtracking | Triggered 2x (MARL adaptive) |
232
+ | Model | Qwen2.5-3B-Instruct 4-bit LoRA |
233
 
234
+ ## Trained Model
235
+ 🤖 [Anurag137/enterprise-ops-lora](https://huggingface.co/Anurag137/enterprise-ops-lora)
236
+
237
+ ## Experiment Tracking
238
+ 📊 [View on Weights & Biases](https://wandb.ai/kanhaiyakumar76618-indian-institute-of-information-techn/enterprise-ops-arena)
 
239
 
240
+ ## Before vs After Training
241
+ **Before:** Agent outputs wrong tool names, missing ticket_id
242
+ **After:** Correct tool calls, SLA-aware reasoning, specific ticket references
243
+ """
244
+ )
245
 
246
  tool_call.change(
247
  fn=_preset_tool_params,
 
250
  )
251
  reset_button.click(
252
  fn=_reset_episode,
253
+ inputs=[use_trained_model, scenario],
254
+ outputs=[reset_observation, result_observation, episode_status, model_status],
255
  )
256
  step_button.click(
257
  fn=_step_episode,
258
+ inputs=[
259
+ use_trained_model,
260
+ agent_id,
261
+ tool_call,
262
+ tool_params,
263
+ message_to,
264
+ message_content,
265
+ reasoning,
266
+ ],
267
+ outputs=[reset_observation, result_observation, reward_score, episode_status, model_status],
268
  )
269
  state_button.click(fn=_load_world_state, inputs=None, outputs=world_state)
270
 
271
 
272
  if __name__ == "__main__":
273
+ # Serve FastAPI + Gradio (single process) so /reset and /step work. Requires uvicorn.
274
+ _server = Path(__file__).resolve().parent
275
+ os.chdir(_server)
276
+ if str(_server) not in sys.path:
277
+ sys.path.insert(0, str(_server))
278
+ if str(_server.parent) not in sys.path:
279
+ sys.path.insert(0, str(_server.parent))
280
+ try:
281
+ import uvicorn
282
+ except ImportError:
283
+ print("[gradio_app] uvicorn not installed; launching Gradio UI only. API routes (/reset, /step) will not work without running: uvicorn app:app", flush=True)
284
+ demo.launch(server_name="0.0.0.0", server_port=7860)
285
+ else:
286
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, factory=False, reload=False)
requirements.txt CHANGED
@@ -2,12 +2,15 @@ openenv-core
2
  fastapi
3
  uvicorn[standard]
4
  pydantic>=2.0
5
- httpx
6
  sqlalchemy
7
  aiosqlite
8
- pyyaml
9
  numpy
10
- python-dotenv
11
- pytest
12
  gradio
13
- matplotlib
 
 
 
 
 
 
2
  fastapi
3
  uvicorn[standard]
4
  pydantic>=2.0
5
+ pyyaml
6
  sqlalchemy
7
  aiosqlite
 
8
  numpy
9
+ httpx
 
10
  gradio
11
+ requests
12
+ peft>=0.6.0
13
+ transformers>=4.35.0
14
+ accelerate>=0.24.0
15
+ bitsandbytes>=0.41.0
16
+ huggingface_hub>=0.20.0