hypnonyx commited on
Commit
1bb3ad9
·
verified ·
1 Parent(s): 0573853

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
TrafficoDataset/dataset_traffico.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
TrafficoDataset/train_gemma3_traffico.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Gemma-3-270M – Analisi Traffico di Rete TCP/IP
3
+ # Unsloth + LoRA + Dataset JSONL (CIC-IDS2017 + UNSW-NB15)
4
+ # ============================================================
5
+ # Struttura basata sul tuo script, adattata per il dominio
6
+ # di analisi del traffico di rete con mappatura MITRE ATT&CK.
7
+ #
8
+ # PREREQUISITI:
9
+ # Google Colab con runtime GPU (T4 basta)
10
+ # !pip install --no-deps unsloth
11
+ # !pip install transformers datasets trl peft accelerate sentencepiece
12
+ #
13
+ # FILE NECESSARI (nella stessa cartella dello script):
14
+ # dataset.jsonl ← generato dalla script apposita
15
+ # ============================================================
16
+
17
+ # ---------- INSTALL (Colab) ----------
18
+ # !pip install --no-deps unsloth
19
+ # !pip install transformers datasets trl peft accelerate sentencepiece
20
+
21
+ # ---------- IMPORT ----------
22
+ from unsloth import FastModel
23
+ from unsloth.chat_templates import get_chat_template, train_on_responses_only
24
+ import torch
25
+ from datasets import load_dataset
26
+ from trl import SFTTrainer, SFTConfig
27
+
28
+ # ---------- CONFIG ----------
29
+ MODEL_NAME = "unsloth/gemma-3-270m-it"
30
+ DATASET_PATH = "dataset_traffico.jsonl" # <== il JSONL che abbiamo generato
31
+ OUTPUT_DIR = "outputs"
32
+ MAX_SEQ_LENGTH = 512 # 512 basta per questi prompt, risparmia memoria
33
+
34
+ # ---------- LOAD MODEL ----------
35
+ model, tokenizer = FastModel.from_pretrained(
36
+ model_name = MODEL_NAME,
37
+ max_seq_length = MAX_SEQ_LENGTH,
38
+ load_in_4bit = False,
39
+ load_in_8bit = False,
40
+ full_finetuning = False,
41
+ )
42
+
43
+ # ---------- LoRA ----------
44
+ # Configurazione più aggressiva sul rank (r=64) per un dominio specifico come questo.
45
+ # Target modules: tutti i proiettori del transformer.
46
+ model = FastModel.get_peft_model(
47
+ model,
48
+ r = 64,
49
+ target_modules = [
50
+ "q_proj", "k_proj", "v_proj", "o_proj",
51
+ "gate_proj", "up_proj", "down_proj",
52
+ ],
53
+ lora_alpha = 64,
54
+ lora_dropout = 0,
55
+ bias = "none",
56
+ use_gradient_checkpointing = "unsloth",
57
+ random_state = 3407,
58
+ )
59
+
60
+ # ---------- CHAT TEMPLATE (Gemma-3) ----------
61
+ tokenizer = get_chat_template(
62
+ tokenizer,
63
+ chat_template = "gemma3",
64
+ )
65
+
66
+ # ---------- LOAD DATASET ----------
67
+ dataset = load_dataset(
68
+ "json",
69
+ data_files = DATASET_PATH,
70
+ split = "train",
71
+ )
72
+
73
+ print(f"Dataset caricato: {len(dataset)} righe")
74
+ print(f"Campi presenti: {dataset.column_names}")
75
+ print(f"\nEsempio riga 0:")
76
+ print(dataset[0])
77
+
78
+ # ---------- CONVERT TO CHATML ----------
79
+ # Il JSONL ha campi: instruction, input, output
80
+ # Li convertiamo nel formato conversations [system, user, assistant]
81
+ # che Gemma-3 si aspetta.
82
+ def convert_to_chatml(example):
83
+ system_prompt = example["instruction"]
84
+
85
+ # Se c'è un campo 'context' lo aggiungiamo al system prompt
86
+ if "context" in example and example["context"]:
87
+ system_prompt += f"\nContesto: {example['context']}."
88
+
89
+ return {
90
+ "conversations": [
91
+ {"role": "system", "content": system_prompt},
92
+ {"role": "user", "content": example["input"]},
93
+ {"role": "assistant", "content": example["output"]},
94
+ ]
95
+ }
96
+
97
+ dataset = dataset.map(convert_to_chatml)
98
+
99
+ # ---------- APPLY GEMMA-3 TEMPLATE ----------
100
+ # Applica il template di chat di Gemma-3 a ogni esempio.
101
+ # Questo produce la stringa finale che il modello vedrà durante il training.
102
+ def formatting_prompts_func(examples):
103
+ convos = examples["conversations"]
104
+ texts = [
105
+ tokenizer.apply_chat_template(
106
+ convo,
107
+ tokenize = False,
108
+ add_generation_prompt = False,
109
+ ).removeprefix("<bos>")
110
+ for convo in convos
111
+ ]
112
+ return {"text": texts}
113
+
114
+ dataset = dataset.map(formatting_prompts_func, batched=True)
115
+
116
+ # Verifica come appare un prompt formattato
117
+ print("\n" + "=" * 60)
118
+ print(" PROMPT FORMATTATO (esempio)")
119
+ print("=" * 60)
120
+ print(dataset[0]["text"])
121
+ print("=" * 60)
122
+
123
+ # ---------- TRAINER ----------
124
+ trainer = SFTTrainer(
125
+ model = model,
126
+ tokenizer = tokenizer,
127
+ train_dataset = dataset,
128
+ eval_dataset = None,
129
+ args = SFTConfig(
130
+ dataset_text_field = "text",
131
+ per_device_train_batch_size = 4,
132
+ gradient_accumulation_steps = 4, # batch effettivo = 4 * 4 = 16
133
+ warmup_steps = 10,
134
+ max_steps = 500, # ~500 step su 10k righe con batch 16
135
+ learning_rate = 2e-5,
136
+ logging_steps = 25,
137
+ optim = "adamw_8bit",
138
+ weight_decay = 0.001,
139
+ lr_scheduler_type = "linear",
140
+ seed = 3407,
141
+ output_dir = OUTPUT_DIR,
142
+ report_to = "none",
143
+ ),
144
+ )
145
+
146
+ # ---------- TRAIN ONLY ON ASSISTANT ----------
147
+ # Fondamentale: il modello calcola il loss SOLO sulla risposta dell'assistant,
148
+ # non sul prompt. Così non "impara" a ripetere la domanda.
149
+ trainer = train_on_responses_only(
150
+ trainer,
151
+ instruction_part = "<start_of_turn>user\n",
152
+ response_part = "<start_of_turn>model\n",
153
+ )
154
+
155
+ # ---------- TRAIN ----------
156
+ trainer.train()
157
+
158
+ # ---------- SAVE LoRA ----------
159
+ model.save_pretrained("gemma3-traffico-rete-lora")
160
+ tokenizer.save_pretrained("gemma3-traffico-rete-lora")
161
+ print("\n✓ Modello LoRA salvato in: gemma3-traffico-rete-lora/")
162
+ model.save_pretrained_merged(
163
+ "gemma3-traffico-rete-lora", # cartella output
164
+ tokenizer,
165
+ save_method="merged_16bit" # Float16 per GGUF
166
+ )
167
+ model.save_pretrained_gguf(
168
+ "gemma3-traffico-rete-lora",
169
+ tokenizer,
170
+ quantization_method = "BF16", # For now only Q8_0, BF16, F16 supported
171
+ )
172
+ # ---------- INFERENCE: TEST ----------
173
+ # Dopo il training, prova il modello con alcuni flussi di esempio.
174
+ from transformers import TextStreamer
175
+
176
+ test_cases = [
177
+ # Caso 1: profilo tipico DoS (masse enormi di byte src, pochissimi dst, durata minima)
178
+ "Protocollo: tcp | Porta dst: 80 | Byte src: 480000 | Byte dst: 40 | Pacchetti: 5200 | Durata: 0.015s",
179
+ # Caso 2: traffico normale HTTPS
180
+ "Protocollo: tcp | Porta dst: 443 | Byte src: 1500 | Byte dst: 6200 | Pacchetti: 9 | Durata: 3.200s",
181
+ # Caso 3: profilo PortScan (tanti dst diversi, pochi byte, durata quasi zero)
182
+ "Protocollo: tcp | Porta dst: 22 | Byte src: 60 | Byte dst: 0 | Pacchetti: 1 | Durata: 0.002s",
183
+ # Caso 4: profilo Brute Force su SSH
184
+ "Protocollo: tcp | Porta dst: 22 | Byte src: 3200 | Byte dst: 8500 | Pacchetti: 45 | Durata: 1.800s",
185
+ # Caso 5: profilo Infiltration / esfiltrazioni dati
186
+ "Protocollo: tcp | Porta dst: 443 | Byte src: 8000 | Byte dst: 120000 | Pacchetti: 200 | Durata: 25.500s",
187
+ ]
188
+
189
+ streamer = TextStreamer(tokenizer, skip_prompt=True)
190
+
191
+ for i, test_input in enumerate(test_cases, 1):
192
+ messages = [
193
+ {
194
+ "role": "system",
195
+ "content": (
196
+ "Analizza il seguente flusso di traffico di rete TCP/IP. "
197
+ "Classifica se è traffico normale o un attacco. "
198
+ "Se è un attacco, indica la categoria e la tecnica MITRE ATT&CK corrispondente."
199
+ ),
200
+ },
201
+ {"role": "user", "content": test_input},
202
+ ]
203
+
204
+ text = tokenizer.apply_chat_template(
205
+ messages,
206
+ tokenize = False,
207
+ add_generation_prompt = True,
208
+ ).removeprefix("<bos>")
209
+
210
+ print(f"\n{'─' * 60}")
211
+ print(f" TEST {i}: {test_input[:80]}...")
212
+ print(f"{'─' * 60}")
213
+ print(" Risposta: ", end="")
214
+
215
+ _ = model.generate(
216
+ **tokenizer(text, return_tensors="pt").to("cuda"),
217
+ max_new_tokens = 128,
218
+ temperature = 0.3, # bassa temperatura = risposte più deterministe
219
+ top_p = 0.9,
220
+ top_k = 40,
221
+ streamer = streamer,
222
+ )
223
+
224
+ # ---------- SAVE MERGED (opzionale) ----------
225
+ # Unisce i pesi LoRA al modello base e salva come modello completo.
226
+ # Utile per deployare senza dipendenza da PEFT.
227
+ #
228
+ model.save_pretrained_merged(
229
+ "gemma3-traffico-rete-merged",
230
+ tokenizer,
231
+ save_method = "merged_16bit",
232
+ )
233
+ #
234
+ # ---------- SAVE GGUF (opzionale) ----------
235
+ # Formato GGUF per inferenza locale con llama.cpp / Ollama.
236
+ #
237
+ model.save_pretrained_gguf(
238
+ "gemma3-traffico-rete-gguf",
239
+ tokenizer,
240
+ quantization_method = "Q8_0", # Q8_0 = buon equilibrio qualità/dimensione
241
+ )
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{ '<start_of_turn>model
46
+ ' }}
47
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "torch_dtype": "bfloat16",
11
+ "eos_token_id": 106,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "transformers_version": "4.57.3",
51
+ "unsloth_fixed": true,
52
+ "unsloth_version": "2026.1.4",
53
+ "use_bidirectional_attention": false,
54
+ "use_cache": true,
55
+ "vocab_size": 262144
56
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1debdbdbc6a711e1abf5f0285bda7fd2a7a93805ae3c4aa986012a7bd2eac39a
3
+ size 536223056
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff