Wrap audio placeholders with <so_start>/<so_end> tokens
Browse filesAudio context is now expanded as <so_start>{<|audio_placeholder|>*N}<so_end>
instead of bare repeated placeholders, mirroring the <img>...</img>
wrapping used for vision tokens and matching vLLM's audio prompt format.
Signed-off-by: Chen Cui <chcui@nvidia.com>
- processing.py +3 -1
processing.py
CHANGED
|
@@ -91,6 +91,8 @@ class NemotronH_Nano_Omni_Reasoning_V3Processor(ProcessorMixin):
|
|
| 91 |
self.image_token = "<image>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 92 |
self.video_token = "<video>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
| 93 |
self.audio_token = "<so_embedding>" if not hasattr(tokenizer, "audio_token") else tokenizer.audio_token
|
|
|
|
|
|
|
| 94 |
self.image_start_token = "<img>" if not hasattr(tokenizer, "image_start_token") else tokenizer.image_start_token
|
| 95 |
self.image_end_token = "</img>" if not hasattr(tokenizer, "image_end_token") else tokenizer.image_end_token
|
| 96 |
self.image_token_id = (
|
|
@@ -305,7 +307,7 @@ class NemotronH_Nano_Omni_Reasoning_V3Processor(ProcessorMixin):
|
|
| 305 |
while self.audio_token in text[i]:
|
| 306 |
num_tokens = audio_num_tokens[index] if index < len(audio_num_tokens) else 1
|
| 307 |
# Replace <audio> with repeated audio tokens
|
| 308 |
-
text[i] = text[i].replace(self.audio_token, "<|audio_placeholder|>" * num_tokens, 1)
|
| 309 |
index += 1
|
| 310 |
text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
|
| 311 |
|
|
|
|
| 91 |
self.image_token = "<image>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 92 |
self.video_token = "<video>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
| 93 |
self.audio_token = "<so_embedding>" if not hasattr(tokenizer, "audio_token") else tokenizer.audio_token
|
| 94 |
+
self.audio_start_token = "<so_start>"
|
| 95 |
+
self.audio_end_token = "<so_end>"
|
| 96 |
self.image_start_token = "<img>" if not hasattr(tokenizer, "image_start_token") else tokenizer.image_start_token
|
| 97 |
self.image_end_token = "</img>" if not hasattr(tokenizer, "image_end_token") else tokenizer.image_end_token
|
| 98 |
self.image_token_id = (
|
|
|
|
| 307 |
while self.audio_token in text[i]:
|
| 308 |
num_tokens = audio_num_tokens[index] if index < len(audio_num_tokens) else 1
|
| 309 |
# Replace <audio> with repeated audio tokens
|
| 310 |
+
text[i] = text[i].replace(self.audio_token, self.audio_start_token + "<|audio_placeholder|>" * num_tokens + self.audio_end_token, 1)
|
| 311 |
index += 1
|
| 312 |
text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
|
| 313 |
|