cuichenx commited on
Commit
8d669c8
·
1 Parent(s): 3fcea88

Wrap audio placeholders with <so_start>/<so_end> tokens

Browse files

Audio context is now expanded as <so_start>{<|audio_placeholder|>*N}<so_end>
instead of bare repeated placeholders, mirroring the <img>...</img>
wrapping used for vision tokens and matching vLLM's audio prompt format.

Signed-off-by: Chen Cui <chcui@nvidia.com>

Files changed (1) hide show
  1. processing.py +3 -1
processing.py CHANGED
@@ -91,6 +91,8 @@ class NemotronH_Nano_Omni_Reasoning_V3Processor(ProcessorMixin):
91
  self.image_token = "<image>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
92
  self.video_token = "<video>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
93
  self.audio_token = "<so_embedding>" if not hasattr(tokenizer, "audio_token") else tokenizer.audio_token
 
 
94
  self.image_start_token = "<img>" if not hasattr(tokenizer, "image_start_token") else tokenizer.image_start_token
95
  self.image_end_token = "</img>" if not hasattr(tokenizer, "image_end_token") else tokenizer.image_end_token
96
  self.image_token_id = (
@@ -305,7 +307,7 @@ class NemotronH_Nano_Omni_Reasoning_V3Processor(ProcessorMixin):
305
  while self.audio_token in text[i]:
306
  num_tokens = audio_num_tokens[index] if index < len(audio_num_tokens) else 1
307
  # Replace <audio> with repeated audio tokens
308
- text[i] = text[i].replace(self.audio_token, "<|audio_placeholder|>" * num_tokens, 1)
309
  index += 1
310
  text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
311
 
 
91
  self.image_token = "<image>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
92
  self.video_token = "<video>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
93
  self.audio_token = "<so_embedding>" if not hasattr(tokenizer, "audio_token") else tokenizer.audio_token
94
+ self.audio_start_token = "<so_start>"
95
+ self.audio_end_token = "<so_end>"
96
  self.image_start_token = "<img>" if not hasattr(tokenizer, "image_start_token") else tokenizer.image_start_token
97
  self.image_end_token = "</img>" if not hasattr(tokenizer, "image_end_token") else tokenizer.image_end_token
98
  self.image_token_id = (
 
307
  while self.audio_token in text[i]:
308
  num_tokens = audio_num_tokens[index] if index < len(audio_num_tokens) else 1
309
  # Replace <audio> with repeated audio tokens
310
+ text[i] = text[i].replace(self.audio_token, self.audio_start_token + "<|audio_placeholder|>" * num_tokens + self.audio_end_token, 1)
311
  index += 1
312
  text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
313