Biorrith commited on
Commit
91577dc
·
1 Parent(s): 70c093a
Files changed (2) hide show
  1. app.py +14 -66
  2. src/chatterbox/mtl_tts.py +1 -2
app.py CHANGED
@@ -22,7 +22,7 @@ import gradio as gr
22
  import numpy as np
23
  import torch
24
 
25
- from src.chatterbox.mtl_tts import SUPPORTED_LANGUAGES, ChatterboxMultilingualTTS
26
 
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
  print(f"🚀 Running on device: {DEVICE}")
@@ -62,23 +62,6 @@ def get_danish_voice_options() -> list[tuple[str, str]]:
62
  return [("Mic", "mic"), ("Nic", "nic")]
63
 
64
 
65
- def get_supported_languages_display() -> str:
66
- """Generate a formatted display of all supported languages."""
67
- language_items = []
68
- for code, name in sorted(SUPPORTED_LANGUAGES.items()):
69
- language_items.append(f"**{name}** (`{code}`)")
70
-
71
- # Split into 2 lines
72
- mid = len(language_items) // 2
73
- line1 = " • ".join(language_items[:mid])
74
- line2 = " • ".join(language_items[mid:])
75
-
76
- return f"""
77
- ### Supported Languages
78
- {line1}
79
-
80
- {line2}
81
- """
82
 
83
 
84
  def get_or_load_model():
@@ -135,7 +118,7 @@ def generate_tts_audio(
135
  ) -> tuple[int, np.ndarray]:
136
  """
137
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
138
- Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
139
 
140
  This tool synthesizes natural-sounding speech from input text. When a reference audio file
141
  is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
@@ -189,14 +172,11 @@ def generate_tts_audio(
189
  with gr.Blocks() as demo:
190
  gr.Markdown(
191
  """
192
- # Danish TTS Demo 🇩🇰
193
- Generate high-quality danish speech from text with reference audio styling.
194
- This is a preview of a model that was developed as part of the CoRal project, and is a finetuned version of the Chatterbox Multilingual.
195
  """
196
  )
197
 
198
- # Display supported languages
199
- gr.Markdown(get_supported_languages_display())
200
  with gr.Row():
201
  with gr.Column():
202
  initial_lang = "da"
@@ -206,19 +186,11 @@ with gr.Blocks() as demo:
206
  max_lines=5
207
  )
208
 
209
- language_id = gr.Dropdown(
210
- choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
211
- value=initial_lang,
212
- label="Language",
213
- info="Select the language for text-to-speech synthesis"
214
- )
215
-
216
  danish_voice = gr.Dropdown(
217
  choices=get_danish_voice_options(),
218
  value="mic",
219
- label="Danish Voice Selection",
220
- info="Choose between different Danish voice options",
221
- visible=(initial_lang == "da")
222
  )
223
 
224
  ref_wav = gr.Audio(
@@ -228,10 +200,10 @@ with gr.Blocks() as demo:
228
  value=default_audio_for_ui(initial_lang)
229
  )
230
 
231
- gr.Markdown(
232
- "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
233
- elem_classes=["audio-note"]
234
- )
235
 
236
  cfg_weight = gr.Slider(
237
  0.2, 1, step=.05, label="CFG/Pace", value=0.5
@@ -241,36 +213,12 @@ with gr.Blocks() as demo:
241
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
242
  temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
243
 
244
- run_btn = gr.Button("Generate", variant="primary")
245
-
246
- with gr.Column():
247
- audio_output = gr.Audio(label="Output Audio")
248
-
249
- def on_language_change(lang, current_ref, current_text):
250
- is_danish = (lang == "da")
251
- danish_voice_val = "mic" if is_danish else "mic" # Default to mic
252
- return (
253
- default_audio_for_ui(lang, danish_voice_val),
254
- default_text_for_ui(lang),
255
- gr.update(visible=is_danish), # Update Danish voice dropdown visibility
256
- danish_voice_val
257
- )
258
-
259
- def on_danish_voice_change(lang, danish_voice_val):
260
- if lang == "da":
261
- return default_audio_for_ui(lang, danish_voice_val)
262
- return gr.update() # No change if not Danish
263
-
264
- language_id.change(
265
- fn=on_language_change,
266
- inputs=[language_id, ref_wav, text],
267
- outputs=[ref_wav, text, danish_voice, danish_voice],
268
- show_progress=False
269
- )
270
 
271
  danish_voice.change(
272
  fn=on_danish_voice_change,
273
- inputs=[language_id, danish_voice],
274
  outputs=[ref_wav],
275
  show_progress=False
276
  )
@@ -279,7 +227,7 @@ with gr.Blocks() as demo:
279
  fn=generate_tts_audio,
280
  inputs=[
281
  text,
282
- language_id,
283
  ref_wav,
284
  danish_voice,
285
  temp,
 
22
  import numpy as np
23
  import torch
24
 
25
+ from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS
26
 
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
  print(f"🚀 Running on device: {DEVICE}")
 
62
  return [("Mic", "mic"), ("Nic", "nic")]
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  def get_or_load_model():
 
118
  ) -> tuple[int, np.ndarray]:
119
  """
120
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
121
+ Supported languages: Danish
122
 
123
  This tool synthesizes natural-sounding speech from input text. When a reference audio file
124
  is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
 
172
  with gr.Blocks() as demo:
173
  gr.Markdown(
174
  """
175
+ # Røst V3 Chatterbox 500M Text-to-Speech 🇩🇰
176
+ Generate high-quality danish speech from text with reference audio styling. This is model was developed as part of the CoRal project, and is a finetuned version of Chatterbox Multilingual.
 
177
  """
178
  )
179
 
 
 
180
  with gr.Row():
181
  with gr.Column():
182
  initial_lang = "da"
 
186
  max_lines=5
187
  )
188
 
 
 
 
 
 
 
 
189
  danish_voice = gr.Dropdown(
190
  choices=get_danish_voice_options(),
191
  value="mic",
192
+ label="Voice Selection",
193
+ info="Choose between different voice options"
 
194
  )
195
 
196
  ref_wav = gr.Audio(
 
200
  value=default_audio_for_ui(initial_lang)
201
  )
202
 
203
+ run_btn = gr.Button("Generate", variant="primary")
204
+
205
+ with gr.Column():
206
+ audio_output = gr.Audio(label="Output Audio")
207
 
208
  cfg_weight = gr.Slider(
209
  0.2, 1, step=.05, label="CFG/Pace", value=0.5
 
213
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
214
  temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
215
 
216
+ def on_danish_voice_change(danish_voice_val):
217
+ return default_audio_for_ui("da", danish_voice_val)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  danish_voice.change(
220
  fn=on_danish_voice_change,
221
+ inputs=[danish_voice],
222
  outputs=[ref_wav],
223
  show_progress=False
224
  )
 
227
  fn=generate_tts_audio,
228
  inputs=[
229
  text,
230
+ gr.State("da"),
231
  ref_wav,
232
  danish_voice,
233
  temp,
src/chatterbox/mtl_tts.py CHANGED
@@ -18,12 +18,11 @@ from .models.voice_encoder import VoiceEncoder
18
  from .models.t3.modules.cond_enc import T3Cond
19
 
20
 
21
- REPO_ID = "CoRal-project/roest-chatterbox"
22
 
23
  # Supported languages for the multilingual model
24
  SUPPORTED_LANGUAGES = {
25
  "da": "🇩🇰 Danish",
26
- "en": "🇬🇧 English"
27
  }
28
 
29
 
 
18
  from .models.t3.modules.cond_enc import T3Cond
19
 
20
 
21
+ REPO_ID = "CoRal-project/roest-v3-chatterbox-500m"
22
 
23
  # Supported languages for the multilingual model
24
  SUPPORTED_LANGUAGES = {
25
  "da": "🇩🇰 Danish",
 
26
  }
27
 
28