Biorrith commited on
Commit
20739c9
·
1 Parent(s): 9658111

Error fix

Browse files
Files changed (1) hide show
  1. app.py +49 -42
app.py CHANGED
@@ -14,17 +14,19 @@ MODEL = None
14
 
15
  LANGUAGE_CONFIG = {
16
  "da": {
17
- "audio_options": {"mic": "voices/mic.wav", "nic": "voices/nic.wav"},
 
 
 
18
  "default_audio": "voices/mic.wav", # Default to mic
19
- "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal.",
20
  },
21
  "en": {
22
  "audio": "voices/en_f1.flac",
23
- "text": "Last month, we reached a new milestone with two billion views on our YouTube channel.",
24
  },
25
  }
26
 
27
-
28
  # --- UI Helpers ---
29
  def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
30
  config = LANGUAGE_CONFIG.get(lang, {})
@@ -47,12 +49,12 @@ def get_supported_languages_display() -> str:
47
  language_items = []
48
  for code, name in sorted(SUPPORTED_LANGUAGES.items()):
49
  language_items.append(f"**{name}** (`{code}`)")
50
-
51
  # Split into 2 lines
52
  mid = len(language_items) // 2
53
  line1 = " • ".join(language_items[:mid])
54
  line2 = " • ".join(language_items[mid:])
55
-
56
  return f"""
57
  ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
58
  {line1}
@@ -69,7 +71,7 @@ def get_or_load_model():
69
  print("Model not loaded, initializing...")
70
  try:
71
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
72
- if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
73
  MODEL.to(DEVICE)
74
  print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
75
  except Exception as e:
@@ -77,14 +79,12 @@ def get_or_load_model():
77
  raise
78
  return MODEL
79
 
80
-
81
  # Attempt to load the model at startup.
82
  try:
83
  get_or_load_model()
84
  except Exception as e:
85
  print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
86
 
87
-
88
  def set_seed(seed: int):
89
  """Sets the random seed for reproducibility across torch, numpy, and random."""
90
  torch.manual_seed(seed)
@@ -93,8 +93,7 @@ def set_seed(seed: int):
93
  torch.cuda.manual_seed_all(seed)
94
  random.seed(seed)
95
  np.random.seed(seed)
96
-
97
-
98
  def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
99
  """
100
  Decide which audio prompt to use:
@@ -115,14 +114,14 @@ def generate_tts_audio(
115
  exaggeration_input: float = 0.5,
116
  temperature_input: float = 0.8,
117
  seed_num_input: int = 0,
118
- cfgw_input: float = 0.5,
119
  ) -> tuple[int, np.ndarray]:
120
  """
121
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
122
  Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
123
-
124
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
125
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
126
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
127
 
128
  Args:
@@ -132,7 +131,7 @@ def generate_tts_audio(
132
  exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
133
  temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
134
  seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
135
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
136
 
137
  Returns:
138
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -146,7 +145,7 @@ def generate_tts_audio(
146
  set_seed(int(seed_num_input))
147
 
148
  print(f"Generating audio for text: '{text_input[:50]}...'")
149
-
150
  # Handle optional audio prompt
151
  chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
152
 
@@ -160,16 +159,15 @@ def generate_tts_audio(
160
  print(f"Using audio prompt: {chosen_prompt}")
161
  else:
162
  print("No audio prompt provided; using default voice.")
163
-
164
  wav = current_model.generate(
165
  text_input[:300], # Truncate text to max chars
166
  language_id=language_id,
167
- **generate_kwargs,
168
  )
169
  print("Audio generation complete.")
170
  return (current_model.sr, wav.squeeze(0).numpy())
171
 
172
-
173
  with gr.Blocks() as demo:
174
  gr.Markdown(
175
  """
@@ -177,47 +175,53 @@ with gr.Blocks() as demo:
177
  Generate high-quality danish speech from text with reference audio styling.
178
  """
179
  )
180
-
181
  # Display supported languages
182
  gr.Markdown(get_supported_languages_display())
183
  with gr.Row():
184
  with gr.Column():
185
  initial_lang = "da"
186
- text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text to synthesize (max chars 300)", max_lines=5)
187
-
 
 
 
 
188
  language_id = gr.Dropdown(
189
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
190
  value=initial_lang,
191
  label="Language",
192
- info="Select the language for text-to-speech synthesis",
193
  )
194
-
195
  danish_voice = gr.Dropdown(
196
  choices=get_danish_voice_options(),
197
  value="mic",
198
  label="Danish Voice Selection",
199
  info="Choose between different Danish voice options",
200
- visible=(initial_lang == "da"),
201
  )
202
-
203
  ref_wav = gr.Audio(
204
  sources=["upload", "microphone"],
205
  type="filepath",
206
  label="Reference Audio File (Optional)",
207
- value=default_audio_for_ui(initial_lang),
208
  )
209
-
210
  gr.Markdown(
211
  "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
212
- elem_classes=["audio-note"],
 
 
 
 
 
213
  )
214
-
215
- exaggeration = 0.5
216
- cfg_weight = gr.Slider(0.2, 1, step=0.05, label="CFG/Pace", value=0.5)
217
 
218
  with gr.Accordion("More options", open=False):
219
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
220
- temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
221
 
222
  run_btn = gr.Button("Generate", variant="primary")
223
 
@@ -225,13 +229,13 @@ with gr.Blocks() as demo:
225
  audio_output = gr.Audio(label="Output Audio")
226
 
227
  def on_language_change(lang, current_ref, current_text):
228
- is_danish = lang == "da"
229
  danish_voice_val = "mic" if is_danish else "mic" # Default to mic
230
  return (
231
- default_audio_for_ui(lang, danish_voice_val),
232
- default_text_for_ui(lang),
233
  gr.update(visible=is_danish), # Update Danish voice dropdown visibility
234
- danish_voice_val,
235
  )
236
 
237
  def on_danish_voice_change(lang, danish_voice_val):
@@ -243,11 +247,14 @@ with gr.Blocks() as demo:
243
  fn=on_language_change,
244
  inputs=[language_id, ref_wav, text],
245
  outputs=[ref_wav, text, danish_voice, danish_voice],
246
- show_progress=False,
247
  )
248
 
249
  danish_voice.change(
250
- fn=on_danish_voice_change, inputs=[language_id, danish_voice], outputs=[ref_wav], show_progress=False
 
 
 
251
  )
252
 
253
  run_btn.click(
@@ -257,7 +264,7 @@ with gr.Blocks() as demo:
257
  language_id,
258
  ref_wav,
259
  danish_voice,
260
- exaggeration,
261
  temp,
262
  seed_num,
263
  cfg_weight,
@@ -265,4 +272,4 @@ with gr.Blocks() as demo:
265
  outputs=[audio_output],
266
  )
267
 
268
- demo.launch() # mcp_server=True
 
14
 
15
  LANGUAGE_CONFIG = {
16
  "da": {
17
+ "audio_options": {
18
+ "mic": "voices/mic.wav",
19
+ "nic": "voices/nic.wav"
20
+ },
21
  "default_audio": "voices/mic.wav", # Default to mic
22
+ "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
23
  },
24
  "en": {
25
  "audio": "voices/en_f1.flac",
26
+ "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
27
  },
28
  }
29
 
 
30
  # --- UI Helpers ---
31
  def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
32
  config = LANGUAGE_CONFIG.get(lang, {})
 
49
  language_items = []
50
  for code, name in sorted(SUPPORTED_LANGUAGES.items()):
51
  language_items.append(f"**{name}** (`{code}`)")
52
+
53
  # Split into 2 lines
54
  mid = len(language_items) // 2
55
  line1 = " • ".join(language_items[:mid])
56
  line2 = " • ".join(language_items[mid:])
57
+
58
  return f"""
59
  ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
60
  {line1}
 
71
  print("Model not loaded, initializing...")
72
  try:
73
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
74
+ if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
75
  MODEL.to(DEVICE)
76
  print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
77
  except Exception as e:
 
79
  raise
80
  return MODEL
81
 
 
82
  # Attempt to load the model at startup.
83
  try:
84
  get_or_load_model()
85
  except Exception as e:
86
  print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
87
 
 
88
  def set_seed(seed: int):
89
  """Sets the random seed for reproducibility across torch, numpy, and random."""
90
  torch.manual_seed(seed)
 
93
  torch.cuda.manual_seed_all(seed)
94
  random.seed(seed)
95
  np.random.seed(seed)
96
+
 
97
  def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
98
  """
99
  Decide which audio prompt to use:
 
114
  exaggeration_input: float = 0.5,
115
  temperature_input: float = 0.8,
116
  seed_num_input: int = 0,
117
+ cfgw_input: float = 0.5
118
  ) -> tuple[int, np.ndarray]:
119
  """
120
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
121
  Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
122
+
123
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
124
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
125
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
126
 
127
  Args:
 
131
  exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
132
  temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
133
  seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
134
+ cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
135
 
136
  Returns:
137
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
 
145
  set_seed(int(seed_num_input))
146
 
147
  print(f"Generating audio for text: '{text_input[:50]}...'")
148
+
149
  # Handle optional audio prompt
150
  chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
151
 
 
159
  print(f"Using audio prompt: {chosen_prompt}")
160
  else:
161
  print("No audio prompt provided; using default voice.")
162
+
163
  wav = current_model.generate(
164
  text_input[:300], # Truncate text to max chars
165
  language_id=language_id,
166
+ **generate_kwargs
167
  )
168
  print("Audio generation complete.")
169
  return (current_model.sr, wav.squeeze(0).numpy())
170
 
 
171
  with gr.Blocks() as demo:
172
  gr.Markdown(
173
  """
 
175
  Generate high-quality danish speech from text with reference audio styling.
176
  """
177
  )
178
+
179
  # Display supported languages
180
  gr.Markdown(get_supported_languages_display())
181
  with gr.Row():
182
  with gr.Column():
183
  initial_lang = "da"
184
+ text = gr.Textbox(
185
+ value=default_text_for_ui(initial_lang),
186
+ label="Text to synthesize (max chars 300)",
187
+ max_lines=5
188
+ )
189
+
190
  language_id = gr.Dropdown(
191
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
192
  value=initial_lang,
193
  label="Language",
194
+ info="Select the language for text-to-speech synthesis"
195
  )
196
+
197
  danish_voice = gr.Dropdown(
198
  choices=get_danish_voice_options(),
199
  value="mic",
200
  label="Danish Voice Selection",
201
  info="Choose between different Danish voice options",
202
+ visible=(initial_lang == "da")
203
  )
204
+
205
  ref_wav = gr.Audio(
206
  sources=["upload", "microphone"],
207
  type="filepath",
208
  label="Reference Audio File (Optional)",
209
+ value=default_audio_for_ui(initial_lang)
210
  )
211
+
212
  gr.Markdown(
213
  "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
214
+ elem_classes=["audio-note"]
215
+ )
216
+
217
+ exaggeration = 0.5 # Fixed exaggeration value
218
+ cfg_weight = gr.Slider(
219
+ 0.2, 1, step=.05, label="CFG/Pace", value=0.5
220
  )
 
 
 
221
 
222
  with gr.Accordion("More options", open=False):
223
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
224
+ temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
225
 
226
  run_btn = gr.Button("Generate", variant="primary")
227
 
 
229
  audio_output = gr.Audio(label="Output Audio")
230
 
231
  def on_language_change(lang, current_ref, current_text):
232
+ is_danish = (lang == "da")
233
  danish_voice_val = "mic" if is_danish else "mic" # Default to mic
234
  return (
235
+ default_audio_for_ui(lang, danish_voice_val),
236
+ default_text_for_ui(lang),
237
  gr.update(visible=is_danish), # Update Danish voice dropdown visibility
238
+ danish_voice_val
239
  )
240
 
241
  def on_danish_voice_change(lang, danish_voice_val):
 
247
  fn=on_language_change,
248
  inputs=[language_id, ref_wav, text],
249
  outputs=[ref_wav, text, danish_voice, danish_voice],
250
+ show_progress=False
251
  )
252
 
253
  danish_voice.change(
254
+ fn=on_danish_voice_change,
255
+ inputs=[language_id, danish_voice],
256
+ outputs=[ref_wav],
257
+ show_progress=False
258
  )
259
 
260
  run_btn.click(
 
264
  language_id,
265
  ref_wav,
266
  danish_voice,
267
+ 0.5, # Fixed exaggeration
268
  temp,
269
  seed_num,
270
  cfg_weight,
 
272
  outputs=[audio_output],
273
  )
274
 
275
+ demo.launch() #mcp_server=True