Spaces:
Sleeping
Sleeping
Updated
Browse files- app.py +14 -66
- src/chatterbox/mtl_tts.py +1 -2
app.py
CHANGED
|
@@ -22,7 +22,7 @@ import gradio as gr
|
|
| 22 |
import numpy as np
|
| 23 |
import torch
|
| 24 |
|
| 25 |
-
from src.chatterbox.mtl_tts import
|
| 26 |
|
| 27 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
print(f"🚀 Running on device: {DEVICE}")
|
|
@@ -62,23 +62,6 @@ def get_danish_voice_options() -> list[tuple[str, str]]:
|
|
| 62 |
return [("Mic", "mic"), ("Nic", "nic")]
|
| 63 |
|
| 64 |
|
| 65 |
-
def get_supported_languages_display() -> str:
|
| 66 |
-
"""Generate a formatted display of all supported languages."""
|
| 67 |
-
language_items = []
|
| 68 |
-
for code, name in sorted(SUPPORTED_LANGUAGES.items()):
|
| 69 |
-
language_items.append(f"**{name}** (`{code}`)")
|
| 70 |
-
|
| 71 |
-
# Split into 2 lines
|
| 72 |
-
mid = len(language_items) // 2
|
| 73 |
-
line1 = " • ".join(language_items[:mid])
|
| 74 |
-
line2 = " • ".join(language_items[mid:])
|
| 75 |
-
|
| 76 |
-
return f"""
|
| 77 |
-
### Supported Languages
|
| 78 |
-
{line1}
|
| 79 |
-
|
| 80 |
-
{line2}
|
| 81 |
-
"""
|
| 82 |
|
| 83 |
|
| 84 |
def get_or_load_model():
|
|
@@ -135,7 +118,7 @@ def generate_tts_audio(
|
|
| 135 |
) -> tuple[int, np.ndarray]:
|
| 136 |
"""
|
| 137 |
Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
|
| 138 |
-
Supported languages:
|
| 139 |
|
| 140 |
This tool synthesizes natural-sounding speech from input text. When a reference audio file
|
| 141 |
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
|
|
@@ -189,14 +172,11 @@ def generate_tts_audio(
|
|
| 189 |
with gr.Blocks() as demo:
|
| 190 |
gr.Markdown(
|
| 191 |
"""
|
| 192 |
-
#
|
| 193 |
-
Generate high-quality danish speech from text with reference audio styling.
|
| 194 |
-
This is a preview of a model that was developed as part of the CoRal project, and is a finetuned version of the Chatterbox Multilingual.
|
| 195 |
"""
|
| 196 |
)
|
| 197 |
|
| 198 |
-
# Display supported languages
|
| 199 |
-
gr.Markdown(get_supported_languages_display())
|
| 200 |
with gr.Row():
|
| 201 |
with gr.Column():
|
| 202 |
initial_lang = "da"
|
|
@@ -206,19 +186,11 @@ with gr.Blocks() as demo:
|
|
| 206 |
max_lines=5
|
| 207 |
)
|
| 208 |
|
| 209 |
-
language_id = gr.Dropdown(
|
| 210 |
-
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 211 |
-
value=initial_lang,
|
| 212 |
-
label="Language",
|
| 213 |
-
info="Select the language for text-to-speech synthesis"
|
| 214 |
-
)
|
| 215 |
-
|
| 216 |
danish_voice = gr.Dropdown(
|
| 217 |
choices=get_danish_voice_options(),
|
| 218 |
value="mic",
|
| 219 |
-
label="
|
| 220 |
-
info="Choose between different
|
| 221 |
-
visible=(initial_lang == "da")
|
| 222 |
)
|
| 223 |
|
| 224 |
ref_wav = gr.Audio(
|
|
@@ -228,10 +200,10 @@ with gr.Blocks() as demo:
|
|
| 228 |
value=default_audio_for_ui(initial_lang)
|
| 229 |
)
|
| 230 |
|
| 231 |
-
gr.
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
)
|
| 235 |
|
| 236 |
cfg_weight = gr.Slider(
|
| 237 |
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
|
@@ -241,36 +213,12 @@ with gr.Blocks() as demo:
|
|
| 241 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 242 |
temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
with gr.Column():
|
| 247 |
-
audio_output = gr.Audio(label="Output Audio")
|
| 248 |
-
|
| 249 |
-
def on_language_change(lang, current_ref, current_text):
|
| 250 |
-
is_danish = (lang == "da")
|
| 251 |
-
danish_voice_val = "mic" if is_danish else "mic" # Default to mic
|
| 252 |
-
return (
|
| 253 |
-
default_audio_for_ui(lang, danish_voice_val),
|
| 254 |
-
default_text_for_ui(lang),
|
| 255 |
-
gr.update(visible=is_danish), # Update Danish voice dropdown visibility
|
| 256 |
-
danish_voice_val
|
| 257 |
-
)
|
| 258 |
-
|
| 259 |
-
def on_danish_voice_change(lang, danish_voice_val):
|
| 260 |
-
if lang == "da":
|
| 261 |
-
return default_audio_for_ui(lang, danish_voice_val)
|
| 262 |
-
return gr.update() # No change if not Danish
|
| 263 |
-
|
| 264 |
-
language_id.change(
|
| 265 |
-
fn=on_language_change,
|
| 266 |
-
inputs=[language_id, ref_wav, text],
|
| 267 |
-
outputs=[ref_wav, text, danish_voice, danish_voice],
|
| 268 |
-
show_progress=False
|
| 269 |
-
)
|
| 270 |
|
| 271 |
danish_voice.change(
|
| 272 |
fn=on_danish_voice_change,
|
| 273 |
-
inputs=[
|
| 274 |
outputs=[ref_wav],
|
| 275 |
show_progress=False
|
| 276 |
)
|
|
@@ -279,7 +227,7 @@ with gr.Blocks() as demo:
|
|
| 279 |
fn=generate_tts_audio,
|
| 280 |
inputs=[
|
| 281 |
text,
|
| 282 |
-
|
| 283 |
ref_wav,
|
| 284 |
danish_voice,
|
| 285 |
temp,
|
|
|
|
| 22 |
import numpy as np
|
| 23 |
import torch
|
| 24 |
|
| 25 |
+
from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
| 26 |
|
| 27 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
print(f"🚀 Running on device: {DEVICE}")
|
|
|
|
| 62 |
return [("Mic", "mic"), ("Nic", "nic")]
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def get_or_load_model():
|
|
|
|
| 118 |
) -> tuple[int, np.ndarray]:
|
| 119 |
"""
|
| 120 |
Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
|
| 121 |
+
Supported languages: Danish
|
| 122 |
|
| 123 |
This tool synthesizes natural-sounding speech from input text. When a reference audio file
|
| 124 |
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
|
|
|
|
| 172 |
with gr.Blocks() as demo:
|
| 173 |
gr.Markdown(
|
| 174 |
"""
|
| 175 |
+
# Røst V3 Chatterbox 500M Text-to-Speech 🇩🇰
|
| 176 |
+
Generate high-quality danish speech from text with reference audio styling. This is model was developed as part of the CoRal project, and is a finetuned version of Chatterbox Multilingual.
|
|
|
|
| 177 |
"""
|
| 178 |
)
|
| 179 |
|
|
|
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
with gr.Column():
|
| 182 |
initial_lang = "da"
|
|
|
|
| 186 |
max_lines=5
|
| 187 |
)
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
danish_voice = gr.Dropdown(
|
| 190 |
choices=get_danish_voice_options(),
|
| 191 |
value="mic",
|
| 192 |
+
label="Voice Selection",
|
| 193 |
+
info="Choose between different voice options"
|
|
|
|
| 194 |
)
|
| 195 |
|
| 196 |
ref_wav = gr.Audio(
|
|
|
|
| 200 |
value=default_audio_for_ui(initial_lang)
|
| 201 |
)
|
| 202 |
|
| 203 |
+
run_btn = gr.Button("Generate", variant="primary")
|
| 204 |
+
|
| 205 |
+
with gr.Column():
|
| 206 |
+
audio_output = gr.Audio(label="Output Audio")
|
| 207 |
|
| 208 |
cfg_weight = gr.Slider(
|
| 209 |
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
|
|
|
| 213 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 214 |
temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
|
| 215 |
|
| 216 |
+
def on_danish_voice_change(danish_voice_val):
|
| 217 |
+
return default_audio_for_ui("da", danish_voice_val)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
danish_voice.change(
|
| 220 |
fn=on_danish_voice_change,
|
| 221 |
+
inputs=[danish_voice],
|
| 222 |
outputs=[ref_wav],
|
| 223 |
show_progress=False
|
| 224 |
)
|
|
|
|
| 227 |
fn=generate_tts_audio,
|
| 228 |
inputs=[
|
| 229 |
text,
|
| 230 |
+
gr.State("da"),
|
| 231 |
ref_wav,
|
| 232 |
danish_voice,
|
| 233 |
temp,
|
src/chatterbox/mtl_tts.py
CHANGED
|
@@ -18,12 +18,11 @@ from .models.voice_encoder import VoiceEncoder
|
|
| 18 |
from .models.t3.modules.cond_enc import T3Cond
|
| 19 |
|
| 20 |
|
| 21 |
-
REPO_ID = "CoRal-project/roest-chatterbox"
|
| 22 |
|
| 23 |
# Supported languages for the multilingual model
|
| 24 |
SUPPORTED_LANGUAGES = {
|
| 25 |
"da": "🇩🇰 Danish",
|
| 26 |
-
"en": "🇬🇧 English"
|
| 27 |
}
|
| 28 |
|
| 29 |
|
|
|
|
| 18 |
from .models.t3.modules.cond_enc import T3Cond
|
| 19 |
|
| 20 |
|
| 21 |
+
REPO_ID = "CoRal-project/roest-v3-chatterbox-500m"
|
| 22 |
|
| 23 |
# Supported languages for the multilingual model
|
| 24 |
SUPPORTED_LANGUAGES = {
|
| 25 |
"da": "🇩🇰 Danish",
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
|