Spaces:

alexandrainst
/

roest-chatterbox-demo

Sleeping

App Files Files Community

Biorrith commited on Feb 22

Commit

91577dc

1 Parent(s): 70c093a

Updated

Browse files

Files changed (2) hide show

app.py +14 -66
src/chatterbox/mtl_tts.py +1 -2

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ import gradio as gr
 import numpy as np
 import torch
-from src.chatterbox.mtl_tts import SUPPORTED_LANGUAGES, ChatterboxMultilingualTTS
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
@@ -62,23 +62,6 @@ def get_danish_voice_options() -> list[tuple[str, str]]:
     return [("Mic", "mic"), ("Nic", "nic")]
-def get_supported_languages_display() -> str:
-    """Generate a formatted display of all supported languages."""
-    language_items = []
-    for code, name in sorted(SUPPORTED_LANGUAGES.items()):
-        language_items.append(f"**{name}** (`{code}`)")
-    # Split into 2 lines
-    mid = len(language_items) // 2
-    line1 = " • ".join(language_items[:mid])
-    line2 = " • ".join(language_items[mid:])
-    return f"""
-### Supported Languages
-{line1}
-{line2}
-"""
 def get_or_load_model():
@@ -135,7 +118,7 @@ def generate_tts_audio(
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
-    Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
     This tool synthesizes natural-sounding speech from input text. When a reference audio file
     is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
@@ -189,14 +172,11 @@ def generate_tts_audio(
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # Danish TTS Demo 🇩🇰
-        Generate high-quality danish speech from text with reference audio styling.
-        This is a preview of a model that was developed as part of the CoRal project, and is a finetuned version of the Chatterbox Multilingual.
         """
     )
-    # Display supported languages
-    gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "da"
@@ -206,19 +186,11 @@ with gr.Blocks() as demo:
                 max_lines=5
             )
-            language_id = gr.Dropdown(
-                choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
-                value=initial_lang,
-                label="Language",
-                info="Select the language for text-to-speech synthesis"
-            )
             danish_voice = gr.Dropdown(
                 choices=get_danish_voice_options(),
                 value="mic",
-                label="Danish Voice Selection",
-                info="Choose between different Danish voice options",
-                visible=(initial_lang == "da")
             )
             ref_wav = gr.Audio(
@@ -228,10 +200,10 @@ with gr.Blocks() as demo:
                 value=default_audio_for_ui(initial_lang)
             )
-            gr.Markdown(
-                "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
-                elem_classes=["audio-note"]
-            )
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
@@ -241,36 +213,12 @@ with gr.Blocks() as demo:
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
-            run_btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="Output Audio")
-        def on_language_change(lang, current_ref, current_text):
-            is_danish = (lang == "da")
-            danish_voice_val = "mic" if is_danish else "mic"  # Default to mic
-            return (
-                default_audio_for_ui(lang, danish_voice_val),
-                default_text_for_ui(lang),
-                gr.update(visible=is_danish),  # Update Danish voice dropdown visibility
-                danish_voice_val
-            )
-        def on_danish_voice_change(lang, danish_voice_val):
-            if lang == "da":
-                return default_audio_for_ui(lang, danish_voice_val)
-            return gr.update()  # No change if not Danish
-        language_id.change(
-            fn=on_language_change,
-            inputs=[language_id, ref_wav, text],
-            outputs=[ref_wav, text, danish_voice, danish_voice],
-            show_progress=False
-        )
         danish_voice.change(
             fn=on_danish_voice_change,
-            inputs=[language_id, danish_voice],
             outputs=[ref_wav],
             show_progress=False
         )
@@ -279,7 +227,7 @@ with gr.Blocks() as demo:
         fn=generate_tts_audio,
         inputs=[
             text,
-            language_id,
             ref_wav,
             danish_voice,
             temp,

 import numpy as np
 import torch
+from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
     return [("Mic", "mic"), ("Nic", "nic")]
 def get_or_load_model():
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
+    Supported languages: Danish
     This tool synthesizes natural-sounding speech from input text. When a reference audio file
     is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # Røst V3 Chatterbox 500M Text-to-Speech 🇩🇰
+        Generate high-quality danish speech from text with reference audio styling. This is model was developed as part of the CoRal project, and is a finetuned version of Chatterbox Multilingual.
         """
     )
     with gr.Row():
         with gr.Column():
             initial_lang = "da"
                 max_lines=5
             )
             danish_voice = gr.Dropdown(
                 choices=get_danish_voice_options(),
                 value="mic",
+                label="Voice Selection",
+                info="Choose between different voice options"
             )
             ref_wav = gr.Audio(
                 value=default_audio_for_ui(initial_lang)
             )
+            run_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Output Audio")
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
+        def on_danish_voice_change(danish_voice_val):
+            return default_audio_for_ui("da", danish_voice_val)
         danish_voice.change(
             fn=on_danish_voice_change,
+            inputs=[danish_voice],
             outputs=[ref_wav],
             show_progress=False
         )
         fn=generate_tts_audio,
         inputs=[
             text,
+            gr.State("da"),
             ref_wav,
             danish_voice,
             temp,

src/chatterbox/mtl_tts.py CHANGED Viewed

@@ -18,12 +18,11 @@ from .models.voice_encoder import VoiceEncoder
 from .models.t3.modules.cond_enc import T3Cond
-REPO_ID = "CoRal-project/roest-chatterbox"
 # Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
   "da": "🇩🇰 Danish",
-  "en": "🇬🇧 English"
 }

 from .models.t3.modules.cond_enc import T3Cond
+REPO_ID = "CoRal-project/roest-v3-chatterbox-500m"
 # Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
   "da": "🇩🇰 Danish",
 }