Spaces:
Sleeping
Sleeping
fixed for real this time, hopefully
Browse files
app.py
CHANGED
|
@@ -111,7 +111,6 @@ def generate_tts_audio(
|
|
| 111 |
language_id: str,
|
| 112 |
audio_prompt_path_input: str = None,
|
| 113 |
danish_voice_input: str = "mic",
|
| 114 |
-
exaggeration_input: float = 0.5,
|
| 115 |
temperature_input: float = 0.8,
|
| 116 |
seed_num_input: int = 0,
|
| 117 |
cfgw_input: float = 0.5
|
|
@@ -128,7 +127,6 @@ def generate_tts_audio(
|
|
| 128 |
text_input (str): The text to synthesize into speech (maximum 300 characters)
|
| 129 |
language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
|
| 130 |
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
|
| 131 |
-
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
| 132 |
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
|
| 133 |
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
|
| 134 |
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
|
|
@@ -137,6 +135,8 @@ def generate_tts_audio(
|
|
| 137 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
| 138 |
"""
|
| 139 |
current_model = get_or_load_model()
|
|
|
|
|
|
|
| 140 |
|
| 141 |
if current_model is None:
|
| 142 |
raise RuntimeError("TTS model is not loaded.")
|
|
@@ -150,7 +150,7 @@ def generate_tts_audio(
|
|
| 150 |
chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
|
| 151 |
|
| 152 |
generate_kwargs = {
|
| 153 |
-
"exaggeration":
|
| 154 |
"temperature": temperature_input,
|
| 155 |
"cfg_weight": cfgw_input,
|
| 156 |
}
|
|
@@ -263,7 +263,6 @@ with gr.Blocks() as demo:
|
|
| 263 |
language_id,
|
| 264 |
ref_wav,
|
| 265 |
danish_voice,
|
| 266 |
-
0.5, # Fixed exaggeration
|
| 267 |
temp,
|
| 268 |
seed_num,
|
| 269 |
cfg_weight,
|
|
|
|
| 111 |
language_id: str,
|
| 112 |
audio_prompt_path_input: str = None,
|
| 113 |
danish_voice_input: str = "mic",
|
|
|
|
| 114 |
temperature_input: float = 0.8,
|
| 115 |
seed_num_input: int = 0,
|
| 116 |
cfgw_input: float = 0.5
|
|
|
|
| 127 |
text_input (str): The text to synthesize into speech (maximum 300 characters)
|
| 128 |
language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
|
| 129 |
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
|
|
|
|
| 130 |
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
|
| 131 |
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
|
| 132 |
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
|
|
|
|
| 135 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
| 136 |
"""
|
| 137 |
current_model = get_or_load_model()
|
| 138 |
+
exaggeration: float = 0.5,
|
| 139 |
+
|
| 140 |
|
| 141 |
if current_model is None:
|
| 142 |
raise RuntimeError("TTS model is not loaded.")
|
|
|
|
| 150 |
chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
|
| 151 |
|
| 152 |
generate_kwargs = {
|
| 153 |
+
"exaggeration": exaggeration,
|
| 154 |
"temperature": temperature_input,
|
| 155 |
"cfg_weight": cfgw_input,
|
| 156 |
}
|
|
|
|
| 263 |
language_id,
|
| 264 |
ref_wav,
|
| 265 |
danish_voice,
|
|
|
|
| 266 |
temp,
|
| 267 |
seed_num,
|
| 268 |
cfg_weight,
|