PatnaikAshish commited on
Commit
8918877
Β·
verified Β·
1 Parent(s): afcf4ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -274
app.py CHANGED
@@ -1,274 +1,274 @@
1
- import gradio as gr
2
- import os
3
- import re
4
- import torch
5
- import numpy as np
6
- from scipy.io.wavfile import write
7
- from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
- from safetensors.torch import load_file
9
- from huggingface_hub import hf_hub_download
10
-
11
- from tts import commons
12
- from tts import utils
13
- from tts.models import SynthesizerTrn
14
- from text.symbols import symbols
15
- from text import text_to_sequence
16
-
17
- _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
18
- if os.path.exists(_ESPEAK_LIBRARY):
19
- EspeakWrapper.set_library(_ESPEAK_LIBRARY)
20
- print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
21
-
22
-
23
- REPO_ID = "PatnaikAshish/Sonya-TTS"
24
-
25
- MODEL_FILENAME = "sonya-tts.safetensors"
26
- CONFIG_FILENAME = "config.json"
27
-
28
- LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors"
29
- LOCAL_CONFIG_PATH = "checkpoints/config.json"
30
-
31
- device = "cuda" if torch.cuda.is_available() else "cpu"
32
-
33
-
34
- def clean_text_for_vits(text):
35
- text = text.strip()
36
- text = text.replace("'", "'")
37
- text = text.replace(""", '"').replace(""", '"')
38
- text = text.replace("–", "-").replace("β€”", "-")
39
- text = re.sub(r"[()\[\]{}<>]", "", text)
40
- text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
41
- text = re.sub(r"\s+", " ", text)
42
- return text
43
-
44
-
45
- def get_text(text, hps):
46
- text = clean_text_for_vits(text)
47
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
48
- if hps.data.add_blank:
49
- text_norm = commons.intersperse(text_norm, 0)
50
- return torch.LongTensor(text_norm)
51
-
52
-
53
- def split_sentences(text):
54
- text = clean_text_for_vits(text)
55
- if not text:
56
- return []
57
- return re.split(r'(?<=[.!?])\s+', text)
58
-
59
-
60
- print("πŸ”„ Loading Sonya TTS Model...")
61
-
62
- if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH):
63
- print("βœ… Loading Sonya TTS from local checkpoints...")
64
- model_path = LOCAL_MODEL_PATH
65
- config_path = LOCAL_CONFIG_PATH
66
- else:
67
- print("🌍 Downloading Sonya TTS from Hugging Face...")
68
- model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
69
- config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME)
70
-
71
- hps = utils.get_hparams_from_file(config_path)
72
-
73
- net_g = SynthesizerTrn(
74
- len(symbols),
75
- hps.data.filter_length // 2 + 1,
76
- hps.train.segment_size // hps.data.hop_length,
77
- **hps.model
78
- ).to(device)
79
-
80
- net_g.eval()
81
-
82
- state_dict = load_file(model_path)
83
- net_g.load_state_dict(state_dict)
84
- print("πŸŽ‰ Sonya TTS loaded successfully!")
85
-
86
-
87
- def infer_short(text, noise_scale, noise_scale_w, length_scale):
88
- if not text.strip():
89
- return None
90
-
91
- stn_tst = get_text(text, hps)
92
-
93
- with torch.no_grad():
94
- x_tst = stn_tst.to(device).unsqueeze(0)
95
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
96
-
97
- audio = net_g.infer(
98
- x_tst,
99
- x_tst_lengths,
100
- noise_scale=noise_scale,
101
- noise_scale_w=noise_scale_w,
102
- length_scale=length_scale
103
- )[0][0,0].data.cpu().float().numpy()
104
-
105
- return (hps.data.sampling_rate, audio)
106
-
107
-
108
- def infer_long(text, length_scale, noise_scale):
109
- if not text.strip():
110
- return None
111
-
112
- sentences = split_sentences(text)
113
- audio_chunks = []
114
-
115
- fixed_noise_w = 0.6
116
- base_pause = 0.3
117
-
118
- for sent in sentences:
119
- if len(sent.strip()) < 2:
120
- continue
121
-
122
- stn_tst = get_text(sent, hps)
123
- with torch.no_grad():
124
- x_tst = stn_tst.to(device).unsqueeze(0)
125
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
126
-
127
- audio = net_g.infer(
128
- x_tst,
129
- x_tst_lengths,
130
- noise_scale=noise_scale,
131
- noise_scale_w=fixed_noise_w,
132
- length_scale=length_scale
133
- )[0][0,0].data.cpu().float().numpy()
134
-
135
- if sent.endswith("?"):
136
- pause_dur = base_pause + 0.2
137
- elif sent.endswith("!"):
138
- pause_dur = base_pause + 0.1
139
- else:
140
- pause_dur = base_pause
141
-
142
- silence = np.zeros(int(hps.data.sampling_rate * pause_dur))
143
-
144
- audio_chunks.append(audio)
145
- audio_chunks.append(silence)
146
-
147
- final_audio = np.concatenate(audio_chunks)
148
- return (hps.data.sampling_rate, final_audio)
149
-
150
-
151
- theme = gr.themes.Soft(
152
- primary_hue="pink",
153
- secondary_hue="rose",
154
- neutral_hue="slate"
155
- ).set(
156
- button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)",
157
- button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)",
158
- button_primary_text_color="white",
159
- )
160
-
161
- custom_css = """
162
- .banner-container {
163
- width: 100%;
164
- max-width: 100%;
165
- margin: 0 auto 20px auto;
166
- display: flex;
167
- justify-content: center;
168
- align-items: center;
169
- }
170
-
171
- .banner-container img {
172
- width: 100%;
173
- max-width: 1800px;
174
- max-height: 120px;
175
- height: auto;
176
- object-fit: scale-down;
177
- object-position: center;
178
- border-radius: 8px;
179
- }
180
-
181
- .main-title {
182
- text-align: center;
183
- color: #ff1493;
184
- font-size: 2em;
185
- font-weight: 700;
186
- margin: 15px 0 8px 0;
187
- }
188
-
189
- .subtitle {
190
- text-align: center;
191
- color: white;
192
- font-size: 1.1em;
193
- margin-bottom: 25px;
194
- font-weight: 400;
195
- }
196
-
197
- footer {
198
- display: none !important;
199
- }
200
- """
201
-
202
-
203
- with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app:
204
-
205
- with gr.Row(elem_classes="banner-container"):
206
- if os.path.exists("logo.png"):
207
- gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img")
208
-
209
- gr.HTML("""
210
- <h1 class="main-title">✨ Sonya TTS β€” A Beautiful, Expressive Neural Voice Engine</h1>
211
- <p class="subtitle">High-fidelity AI speech with emotion, rhythm, and audiobook mode</p>
212
- """)
213
-
214
- with gr.Tabs():
215
-
216
- with gr.TabItem("πŸŽ›οΈ Studio Mode"):
217
- with gr.Row():
218
- with gr.Column(scale=2):
219
- inp_short = gr.Textbox(
220
- label="πŸ’¬ Input Text",
221
- placeholder="Type something for Sonya to say...",
222
- lines=4,
223
- value="Hello! I am Sonya, your AI voice."
224
- )
225
-
226
- with gr.Accordion("βš™οΈ Voice Controls", open=True):
227
- slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="🎭 Emotion", info="Higher = more expressive")
228
- slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="🎡 Rhythm", info="Higher = looser timing")
229
- slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="⏱ Speed", info="Lower = faster, Higher = slower")
230
-
231
- btn_short = gr.Button("✨ Generate Voice", variant="primary", size="lg")
232
-
233
- with gr.Column(scale=1):
234
- out_short = gr.Audio(label="πŸ”Š Sonya's Voice", type="numpy")
235
-
236
- btn_short.click(
237
- infer_short,
238
- inputs=[inp_short, slider_ns, slider_nsw, slider_ls],
239
- outputs=[out_short]
240
- )
241
-
242
- with gr.TabItem("πŸ“– Audiobook Mode"):
243
- gr.Markdown(
244
- """<p style='text-align: center; color: #666; font-size: 1.05em;'>
245
- Paste long text. Sonya will read it beautifully with natural pauses.
246
- </p>""",
247
- elem_classes="audiobook-description"
248
- )
249
-
250
- with gr.Row():
251
- with gr.Column(scale=2):
252
- inp_long = gr.Textbox(
253
- label="πŸ“œ Long Text Input",
254
- placeholder="Paste your story or article here...",
255
- lines=10
256
- )
257
-
258
- with gr.Accordion("βš™οΈ Narration Settings", open=False):
259
- long_ls = gr.Slider(0.5, 1.5, value=1.0, label="⏱ Reading Speed")
260
- long_ns = gr.Slider(0.1, 1.0, value=0.5, label="🎭 Tone Variation")
261
-
262
- btn_long = gr.Button("🎧 Read Aloud", variant="primary", size="lg")
263
-
264
- with gr.Column(scale=1):
265
- out_long = gr.Audio(label="πŸ“’ Full Narration", type="numpy")
266
-
267
- btn_long.click(
268
- infer_long,
269
- inputs=[inp_long, long_ls, long_ns],
270
- outputs=[out_long]
271
- )
272
-
273
- if __name__ == "__main__":
274
- app.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import torch
5
+ import numpy as np
6
+ from scipy.io.wavfile import write
7
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
+ from safetensors.torch import load_file
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ from tts import commons
12
+ from tts import utils
13
+ from tts.models import SynthesizerTrn
14
+ from text.symbols import symbols
15
+ from text import text_to_sequence
16
+
17
+ _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
18
+ if os.path.exists(_ESPEAK_LIBRARY):
19
+ EspeakWrapper.set_library(_ESPEAK_LIBRARY)
20
+ print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
21
+
22
+
23
+ REPO_ID = "PatnaikAshish/Sonya-TTS"
24
+
25
+ MODEL_FILENAME = "checkpoints/sonya-tts.safetensors"
26
+ CONFIG_FILENAME = "checkpoints/config.json"
27
+
28
+ LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors"
29
+ LOCAL_CONFIG_PATH = "checkpoints/config.json"
30
+
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+
33
+
34
+ def clean_text_for_vits(text):
35
+ text = text.strip()
36
+ text = text.replace("'", "'")
37
+ text = text.replace(""", '"').replace(""", '"')
38
+ text = text.replace("–", "-").replace("β€”", "-")
39
+ text = re.sub(r"[()\[\]{}<>]", "", text)
40
+ text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
41
+ text = re.sub(r"\s+", " ", text)
42
+ return text
43
+
44
+
45
+ def get_text(text, hps):
46
+ text = clean_text_for_vits(text)
47
+ text_norm = text_to_sequence(text, hps.data.text_cleaners)
48
+ if hps.data.add_blank:
49
+ text_norm = commons.intersperse(text_norm, 0)
50
+ return torch.LongTensor(text_norm)
51
+
52
+
53
+ def split_sentences(text):
54
+ text = clean_text_for_vits(text)
55
+ if not text:
56
+ return []
57
+ return re.split(r'(?<=[.!?])\s+', text)
58
+
59
+
60
+ print("πŸ”„ Loading Sonya TTS Model...")
61
+
62
+ if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH):
63
+ print("βœ… Loading Sonya TTS from local checkpoints...")
64
+ model_path = LOCAL_MODEL_PATH
65
+ config_path = LOCAL_CONFIG_PATH
66
+ else:
67
+ print("🌍 Downloading Sonya TTS from Hugging Face...")
68
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
69
+ config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME)
70
+
71
+ hps = utils.get_hparams_from_file(config_path)
72
+
73
+ net_g = SynthesizerTrn(
74
+ len(symbols),
75
+ hps.data.filter_length // 2 + 1,
76
+ hps.train.segment_size // hps.data.hop_length,
77
+ **hps.model
78
+ ).to(device)
79
+
80
+ net_g.eval()
81
+
82
+ state_dict = load_file(model_path)
83
+ net_g.load_state_dict(state_dict)
84
+ print("πŸŽ‰ Sonya TTS loaded successfully!")
85
+
86
+
87
+ def infer_short(text, noise_scale, noise_scale_w, length_scale):
88
+ if not text.strip():
89
+ return None
90
+
91
+ stn_tst = get_text(text, hps)
92
+
93
+ with torch.no_grad():
94
+ x_tst = stn_tst.to(device).unsqueeze(0)
95
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
96
+
97
+ audio = net_g.infer(
98
+ x_tst,
99
+ x_tst_lengths,
100
+ noise_scale=noise_scale,
101
+ noise_scale_w=noise_scale_w,
102
+ length_scale=length_scale
103
+ )[0][0,0].data.cpu().float().numpy()
104
+
105
+ return (hps.data.sampling_rate, audio)
106
+
107
+
108
+ def infer_long(text, length_scale, noise_scale):
109
+ if not text.strip():
110
+ return None
111
+
112
+ sentences = split_sentences(text)
113
+ audio_chunks = []
114
+
115
+ fixed_noise_w = 0.6
116
+ base_pause = 0.3
117
+
118
+ for sent in sentences:
119
+ if len(sent.strip()) < 2:
120
+ continue
121
+
122
+ stn_tst = get_text(sent, hps)
123
+ with torch.no_grad():
124
+ x_tst = stn_tst.to(device).unsqueeze(0)
125
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
126
+
127
+ audio = net_g.infer(
128
+ x_tst,
129
+ x_tst_lengths,
130
+ noise_scale=noise_scale,
131
+ noise_scale_w=fixed_noise_w,
132
+ length_scale=length_scale
133
+ )[0][0,0].data.cpu().float().numpy()
134
+
135
+ if sent.endswith("?"):
136
+ pause_dur = base_pause + 0.2
137
+ elif sent.endswith("!"):
138
+ pause_dur = base_pause + 0.1
139
+ else:
140
+ pause_dur = base_pause
141
+
142
+ silence = np.zeros(int(hps.data.sampling_rate * pause_dur))
143
+
144
+ audio_chunks.append(audio)
145
+ audio_chunks.append(silence)
146
+
147
+ final_audio = np.concatenate(audio_chunks)
148
+ return (hps.data.sampling_rate, final_audio)
149
+
150
+
151
+ theme = gr.themes.Soft(
152
+ primary_hue="pink",
153
+ secondary_hue="rose",
154
+ neutral_hue="slate"
155
+ ).set(
156
+ button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)",
157
+ button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)",
158
+ button_primary_text_color="white",
159
+ )
160
+
161
+ custom_css = """
162
+ .banner-container {
163
+ width: 100%;
164
+ max-width: 100%;
165
+ margin: 0 auto 20px auto;
166
+ display: flex;
167
+ justify-content: center;
168
+ align-items: center;
169
+ }
170
+
171
+ .banner-container img {
172
+ width: 100%;
173
+ max-width: 1800px;
174
+ max-height: 120px;
175
+ height: auto;
176
+ object-fit: scale-down;
177
+ object-position: center;
178
+ border-radius: 8px;
179
+ }
180
+
181
+ .main-title {
182
+ text-align: center;
183
+ color: #ff1493;
184
+ font-size: 2em;
185
+ font-weight: 700;
186
+ margin: 15px 0 8px 0;
187
+ }
188
+
189
+ .subtitle {
190
+ text-align: center;
191
+ color: white;
192
+ font-size: 1.1em;
193
+ margin-bottom: 25px;
194
+ font-weight: 400;
195
+ }
196
+
197
+ footer {
198
+ display: none !important;
199
+ }
200
+ """
201
+
202
+
203
+ with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app:
204
+
205
+ with gr.Row(elem_classes="banner-container"):
206
+ if os.path.exists("logo.png"):
207
+ gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img")
208
+
209
+ gr.HTML("""
210
+ <h1 class="main-title">✨ Sonya TTS β€” A Beautiful, Expressive Neural Voice Engine</h1>
211
+ <p class="subtitle">High-fidelity AI speech with emotion, rhythm, and audiobook mode</p>
212
+ """)
213
+
214
+ with gr.Tabs():
215
+
216
+ with gr.TabItem("πŸŽ›οΈ Studio Mode"):
217
+ with gr.Row():
218
+ with gr.Column(scale=2):
219
+ inp_short = gr.Textbox(
220
+ label="πŸ’¬ Input Text",
221
+ placeholder="Type something for Sonya to say...",
222
+ lines=4,
223
+ value="Hello! I am Sonya, your AI voice."
224
+ )
225
+
226
+ with gr.Accordion("βš™οΈ Voice Controls", open=True):
227
+ slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="🎭 Emotion", info="Higher = more expressive")
228
+ slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="🎡 Rhythm", info="Higher = looser timing")
229
+ slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="⏱ Speed", info="Lower = faster, Higher = slower")
230
+
231
+ btn_short = gr.Button("✨ Generate Voice", variant="primary", size="lg")
232
+
233
+ with gr.Column(scale=1):
234
+ out_short = gr.Audio(label="πŸ”Š Sonya's Voice", type="numpy")
235
+
236
+ btn_short.click(
237
+ infer_short,
238
+ inputs=[inp_short, slider_ns, slider_nsw, slider_ls],
239
+ outputs=[out_short]
240
+ )
241
+
242
+ with gr.TabItem("πŸ“– Audiobook Mode"):
243
+ gr.Markdown(
244
+ """<p style='text-align: center; color: #666; font-size: 1.05em;'>
245
+ Paste long text. Sonya will read it beautifully with natural pauses.
246
+ </p>""",
247
+ elem_classes="audiobook-description"
248
+ )
249
+
250
+ with gr.Row():
251
+ with gr.Column(scale=2):
252
+ inp_long = gr.Textbox(
253
+ label="πŸ“œ Long Text Input",
254
+ placeholder="Paste your story or article here...",
255
+ lines=10
256
+ )
257
+
258
+ with gr.Accordion("βš™οΈ Narration Settings", open=False):
259
+ long_ls = gr.Slider(0.5, 1.5, value=1.0, label="⏱ Reading Speed")
260
+ long_ns = gr.Slider(0.1, 1.0, value=0.5, label="🎭 Tone Variation")
261
+
262
+ btn_long = gr.Button("🎧 Read Aloud", variant="primary", size="lg")
263
+
264
+ with gr.Column(scale=1):
265
+ out_long = gr.Audio(label="πŸ“’ Full Narration", type="numpy")
266
+
267
+ btn_long.click(
268
+ infer_long,
269
+ inputs=[inp_long, long_ls, long_ns],
270
+ outputs=[out_long]
271
+ )
272
+
273
+ if __name__ == "__main__":
274
+ app.launch()