| import gradio as gr |
| import os |
| import shutil |
|
|
| |
| import numpy as np |
| from scipy.io import wavfile |
| """ |
| model_ids = [ |
| 'suno/bark', |
| ] |
| |
| for model_id in model_ids: |
| model_name = model_id.split('/')[-1] |
| snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') |
| |
| from TTS.tts.configs.bark_config import BarkConfig |
| from TTS.tts.models.bark import Bark |
| |
| #os.environ['CUDA_VISIBLE_DEVICES'] = '1' |
| config = BarkConfig() |
| model = Bark.init_from_config(config) |
| model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True) |
| """ |
| from TTS.api import TTS |
| tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) |
|
|
| def infer(prompt, input_wav_file): |
|
|
| print("SAVING THE AUDIO FILE TO WHERE IT BELONGS") |
|
|
| |
| source_path = input_wav_file |
|
|
| |
| destination_directory = "bark_voices" |
|
|
| |
| file_name = os.path.splitext(os.path.basename(source_path))[0] |
|
|
| |
| destination_path = os.path.join(destination_directory, file_name) |
|
|
| |
| os.makedirs(destination_path, exist_ok=True) |
|
|
| |
| shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) |
|
|
| """ |
| text = prompt |
| |
| print("SYNTHETIZING...") |
| # with random speaker |
| #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) |
| |
| # cloning a speaker. |
| # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` |
| output_dict = model.synthesize( |
| text, |
| config, |
| speaker_id=f"{file_name}", |
| voice_dirs="bark_voices/", |
| gpu=True |
| ) |
| |
| print(output_dict) |
| |
| |
| |
| sample_rate = 24000 # Replace with the actual sample rate |
| print("WRITING WAVE FILE") |
| wavfile.write( |
| 'output.wav', |
| sample_rate, |
| output_dict['wav'] |
| ) |
| """ |
| |
| tts.tts_to_file(text=prompt, |
| file_path="output.wav", |
| voice_dir="bark_voices/", |
| speaker=f"{file_name}") |
|
|
| |
| contents = os.listdir(f"bark_voices/{file_name}") |
|
|
| |
| for item in contents: |
| print(item) |
|
|
| tts_video = gr.make_waveform(audio="output.wav") |
| |
| return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True) |
|
|
|
|
| css = """ |
| #col-container {max-width: 780px; margin-left: auto; margin-right: auto;} |
| img[src*='#center'] { |
| display: block; |
| margin: auto; |
| } |
| """ |
|
|
| with gr.Blocks(css=css) as demo: |
| with gr.Column(elem_id="col-container"): |
| |
| gr.Markdown(""" |
| <h1 style="text-align: center;">Instant Voice Cloning</h1> |
| <p style="text-align: center;"> |
| Clone any voice in less than 2 minutes with this <a href="https://tts.readthedocs.io/en/dev/models/bark.html" target="_blank">Coqui TSS + Bark</a> demo ! <br /> |
| Upload a clean 20 seconds WAV file of the voice you want to clone, <br /> |
| type your text-to-speech prompt and hit submit ! <br /> |
| </p> |
| |
| [](https://huggingface.co/spaces/fffiloni/instant-TTS-Bark-cloning?duplicate=true) |
| |
| """) |
| with gr.Row(): |
| with gr.Column(): |
| prompt = gr.Textbox( |
| label="Text to speech prompt" |
| ) |
| |
| audio_in = gr.Audio( |
| label="WAV voice to clone", |
| type="filepath", |
| source="upload" |
| ) |
| |
| submit_btn = gr.Button("Submit") |
|
|
| with gr.Column(): |
| |
| cloned_out = gr.Audio( |
| label="Text to speech output" |
| ) |
| |
| video_out = gr.Video( |
| label = "Waveform video" |
| ) |
| |
| npz_file = gr.File( |
| label = ".npz file", |
| visible = False |
| ) |
| |
| submit_btn.click( |
| fn = infer, |
| inputs = [ |
| prompt, |
| audio_in |
| ], |
| outputs = [ |
| cloned_out, |
| video_out, |
| npz_file |
| ] |
| ) |
|
|
| demo.queue(max_size=20).launch() |