Spaces:
Build error
Build error
| import os | |
| import numpy as np | |
| import librosa | |
| from pydub import AudioSegment | |
| import soundfile as sf | |
| import gdown | |
| from TTS.api import TTS | |
| from langdetect import detect | |
| from scipy.spatial.distance import cosine | |
| import torch | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import streamlit as st | |
| from io import BytesIO | |
| # === Utility Functions === | |
| def convert_mp3_to_wav(mp3_file, wav_file): | |
| audio = AudioSegment.from_file(mp3_file, format="mp3") | |
| audio.export(wav_file, format="wav") | |
| def extract_mfcc(wav_file): | |
| y, sr = librosa.load(wav_file, sr=None) | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| return np.mean(mfcc, axis=1) | |
| def clone_and_compare(tts, ref_wav, text, language, output_wav="cloned.wav"): | |
| tts.tts_to_file(text=text, speaker_wav=ref_wav, language=language, file_path=output_wav) | |
| orig = extract_mfcc(ref_wav) | |
| clone = extract_mfcc(output_wav) | |
| similarity = 1 - cosine(orig, clone) | |
| return similarity, output_wav | |
| def standardize_audio_format(input_file, output_file, sample_rate=22050): | |
| y, sr = librosa.load(input_file, sr=sample_rate) | |
| sf.write(output_file, y, sample_rate) | |
| # === Streamlit App === | |
| def main(): | |
| st.title("ποΈ Voice Cloning App") | |
| st.write("Clone voices and compare similarity with the original") | |
| # Initialize TTS model | |
| if 'tts' not in st.session_state: | |
| with st.spinner("Loading TTS model..."): | |
| st.session_state.tts = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/your_tts", | |
| progress_bar=False, | |
| gpu=torch.cuda.is_available() | |
| ) | |
| # Input method selection | |
| input_method = st.radio( | |
| "How do you want to provide the voice/text data?", | |
| options=[ | |
| "Upload audio and text manually", | |
| "Enter local paths", | |
| "Use Google Drive link", | |
| "Upload existing CSV file" | |
| ] | |
| ) | |
| wav_file = None | |
| input_text = None | |
| csv_data = None | |
| if input_method == "Upload audio and text manually": | |
| audio_file = st.file_uploader("Upload your audio (MP3) file", type=["mp3"]) | |
| text_file = st.file_uploader("Upload your text file", type=["txt"]) | |
| if audio_file and text_file: | |
| wav_file = "input.wav" | |
| with open("temp.mp3", "wb") as f: | |
| f.write(audio_file.getbuffer()) | |
| convert_mp3_to_wav("temp.mp3", wav_file) | |
| input_text = text_file.read().decode("utf-8") | |
| elif input_method == "Enter local paths": | |
| mp3_path = st.text_input("Enter path to your MP3 file") | |
| text_path = st.text_input("Enter path to your text file") | |
| if mp3_path and text_path: | |
| wav_file = mp3_path.replace(".mp3", ".wav") | |
| convert_mp3_to_wav(mp3_path, wav_file) | |
| with open(text_path, 'r') as file: | |
| input_text = file.read() | |
| elif input_method == "Use Google Drive link": | |
| gdrive_url = st.text_input("Enter the Google Drive MP3 link") | |
| input_text = st.text_area("Enter the text to be spoken using cloned voice") | |
| if gdrive_url and input_text: | |
| mp3_file = "input.mp3" | |
| wav_file = "input.wav" | |
| try: | |
| file_id = gdrive_url.split("/d/")[1].split("/")[0] | |
| download_url = f"https://drive.google.com/uc?id={file_id}" | |
| gdown.download(download_url, mp3_file, quiet=False) | |
| convert_mp3_to_wav(mp3_file, wav_file) | |
| except Exception as e: | |
| st.error(f"Error downloading from Google Drive: {e}") | |
| elif input_method == "Upload existing CSV file": | |
| csv_file = st.file_uploader("Upload your voice_dataset.csv", type=["csv"]) | |
| if csv_file: | |
| csv_data = pd.read_csv(csv_file) | |
| st.write("Uploaded CSV data:") | |
| st.dataframe(csv_data) | |
| # Process cloning if we have the required inputs | |
| if csv_data is not None: | |
| st.success("β You uploaded an existing CSV, skipping voice cloning.") | |
| elif wav_file and input_text: | |
| try: | |
| language = detect(input_text) | |
| st.write(f"Detected language: {language}") | |
| if st.button("Start Voice Cloning"): | |
| best_similarity = 0 | |
| best_output = "" | |
| results = [] | |
| st.write("π Running 5 cloning attempts for best match...") | |
| progress_bar = st.progress(0) | |
| for i in range(5): | |
| with st.spinner(f"Running attempt {i+1}/5..."): | |
| sim, out_file = clone_and_compare( | |
| st.session_state.tts, | |
| wav_file, | |
| input_text, | |
| language, | |
| f"clone_try_{i}.wav" | |
| ) | |
| results.append({"Attempt": i + 1, "Similarity": sim}) | |
| progress_bar.progress((i+1)/5) | |
| st.write(f"Attempt {i+1}: Similarity = {sim*100:.2f}%") | |
| if sim > best_similarity: | |
| best_similarity = sim | |
| best_output = out_file | |
| # Standardize & Save Final Audio | |
| standardize_audio_format(best_output, "final_cloned_voice.wav") | |
| st.success(f"β Best voice with similarity {best_similarity*100:.2f}%") | |
| # Save CSV | |
| df = pd.DataFrame(results) | |
| df.to_csv("voice_dataset.csv", index=False) | |
| # Plot | |
| fig, ax = plt.subplots() | |
| ax.plot(df['Attempt'], df['Similarity'] * 100, marker='o') | |
| ax.set_title("Voice Similarity Over Attempts") | |
| ax.set_xlabel("Attempt") | |
| ax.set_ylabel("Similarity (%)") | |
| ax.set_ylim(0, 100) | |
| ax.grid(True) | |
| st.pyplot(fig) | |
| # Download options | |
| st.subheader("π₯ Download Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| with open("voice_dataset.csv", "rb") as f: | |
| st.download_button( | |
| "Download CSV", | |
| f, | |
| file_name="voice_dataset.csv", | |
| mime="text/csv" | |
| ) | |
| with col2: | |
| with open("final_cloned_voice.wav", "rb") as f: | |
| st.download_button( | |
| "Download Audio", | |
| f, | |
| file_name="final_cloned_voice.wav", | |
| mime="audio/wav" | |
| ) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| if __name__ == "__main__": | |
| main() |