ahmedumeraziz's picture
Create app.py
708f5fe verified
import os
import numpy as np
import librosa
from pydub import AudioSegment
import soundfile as sf
import gdown
from TTS.api import TTS
from langdetect import detect
from scipy.spatial.distance import cosine
import torch
import matplotlib.pyplot as plt
import pandas as pd
import streamlit as st
from io import BytesIO
# === Utility Functions ===
def convert_mp3_to_wav(mp3_file, wav_file):
audio = AudioSegment.from_file(mp3_file, format="mp3")
audio.export(wav_file, format="wav")
def extract_mfcc(wav_file):
y, sr = librosa.load(wav_file, sr=None)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
return np.mean(mfcc, axis=1)
def clone_and_compare(tts, ref_wav, text, language, output_wav="cloned.wav"):
tts.tts_to_file(text=text, speaker_wav=ref_wav, language=language, file_path=output_wav)
orig = extract_mfcc(ref_wav)
clone = extract_mfcc(output_wav)
similarity = 1 - cosine(orig, clone)
return similarity, output_wav
def standardize_audio_format(input_file, output_file, sample_rate=22050):
y, sr = librosa.load(input_file, sr=sample_rate)
sf.write(output_file, y, sample_rate)
# === Streamlit App ===
def main():
st.title("πŸŽ™οΈ Voice Cloning App")
st.write("Clone voices and compare similarity with the original")
# Initialize TTS model
if 'tts' not in st.session_state:
with st.spinner("Loading TTS model..."):
st.session_state.tts = TTS(
model_name="tts_models/multilingual/multi-dataset/your_tts",
progress_bar=False,
gpu=torch.cuda.is_available()
)
# Input method selection
input_method = st.radio(
"How do you want to provide the voice/text data?",
options=[
"Upload audio and text manually",
"Enter local paths",
"Use Google Drive link",
"Upload existing CSV file"
]
)
wav_file = None
input_text = None
csv_data = None
if input_method == "Upload audio and text manually":
audio_file = st.file_uploader("Upload your audio (MP3) file", type=["mp3"])
text_file = st.file_uploader("Upload your text file", type=["txt"])
if audio_file and text_file:
wav_file = "input.wav"
with open("temp.mp3", "wb") as f:
f.write(audio_file.getbuffer())
convert_mp3_to_wav("temp.mp3", wav_file)
input_text = text_file.read().decode("utf-8")
elif input_method == "Enter local paths":
mp3_path = st.text_input("Enter path to your MP3 file")
text_path = st.text_input("Enter path to your text file")
if mp3_path and text_path:
wav_file = mp3_path.replace(".mp3", ".wav")
convert_mp3_to_wav(mp3_path, wav_file)
with open(text_path, 'r') as file:
input_text = file.read()
elif input_method == "Use Google Drive link":
gdrive_url = st.text_input("Enter the Google Drive MP3 link")
input_text = st.text_area("Enter the text to be spoken using cloned voice")
if gdrive_url and input_text:
mp3_file = "input.mp3"
wav_file = "input.wav"
try:
file_id = gdrive_url.split("/d/")[1].split("/")[0]
download_url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(download_url, mp3_file, quiet=False)
convert_mp3_to_wav(mp3_file, wav_file)
except Exception as e:
st.error(f"Error downloading from Google Drive: {e}")
elif input_method == "Upload existing CSV file":
csv_file = st.file_uploader("Upload your voice_dataset.csv", type=["csv"])
if csv_file:
csv_data = pd.read_csv(csv_file)
st.write("Uploaded CSV data:")
st.dataframe(csv_data)
# Process cloning if we have the required inputs
if csv_data is not None:
st.success("βœ… You uploaded an existing CSV, skipping voice cloning.")
elif wav_file and input_text:
try:
language = detect(input_text)
st.write(f"Detected language: {language}")
if st.button("Start Voice Cloning"):
best_similarity = 0
best_output = ""
results = []
st.write("πŸ” Running 5 cloning attempts for best match...")
progress_bar = st.progress(0)
for i in range(5):
with st.spinner(f"Running attempt {i+1}/5..."):
sim, out_file = clone_and_compare(
st.session_state.tts,
wav_file,
input_text,
language,
f"clone_try_{i}.wav"
)
results.append({"Attempt": i + 1, "Similarity": sim})
progress_bar.progress((i+1)/5)
st.write(f"Attempt {i+1}: Similarity = {sim*100:.2f}%")
if sim > best_similarity:
best_similarity = sim
best_output = out_file
# Standardize & Save Final Audio
standardize_audio_format(best_output, "final_cloned_voice.wav")
st.success(f"βœ… Best voice with similarity {best_similarity*100:.2f}%")
# Save CSV
df = pd.DataFrame(results)
df.to_csv("voice_dataset.csv", index=False)
# Plot
fig, ax = plt.subplots()
ax.plot(df['Attempt'], df['Similarity'] * 100, marker='o')
ax.set_title("Voice Similarity Over Attempts")
ax.set_xlabel("Attempt")
ax.set_ylabel("Similarity (%)")
ax.set_ylim(0, 100)
ax.grid(True)
st.pyplot(fig)
# Download options
st.subheader("πŸ“₯ Download Results")
col1, col2 = st.columns(2)
with col1:
with open("voice_dataset.csv", "rb") as f:
st.download_button(
"Download CSV",
f,
file_name="voice_dataset.csv",
mime="text/csv"
)
with col2:
with open("final_cloned_voice.wav", "rb") as f:
st.download_button(
"Download Audio",
f,
file_name="final_cloned_voice.wav",
mime="audio/wav"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()