Spaces:

ahmedumeraziz
/

custom_audio_generator

Build error

App Files Files Community

custom_audio_generator / app.py

ahmedumeraziz

Create app.py

708f5fe verified 11 months ago

raw

history blame contribute delete

7 kB

	import os
	import numpy as np
	import librosa
	from pydub import AudioSegment
	import soundfile as sf
	import gdown
	from TTS.api import TTS
	from langdetect import detect
	from scipy.spatial.distance import cosine
	import torch
	import matplotlib.pyplot as plt
	import pandas as pd
	import streamlit as st
	from io import BytesIO

	# === Utility Functions ===
	def convert_mp3_to_wav(mp3_file, wav_file):
	audio = AudioSegment.from_file(mp3_file, format="mp3")
	audio.export(wav_file, format="wav")

	def extract_mfcc(wav_file):
	y, sr = librosa.load(wav_file, sr=None)
	mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
	return np.mean(mfcc, axis=1)

	def clone_and_compare(tts, ref_wav, text, language, output_wav="cloned.wav"):
	tts.tts_to_file(text=text, speaker_wav=ref_wav, language=language, file_path=output_wav)
	orig = extract_mfcc(ref_wav)
	clone = extract_mfcc(output_wav)
	similarity = 1 - cosine(orig, clone)
	return similarity, output_wav

	def standardize_audio_format(input_file, output_file, sample_rate=22050):
	y, sr = librosa.load(input_file, sr=sample_rate)
	sf.write(output_file, y, sample_rate)

	# === Streamlit App ===
	def main():
	st.title("🎙️ Voice Cloning App")
	st.write("Clone voices and compare similarity with the original")

	# Initialize TTS model
	if 'tts' not in st.session_state:
	with st.spinner("Loading TTS model..."):
	st.session_state.tts = TTS(
	model_name="tts_models/multilingual/multi-dataset/your_tts",
	progress_bar=False,
	gpu=torch.cuda.is_available()
	)

	# Input method selection
	input_method = st.radio(
	"How do you want to provide the voice/text data?",
	options=[
	"Upload audio and text manually",
	"Enter local paths",
	"Use Google Drive link",
	"Upload existing CSV file"
	]
	)

	wav_file = None
	input_text = None
	csv_data = None

	if input_method == "Upload audio and text manually":
	audio_file = st.file_uploader("Upload your audio (MP3) file", type=["mp3"])
	text_file = st.file_uploader("Upload your text file", type=["txt"])

	if audio_file and text_file:
	wav_file = "input.wav"
	with open("temp.mp3", "wb") as f:
	f.write(audio_file.getbuffer())
	convert_mp3_to_wav("temp.mp3", wav_file)

	input_text = text_file.read().decode("utf-8")

	elif input_method == "Enter local paths":
	mp3_path = st.text_input("Enter path to your MP3 file")
	text_path = st.text_input("Enter path to your text file")

	if mp3_path and text_path:
	wav_file = mp3_path.replace(".mp3", ".wav")
	convert_mp3_to_wav(mp3_path, wav_file)

	with open(text_path, 'r') as file:
	input_text = file.read()

	elif input_method == "Use Google Drive link":
	gdrive_url = st.text_input("Enter the Google Drive MP3 link")
	input_text = st.text_area("Enter the text to be spoken using cloned voice")

	if gdrive_url and input_text:
	mp3_file = "input.mp3"
	wav_file = "input.wav"
	try:
	file_id = gdrive_url.split("/d/")[1].split("/")[0]
	download_url = f"https://drive.google.com/uc?id={file_id}"
	gdown.download(download_url, mp3_file, quiet=False)
	convert_mp3_to_wav(mp3_file, wav_file)
	except Exception as e:
	st.error(f"Error downloading from Google Drive: {e}")

	elif input_method == "Upload existing CSV file":
	csv_file = st.file_uploader("Upload your voice_dataset.csv", type=["csv"])
	if csv_file:
	csv_data = pd.read_csv(csv_file)
	st.write("Uploaded CSV data:")
	st.dataframe(csv_data)

	# Process cloning if we have the required inputs
	if csv_data is not None:
	st.success("✅ You uploaded an existing CSV, skipping voice cloning.")
	elif wav_file and input_text:
	try:
	language = detect(input_text)
	st.write(f"Detected language: {language}")

	if st.button("Start Voice Cloning"):
	best_similarity = 0
	best_output = ""
	results = []

	st.write("🔁 Running 5 cloning attempts for best match...")
	progress_bar = st.progress(0)

	for i in range(5):
	with st.spinner(f"Running attempt {i+1}/5..."):
	sim, out_file = clone_and_compare(
	st.session_state.tts,
	wav_file,
	input_text,
	language,
	f"clone_try_{i}.wav"
	)
	results.append({"Attempt": i + 1, "Similarity": sim})
	progress_bar.progress((i+1)/5)
	st.write(f"Attempt {i+1}: Similarity = {sim*100:.2f}%")

	if sim > best_similarity:
	best_similarity = sim
	best_output = out_file

	# Standardize & Save Final Audio
	standardize_audio_format(best_output, "final_cloned_voice.wav")
	st.success(f"✅ Best voice with similarity {best_similarity*100:.2f}%")

	# Save CSV
	df = pd.DataFrame(results)
	df.to_csv("voice_dataset.csv", index=False)

	# Plot
	fig, ax = plt.subplots()
	ax.plot(df['Attempt'], df['Similarity'] * 100, marker='o')
	ax.set_title("Voice Similarity Over Attempts")
	ax.set_xlabel("Attempt")
	ax.set_ylabel("Similarity (%)")
	ax.set_ylim(0, 100)
	ax.grid(True)
	st.pyplot(fig)

	# Download options
	st.subheader("📥 Download Results")

	col1, col2 = st.columns(2)

	with col1:
	with open("voice_dataset.csv", "rb") as f:
	st.download_button(
	"Download CSV",
	f,
	file_name="voice_dataset.csv",
	mime="text/csv"
	)

	with col2:
	with open("final_cloned_voice.wav", "rb") as f:
	st.download_button(
	"Download Audio",
	f,
	file_name="final_cloned_voice.wav",
	mime="audio/wav"
	)
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	if __name__ == "__main__":
	main()