Spaces:
Running
Running
| import os | |
| import sys | |
| from os import rename | |
| from os.path import basename | |
| from zipfile import ZipFile, ZIP_DEFLATED | |
| from shutil import rmtree | |
| from importlib.metadata import version | |
| import sphn | |
| import torch | |
| import sentry_sdk | |
| import gradio as gr | |
| from gradio.themes import Soft | |
| from inference import inference_file | |
| if "SENTRY_DSN" in os.environ: | |
| sentry_sdk.init( | |
| dsn=os.environ["SENTRY_DSN"], | |
| send_default_pii=True, | |
| ) | |
| print("Sentry SDK is activated") | |
| use_cuda = torch.cuda.is_available() | |
| if use_cuda: | |
| print("CUDA is available, setting correct device variable.") | |
| device = "cuda" | |
| else: | |
| device = "cpu" | |
| # https://www.tablesgenerator.com/markdown_tables | |
| authors_table = """ | |
| ## Authors | |
| Follow them in social networks and **contact** if you need any help or have any questions: | |
| | **Yehor Smoliakov** | | |
| |-------------------------------------------------------------------------------------------------| | |
| | https://t.me/smlkw in Telegram | | |
| | https://x.com/yehor_smoliakov at X | | |
| | https://github.com/egorsmkv at GitHub | | |
| | https://huggingface.co/Yehor at Hugging Face | | |
| | or use egorsmkv@gmail.com | | |
| """.strip() | |
| tech_env = f""" | |
| #### Environment | |
| - Python: {sys.version} | |
| - Torch device: {device} | |
| #### Models | |
| ##### Acoustic model (Voice Activity Detection) | |
| - Name: MarbleNet | |
| - URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_marblenet | |
| """.strip() | |
| tech_libraries = f""" | |
| #### Libraries | |
| - torch: {version("torch")} | |
| - sphn: {version("sphn")} | |
| - gradio: {version("gradio")} | |
| - sentry_sdk: {version("sentry_sdk")} | |
| """.strip() | |
| description_head = """ | |
| # MarbleNet | |
| Split an audio file to voice chunks. | |
| """.strip() | |
| concurrency_limit = 5 | |
| def inference_func(wav_file, min_sec, max_sec): | |
| archive_name = "tmp.zip" | |
| n_files = 0 | |
| duration_secs = 0 | |
| # Validate the file | |
| try: | |
| data, sr = sphn.read(wav_file) | |
| duration = len(data[0]) / sr | |
| if duration < 0.1: | |
| raise gr.Error("The duration is too low") | |
| n_channels = len(data) | |
| if n_channels > 1: | |
| raise gr.Error( | |
| f"Your file must be in the mono format. The file has {n_channels} channels." | |
| ) | |
| except Exception as e: | |
| raise gr.Error(f"Can't read your file, the problem: {e}") | |
| # Rename the file | |
| old_wav_file = wav_file | |
| wav_file = "input.wav" | |
| rename(old_wav_file, wav_file) | |
| with ZipFile( | |
| archive_name, | |
| "w", | |
| compression=ZIP_DEFLATED, | |
| allowZip64=True, | |
| compresslevel=9, | |
| ) as zip_file: | |
| try: | |
| results = inference_file(wav_file) | |
| except Exception as exc: | |
| sentry_sdk.capture_exception(exc) | |
| raise gr.Error("Something went wrong, we will be notified about this") | |
| for idx, result in enumerate(results): | |
| duration = result["speech"]["duration"] | |
| print(result, duration) | |
| if duration <= min_sec or duration >= max_sec: | |
| print("Skipping...") | |
| continue | |
| arc_name = basename(result["filename"]) | |
| zip_file.write(result["filename"], arc_name) | |
| duration_secs += duration | |
| n_files += 1 | |
| # Remove files | |
| rmtree("chunks") | |
| mins = round(duration_secs / 60, 4) | |
| gr.Success( | |
| f"VAD model identified {n_files} files in interval [{min_sec}:{max_sec}], total duration = {mins} min." | |
| ) | |
| return archive_name | |
| def create_app(): | |
| tab = gr.Blocks( | |
| title="MarbleNet", | |
| analytics_enabled=False, | |
| theme=Soft(), | |
| ) | |
| with tab: | |
| gr.Markdown(description_head) | |
| gr.Markdown("## Usage") | |
| with gr.Column(): | |
| wav_file = gr.File( | |
| label="WAV file to process", | |
| file_count="single", | |
| file_types=[".wav"], | |
| ) | |
| min_sec = gr.Number( | |
| label="Minimum seconds", value=0.1, minimum=0.01, maximum=59.99 | |
| ) | |
| max_sec = gr.Number( | |
| label="Maximum seconds", value=30, minimum=0.02, maximum=60 | |
| ) | |
| with gr.Column(): | |
| zip_file = gr.File(label="ZIP file with voice chunks") | |
| gr.Button("Run").click( | |
| inference_func, | |
| concurrency_limit=concurrency_limit, | |
| inputs=[wav_file, min_sec, max_sec], | |
| outputs=[zip_file], | |
| ) | |
| return tab | |
| def create_env(): | |
| with gr.Blocks(theme=Soft()) as tab: | |
| gr.Markdown(tech_env) | |
| gr.Markdown(tech_libraries) | |
| return tab | |
| def create_authors(): | |
| with gr.Blocks(theme=Soft()) as tab: | |
| gr.Markdown(authors_table) | |
| return tab | |
| def create_demo(): | |
| app_tab = create_app() | |
| authors_tab = create_authors() | |
| env_tab = create_env() | |
| return gr.TabbedInterface( | |
| [app_tab, authors_tab, env_tab], | |
| tab_names=[ | |
| "🎙️ VAD", | |
| "👥 Authors", | |
| "📦 Environment, Models, and Libraries", | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.queue() | |
| demo.launch() | |