Spaces:
Paused
Paused
yutoAb commited on
Commit ·
4ee8d5c
1
Parent(s): b2eed5f
Add application file
Browse files- Dockerfile +39 -0
- README.md +59 -1
- app.py +201 -0
- requirements.txt +2 -0
Dockerfile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Triton/Inductor の JIT に必要(gcc等)
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Copy application files
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
COPY app.py .
|
| 13 |
+
|
| 14 |
+
RUN pip install --no-cache-dir uv
|
| 15 |
+
RUN uv pip install --system --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# 安定化
|
| 18 |
+
ENV OMP_NUM_THREADS=1
|
| 19 |
+
ENV MKL_NUM_THREADS=1
|
| 20 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 21 |
+
|
| 22 |
+
# ✅ ログが止まって見える問題を解消
|
| 23 |
+
ENV PYTHONUNBUFFERED=1
|
| 24 |
+
|
| 25 |
+
# ✅ 起動を軽くする(コンパイル無効)
|
| 26 |
+
ENV TORCHDYNAMO_DISABLE=1
|
| 27 |
+
ENV TORCHINDUCTOR_DISABLE=1
|
| 28 |
+
|
| 29 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
| 30 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 31 |
+
|
| 32 |
+
# Expose ports for:
|
| 33 |
+
# - 7860: Main Gradio interface
|
| 34 |
+
# - 8998: Moshi server 1 (finetuned-step-9282)
|
| 35 |
+
# - 8999: Moshi server 2 (j-moshi)
|
| 36 |
+
EXPOSE 7860 8998 8999
|
| 37 |
+
|
| 38 |
+
CMD ["python", "app.py"]
|
| 39 |
+
|
README.md
CHANGED
|
@@ -6,6 +6,64 @@ colorTo: indigo
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: apache-2.0
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: apache-2.0
|
| 9 |
+
short_description: Compare two Moshi models with tabbed interface
|
| 10 |
+
app_port: 7860
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🎙️ Dual Moshi Model Interface
|
| 14 |
+
|
| 15 |
+
Compare two Moshi voice AI models side-by-side with a tabbed interface.
|
| 16 |
+
Each model runs with full WebRTC support for real-time voice conversation.
|
| 17 |
+
|
| 18 |
+
## Available Models
|
| 19 |
+
|
| 20 |
+
### 1. Finetuned Step 9282
|
| 21 |
+
- **Repository**: `abePclWaseda/moshi-finetuned-step-9282`
|
| 22 |
+
- Fine-tuned Moshi model at training step 9282
|
| 23 |
+
- Optimized for specific use cases
|
| 24 |
+
- **Port**: 8998
|
| 25 |
+
|
| 26 |
+
### 2. J-Moshi (Japanese)
|
| 27 |
+
- **Repository**: `nu-dialogue/j-moshi`
|
| 28 |
+
- Japanese-optimized full-duplex spoken dialogue system
|
| 29 |
+
- Built on Moshi 7B with additional Japanese training data
|
| 30 |
+
- Supports natural turn-taking and backchannel responses (相槌)
|
| 31 |
+
- **Port**: 8999
|
| 32 |
+
|
| 33 |
+
## Features
|
| 34 |
+
|
| 35 |
+
- **Tabbed Interface**: Switch between models using tabs
|
| 36 |
+
- **Full WebRTC Support**: Complete `moshi.server` UI for each model
|
| 37 |
+
- **Dual Model Execution**: Both models run simultaneously
|
| 38 |
+
- **Real-time Voice**: Full-duplex conversation with microphone input
|
| 39 |
+
- **GPU Optimized**: Designed for 48GB+ GPU environments
|
| 40 |
+
|
| 41 |
+
## Architecture
|
| 42 |
+
|
| 43 |
+
The application runs:
|
| 44 |
+
1. **Main Gradio Interface** (Port 7860) - Tabbed UI for model selection
|
| 45 |
+
2. **Moshi Server 1** (Port 8998) - Finetuned Step 9282 model
|
| 46 |
+
3. **Moshi Server 2** (Port 8999) - J-Moshi model
|
| 47 |
+
|
| 48 |
+
Each tab embeds the complete `moshi.server` interface with WebRTC support.
|
| 49 |
+
|
| 50 |
+
## Requirements
|
| 51 |
+
|
| 52 |
+
- **GPU**: A100 (48GB) or equivalent recommended
|
| 53 |
+
- **Memory**: ~48GB GPU VRAM (24GB per model running simultaneously)
|
| 54 |
+
- **Docker**: Containerized deployment
|
| 55 |
+
|
| 56 |
+
## Usage
|
| 57 |
+
|
| 58 |
+
1. Open the application (port 7860)
|
| 59 |
+
2. Click on a tab to select which model to use
|
| 60 |
+
3. Click inside the embedded interface to interact with Moshi
|
| 61 |
+
4. Use the microphone button to start voice conversation
|
| 62 |
+
5. Switch tabs to compare different models
|
| 63 |
+
|
| 64 |
+
## Technical Details
|
| 65 |
+
|
| 66 |
+
- **Framework**: Gradio + moshi.server
|
| 67 |
+
- **Models**: Moshi (7B parameters each)
|
| 68 |
+
- **Codec**: MIMI audio codec
|
| 69 |
+
- **Ports**: 7860 (main), 8998, 8999 (model servers)
|
app.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Dual Moshi Model Interface
|
| 4 |
+
Runs two moshi.server instances and provides a tabbed interface for comparison
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import multiprocessing
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
import time
|
| 13 |
+
from typing import List
|
| 14 |
+
|
| 15 |
+
import gradio as gr
|
| 16 |
+
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Model configurations
|
| 26 |
+
MODELS = [
|
| 27 |
+
{
|
| 28 |
+
"name": "Finetuned Step 9282",
|
| 29 |
+
"repo": "abePclWaseda/moshi-finetuned-step-9282",
|
| 30 |
+
"port": 8998,
|
| 31 |
+
"description": "Fine-tuned Moshi model at training step 9282"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"name": "J-Moshi (Japanese)",
|
| 35 |
+
"repo": "nu-dialogue/j-moshi",
|
| 36 |
+
"port": 8999,
|
| 37 |
+
"description": "Japanese-optimized full-duplex dialogue system"
|
| 38 |
+
}
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def start_moshi_server(model_repo: str, port: int, model_name: str):
|
| 43 |
+
"""Start a moshi.server instance for a specific model"""
|
| 44 |
+
logger.info(f"Starting {model_name} on port {port}...")
|
| 45 |
+
logger.info(f" Repository: {model_repo}")
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
# Run moshi.server with specified model and port
|
| 49 |
+
cmd = [
|
| 50 |
+
"uv", "run", "-m", "moshi.server",
|
| 51 |
+
"--hf-repo", model_repo,
|
| 52 |
+
"--host", "0.0.0.0",
|
| 53 |
+
"--port", str(port)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
logger.info(f" Command: {' '.join(cmd)}")
|
| 57 |
+
|
| 58 |
+
# Start the process
|
| 59 |
+
process = subprocess.Popen(
|
| 60 |
+
cmd,
|
| 61 |
+
stdout=subprocess.PIPE,
|
| 62 |
+
stderr=subprocess.STDOUT,
|
| 63 |
+
universal_newlines=True,
|
| 64 |
+
bufsize=1
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Log output
|
| 68 |
+
for line in process.stdout:
|
| 69 |
+
logger.info(f" [{model_name}:{port}] {line.rstrip()}")
|
| 70 |
+
|
| 71 |
+
return_code = process.wait()
|
| 72 |
+
if return_code != 0:
|
| 73 |
+
logger.error(f"{model_name} exited with code {return_code}")
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Failed to start {model_name}: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def create_gradio_interface() -> gr.Blocks:
|
| 81 |
+
"""Create the Gradio tabbed interface"""
|
| 82 |
+
|
| 83 |
+
with gr.Blocks(
|
| 84 |
+
title="Dual Moshi Model Interface",
|
| 85 |
+
theme=gr.themes.Soft(),
|
| 86 |
+
css="""
|
| 87 |
+
.iframe-container {
|
| 88 |
+
width: 100%;
|
| 89 |
+
height: 800px;
|
| 90 |
+
border: 1px solid #e0e0e0;
|
| 91 |
+
border-radius: 8px;
|
| 92 |
+
overflow: hidden;
|
| 93 |
+
}
|
| 94 |
+
.iframe-container iframe {
|
| 95 |
+
width: 100%;
|
| 96 |
+
height: 100%;
|
| 97 |
+
border: none;
|
| 98 |
+
}
|
| 99 |
+
"""
|
| 100 |
+
) as demo:
|
| 101 |
+
|
| 102 |
+
gr.Markdown("""
|
| 103 |
+
# 🎙️ Dual Moshi Model Interface
|
| 104 |
+
|
| 105 |
+
Compare two Moshi voice AI models side-by-side. Each tab shows the full WebRTC interface
|
| 106 |
+
for a different model. Switch tabs to compare responses.
|
| 107 |
+
""")
|
| 108 |
+
|
| 109 |
+
with gr.Tabs() as tabs:
|
| 110 |
+
for model in MODELS:
|
| 111 |
+
with gr.TabItem(f"🎤 {model['name']}"):
|
| 112 |
+
gr.Markdown(f"""
|
| 113 |
+
### {model['name']}
|
| 114 |
+
**Repository**: `{model['repo']}`
|
| 115 |
+
|
| 116 |
+
{model['description']}
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
""")
|
| 120 |
+
|
| 121 |
+
# Embed moshi.server UI in iframe
|
| 122 |
+
gr.HTML(f"""
|
| 123 |
+
<div class="iframe-container">
|
| 124 |
+
<iframe src="http://localhost:{model['port']}"
|
| 125 |
+
allow="microphone; camera"
|
| 126 |
+
title="{model['name']} Interface">
|
| 127 |
+
</iframe>
|
| 128 |
+
</div>
|
| 129 |
+
""")
|
| 130 |
+
|
| 131 |
+
gr.Markdown(f"""
|
| 132 |
+
---
|
| 133 |
+
💡 **Tip**: Click inside the frame to interact with the Moshi interface.
|
| 134 |
+
The server is running on port {model['port']}.
|
| 135 |
+
""")
|
| 136 |
+
|
| 137 |
+
gr.Markdown("""
|
| 138 |
+
---
|
| 139 |
+
### 使い方
|
| 140 |
+
|
| 141 |
+
1. **タブを選択**: 上のタブで使用したいモデルを選択
|
| 142 |
+
2. **音声入力**: 各タブ内のMoshi UIでマイクボタンをクリック
|
| 143 |
+
3. **比較**: タブを切り替えて異なるモデルの応答を比較
|
| 144 |
+
|
| 145 |
+
### システム情報
|
| 146 |
+
|
| 147 |
+
- 両方のモデルが独立したプロセスで同時に実行中
|
| 148 |
+
- 各モデルは専用のポートで動作(8998, 8999)
|
| 149 |
+
- フルWebRTC対応の音声ストリーミング
|
| 150 |
+
""")
|
| 151 |
+
|
| 152 |
+
return demo
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def main():
|
| 156 |
+
"""Main entry point"""
|
| 157 |
+
logger.info("="*60)
|
| 158 |
+
logger.info("Dual Moshi Model Interface Starting...")
|
| 159 |
+
logger.info("="*60)
|
| 160 |
+
|
| 161 |
+
# Start moshi.server instances in separate processes
|
| 162 |
+
processes: List[multiprocessing.Process] = []
|
| 163 |
+
|
| 164 |
+
for model in MODELS:
|
| 165 |
+
logger.info(f"Launching process for {model['name']}...")
|
| 166 |
+
process = multiprocessing.Process(
|
| 167 |
+
target=start_moshi_server,
|
| 168 |
+
args=(model['repo'], model['port'], model['name']),
|
| 169 |
+
daemon=True
|
| 170 |
+
)
|
| 171 |
+
process.start()
|
| 172 |
+
processes.append(process)
|
| 173 |
+
logger.info(f" Process started: PID {process.pid}")
|
| 174 |
+
|
| 175 |
+
# Wait a bit for servers to start
|
| 176 |
+
logger.info("Waiting for moshi.server instances to initialize...")
|
| 177 |
+
time.sleep(10)
|
| 178 |
+
|
| 179 |
+
# Create and launch the main Gradio interface
|
| 180 |
+
logger.info("Creating Gradio interface...")
|
| 181 |
+
demo = create_gradio_interface()
|
| 182 |
+
|
| 183 |
+
# Launch on a different port (main interface)
|
| 184 |
+
main_port = 7860
|
| 185 |
+
logger.info(f"Launching main interface on port {main_port}...")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
demo.launch(
|
| 189 |
+
server_name="0.0.0.0",
|
| 190 |
+
server_port=main_port,
|
| 191 |
+
share=False
|
| 192 |
+
)
|
| 193 |
+
except KeyboardInterrupt:
|
| 194 |
+
logger.info("Shutting down...")
|
| 195 |
+
for process in processes:
|
| 196 |
+
process.terminate()
|
| 197 |
+
logger.info("All processes terminated")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
moshi>=0.2.0,<=0.2.2
|
| 2 |
+
gradio>=4.0.0
|