yutoAb commited on
Commit
4ee8d5c
·
1 Parent(s): b2eed5f

Add application file

Browse files
Files changed (4) hide show
  1. Dockerfile +39 -0
  2. README.md +59 -1
  3. app.py +201 -0
  4. requirements.txt +2 -0
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Triton/Inductor の JIT に必要(gcc等)
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy application files
11
+ COPY requirements.txt .
12
+ COPY app.py .
13
+
14
+ RUN pip install --no-cache-dir uv
15
+ RUN uv pip install --system --no-cache-dir -r requirements.txt
16
+
17
+ # 安定化
18
+ ENV OMP_NUM_THREADS=1
19
+ ENV MKL_NUM_THREADS=1
20
+ ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
21
+
22
+ # ✅ ログが止まって見える問題を解消
23
+ ENV PYTHONUNBUFFERED=1
24
+
25
+ # ✅ 起動を軽くする(コンパイル無効)
26
+ ENV TORCHDYNAMO_DISABLE=1
27
+ ENV TORCHINDUCTOR_DISABLE=1
28
+
29
+ ENV GRADIO_SERVER_NAME=0.0.0.0
30
+ ENV GRADIO_SERVER_PORT=7860
31
+
32
+ # Expose ports for:
33
+ # - 7860: Main Gradio interface
34
+ # - 8998: Moshi server 1 (finetuned-step-9282)
35
+ # - 8999: Moshi server 2 (j-moshi)
36
+ EXPOSE 7860 8998 8999
37
+
38
+ CMD ["python", "app.py"]
39
+
README.md CHANGED
@@ -6,6 +6,64 @@ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
 
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
+ short_description: Compare two Moshi models with tabbed interface
10
+ app_port: 7860
11
  ---
12
 
13
+ # 🎙️ Dual Moshi Model Interface
14
+
15
+ Compare two Moshi voice AI models side-by-side with a tabbed interface.
16
+ Each model runs with full WebRTC support for real-time voice conversation.
17
+
18
+ ## Available Models
19
+
20
+ ### 1. Finetuned Step 9282
21
+ - **Repository**: `abePclWaseda/moshi-finetuned-step-9282`
22
+ - Fine-tuned Moshi model at training step 9282
23
+ - Optimized for specific use cases
24
+ - **Port**: 8998
25
+
26
+ ### 2. J-Moshi (Japanese)
27
+ - **Repository**: `nu-dialogue/j-moshi`
28
+ - Japanese-optimized full-duplex spoken dialogue system
29
+ - Built on Moshi 7B with additional Japanese training data
30
+ - Supports natural turn-taking and backchannel responses (相槌)
31
+ - **Port**: 8999
32
+
33
+ ## Features
34
+
35
+ - **Tabbed Interface**: Switch between models using tabs
36
+ - **Full WebRTC Support**: Complete `moshi.server` UI for each model
37
+ - **Dual Model Execution**: Both models run simultaneously
38
+ - **Real-time Voice**: Full-duplex conversation with microphone input
39
+ - **GPU Optimized**: Designed for 48GB+ GPU environments
40
+
41
+ ## Architecture
42
+
43
+ The application runs:
44
+ 1. **Main Gradio Interface** (Port 7860) - Tabbed UI for model selection
45
+ 2. **Moshi Server 1** (Port 8998) - Finetuned Step 9282 model
46
+ 3. **Moshi Server 2** (Port 8999) - J-Moshi model
47
+
48
+ Each tab embeds the complete `moshi.server` interface with WebRTC support.
49
+
50
+ ## Requirements
51
+
52
+ - **GPU**: A100 (48GB) or equivalent recommended
53
+ - **Memory**: ~48GB GPU VRAM (24GB per model running simultaneously)
54
+ - **Docker**: Containerized deployment
55
+
56
+ ## Usage
57
+
58
+ 1. Open the application (port 7860)
59
+ 2. Click on a tab to select which model to use
60
+ 3. Click inside the embedded interface to interact with Moshi
61
+ 4. Use the microphone button to start voice conversation
62
+ 5. Switch tabs to compare different models
63
+
64
+ ## Technical Details
65
+
66
+ - **Framework**: Gradio + moshi.server
67
+ - **Models**: Moshi (7B parameters each)
68
+ - **Codec**: MIMI audio codec
69
+ - **Ports**: 7860 (main), 8998, 8999 (model servers)
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dual Moshi Model Interface
4
+ Runs two moshi.server instances and provides a tabbed interface for comparison
5
+ """
6
+
7
+ import logging
8
+ import multiprocessing
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ import time
13
+ from typing import List
14
+
15
+ import gradio as gr
16
+
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # Model configurations
26
+ MODELS = [
27
+ {
28
+ "name": "Finetuned Step 9282",
29
+ "repo": "abePclWaseda/moshi-finetuned-step-9282",
30
+ "port": 8998,
31
+ "description": "Fine-tuned Moshi model at training step 9282"
32
+ },
33
+ {
34
+ "name": "J-Moshi (Japanese)",
35
+ "repo": "nu-dialogue/j-moshi",
36
+ "port": 8999,
37
+ "description": "Japanese-optimized full-duplex dialogue system"
38
+ }
39
+ ]
40
+
41
+
42
+ def start_moshi_server(model_repo: str, port: int, model_name: str):
43
+ """Start a moshi.server instance for a specific model"""
44
+ logger.info(f"Starting {model_name} on port {port}...")
45
+ logger.info(f" Repository: {model_repo}")
46
+
47
+ try:
48
+ # Run moshi.server with specified model and port
49
+ cmd = [
50
+ "uv", "run", "-m", "moshi.server",
51
+ "--hf-repo", model_repo,
52
+ "--host", "0.0.0.0",
53
+ "--port", str(port)
54
+ ]
55
+
56
+ logger.info(f" Command: {' '.join(cmd)}")
57
+
58
+ # Start the process
59
+ process = subprocess.Popen(
60
+ cmd,
61
+ stdout=subprocess.PIPE,
62
+ stderr=subprocess.STDOUT,
63
+ universal_newlines=True,
64
+ bufsize=1
65
+ )
66
+
67
+ # Log output
68
+ for line in process.stdout:
69
+ logger.info(f" [{model_name}:{port}] {line.rstrip()}")
70
+
71
+ return_code = process.wait()
72
+ if return_code != 0:
73
+ logger.error(f"{model_name} exited with code {return_code}")
74
+
75
+ except Exception as e:
76
+ logger.error(f"Failed to start {model_name}: {e}")
77
+ raise
78
+
79
+
80
+ def create_gradio_interface() -> gr.Blocks:
81
+ """Create the Gradio tabbed interface"""
82
+
83
+ with gr.Blocks(
84
+ title="Dual Moshi Model Interface",
85
+ theme=gr.themes.Soft(),
86
+ css="""
87
+ .iframe-container {
88
+ width: 100%;
89
+ height: 800px;
90
+ border: 1px solid #e0e0e0;
91
+ border-radius: 8px;
92
+ overflow: hidden;
93
+ }
94
+ .iframe-container iframe {
95
+ width: 100%;
96
+ height: 100%;
97
+ border: none;
98
+ }
99
+ """
100
+ ) as demo:
101
+
102
+ gr.Markdown("""
103
+ # 🎙️ Dual Moshi Model Interface
104
+
105
+ Compare two Moshi voice AI models side-by-side. Each tab shows the full WebRTC interface
106
+ for a different model. Switch tabs to compare responses.
107
+ """)
108
+
109
+ with gr.Tabs() as tabs:
110
+ for model in MODELS:
111
+ with gr.TabItem(f"🎤 {model['name']}"):
112
+ gr.Markdown(f"""
113
+ ### {model['name']}
114
+ **Repository**: `{model['repo']}`
115
+
116
+ {model['description']}
117
+
118
+ ---
119
+ """)
120
+
121
+ # Embed moshi.server UI in iframe
122
+ gr.HTML(f"""
123
+ <div class="iframe-container">
124
+ <iframe src="http://localhost:{model['port']}"
125
+ allow="microphone; camera"
126
+ title="{model['name']} Interface">
127
+ </iframe>
128
+ </div>
129
+ """)
130
+
131
+ gr.Markdown(f"""
132
+ ---
133
+ 💡 **Tip**: Click inside the frame to interact with the Moshi interface.
134
+ The server is running on port {model['port']}.
135
+ """)
136
+
137
+ gr.Markdown("""
138
+ ---
139
+ ### 使い方
140
+
141
+ 1. **タブを選択**: 上のタブで使用したいモデルを選択
142
+ 2. **音声入力**: 各タブ内のMoshi UIでマイクボタンをクリック
143
+ 3. **比較**: タブを切り替えて異なるモデルの応答を比較
144
+
145
+ ### システム情報
146
+
147
+ - 両方のモデルが独立したプロセスで同時に実行中
148
+ - 各モデルは専用のポートで動作(8998, 8999)
149
+ - フルWebRTC対応の音声ストリーミング
150
+ """)
151
+
152
+ return demo
153
+
154
+
155
+ def main():
156
+ """Main entry point"""
157
+ logger.info("="*60)
158
+ logger.info("Dual Moshi Model Interface Starting...")
159
+ logger.info("="*60)
160
+
161
+ # Start moshi.server instances in separate processes
162
+ processes: List[multiprocessing.Process] = []
163
+
164
+ for model in MODELS:
165
+ logger.info(f"Launching process for {model['name']}...")
166
+ process = multiprocessing.Process(
167
+ target=start_moshi_server,
168
+ args=(model['repo'], model['port'], model['name']),
169
+ daemon=True
170
+ )
171
+ process.start()
172
+ processes.append(process)
173
+ logger.info(f" Process started: PID {process.pid}")
174
+
175
+ # Wait a bit for servers to start
176
+ logger.info("Waiting for moshi.server instances to initialize...")
177
+ time.sleep(10)
178
+
179
+ # Create and launch the main Gradio interface
180
+ logger.info("Creating Gradio interface...")
181
+ demo = create_gradio_interface()
182
+
183
+ # Launch on a different port (main interface)
184
+ main_port = 7860
185
+ logger.info(f"Launching main interface on port {main_port}...")
186
+
187
+ try:
188
+ demo.launch(
189
+ server_name="0.0.0.0",
190
+ server_port=main_port,
191
+ share=False
192
+ )
193
+ except KeyboardInterrupt:
194
+ logger.info("Shutting down...")
195
+ for process in processes:
196
+ process.terminate()
197
+ logger.info("All processes terminated")
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ moshi>=0.2.0,<=0.2.2
2
+ gradio>=4.0.0