Spaces:

Piyush1225
/

talkmateai-backend

Sleeping

App Files Files Community

talkmateai-backend / main.py

Piyush1225

Add root and health endpoints

39d2744 4 months ago

raw

history blame contribute delete

63.8 kB

	import asyncio
	import json
	import base64
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers import (
	AutoModelForImageTextToText,
	TextIteratorStreamer,
	GenerationConfig,
	)
	import numpy as np
	import logging
	import sys
	import io
	from PIL import Image
	import time
	import os
	from datetime import datetime
	from pathlib import Path
	from threading import Thread
	import re
	from typing import Optional, Dict, Any
	import uvicorn

	# FastAPI imports
	from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	from contextlib import asynccontextmanager

	# Import Kokoro TTS library
	from kokoro import KPipeline

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	handlers=[logging.StreamHandler(sys.stdout)],
	)
	logger = logging.getLogger(__name__)

	# Add compatibility for Python < 3.10 where anext is not available
	try:
	anext
	except NameError:

	async def anext(iterator):
	"""Get the next item from an async iterator, or raise StopAsyncIteration."""
	try:
	return await iterator.__anext__()
	except StopAsyncIteration:
	raise


	class ImageManager:
	"""Manages image saving and verification"""

	def __init__(self, save_directory="received_images"):
	self.save_directory = Path(save_directory)
	self.save_directory.mkdir(exist_ok=True)
	logger.info(f"Image save directory: {self.save_directory.absolute()}")

	def save_image(self, image_data: bytes, client_id: str, prefix: str = "img") -> str:
	"""Save image data and return the filename"""
	try:
	# Create timestamp-based filename
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] # milliseconds
	filename = f"{prefix}_{client_id}_{timestamp}.jpg"
	filepath = self.save_directory / filename

	# Save the image
	with open(filepath, "wb") as f:
	f.write(image_data)

	# Log file info
	file_size = len(image_data)
	logger.info(f"💾 Saved image: {filename} ({file_size:,} bytes)")

	return str(filepath)

	except Exception as e:
	logger.error(f"❌ Error saving image: {e}")
	return None

	def verify_image(self, filepath: str) -> dict:
	"""Verify saved image and return info"""
	try:
	if not os.path.exists(filepath):
	return {"error": "File not found"}

	# Get file stats
	stat = os.stat(filepath)
	file_size = stat.st_size

	# Try to open with PIL to verify it's a valid image
	with Image.open(filepath) as img:
	info = {
	"filepath": filepath,
	"file_size": file_size,
	"format": img.format,
	"mode": img.mode,
	"size": img.size,
	"width": img.width,
	"height": img.height,
	"valid": True,
	}

	logger.info(f"✅ Image verified: {info}")
	return info

	except Exception as e:
	logger.error(f"❌ Error verifying image {filepath}: {e}")
	return {"error": str(e), "valid": False}


	class WhisperProcessor:
	"""Handles speech-to-text using Whisper model"""

	_instance = None

	@classmethod
	def get_instance(cls):
	if cls._instance is None:
	cls._instance = cls()
	return cls._instance

	def __init__(self):
	self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
	self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	logger.info(f"Using device for Whisper: {self.device}")

	# Load Whisper model
	model_id = "openai/whisper-tiny"
	logger.info(f"Loading {model_id}...")

	self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=self.torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True,
	)
	self.model.to(self.device)

	self.processor = AutoProcessor.from_pretrained(model_id)

	# Create pipeline
	self.pipe = pipeline(
	"automatic-speech-recognition",
	model=self.model,
	tokenizer=self.processor.tokenizer,
	feature_extractor=self.processor.feature_extractor,
	torch_dtype=self.torch_dtype,
	device=self.device,
	)

	logger.info("Whisper model ready for transcription")
	self.transcription_count = 0

	async def transcribe_audio(self, audio_bytes):
	"""Transcribe audio bytes to text"""
	try:
	# Convert audio bytes to numpy array
	audio_array = (
	np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
	)

	# Run transcription in executor to avoid blocking
	result = await asyncio.get_event_loop().run_in_executor(
	None, lambda: self.pipe(audio_array)
	)

	transcribed_text = result["text"].strip()
	self.transcription_count += 1

	logger.info(
	f"Transcription #{self.transcription_count}: '{transcribed_text}'"
	)

	# Check for noise/empty transcription
	if not transcribed_text or len(transcribed_text) < 3:
	return "NO_SPEECH"

	# Check for common noise indicators
	noise_indicators = ["thank you", "thanks for watching", "you", ".", ""]
	if transcribed_text.lower().strip() in noise_indicators:
	return "NOISE_DETECTED"

	return transcribed_text

	except Exception as e:
	logger.error(f"Transcription error: {e}")
	return None


	class SmolVLMProcessor:
	"""Handles image + text processing using SmolVLM2 model"""

	_instance = None

	@classmethod
	def get_instance(cls):
	if cls._instance is None:
	cls._instance = cls()
	return cls._instance

	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device for SmolVLM2: {self.device}")

	# Load SmolVLM2 model
	model_path = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
	logger.info(f"Loading {model_path}...")

	self.processor = AutoProcessor.from_pretrained(model_path)
	self.model = AutoModelForImageTextToText.from_pretrained(
	model_path,
	torch_dtype=torch.float32, # Changed to float32 for better CPU compatibility
	# attn_implementation="flash_attention_2", # Disabled for CPU/Free Tier
	device_map="auto",
	)

	logger.info("SmolVLM2 model ready for multimodal generation")

	# Cache for most recent image
	self.last_image = None
	self.last_image_timestamp = 0
	self.lock = asyncio.Lock()

	# Message history management
	self.message_history = []
	self.max_history_messages = 4 # Keep last 4 exchanges

	# Counter
	self.generation_count = 0

	async def set_image(self, image_data):
	"""Cache the most recent image received"""
	async with self.lock:
	try:
	# Convert image data to PIL Image
	image = Image.open(io.BytesIO(image_data))

	# Resize to 75% of original size for efficiency
	new_size = (int(image.size[0] * 0.75), int(image.size[1] * 0.75))
	image = image.resize(new_size, Image.Resampling.LANCZOS)

	# Clear message history when new image is set
	self.message_history = []
	self.last_image = image
	self.last_image_timestamp = time.time()
	logger.info("Image cached successfully")
	return True
	except Exception as e:
	logger.error(f"Error processing image: {e}")
	return False

	async def process_text_with_image(self, text, initial_chunks=3):
	"""Process text with image context using SmolVLM2"""
	async with self.lock:
	try:
	if not self.last_image:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": text},
	],
	},
	]
	else:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "url": self.last_image},
	{"type": "text", "text": text},
	],
	},
	]

	# Apply chat template
	inputs = self.processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	).to(self.device, dtype=torch.bfloat16)

	# Create a streamer for token-by-token generation
	streamer = TextIteratorStreamer(
	tokenizer=self.processor.tokenizer,
	skip_special_tokens=True,
	skip_prompt=True,
	clean_up_tokenization_spaces=False,
	)

	# Configure generation parameters
	generation_kwargs = dict(
	**inputs,
	do_sample=False,
	max_new_tokens=1200,
	streamer=streamer,
	)

	# Start generation in a separate thread
	thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
	thread.start()

	# Collect initial text until we have a complete sentence or enough content
	initial_text = ""
	min_chars = 50 # Minimum characters to collect for initial chunk
	sentence_end_pattern = re.compile(r"[.!?]")
	has_sentence_end = False
	initial_collection_stopped_early = False

	# Collect the first sentence or minimum character count
	for chunk in streamer:
	initial_text += chunk
	logger.info(f"Streaming chunk: '{chunk}'")

	# Check if we have a sentence end
	if sentence_end_pattern.search(chunk):
	has_sentence_end = True
	# If we have at least some content, break after sentence end
	if len(initial_text) >= min_chars / 2:
	initial_collection_stopped_early = True
	break

	# If we have enough content, break
	if len(initial_text) >= min_chars and (
	has_sentence_end or "," in initial_text
	):
	initial_collection_stopped_early = True
	break

	# Safety check - if we've collected a lot of text without sentence end
	if len(initial_text) >= min_chars * 2:
	initial_collection_stopped_early = True
	break

	# Return initial text and the streamer for continued generation
	self.generation_count += 1
	logger.info(
	f"SmolVLM2 initial generation: '{initial_text}' ({len(initial_text)} chars)"
	)

	# Store user message and initial response
	self.pending_user_message = text
	self.pending_response = initial_text

	return streamer, initial_text, initial_collection_stopped_early

	except Exception as e:
	logger.error(f"SmolVLM2 streaming generation error: {e}")
	return None, f"Error processing: {text}", False

	def update_history_with_complete_response(
	self, user_text, initial_response, remaining_text=None
	):
	"""Update message history with complete response, including any remaining text"""
	# Combine initial and remaining text if available
	complete_response = initial_response
	if remaining_text:
	complete_response = initial_response + remaining_text

	# Add to history for context in future exchanges
	self.message_history.append({"role": "user", "text": user_text})

	self.message_history.append({"role": "assistant", "text": complete_response})

	# Trim history to keep only recent messages
	if len(self.message_history) > self.max_history_messages:
	self.message_history = self.message_history[-self.max_history_messages :]

	logger.info(
	f"Updated message history with complete response ({len(complete_response)} chars)"
	)


	class KokoroTTSProcessor:
	"""Handles text-to-speech conversion using Kokoro model"""

	_instance = None

	@classmethod
	def get_instance(cls):
	if cls._instance is None:
	cls._instance = cls()
	return cls._instance

	def __init__(self):
	logger.info("Initializing Kokoro TTS processor...")
	try:
	# Initialize Kokoro TTS pipeline
	self.pipeline = KPipeline(lang_code="a")

	# Set voice
	self.default_voice = "af_sarah"

	logger.info("Kokoro TTS processor initialized successfully")
	# Counter
	self.synthesis_count = 0
	except Exception as e:
	logger.error(f"Error initializing Kokoro TTS: {e}")
	self.pipeline = None

	async def synthesize_initial_speech_with_timing(self, text):
	"""Convert initial text to speech using Kokoro TTS data"""
	if not text or not self.pipeline:
	return None, []

	try:
	logger.info(f"Synthesizing initial speech for text: '{text}'")

	# Run TTS in a thread pool to avoid blocking
	audio_segments = []
	all_word_timings = []
	time_offset = 0 # Track cumulative time for multiple segments

	# Use the executor to run the TTS pipeline with minimal splitting
	generator = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: self.pipeline(
	text,
	voice=self.default_voice,
	speed=1,
	split_pattern=None, # No splitting for initial text to process faster
	),
	)

	# Process all generated segments and extract NATIVE timing
	for i, result in enumerate(generator):
	# Extract the components as shown in your screenshot
	gs = result.graphemes # str - the text graphemes
	ps = result.phonemes # str - the phonemes
	audio = result.audio.cpu().numpy() # numpy array
	tokens = result.tokens # List[en.MToken] - THE TIMING GOLD!

	logger.info(
	f"Segment {i}: {len(tokens)} tokens, audio shape: {audio.shape}"
	)

	# Extract word timing from native tokens with null checks
	for token in tokens:
	# Check if timing data is available
	if token.start_ts is not None and token.end_ts is not None:
	word_timing = {
	"word": token.text,
	"start_time": (token.start_ts + time_offset)
	* 1000, # Convert to milliseconds
	"end_time": (token.end_ts + time_offset)
	* 1000, # Convert to milliseconds
	}
	all_word_timings.append(word_timing)
	logger.debug(
	f"Word: '{token.text}' Start: {word_timing['start_time']:.1f}ms End: {word_timing['end_time']:.1f}ms"
	)
	else:
	# Log when timing data is missing
	logger.debug(
	f"Word: '{token.text}' - No timing data available (start_ts: {token.start_ts}, end_ts: {token.end_ts})"
	)

	# Add audio segment
	audio_segments.append(audio)

	# Update time offset for next segment
	if len(audio) > 0:
	segment_duration = len(audio) / 24000 # seconds
	time_offset += segment_duration

	# Combine all audio segments
	if audio_segments:
	combined_audio = np.concatenate(audio_segments)
	self.synthesis_count += 1
	logger.info(
	f"✨ Initial speech synthesis complete: {len(combined_audio)} samples, {len(all_word_timings)} word timings"
	)
	return combined_audio, all_word_timings
	return None, []

	except Exception as e:
	logger.error(f"Initial speech synthesis with timing error: {e}")
	return None, []

	async def synthesize_remaining_speech_with_timing(self, text):
	"""Convert remaining text to speech using Kokoro TTS data"""
	if not text or not self.pipeline:
	return None, []

	try:
	logger.info(
	f"Synthesizing chunk speech for text: '{text[:50]}...' if len(text) > 50 else text"
	)

	# Run TTS in a thread pool to avoid blocking
	audio_segments = []
	all_word_timings = []
	time_offset = 0 # Track cumulative time for multiple segments

	# Determine appropriate split pattern based on text length
	if len(text) < 100:
	split_pattern = None # No splitting for very short chunks
	else:
	split_pattern = r"[.!?。！？]+"

	# Use the executor to run the TTS pipeline with optimized splitting
	generator = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: self.pipeline(
	text, voice=self.default_voice, speed=1, split_pattern=split_pattern
	),
	)

	# Process all generated segments and extract NATIVE timing
	for i, result in enumerate(generator):
	# Extract the components with NATIVE timing
	gs = result.graphemes # str
	ps = result.phonemes # str
	audio = result.audio.cpu().numpy() # numpy array
	tokens = result.tokens # List[en.MToken] - THE TIMING GOLD!

	logger.info(
	f"Chunk segment {i}: {len(tokens)} tokens, audio shape: {audio.shape}"
	)

	# Extract word timing from native tokens with null checks
	for token in tokens:
	# Check if timing data is available
	if token.start_ts is not None and token.end_ts is not None:
	word_timing = {
	"word": token.text,
	"start_time": (token.start_ts + time_offset)
	* 1000, # Convert to milliseconds
	"end_time": (token.end_ts + time_offset)
	* 1000, # Convert to milliseconds
	}
	all_word_timings.append(word_timing)
	logger.debug(
	f"Chunk word: '{token.text}' Start: {word_timing['start_time']:.1f}ms End: {word_timing['end_time']:.1f}ms"
	)
	else:
	# Log when timing data is missing
	logger.debug(
	f"Chunk word: '{token.text}' - No timing data available (start_ts: {token.start_ts}, end_ts: {token.end_ts})"
	)

	# Add audio segment
	audio_segments.append(audio)

	# Update time offset for next segment
	if len(audio) > 0:
	segment_duration = len(audio) / 24000 # seconds
	time_offset += segment_duration

	# Combine all audio segments
	if audio_segments:
	combined_audio = np.concatenate(audio_segments)
	self.synthesis_count += 1
	logger.info(
	f"✨ Chunk speech synthesis complete: {len(combined_audio)} samples, {len(all_word_timings)} word timings"
	)
	return combined_audio, all_word_timings
	return None, []

	except Exception as e:
	logger.error(f"Chunk speech synthesis with timing error: {e}")
	return None, []


	async def collect_remaining_text(streamer, chunk_size=80):
	"""Collect remaining text from the streamer in smaller chunks

	Args:
	streamer: The text streamer object
	chunk_size: Maximum characters per chunk before yielding

	Yields:
	Text chunks as they become available
	"""
	current_chunk = ""

	if streamer:
	try:
	for chunk in streamer:
	current_chunk += chunk
	logger.info(f"Collecting remaining text chunk: '{chunk}'")

	# Check if we've reached a good breaking point (sentence end)
	if len(current_chunk) >= chunk_size and (
	current_chunk.endswith(".")
	or current_chunk.endswith("!")
	or current_chunk.endswith("?")
	or "." in current_chunk[-15:]
	):
	logger.info(f"Yielding text chunk of length {len(current_chunk)}")
	yield current_chunk
	current_chunk = ""

	# Yield any remaining text
	if current_chunk:
	logger.info(f"Yielding final text chunk of length {len(current_chunk)}")
	yield current_chunk

	except asyncio.CancelledError:
	# If there's text collected before cancellation, yield it
	if current_chunk:
	logger.info(
	f"Yielding partial text chunk before cancellation: {len(current_chunk)} chars"
	)
	yield current_chunk
	raise


	# Store active connections
	class ConnectionManager:
	def __init__(self):
	self.active_connections: Dict[str, WebSocket] = {}
	# Track current processing tasks for each client
	self.current_tasks: Dict[str, Dict[str, asyncio.Task]] = {}
	# Add image manager
	self.image_manager = ImageManager()
	# Track statistics
	self.stats = {
	"audio_segments_received": 0,
	"images_received": 0,
	"audio_with_image_received": 0,
	"last_reset": datetime.now(),
	}

	async def connect(self, websocket: WebSocket, client_id: str):
	await websocket.accept()
	self.active_connections[client_id] = websocket
	self.current_tasks[client_id] = {"processing": None, "tts": None}
	logger.info(f"Client {client_id} connected")

	def disconnect(self, client_id: str):
	if client_id in self.active_connections:
	del self.active_connections[client_id]
	if client_id in self.current_tasks:
	del self.current_tasks[client_id]
	logger.info(f"Client {client_id} disconnected")

	async def cancel_current_tasks(self, client_id: str):
	"""Cancel any ongoing processing tasks for a client"""
	if client_id in self.current_tasks:
	tasks = self.current_tasks[client_id]

	# Cancel processing task
	if tasks["processing"] and not tasks["processing"].done():
	logger.info(f"Cancelling processing task for client {client_id}")
	tasks["processing"].cancel()
	try:
	await tasks["processing"]
	except asyncio.CancelledError:
	pass

	# Cancel TTS task
	if tasks["tts"] and not tasks["tts"].done():
	logger.info(f"Cancelling TTS task for client {client_id}")
	tasks["tts"].cancel()
	try:
	await tasks["tts"]
	except asyncio.CancelledError:
	pass

	# Reset tasks
	self.current_tasks[client_id] = {"processing": None, "tts": None}

	def set_task(self, client_id: str, task_type: str, task: asyncio.Task):
	"""Set a task for a client"""
	if client_id in self.current_tasks:
	self.current_tasks[client_id][task_type] = task

	def update_stats(self, event_type: str):
	"""Update statistics"""
	if event_type in self.stats:
	self.stats[event_type] += 1

	def get_stats(self) -> dict:
	"""Get current statistics"""
	uptime = datetime.now() - self.stats["last_reset"]
	return {
	**self.stats,
	"uptime_seconds": uptime.total_seconds(),
	"active_connections": len(self.active_connections),
	}


	manager = ConnectionManager()


	@asynccontextmanager
	async def lifespan(app: FastAPI):
	# Startup
	lightweight_mode = os.getenv("LIGHTWEIGHT_MODE", "false").lower() == "true"

	if lightweight_mode:
	logger.warning("⚠️ LIGHTWEIGHT MODE ENABLED - AI models will NOT be loaded")
	logger.warning("⚠️ Server will run but AI features will be disabled")
	else:
	logger.info("Initializing models on startup...")
	try:
	# Initialize processors to load models
	whisper_processor = WhisperProcessor.get_instance()
	smolvlm_processor = SmolVLMProcessor.get_instance()
	tts_processor = KokoroTTSProcessor.get_instance()
	logger.info("All models initialized successfully")
	except Exception as e:
	logger.error(f"Error initializing models: {e}")
	raise

	yield # Server is running

	# Shutdown
	logger.info("Shutting down server...")
	# Close any remaining connections
	for client_id in list(manager.active_connections.keys()):
	try:
	await manager.active_connections[client_id].close()
	except Exception as e:
	logger.error(f"Error closing connection for {client_id}: {e}")
	manager.disconnect(client_id)
	logger.info("Server shutdown complete")


	# Initialize FastAPI app
	app = FastAPI(
	title="Whisper + SmolVLM2 Voice Assistant",
	description="Real-time voice assistant with speech recognition, image processing, and text-to-speech",
	version="1.0.0",
	lifespan=lifespan,
	)

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Configure this appropriately for production
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)



	@app.get("/")
	async def root():
	"""Root endpoint to check if server is running"""
	return {"status": "online", "message": "TalkMateAI Server is running"}


	@app.get("/health")
	async def health_check():
	"""Health check endpoint"""
	return {"status": "healthy"}


	@app.get("/stats")
	async def get_stats():
	"""Get server statistics"""
	return manager.get_stats()


	@app.get("/images")
	async def list_saved_images():
	"""List all saved images"""
	try:
	images_dir = manager.image_manager.save_directory
	if not images_dir.exists():
	return {"images": [], "message": "No images directory found"}

	images = []
	for image_file in images_dir.glob("*.jpg"):
	stat = image_file.stat()
	images.append(
	{
	"filename": image_file.name,
	"path": str(image_file),
	"size": stat.st_size,
	"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
	}
	)

	images.sort(key=lambda x: x["created"], reverse=True) # Most recent first
	return {"images": images, "count": len(images)}

	except Exception as e:
	logger.error(f"Error listing images: {e}")
	return {"error": str(e)}


	@app.websocket("/ws/{client_id}")
	async def websocket_endpoint(websocket: WebSocket, client_id: str):
	"""WebSocket endpoint for real-time multimodal interaction"""
	await manager.connect(websocket, client_id)

	# Get instances of processors
	whisper_processor = WhisperProcessor.get_instance()
	smolvlm_processor = SmolVLMProcessor.get_instance()
	tts_processor = KokoroTTSProcessor.get_instance()

	try:
	# Send initial configuration confirmation
	await websocket.send_text(
	json.dumps({"status": "connected", "client_id": client_id})
	)

	async def send_keepalive():
	"""Send periodic keepalive pings"""
	while True:
	try:
	await websocket.send_text(
	json.dumps({"type": "ping", "timestamp": time.time()})
	)
	await asyncio.sleep(10) # Send ping every 10 seconds
	except Exception:
	break

	async def process_text_message(text, image_data=None):
	"""Process a text message from the user with optional image"""
	try:
	# Log what we received
	if image_data:
	logger.info(
	f"💬 Processing text+image: text='{text}', image={len(image_data)} bytes"
	)
	manager.update_stats("audio_with_image_received")

	# Save the image for verification
	saved_path = manager.image_manager.save_image(
	image_data, client_id, "text_multimodal"
	)
	if saved_path:
	verification = manager.image_manager.verify_image(saved_path)
	if verification.get("valid"):
	logger.info(
	f"📸 Image verified successfully: {verification['size']} pixels"
	)
	else:
	logger.info(f"💬 Processing text-only: '{text}'")

	# Send interrupt signal
	logger.info("Sending interrupt signal")
	interrupt_message = json.dumps({"interrupt": True})
	await websocket.send_text(interrupt_message)

	# Set image if provided
	if image_data:
	await smolvlm_processor.set_image(image_data)
	logger.info("🖼️ Image set for multimodal processing")

	# Process text with SmolVLM2
	logger.info("Starting SmolVLM2 generation from text")
	streamer, initial_text, initial_collection_stopped_early = (
	await smolvlm_processor.process_text_with_image(text)
	)
	logger.info(
	f"SmolVLM2 initial text: '{initial_text[:50]}...' ({len(initial_text)} chars)"
	)

	# Check if VLM response indicates noise
	if initial_text.startswith("NOISE:"):
	logger.info(
	f"Noise detected in VLM processing: '{initial_text}'. Skipping TTS."
	)
	return

	# Generate TTS for initial text
	if initial_text:
	logger.info("Starting TTS for initial text")
	tts_task = asyncio.create_task(
	tts_processor.synthesize_initial_speech_with_timing(
	initial_text
	)
	)
	manager.set_task(client_id, "tts", tts_task)

	tts_result = await tts_task
	if isinstance(tts_result, tuple) and len(tts_result) == 2:
	initial_audio, initial_timings = tts_result
	else:
	initial_audio = tts_result
	initial_timings = []

	logger.info(
	f"Initial TTS complete: {len(initial_audio) if initial_audio is not None else 0} samples"
	)

	if initial_audio is not None and len(initial_audio) > 0:
	# Convert to base64 and send to client
	audio_bytes = (initial_audio * 32767).astype(np.int16).tobytes()
	base64_audio = base64.b64encode(audio_bytes).decode("utf-8")

	audio_message = {
	"audio": base64_audio,
	"word_timings": initial_timings,
	"sample_rate": 24000,
	"method": "native_kokoro_timing",
	"modality": (
	"text_multimodal" if image_data else "text_only"
	),
	}

	await websocket.send_text(json.dumps(audio_message))
	logger.info(
	f"✨ Initial audio sent to client with {len(initial_timings)} word timings [text message]"
	)

	# Process remaining text chunks if available
	if initial_collection_stopped_early:
	logger.info("Processing remaining text chunks")
	collected_chunks = []

	try:
	text_iterator = collect_remaining_text(streamer)

	while True:
	try:
	text_chunk = await anext(text_iterator)
	logger.info(
	f"Processing text chunk: '{text_chunk[:30]}...'"
	)
	collected_chunks.append(text_chunk)

	# Generate TTS for this chunk
	chunk_tts_task = asyncio.create_task(
	tts_processor.synthesize_remaining_speech_with_timing(
	text_chunk
	)
	)
	manager.set_task(
	client_id, "tts", chunk_tts_task
	)

	chunk_tts_result = await chunk_tts_task
	if (
	isinstance(chunk_tts_result, tuple)
	and len(chunk_tts_result) == 2
	):
	chunk_audio, chunk_timings = (
	chunk_tts_result
	)
	else:
	chunk_audio = chunk_tts_result
	chunk_timings = []

	if (
	chunk_audio is not None
	and len(chunk_audio) > 0
	):
	audio_bytes = (
	(chunk_audio * 32767)
	.astype(np.int16)
	.tobytes()
	)
	base64_audio = base64.b64encode(
	audio_bytes
	).decode("utf-8")

	chunk_audio_message = {
	"audio": base64_audio,
	"word_timings": chunk_timings,
	"sample_rate": 24000,
	"method": "native_kokoro_timing",
	"chunk": True,
	"modality": (
	"text_multimodal"
	if image_data
	else "text_only"
	),
	}

	await websocket.send_text(
	json.dumps(chunk_audio_message)
	)
	logger.info(
	f"✨ Chunk audio sent to client [text message]"
	)

	except StopAsyncIteration:
	logger.info("All text chunks processed")
	break
	except asyncio.CancelledError:
	logger.info("Text chunk processing cancelled")
	raise

	# Update history
	if collected_chunks:
	complete_remaining_text = "".join(collected_chunks)
	smolvlm_processor.update_history_with_complete_response(
	text, initial_text, complete_remaining_text
	)

	except asyncio.CancelledError:
	logger.info("Remaining text processing cancelled")
	if collected_chunks:
	partial_remaining_text = "".join(collected_chunks)
	smolvlm_processor.update_history_with_complete_response(
	text, initial_text, partial_remaining_text
	)
	else:
	smolvlm_processor.update_history_with_complete_response(
	text, initial_text
	)
	return
	else:
	smolvlm_processor.update_history_with_complete_response(
	text, initial_text
	)

	# Signal end of audio stream
	await websocket.send_text(json.dumps({"audio_complete": True}))
	logger.info("Text message processing complete")

	except asyncio.CancelledError:
	logger.info("Text message processing cancelled")
	raise
	except Exception as e:
	logger.error(f"Error processing text message: {e}")
	import traceback

	logger.error(f"Full traceback: {traceback.format_exc()}")

	async def process_audio_segment(audio_data, image_data=None):
	"""Process a complete audio segment through the pipeline with optional image"""
	try:
	# Log what we received
	if image_data:
	logger.info(
	f"🎥 Processing audio+image segment: audio={len(audio_data)} bytes, image={len(image_data)} bytes"
	)
	manager.update_stats("audio_with_image_received")

	# Save the image for verification
	saved_path = manager.image_manager.save_image(
	image_data, client_id, "multimodal"
	)
	if saved_path:
	# Verify the saved image
	verification = manager.image_manager.verify_image(saved_path)
	if verification.get("valid"):
	logger.info(
	f"📸 Image verified successfully: {verification['size']} pixels"
	)
	else:
	logger.warning(
	f"⚠️ Image verification failed: {verification}"
	)

	else:
	logger.info(
	f"🎤 Processing audio-only segment: {len(audio_data)} bytes"
	)
	manager.update_stats("audio_segments_received")

	# Send interrupt immediately since frontend determined this is valid speech
	logger.info("Sending interrupt signal")
	interrupt_message = json.dumps({"interrupt": True})
	await websocket.send_text(interrupt_message)

	# Step 1: Transcribe audio with Whisper
	logger.info("Starting Whisper transcription")
	transcribed_text = await whisper_processor.transcribe_audio(audio_data)
	logger.info(f"Transcription result: '{transcribed_text}'")

	# Check if transcription indicates noise
	if transcribed_text in ["NOISE_DETECTED", "NO_SPEECH", None]:
	logger.info(
	f"Noise detected in transcription: '{transcribed_text}'. Skipping further processing."
	)
	return

	# Step 2: Set image if provided, then process text
	if image_data:
	await smolvlm_processor.set_image(image_data)
	logger.info("🖼️ Image set for multimodal processing")

	# Process transcribed text with image using SmolVLM2
	logger.info("Starting SmolVLM2 generation")
	streamer, initial_text, initial_collection_stopped_early = (
	await smolvlm_processor.process_text_with_image(transcribed_text)
	)
	logger.info(
	f"SmolVLM2 initial text: '{initial_text[:50]}...' ({len(initial_text)} chars)"
	)

	# Check if VLM response indicates noise
	if initial_text.startswith("NOISE:"):
	logger.info(
	f"Noise detected in VLM processing: '{initial_text}'. Skipping TTS."
	)
	return

	# Step 3: Generate TTS for initial text WITH NATIVE TIMING
	if initial_text:
	logger.info("Starting TTS for initial text")
	tts_task = asyncio.create_task(
	tts_processor.synthesize_initial_speech_with_timing(
	initial_text
	)
	)
	manager.set_task(client_id, "tts", tts_task)

	# FIXED: Properly unpack the tuple
	tts_result = await tts_task
	if isinstance(tts_result, tuple) and len(tts_result) == 2:
	initial_audio, initial_timings = tts_result
	else:
	# Fallback for legacy method
	initial_audio = tts_result
	initial_timings = []
	logger.warning(
	"TTS returned single value instead of tuple - no timing data available"
	)

	logger.info(
	f"Initial TTS complete: {len(initial_audio) if initial_audio is not None else 0} samples, {len(initial_timings)} word timings"
	)

	if initial_audio is not None and len(initial_audio) > 0:
	# Convert to base64 and send to client WITH TIMING DATA
	audio_bytes = (initial_audio * 32767).astype(np.int16).tobytes()
	base64_audio = base64.b64encode(audio_bytes).decode("utf-8")

	# Send audio with native timing information
	audio_message = {
	"audio": base64_audio,
	"word_timings": initial_timings, # 🎉 NATIVE TIMING DATA!
	"sample_rate": 24000,
	"method": "native_kokoro_timing",
	"modality": "multimodal" if image_data else "audio_only",
	}

	await websocket.send_text(json.dumps(audio_message))
	logger.info(
	f"✨ Initial audio sent to client with {len(initial_timings)} NATIVE word timings [{audio_message['modality']}]"
	)

	# Step 4: Process remaining text chunks if available
	if initial_collection_stopped_early:
	logger.info("Processing remaining text chunks")
	collected_chunks = []

	try:
	text_iterator = collect_remaining_text(streamer)

	while True:
	try:
	text_chunk = await anext(text_iterator)
	logger.info(
	f"Processing text chunk: '{text_chunk[:30]}...' ({len(text_chunk)} chars)"
	)
	collected_chunks.append(text_chunk)

	# Generate TTS for this chunk WITH NATIVE TIMING
	chunk_tts_task = asyncio.create_task(
	tts_processor.synthesize_remaining_speech_with_timing(
	text_chunk
	)
	)
	manager.set_task(
	client_id, "tts", chunk_tts_task
	)

	# FIXED: Properly unpack the tuple for chunks too
	chunk_tts_result = await chunk_tts_task
	if (
	isinstance(chunk_tts_result, tuple)
	and len(chunk_tts_result) == 2
	):
	chunk_audio, chunk_timings = (
	chunk_tts_result
	)
	else:
	# Fallback for legacy method
	chunk_audio = chunk_tts_result
	chunk_timings = []
	logger.warning(
	"Chunk TTS returned single value instead of tuple"
	)

	logger.info(
	f"Chunk TTS complete: {len(chunk_audio) if chunk_audio is not None else 0} samples, {len(chunk_timings)} word timings"
	)

	if (
	chunk_audio is not None
	and len(chunk_audio) > 0
	):
	# Convert to base64 and send to client WITH TIMING DATA
	audio_bytes = (
	(chunk_audio * 32767)
	.astype(np.int16)
	.tobytes()
	)
	base64_audio = base64.b64encode(
	audio_bytes
	).decode("utf-8")

	# Send chunk audio with native timing information
	chunk_audio_message = {
	"audio": base64_audio,
	"word_timings": chunk_timings, # 🎉 NATIVE TIMING DATA!
	"sample_rate": 24000,
	"method": "native_kokoro_timing",
	"chunk": True,
	"modality": (
	"multimodal"
	if image_data
	else "audio_only"
	),
	}

	await websocket.send_text(
	json.dumps(chunk_audio_message)
	)
	logger.info(
	f"✨ Chunk audio sent to client with {len(chunk_timings)} NATIVE word timings [{chunk_audio_message['modality']}]"
	)

	except StopAsyncIteration:
	logger.info("All text chunks processed")
	break
	except asyncio.CancelledError:
	logger.info("Text chunk processing cancelled")
	raise

	# Update history with complete response
	if collected_chunks:
	complete_remaining_text = "".join(collected_chunks)
	smolvlm_processor.update_history_with_complete_response(
	transcribed_text,
	initial_text,
	complete_remaining_text,
	)

	except asyncio.CancelledError:
	logger.info("Remaining text processing cancelled")
	# Update history with partial response
	if collected_chunks:
	partial_remaining_text = "".join(collected_chunks)
	smolvlm_processor.update_history_with_complete_response(
	transcribed_text,
	initial_text,
	partial_remaining_text,
	)
	else:
	smolvlm_processor.update_history_with_complete_response(
	transcribed_text, initial_text
	)
	return
	else:
	# No remaining text, just update history with initial response
	smolvlm_processor.update_history_with_complete_response(
	transcribed_text, initial_text
	)

	# Signal end of audio stream
	await websocket.send_text(json.dumps({"audio_complete": True}))
	logger.info("Audio processing complete")

	except asyncio.CancelledError:
	logger.info("Audio processing cancelled")
	raise
	except Exception as e:
	logger.error(f"Error processing audio segment: {e}")
	# Add more detailed error info
	import traceback

	logger.error(f"Full traceback: {traceback.format_exc()}")

	async def receive_and_process():
	"""Receive and process messages from the client"""
	try:
	while True:
	data = await websocket.receive_text()
	try:
	message = json.loads(data)

	# Handle complete audio segments from frontend
	if "audio_segment" in message:
	# Cancel any current processing
	await manager.cancel_current_tasks(client_id)

	# Decode audio data
	audio_data = base64.b64decode(message["audio_segment"])

	# Check if image is also included
	image_data = None
	if "image" in message:
	image_data = base64.b64decode(message["image"])
	logger.info(
	f"Received audio+image: audio={len(audio_data)} bytes, image={len(image_data)} bytes"
	)
	else:
	logger.info(
	f"Received audio-only: {len(audio_data)} bytes"
	)

	# Start processing the audio segment with optional image
	processing_task = asyncio.create_task(
	process_audio_segment(audio_data, image_data)
	)
	manager.set_task(client_id, "processing", processing_task)

	# Handle text messages with optional image
	elif "text_message" in message:
	# Cancel any current processing
	await manager.cancel_current_tasks(client_id)

	text = message["text_message"]

	# Check if image is also included
	image_data = None
	if "image" in message:
	image_data = base64.b64decode(message["image"])
	logger.info(
	f"Received text+image: text='{text}', image={len(image_data)} bytes"
	)
	else:
	logger.info(f"Received text-only: '{text}'")

	# Start processing the text message with optional image
	processing_task = asyncio.create_task(
	process_text_message(text, image_data)
	)
	manager.set_task(client_id, "processing", processing_task)

	# Handle standalone images (only if not currently processing)
	elif "image" in message:
	if not (
	client_id in manager.current_tasks
	and manager.current_tasks[client_id]["processing"]
	and not manager.current_tasks[client_id][
	"processing"
	].done()
	):
	image_data = base64.b64decode(message["image"])
	manager.update_stats("images_received")

	# Save standalone image
	saved_path = manager.image_manager.save_image(
	image_data, client_id, "standalone"
	)
	if saved_path:
	verification = manager.image_manager.verify_image(
	saved_path
	)
	logger.info(
	f"📸 Standalone image saved and verified: {verification}"
	)

	await smolvlm_processor.set_image(image_data)
	logger.info("Image updated")

	# Handle realtime input (for backward compatibility)
	elif "realtime_input" in message:
	for chunk in message["realtime_input"]["media_chunks"]:
	if chunk["mime_type"] == "audio/pcm":
	# Treat as complete audio segment
	await manager.cancel_current_tasks(client_id)

	audio_data = base64.b64decode(chunk["data"])
	processing_task = asyncio.create_task(
	process_audio_segment(audio_data)
	)
	manager.set_task(
	client_id, "processing", processing_task
	)

	elif chunk["mime_type"] == "image/jpeg":
	# Only process image if not currently processing audio
	if not (
	client_id in manager.current_tasks
	and manager.current_tasks[client_id][
	"processing"
	]
	and not manager.current_tasks[client_id][
	"processing"
	].done()
	):
	image_data = base64.b64decode(chunk["data"])
	manager.update_stats("images_received")

	# Save image from realtime input
	saved_path = manager.image_manager.save_image(
	image_data, client_id, "realtime"
	)
	if saved_path:
	verification = (
	manager.image_manager.verify_image(
	saved_path
	)
	)
	logger.info(
	f"📸 Realtime image saved and verified: {verification}"
	)

	await smolvlm_processor.set_image(image_data)

	except json.JSONDecodeError as e:
	logger.error(f"Error decoding JSON: {e}")
	await websocket.send_text(
	json.dumps({"error": "Invalid JSON format"})
	)
	except KeyError as e:
	logger.error(f"Missing key in message: {e}")
	await websocket.send_text(
	json.dumps({"error": f"Missing required field: {e}"})
	)
	except Exception as e:
	logger.error(f"Error processing message: {e}")
	await websocket.send_text(
	json.dumps({"error": f"Processing error: {str(e)}"})
	)

	except WebSocketDisconnect:
	logger.info("WebSocket connection closed during receive loop")

	# Run tasks concurrently
	receive_task = asyncio.create_task(receive_and_process())
	keepalive_task = asyncio.create_task(send_keepalive())

	# Wait for any task to complete (usually due to disconnection or error)
	done, pending = await asyncio.wait(
	[receive_task, keepalive_task],
	return_when=asyncio.FIRST_COMPLETED,
	)

	# Cancel pending tasks
	for task in pending:
	task.cancel()
	try:
	await task
	except asyncio.CancelledError:
	pass

	# Log results of completed tasks
	for task in done:
	try:
	result = task.result()
	except Exception as e:
	logger.error(f"Task finished with error: {e}")

	except WebSocketDisconnect:
	logger.info(f"Client {client_id} disconnected normally")
	except Exception as e:
	logger.error(f"WebSocket session error for client {client_id}: {e}")
	finally:
	# Cleanup
	logger.info(f"Cleaning up resources for client {client_id}")
	await manager.cancel_current_tasks(client_id)
	manager.disconnect(client_id)


	def main():
	"""Main function to start the FastAPI server"""
	logger.info("Starting FastAPI Whisper + SmolVLM2 Voice Assistant server...")

	# Configure uvicorn
	config = uvicorn.Config(
	app=app,
	host="0.0.0.0",
	port=8000,
	log_level="info",
	access_log=True,
	ws_ping_interval=20,
	ws_ping_timeout=60,
	timeout_keep_alive=30,
	)

	server = uvicorn.Server(config)

	try:
	server.run()
	except KeyboardInterrupt:
	logger.info("Server stopped by user")
	except Exception as e:
	logger.error(f"Server error: {e}")


	if __name__ == "__main__":
	main()