# import os
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# from typing import List, Dict
# import logging

# # Set up logging to help us debug model loading and inference
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# class MedicalAssistant:
#     def __init__(self):
#         """Initialize the medical assistant with model and tokenizer"""
#         try:
#             logger.info("Starting model initialization...")
            
#             # Model configuration - adjust these based on your available compute
#             self.model_name = "mradermacher/Llama3-Med42-8B-GGUF"
#             self.max_length = 1048
#             self.device = "cuda" if torch.cuda.is_available() else "cpu"
            
#             logger.info(f"Using device: {self.device}")
            
#             # Load tokenizer first - this is typically faster and can catch issues early
#             logger.info("Loading tokenizer...")
#             self.tokenizer = AutoTokenizer.from_pretrained(
#                 self.model_name,
#                 padding_side="left",
#                 trust_remote_code=True
#             )
            
#             # Set padding token if not set
#             if self.tokenizer.pad_token is None:
#                 self.tokenizer.pad_token = self.tokenizer.eos_token
            
#             # Load model with memory optimizations
#             logger.info("Loading model...")
#             self.model = AutoModelForCausalLM.from_pretrained(
#                 self.model_name,
#                 torch_dtype=torch.float16,
#                 device_map="auto",
#                 load_in_8bit=True,
#                 trust_remote_code=True
#             )
            
#             logger.info("Model initialization completed successfully!")
            
#         except Exception as e:
#             logger.error(f"Error during initialization: {str(e)}")
#             raise

#     def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
#         """Generate a response to the user's message"""
#         try:
#             # Prepare the prompt
#             system_prompt = """You are a medical AI assistant. Respond to medical queries 
#             professionally and accurately. If you're unsure, always recommend consulting 
#             with a healthcare provider."""
            
#             # Combine system prompt, chat history, and current message
#             full_prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
            
#             # Tokenize input
#             inputs = self.tokenizer(
#                 full_prompt,
#                 return_tensors="pt",
#                 padding=True,
#                 truncation=True,
#                 max_length=self.max_length
#             ).to(self.device)
            
#             # Generate response
#             with torch.no_grad():
#                 outputs = self.model.generate(
#                     **inputs,
#                     max_new_tokens=512,
#                     do_sample=True,
#                     temperature=0.7,
#                     top_p=0.95,
#                     pad_token_id=self.tokenizer.pad_token_id,
#                     repetition_penalty=1.1
#                 )
            
#             # Decode and clean up response
#             response = self.tokenizer.decode(
#                 outputs[0],
#                 skip_special_tokens=True
#             )
            
#             # Extract just the assistant's response
#             response = response.split("Assistant:")[-1].strip()
            
#             return response
            
#         except Exception as e:
#             logger.error(f"Error during response generation: {str(e)}")
#             return f"I apologize, but I encountered an error. Please try again."

# # Initialize the assistant
# assistant = None

# def initialize_assistant():
#     """Initialize the assistant and handle any errors"""
#     global assistant
#     try:
#         assistant = MedicalAssistant()
#         return True
#     except Exception as e:
#         logger.error(f"Failed to initialize assistant: {str(e)}")
#         return False

# def chat_response(message: str, history: List[Dict]):
#     """Handle chat messages and return responses"""
#     global assistant
    
#     # Check if assistant is initialized
#     if assistant is None:
#         if not initialize_assistant():
#             return "I apologize, but I'm currently unavailable. Please try again later."
    
#     try:
#         return assistant.generate_response(message, history)
#     except Exception as e:
#         logger.error(f"Error in chat response: {str(e)}")
#         return "I encountered an error. Please try again."

# # Create Gradio interface
# demo = gr.ChatInterface(
#     fn=chat_response,
#     title="Medical Assistant (Test Version)",
#     description="""This is a test version of the medical assistant. 
#                    Please use it to verify basic functionality.""",
#     examples=[
#         "What are the symptoms of malaria?",
#         "How can I prevent type 2 diabetes?",
#         "What should I do for a mild headache?"
#     ],
#     # retry_btn=None,
#     # undo_btn=None,
#     # clear_btn="Clear"
# )

# # Launch the interface
# if __name__ == "__main__":
#     demo.launch()

import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import logging
import traceback

# Set up logging to help us track what's happening
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class MedicalAssistant:
    def __init__(self):
        """
        Initialize the medical assistant with the Llama3-Med42 model.
        This model is specifically trained on medical data and quantized to 4-bit precision
        for better memory efficiency while maintaining good performance.
        """
        try:
            logger.info("Starting model initialization...")
            
            # Updated model to use Llama3-Med42
            self.model_name = "emircanerol/Llama3-Med42-8B-4bit"
            self.max_length = 2048
            
            # Initialize the pipeline for simplified text generation
            # The pipeline handles tokenizer and model loading automatically
            logger.info("Initializing pipeline...")
            self.pipe = pipeline(
                "text-generation",
                model=self.model_name,
                token=os.getenv('HUGGING_FACE_TOKEN'),
                device_map="auto",
                torch_dtype=torch.float16,  # Use half precision for 4-bit model
                load_in_4bit=True          # Enable 4-bit quantization
            )
            
            # Load tokenizer separately for more control over text processing
            logger.info("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                token=os.getenv('HUGGING_FACE_TOKEN'),
                trust_remote_code=True
            )
            
            # Ensure proper padding token configuration
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                
            logger.info("Medical Assistant initialized successfully!")
            
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            logger.error(traceback.format_exc())
            raise

    def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
        """
        Generate a response using the Llama3-Med42 pipeline.
        This method formats the conversation history and generates appropriate medical responses.
        """
        try:
            logger.info("Preparing message for generation")
            
            # Create a medical context-aware prompt
            system_prompt = """You are a medical AI assistant based on Llama3-Med42, 
            specifically trained on medical knowledge. Provide accurate, professional 
            medical guidance while acknowledging limitations. Always recommend 
            consulting healthcare providers for specific medical advice."""
            
            # Format the conversation for the model
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": message}
            ]
            
            # Add chat history if available
            if chat_history:
                for chat in chat_history:
                    messages.append({
                        "role": "user" if chat["role"] == "user" else "assistant",
                        "content": chat["content"]
                    })
            
            logger.info("Generating response")
            # Generate response using the pipeline
            response = self.pipe(
                messages,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.1
            )[0]["generated_text"]
            
            # Clean up the response by extracting the last assistant message
            response = response.split("assistant:")[-1].strip()
            
            logger.info("Response generated successfully")
            return response
            
        except Exception as e:
            logger.error(f"Error during response generation: {str(e)}")
            logger.error(traceback.format_exc())
            return f"I apologize, but I encountered an error: {str(e)}"

# Initialize the assistant
assistant = None

def initialize_assistant():
    """Initialize the assistant with proper error handling"""
    global assistant
    try:
        logger.info("Attempting to initialize assistant")
        assistant = MedicalAssistant()
        logger.info("Assistant initialized successfully")
        return True
    except Exception as e:
        logger.error(f"Failed to initialize assistant: {str(e)}")
        logger.error(traceback.format_exc())
        return False

def chat_response(message: str, history: List[Dict]):
    """Handle chat interactions with error recovery"""
    global assistant
    
    if assistant is None:
        logger.info("Assistant not initialized, attempting initialization")
        if not initialize_assistant():
            return "I apologize, but I'm currently unavailable. Please try again later."
    
    try:
        return assistant.generate_response(message, history)
    except Exception as e:
        logger.error(f"Error in chat response: {str(e)}")
        logger.error(traceback.format_exc())
        return f"I encountered an error: {str(e)}"

# Create the Gradio interface
demo = gr.ChatInterface(
    fn=chat_response,
    title="Medical Assistant (Llama3-Med42)",
    description="""This medical assistant is powered by NURSEOGE, 
                   a model specifically trained on medical knowledge. It provides 
                   guidance and information about health-related queries while 
                   maintaining professional medical standards.""",
    examples=[
        "What are the symptoms of malaria?",
        "How can I prevent type 2 diabetes?",
        "What should I do for a mild headache?"
    ]
)

# Launch the interface
if __name__ == "__main__":
    logger.info("Starting the application")
    demo.launch()