import gradio as gr
from llama_cpp import Llama

# 1. Model Details (GGUF direct link handle karega)
# Hum 2B version use kar rahe hain taaki Free Space par speed fast rahe
repo_id = "unsloth/gemma-4-E4B-it-GGUF" 
filename = "gemma-4-E2B-it-Q4_K_M.gguf"

print("Model download ho raha hai... isme 1-2 minute lag sakte hain.")

# 2. Model Load karna
# Hugging Face ki free space par RAM kam hoti hai, isliye parameters optimized hain
llm = Llama.from_pretrained(
    repo_id=repo_id,
    filename=filename,
    n_ctx=2048,  # Context window
    verbose=False
)

def chat_function(message, history):
    # Gemma ka special prompt format
    formatted_prompt = f"<bos><start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
    
    response = llm(
        formatted_prompt,
        max_tokens=512,
        stop=["<end_of_turn>"],
        stream=True
    )
    
    output_text = ""
    for chunk in response:
        token = chunk["choices"][0]["text"]
        output_text += token
        yield output_text

# 3. Gradio Interface
demo = gr.ChatInterface(
    fn=chat_function,
    title="Gemma 4 Cloud Chat",
    description="Running on Hugging Face Spaces (CPU Only)",
    theme="soft"
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)