import gradio as gr from llama_cpp import Llama # 1. Model Details (GGUF direct link handle karega) # Hum 2B version use kar rahe hain taaki Free Space par speed fast rahe repo_id = "unsloth/gemma-4-E4B-it-GGUF" filename = "gemma-4-E2B-it-Q4_K_M.gguf" print("Model download ho raha hai... isme 1-2 minute lag sakte hain.") # 2. Model Load karna # Hugging Face ki free space par RAM kam hoti hai, isliye parameters optimized hain llm = Llama.from_pretrained( repo_id=repo_id, filename=filename, n_ctx=2048, # Context window verbose=False ) def chat_function(message, history): # Gemma ka special prompt format formatted_prompt = f"user\n{message}\nmodel\n" response = llm( formatted_prompt, max_tokens=512, stop=[""], stream=True ) output_text = "" for chunk in response: token = chunk["choices"][0]["text"] output_text += token yield output_text # 3. Gradio Interface demo = gr.ChatInterface( fn=chat_function, title="Gemma 4 Cloud Chat", description="Running on Hugging Face Spaces (CPU Only)", theme="soft" ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)