gemma4-4b / app.py
Aakash098's picture
Update app.py
843b3d1 verified
import gradio as gr
from llama_cpp import Llama
# 1. Model Details (GGUF direct link handle karega)
# Hum 2B version use kar rahe hain taaki Free Space par speed fast rahe
repo_id = "unsloth/gemma-4-E4B-it-GGUF"
filename = "gemma-4-E2B-it-Q4_K_M.gguf"
print("Model download ho raha hai... isme 1-2 minute lag sakte hain.")
# 2. Model Load karna
# Hugging Face ki free space par RAM kam hoti hai, isliye parameters optimized hain
llm = Llama.from_pretrained(
repo_id=repo_id,
filename=filename,
n_ctx=2048, # Context window
verbose=False
)
def chat_function(message, history):
# Gemma ka special prompt format
formatted_prompt = f"<bos><start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
response = llm(
formatted_prompt,
max_tokens=512,
stop=["<end_of_turn>"],
stream=True
)
output_text = ""
for chunk in response:
token = chunk["choices"][0]["text"]
output_text += token
yield output_text
# 3. Gradio Interface
demo = gr.ChatInterface(
fn=chat_function,
title="Gemma 4 Cloud Chat",
description="Running on Hugging Face Spaces (CPU Only)",
theme="soft"
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)