File size: 1,285 Bytes
f0449f4 17ce68d f0449f4 17ce68d f0449f4 17ce68d d6c5bb6 17ce68d f0449f4 17ce68d cca5561 17ce68d cca5561 17ce68d f0449f4 17ce68d f0449f4 17ce68d f0449f4 843b3d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import gradio as gr
from llama_cpp import Llama
# 1. Model Details (GGUF direct link handle karega)
# Hum 2B version use kar rahe hain taaki Free Space par speed fast rahe
repo_id = "unsloth/gemma-4-E4B-it-GGUF"
filename = "gemma-4-E2B-it-Q4_K_M.gguf"
print("Model download ho raha hai... isme 1-2 minute lag sakte hain.")
# 2. Model Load karna
# Hugging Face ki free space par RAM kam hoti hai, isliye parameters optimized hain
llm = Llama.from_pretrained(
repo_id=repo_id,
filename=filename,
n_ctx=2048, # Context window
verbose=False
)
def chat_function(message, history):
# Gemma ka special prompt format
formatted_prompt = f"<bos><start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
response = llm(
formatted_prompt,
max_tokens=512,
stop=["<end_of_turn>"],
stream=True
)
output_text = ""
for chunk in response:
token = chunk["choices"][0]["text"]
output_text += token
yield output_text
# 3. Gradio Interface
demo = gr.ChatInterface(
fn=chat_function,
title="Gemma 4 Cloud Chat",
description="Running on Hugging Face Spaces (CPU Only)",
theme="soft"
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |