| import gradio as gr |
| from llama_cpp import Llama |
|
|
| |
| |
| repo_id = "unsloth/gemma-4-E4B-it-GGUF" |
| filename = "gemma-4-E2B-it-Q4_K_M.gguf" |
|
|
| print("Model download ho raha hai... isme 1-2 minute lag sakte hain.") |
|
|
| |
| |
| llm = Llama.from_pretrained( |
| repo_id=repo_id, |
| filename=filename, |
| n_ctx=2048, |
| verbose=False |
| ) |
|
|
| def chat_function(message, history): |
| |
| formatted_prompt = f"<bos><start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n" |
| |
| response = llm( |
| formatted_prompt, |
| max_tokens=512, |
| stop=["<end_of_turn>"], |
| stream=True |
| ) |
| |
| output_text = "" |
| for chunk in response: |
| token = chunk["choices"][0]["text"] |
| output_text += token |
| yield output_text |
|
|
| |
| demo = gr.ChatInterface( |
| fn=chat_function, |
| title="Gemma 4 Cloud Chat", |
| description="Running on Hugging Face Spaces (CPU Only)", |
| theme="soft" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |