If this wors?
I tried to use other ggufs but they don't work. If this works? I just want to do local translation.
This is a base model, this won't work for chat or translations. If you want to do translations I suggest you use the sarvam-translate model. If you are comfortable using llama.cpp then it can be significantly fast. Here's how I have set it up
/home/path/to/llama.cpp/build/bin/llama-server --model /path/to/your/model/sarvam-translate.q8_0.gguf
--host 0.0.0.0
--port 9212 \ # or whichever port you like
--alias sarvam-translate
--threads-http 4
--threads 5
--ctx-size 8192 \ # although it is based on gemma3, the context size is from the base sarvam model, so I'd suggest you don't exceed it
--batch-size 4096 \ # or depending on your use
--ubatch-size 4096 \ # or depending on your use
--temp 0.01
--repeat-penalty 1.1
--flash-attn
--slots
--metrics
--jinja
--chat-template-file /path/to/your/model/sarvam-translate/chat_template.jinja \ # It's best to use the model's chat template
--path /path/to/sarvam-translate/frontend/ # incase you have a front-end set up
If you prefer python then you can use this code. I have added streaming to it. You will still need a gguf file.
from fastapi.responses import StreamingResponse, HTMLResponse
import os
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
import json
print("π NEW Sarvam code loaded at runtime-Final!")
from llama_cpp import Llama
MODEL_PATH = os.path.join(os.path.dirname(__file__), "sarvam-translate.q8_0.gguf")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=8192,
n_threads=5,
n_batch=4096,
verbose=True,
)
lang_map = {
"en-IN": "English", "hi-IN": "Hindi", "gu-IN": "Gujarati", "mr-IN": "Marathi", "sa-IN": "Sanskrit",
"en": "English", "hi": "Hindi", "gu": "Gujarati", "mr": "Marathi", "sa": "Sanskrit"
# ...add more as needed
}
def build_prompt(text: str, src_lang: str, tgt_lang: str):
source_language_full = lang_map.get(src_lang, "English")
target_language_full = lang_map.get(tgt_lang, tgt_lang)
# --- BULLETPROOF PROMPT TEMPLATE ---
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
**Instruction:**
You are an expert multilingual translator. Your task is to translate the text provided in the 'Source Text' block from {source_language_full} to {target_language_full}.
Provide only the raw translated text as the output, without any extra commentary.
**Source Text ({source_language_full}):**
\"\"\"
{text}
\"\"\"<|eot_id|><|start_header_id|>model<|end_header_id|>
**Translated Text ({target_language_full}):**
\"\"\"
"""
print(f"DEBUG: Using prompt:\n---\n{prompt}\n---")
return prompt
def stream_translate(text: str, src_lang: str, tgt_lang: str):
prompt = build_prompt(text, src_lang, tgt_lang)
for chunk in llm.create_completion(
prompt=prompt,
temperature=0.07, # π INCREASED for more flexibility, you can adjust it. In python I am having difficulty in getting decent translation when I reduce temperature.
max_tokens=1024,
stop=['"""', '<|eot_id|>'], # π Stop at the end of the text block
repeat_penalty=1.5, # π DECREASED to reduce gibberish
stream=True
):
yield chunk["choices"][0]["text"]
app = FastAPI(title="Sarvam Translator API")
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
@app .post("/v1/translate")
async def translate(req: Request):
data = await req.json()
text = data.get("text")
src = data.get("source_language_code")
tgt = data.get("target_language_code")
if not text or not tgt or not src:
return StreamingResponse((json.dumps({"error": "Provide 'text', 'source_language_code', and 'target_language_code' keys"}) + "\n"), media_type="application/json")
def event_stream():
# Stream as plain text (or as SSE if you prefer)
for tok in stream_translate(text, src, tgt):
yield tok
return StreamingResponse(event_stream(), media_type="text/plain")
@app .get("/v1/health")
def health():
return {"status": "ok"}
@app .get("/", response_class=HTMLResponse)
async def serve_frontend():
with open(os.path.join(os.path.dirname(__file__), "frontend", "index.html")) as f:
return f.read()
print("Sarvam-Translate API is running")