If this wors?

by ryg81 - opened Aug 5, 2025

Discussion

ryg81

Aug 5, 2025

I tried to use other ggufs but they don't work. If this works? I just want to do local translation.

Savyasaachin

Owner Sep 5, 2025

This is a base model, this won't work for chat or translations. If you want to do translations I suggest you use the sarvam-translate model. If you are comfortable using llama.cpp then it can be significantly fast. Here's how I have set it up

/home/path/to/llama.cpp/build/bin/llama-server --model /path/to/your/model/sarvam-translate.q8_0.gguf
--host 0.0.0.0
--port 9212 \ # or whichever port you like
--alias sarvam-translate
--threads-http 4
--threads 5
--ctx-size 8192 \ # although it is based on gemma3, the context size is from the base sarvam model, so I'd suggest you don't exceed it
--batch-size 4096 \ # or depending on your use
--ubatch-size 4096 \ # or depending on your use
--temp 0.01
--repeat-penalty 1.1
--flash-attn
--slots
--metrics
--jinja
--chat-template-file /path/to/your/model/sarvam-translate/chat_template.jinja \ # It's best to use the model's chat template
--path /path/to/sarvam-translate/frontend/ # incase you have a front-end set up

If you prefer python then you can use this code. I have added streaming to it. You will still need a gguf file.

from fastapi.responses import StreamingResponse, HTMLResponse
import os
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
import json

print("🌟 NEW Sarvam code loaded at runtime-Final!")

from llama_cpp import Llama

MODEL_PATH = os.path.join(os.path.dirname(__file__), "sarvam-translate.q8_0.gguf")

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=8192,
    n_threads=5,
    n_batch=4096,
    verbose=True,
)

lang_map = {
    "en-IN": "English", "hi-IN": "Hindi", "gu-IN": "Gujarati", "mr-IN": "Marathi", "sa-IN": "Sanskrit",
    "en": "English", "hi": "Hindi", "gu": "Gujarati", "mr": "Marathi", "sa": "Sanskrit"
    # ...add more as needed
}

def build_prompt(text: str, src_lang: str, tgt_lang: str):
    source_language_full = lang_map.get(src_lang, "English")
    target_language_full = lang_map.get(tgt_lang, tgt_lang)

    # --- BULLETPROOF PROMPT TEMPLATE ---
    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
**Instruction:**
You are an expert multilingual translator. Your task is to translate the text provided in the 'Source Text' block from {source_language_full} to {target_language_full}.
Provide only the raw translated text as the output, without any extra commentary.

**Source Text ({source_language_full}):**
\"\"\"
{text}
\"\"\"<|eot_id|><|start_header_id|>model<|end_header_id|>
**Translated Text ({target_language_full}):**
\"\"\"
"""
    print(f"DEBUG: Using prompt:\n---\n{prompt}\n---")
    return prompt

def stream_translate(text: str, src_lang: str, tgt_lang: str):
    prompt = build_prompt(text, src_lang, tgt_lang)

    for chunk in llm.create_completion(
        prompt=prompt,
        temperature=0.07,          # 👈 INCREASED for more flexibility, you can adjust it. In python I am having difficulty in getting decent translation when I reduce temperature.
        max_tokens=1024,
        stop=['"""', '<|eot_id|>'],       # 👈 Stop at the end of the text block
        repeat_penalty=1.5,        # 👈 DECREASED to reduce gibberish
        stream=True
    ):
        yield chunk["choices"][0]["text"]

app = FastAPI(title="Sarvam Translator API")
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])

@app .post("/v1/translate")
async def translate(req: Request):
    data = await req.json()
    text = data.get("text")
    src = data.get("source_language_code")
    tgt = data.get("target_language_code")
    if not text or not tgt or not src:
        return StreamingResponse((json.dumps({"error": "Provide 'text', 'source_language_code', and 'target_language_code' keys"}) + "\n"), media_type="application/json")
    def event_stream():
        # Stream as plain text (or as SSE if you prefer)
        for tok in stream_translate(text, src, tgt):
            yield tok
    return StreamingResponse(event_stream(), media_type="text/plain")

@app .get("/v1/health")
def health():
    return {"status": "ok"}

@app .get("/", response_class=HTMLResponse)
async def serve_frontend():
    with open(os.path.join(os.path.dirname(__file__), "frontend", "index.html")) as f:
        return f.read()

print("Sarvam-Translate API is running")

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment