Fine-tuned Orpheus-3B: German Emotional Speech Synthesis
This repository contains a fine-tuned implementation of the Orpheus-3B model, specialized for German speech synthesis with advanced support for emotional cues and non-verbal audio tokens.
The model was fine-tuned using LoRA (Low-Rank Adaptation) on a curated German dataset containing high-quality audio with diverse emotional expressions and non-verbal cues.
🚀 Key Highlights
- 54.2% WER Improvement: Reduced Word Error Rate on emotional prompts from 0.7046 (Base) to 0.3226 (Fine-tuned).
- 37.1% CER Improvement: Reduced Character Error Rate from 0.5471 (Base) to 0.3440 (Fine-tuned).
- Architecture: Orpheus-3B (Merged 4-bit Standalone).
🎭 Supported Tags
The model has been fine-tuned on Dataset_eleven_v3 and supports a wide range of emotional and paralinguistic tags. Use square brackets [tag] for inference:
- Emotions:
[happy],[angry],[sad],[thoughtful],[neutral],[sleepy],[whisper],[worried],[annoyed],[surprised],[fearful],[contemptuous],[disgusted] - Paralinguistic Tokens:
[sighs],[laughter],[cry],[growl],[sob],[cheer],[breath],[pause],[grit],[snarl],[exhales sharply],[grits teeth],[breathes heavily],[exclaims],[hush],[soft],[quiet],[softbreath],[hm],[yawn],[mumble],[slowbreath],[ugh],[ew],[scoff],[snort],[tremble],[shaky_breath],[sigh],[nervous_laugh],[chuckles],[short pause],[sniffles],[inhales deeply]
🏋️ Training Details The model was trained using the following optimal parameters:
- Learning Rate: 0.0008
- LoRA Rank (R): 32
- LoRA Alpha: 32
- Precision: 4-bit (bitsandbytes/unsloth)
- Framework: Unsloth 2024.12
🔊 Inference Example This is a standalone merged model. You can load it directly without needing separate adapters.
"""
Script for Orpheus TTS.
Uses meaningful, emotion-specific sentences and authentic tags.
Aligned with training format: {Emotion}: [tag] {text}
"""
import os
import re
import torch
import soundfile as sf
from unsloth import FastLanguageModel
from snac import SNAC
# ============= CONFIGURATION =============
MODEL_NAME = "Vishalshendge3198/orpheus-3b-tts-german-emotional-merged"
OUTPUT_DIR = "generated_samples"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Token IDs from Orpheus config
TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_SPEECH = TOKENISER_LENGTH + 1
END_OF_SPEECH = TOKENISER_LENGTH + 2
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6
AUDIO_TOKENS_START = TOKENISER_LENGTH + 10
# Premium meaningful sentences (German)
# Format: "Emotion": "Text INCLUDING internal tags"
PROMPTS = {
"Happy": "[happy][laughing] Das ist ja der absolute Wahnsinn, ich freu mich so für uns! [chuckles]",
"Angry": "[angry][growl] Ich hab dir doch gesagt, dass das so nicht funktioniert, hör mir doch mal zu! [grit]",
"Sad": "[sad][sighs] Es tut mir leid, aber ich kann heute einfach nicht mehr… alles ist so schwer. [sniffles]",
"Thoughtful": "[thoughtful][sighs] Vielleicht gibt es ja doch noch einen anderen Weg, wer weiß… wir sollten darüber nachdenken. [pause]",
"Neutral": "[neutral][breath] Wir müssen die Ergebnisse erst noch einmal in Ruhe prüfen, bevor wir entscheiden. [pause]",
"Sleepy": "[sleepy][yawn] Es ist schon so spät, ich glaube, ich gehe jetzt wirklich schlafen. [mumble]",
"Whisper": "[whisper][hush] Sei bitte ganz leise, damit uns hier niemand im Dunkeln hören kann. [softbreath]",
"Worried": "[worried][tremble] Ich hab ein ganz ungutes Gefühl bei der Sache, hoffentlich geht am Ende alles gut. [sigh]",
"Annoyed": "[annoyed][scoff] Musst du das wirklich jetzt schon wieder zur Sprache bringen? Das nervt langsam echt. [exhales]",
"Surprised": "[surprised][exclaims] Oh mein Gott, damit hätte ich ja nun wirklich niemals im Leben gerechnet! [breath]",
"Fearful": "[fearful][tremble] Bitte, lass mich einfach in Ruhe, ich will keinen Ärger haben! [shaky_breath]",
"Contemptuous": "[contemptuous][scoff] Glaubst du wirklich, dass mich deine Meinung auch nur im Geringsten interessiert? [tsk]",
"Disgusted": "[disgusted][ew] Igitt, das riecht ja schrecklich, wie kann man sowas denn nur essen? [ugh]"
}
# ============= UTILS =============
def normalize_german_text(text):
text = text.lower()
text = re.sub(r'[^a-zäöüß\s\d.,!?\-<>\[\]]', '', text)
try:
from num2words import num2words
text = re.sub(r'\b\d+\b', lambda m: num2words(int(m.group()), lang='de'), text)
except: pass
return re.sub(r'\s+', ' ', text).strip()
def remove_duplicate_frames(codes_list):
if len(codes_list) % 7 != 0: return codes_list
result = codes_list[:7]
for i in range(7, len(codes_list), 7):
if codes_list[i] != result[-7]:
result.extend(codes_list[i:i+7])
return result
def redistribute_codes(code_list, snac_model):
layer_1, layer_2, layer_3 = [], [], []
for i in range(len(code_list) // 7):
layer_1.append(code_list[7*i] % 4096)
layer_2.append(code_list[7*i+1] % 4096)
layer_3.append(code_list[7*i+2] % 4096)
layer_3.append(code_list[7*i+3] % 4096)
layer_2.append(code_list[7*i+4] % 4096)
layer_3.append(code_list[7*i+5] % 4096)
layer_3.append(code_list[7*i+6] % 4096)
codes = [torch.tensor(layer_1).to(DEVICE).unsqueeze(0),
torch.tensor(layer_2).to(DEVICE).unsqueeze(0),
torch.tensor(layer_3).to(DEVICE).unsqueeze(0)]
return snac_model.decode(codes)
# ============= MAIN VERIFICATION =============
def verify():
print(f" Initializing Premium Verification for: {MODEL_NAME}")
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("\n Loading merged model...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME,
max_seq_length = 2048,
dtype = None,
load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
print(" Model and tokenizer ready")
print("\n Loading SNAC model...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(DEVICE).eval()
print(" SNAC ready")
print(f"\n Generating high-quality emotional samples...")
for emotion, text in PROMPTS.items():
# EXACT Training Prompt Format: {Emotion}: {text}
prompt = f"{emotion}: {text}"
print(f" -> Processing: [{emotion}]")
print(f" Full Prompt: {prompt}")
# Normalize
normalized_prompt = normalize_german_text(prompt)
# Tokenize (Exact Training Sequence)
tokens = tokenizer(normalized_prompt, add_special_tokens=True).input_ids
input_ids = (
[START_OF_HUMAN]
+ tokens
+ [END_OF_TEXT]
+ [END_OF_HUMAN]
+ [START_OF_AI]
+ [START_OF_SPEECH]
)
input_ids = torch.tensor([input_ids], dtype=torch.int64).to(DEVICE)
# Generate
with torch.no_grad():
output = model.generate(
input_ids = input_ids,
max_new_tokens = 2048,
do_sample = True,
temperature = 0.5,
top_p = 0.96,
repetition_penalty = 1.3,
eos_token_id = END_OF_SPEECH,
use_cache = True
)
generated_tokens = output[0][input_ids.shape[1]:]
# Extract Codes
gen_codes = []
for t in generated_tokens:
val = t.item()
if val >= AUDIO_TOKENS_START:
gen_codes.append(val - AUDIO_TOKENS_START)
elif val == END_OF_SPEECH:
break
new_length = (len(gen_codes) // 7) * 7
gen_codes = gen_codes[:new_length]
if not gen_codes:
print(f"No SNAC codes generated.")
continue
# Clean up and Decode
clean_codes = remove_duplicate_frames(gen_codes)
try:
audio_hat = redistribute_codes(clean_codes, snac_model).cpu().detach().numpy()[0, 0]
file_path = os.path.join(OUTPUT_DIR, f"orpheus_premium_{emotion.lower()}.wav")
sf.write(file_path, audio_hat, 24000)
print(f"Saved: {file_path}")
except Exception as e:
print(f"Error: {e}")
print(f"\nAudio generation is complete! Results in: {OUTPUT_DIR}")
if __name__ == "__main__":
verify()
📊 Performance
| Metric | Base Model (3B) | Fine-tuned (German) | Improvement |
|---|---|---|---|
| Avg WER | 0.7046 | 0.3226 | 54.2% |
| Avg CER | 0.5471 | 0.3440 | 37.1% |
| Emotional Prosody | Basic | Advanced | High |
📜 Credits Developed by Vishal Shendge as part of a German TTS fine-tuning project using the Orpheus-3B architecture. Special thanks to the Unsloth team for providing the optimization framework.
- Downloads last month
- 16
Model tree for Vishalshendge3198/orpheus-3b-tts-german-emotional-merged
Base model
meta-llama/Llama-3.2-3B-Instruct