Spaces:

ianro04
/

ScandiProb

Running

File size: 3,887 Bytes

194df75
 
 
 
 
 
c0ac551
194df75

import os
import re
import torch
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer

HF_TOKEN = os.environ.get("HF_TOKEN")
repo_id = "ianro04/ScandiProb"

labels = ["Norwegian", "Swedish", "Danish", "Non-Scandinavian"]

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
model.eval()

def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
    if not text.strip():
        return 1.0

    scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
    scandi_keyboard_alpha_only = r"[a-zA-ZæøåÆØÅäöÄÖéÉ ]"
    scandi_key_matches = re.findall(scandi_keyboard, text)
    scandi_alpha_matches = re.findall(scandi_keyboard_alpha_only, text)
    
    if len(scandi_alpha_matches) < (len(text) * 0.5):
        nonscandi_percent = 1.0
    else:
        nonscandi_percent = (1 - (len(scandi_key_matches) / len(text)))
    return nonscandi_percent

def da_no_cross_skew(text):
    if not text:
        return [0.0, 0.0]
    
    da_skew, no_skew = 0.0, 0.0
    text = text.strip().lower()
    
    da_no_regex = {
        r"æ[bgltv]": "DA", 
        r"[eø]j" : "DA", 
        r"\b\w+hed(?:en|et)?\b" : "DA",
        r"\b\w*([bdfgklnprst])\1\b" : "NO", 
        r"(?:g|k|sk)j[eæø]" : "NO"
    }
    
    words = text.split()
    if not words:
        return [0.0, 0.0]
        
    skew_amount = 1.0 / len(text)
    
    for rule, lang in da_no_regex.items():
        rule_matches = len(re.findall(rule, text))
        skew_inc = rule_matches * skew_amount * (1.5 if len(words) <= 6 else 1)
        if lang == "NO":
            no_skew += skew_inc
            da_skew -= skew_inc
        elif lang == "DA":
            da_skew += skew_inc
            no_skew -= skew_inc
            
    return [no_skew, da_skew]

def ScandiProb(text):
    if not text.strip():
        return {label: 0.0 for label in labels}
        
    inputs = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    raw_probs = torch.sigmoid(outputs.logits)[0]
    
    nonscandi_ratio = nonscandi_penalty(text)
    no_skew, da_skew = da_no_cross_skew(text)
    
    final_probs = {}
    
    for i, label in enumerate(labels):
        prob = raw_probs[i].item()
        
        if label in ["Norwegian", "Swedish", "Danish"]:
            adjusted = prob * (1.0 - nonscandi_ratio)
        else:
            adjusted = prob + ((1.0 - prob) * nonscandi_ratio)

        if label == "Norwegian":
            adjusted = adjusted * (1.0 + no_skew)
            adjusted = adjusted * (1.0 - da_skew)
        elif label == "Danish":
            adjusted = adjusted * (1.0 + da_skew)
            adjusted = adjusted * (1.0 - no_skew)

        adjusted = min(1.0, max(0.0, adjusted))
        final_probs[label] = float(adjusted)
        
    return final_probs

with gr.Blocks() as demo:
    gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
    gr.Markdown("Enter text to output independent probabilities that it is written in Norwegian, Swedish, Danish, or a Non-Scandinavian language. This model utilizes a fine-tuned ScandiBERT combined with linguistic regex heuristics.")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                lines=5, 
                placeholder="Type your text here...", 
                label="Input Text"
            )
            submit_btn = gr.Button("Classify")
        
        with gr.Column():
            output_labels = gr.Label(num_top_classes=4, label="Predicted Probabilities")
            
    submit_btn.click(fn=ScandiProb, inputs=input_text, outputs=output_labels)

if __name__ == "__main__":
    demo.launch()