File size: 5,912 Bytes
194df75
 
 
1f85a2e
194df75
 
1f85a2e
194df75
c0ac551
194df75
 
 
 
 
 
 
 
 
7d97239
1f85a2e
 
 
 
 
 
 
 
 
 
 
 
 
194df75
bfff6ca
 
194df75
 
 
 
 
 
 
 
 
 
 
 
 
 
bfff6ca
194df75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfff6ca
194df75
 
 
 
 
 
 
 
 
 
c12026a
 
ad933f7
194df75
1f85a2e
194df75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad933f7
 
 
 
194df75
f6d154f
194df75
1f85a2e
 
 
 
 
194df75
 
4d2e877
1f85a2e
30f52cc
 
 
4d2e877
194df75
 
1f85a2e
 
 
 
194df75
 
 
ad933f7
 
194df75
1f85a2e
194df75
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import re
import torch
import pypdf
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from docx import Document

HF_TOKEN = os.environ.get("HF_TOKEN")
repo_id = "ianro04/ScandiProb"

labels = ["Norwegian", "Swedish", "Danish", "Non-Scandinavian"]

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
model.eval()

def read_file(file_path): # Alt input method for Hugging Space
    if file_path is None:
        return ""
    if file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    elif file_path.endswith(".pdf"):
        reader = pypdf.PdfReader(file_path)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    return ""

def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
    text = text.strip()
    if len(text) < 2:
        return 1.0

    scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
    scandi_keyboard_alpha_only = r"[a-zA-ZæøåÆØÅäöÄÖéÉ ]"
    scandi_key_matches = re.findall(scandi_keyboard, text)
    scandi_alpha_matches = re.findall(scandi_keyboard_alpha_only, text)
    
    if len(scandi_alpha_matches) < (len(text) * 0.5):
        nonscandi_percent = 1.0
    else:
        nonscandi_percent = (1 - (len(scandi_key_matches) / len(text)))
    return nonscandi_percent

def da_no_cross_skew(text):
    text = text.strip().lower()
    if not text:
        return [0.0, 0.0]
    
    da_skew, no_skew = 0.0, 0.0
    
    da_no_regex = {
        r"æ[bgltv]": "DA", 
        r"[eø]j" : "DA", 
        r"\b\w+hed(?:en|et)?\b" : "DA",
        r"\b\w*([bdfgklnprst])\1\b" : "NO", 
        r"(?:g|k|sk)j[eæø]" : "NO"
    }
    
    words = text.split()
    if not words:
        return [0.0, 0.0]
        
    skew_amount = 1.0 / len(text)
    
    for rule, lang in da_no_regex.items():
        rule_matches = len(re.findall(rule, text))
        skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
        if lang == "NO":
            no_skew += skew_inc
            da_skew -= skew_inc
        elif lang == "DA":
            da_skew += skew_inc
            no_skew -= skew_inc
            
    return [no_skew, da_skew]

def ScandiProb(text):
    text = text.strip()
    if not text:
        return "None", {label: 0.0 for label in labels}
        
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    raw_probs = torch.sigmoid(outputs.logits)[0]
    
    nonscandi_ratio = nonscandi_penalty(text)
    no_skew, da_skew = da_no_cross_skew(text)
    
    final_probs = {}
    
    for i, label in enumerate(labels):
        prob = raw_probs[i].item()
        
        if label in ["Norwegian", "Swedish", "Danish"]:
            adjusted = prob * (1.0 - nonscandi_ratio)
        else:
            adjusted = prob + ((1.0 - prob) * nonscandi_ratio)

        if label == "Norwegian":
            adjusted = adjusted * (1.0 + no_skew)
            adjusted = adjusted * (1.0 - da_skew)
        elif label == "Danish":
            adjusted = adjusted * (1.0 + da_skew)
            adjusted = adjusted * (1.0 - no_skew)

        adjusted = min(1.0, max(0.0, adjusted))
        final_probs[label] = float(adjusted)

    top_labels = [label for label, prob in final_probs.items() if prob > 0.5]
    
    top_labels_str = ", ".join(top_labels) if top_labels else "Indefinitive"
        
    return top_labels_str, final_probs

def classify(text, file):
    if file is not None:
        text = read_file(file)
    return ScandiProb(text)

with gr.Blocks() as demo:
    gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
    gr.Markdown("### By Ian Rodriguez")
    gr.Markdown("Enter text or upload a file to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**. Only first 512 tokens of input will be used.")
    gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), and combined with regex-enforced heuristics. Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide), with a fraction of the training data used in SLIDE.", sanitize_html=False)
    gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)", sanitize_html=False)
    gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))", sanitize_html=False)
    
    with gr.Row():
        with gr.Column():
            with gr.Tab("Text Input"):
                input_text = gr.Textbox(lines=5, placeholder="Enter text...", label="Input Text")
            with gr.Tab("File Upload"):
                input_file = gr.File(file_types=[".txt", ".docx", ".pdf"])
            submit_btn = gr.Button("Classify")
        
        with gr.Column():
            top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
            output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
            
    submit_btn.click(fn=classify, inputs=[input_text, input_file], outputs=[top_prediction, output_labels])

if __name__ == "__main__":
    demo.launch()