Spaces:
Running
Running
File size: 5,912 Bytes
194df75 1f85a2e 194df75 1f85a2e 194df75 c0ac551 194df75 7d97239 1f85a2e 194df75 bfff6ca 194df75 bfff6ca 194df75 bfff6ca 194df75 c12026a ad933f7 194df75 1f85a2e 194df75 ad933f7 194df75 f6d154f 194df75 1f85a2e 194df75 4d2e877 1f85a2e 30f52cc 4d2e877 194df75 1f85a2e 194df75 ad933f7 194df75 1f85a2e 194df75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import os
import re
import torch
import pypdf
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from docx import Document
HF_TOKEN = os.environ.get("HF_TOKEN")
repo_id = "ianro04/ScandiProb"
labels = ["Norwegian", "Swedish", "Danish", "Non-Scandinavian"]
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
model.eval()
def read_file(file_path): # Alt input method for Hugging Space
if file_path is None:
return ""
if file_path.endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
elif file_path.endswith(".docx"):
doc = Document(file_path)
return "\n".join([p.text for p in doc.paragraphs])
elif file_path.endswith(".pdf"):
reader = pypdf.PdfReader(file_path)
return "\n".join([page.extract_text() or "" for page in reader.pages])
return ""
def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
text = text.strip()
if len(text) < 2:
return 1.0
scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
scandi_keyboard_alpha_only = r"[a-zA-ZæøåÆØÅäöÄÖéÉ ]"
scandi_key_matches = re.findall(scandi_keyboard, text)
scandi_alpha_matches = re.findall(scandi_keyboard_alpha_only, text)
if len(scandi_alpha_matches) < (len(text) * 0.5):
nonscandi_percent = 1.0
else:
nonscandi_percent = (1 - (len(scandi_key_matches) / len(text)))
return nonscandi_percent
def da_no_cross_skew(text):
text = text.strip().lower()
if not text:
return [0.0, 0.0]
da_skew, no_skew = 0.0, 0.0
da_no_regex = {
r"æ[bgltv]": "DA",
r"[eø]j" : "DA",
r"\b\w+hed(?:en|et)?\b" : "DA",
r"\b\w*([bdfgklnprst])\1\b" : "NO",
r"(?:g|k|sk)j[eæø]" : "NO"
}
words = text.split()
if not words:
return [0.0, 0.0]
skew_amount = 1.0 / len(text)
for rule, lang in da_no_regex.items():
rule_matches = len(re.findall(rule, text))
skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
if lang == "NO":
no_skew += skew_inc
da_skew -= skew_inc
elif lang == "DA":
da_skew += skew_inc
no_skew -= skew_inc
return [no_skew, da_skew]
def ScandiProb(text):
text = text.strip()
if not text:
return "None", {label: 0.0 for label in labels}
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
raw_probs = torch.sigmoid(outputs.logits)[0]
nonscandi_ratio = nonscandi_penalty(text)
no_skew, da_skew = da_no_cross_skew(text)
final_probs = {}
for i, label in enumerate(labels):
prob = raw_probs[i].item()
if label in ["Norwegian", "Swedish", "Danish"]:
adjusted = prob * (1.0 - nonscandi_ratio)
else:
adjusted = prob + ((1.0 - prob) * nonscandi_ratio)
if label == "Norwegian":
adjusted = adjusted * (1.0 + no_skew)
adjusted = adjusted * (1.0 - da_skew)
elif label == "Danish":
adjusted = adjusted * (1.0 + da_skew)
adjusted = adjusted * (1.0 - no_skew)
adjusted = min(1.0, max(0.0, adjusted))
final_probs[label] = float(adjusted)
top_labels = [label for label, prob in final_probs.items() if prob > 0.5]
top_labels_str = ", ".join(top_labels) if top_labels else "Indefinitive"
return top_labels_str, final_probs
def classify(text, file):
if file is not None:
text = read_file(file)
return ScandiProb(text)
with gr.Blocks() as demo:
gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
gr.Markdown("### By Ian Rodriguez")
gr.Markdown("Enter text or upload a file to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**. Only first 512 tokens of input will be used.")
gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), and combined with regex-enforced heuristics. Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide), with a fraction of the training data used in SLIDE.", sanitize_html=False)
gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)", sanitize_html=False)
gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))", sanitize_html=False)
with gr.Row():
with gr.Column():
with gr.Tab("Text Input"):
input_text = gr.Textbox(lines=5, placeholder="Enter text...", label="Input Text")
with gr.Tab("File Upload"):
input_file = gr.File(file_types=[".txt", ".docx", ".pdf"])
submit_btn = gr.Button("Classify")
with gr.Column():
top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
submit_btn.click(fn=classify, inputs=[input_text, input_file], outputs=[top_prediction, output_labels])
if __name__ == "__main__":
demo.launch() |