Spaces:

ianro04
/

ScandiProb

Running

App Files Files Community

ScandiProb / app.py

ianro04

Attempt at implementing file upload option for input text in app.py + markdown change again

1f85a2e verified 1 day ago

raw

history blame

5.83 kB

	import os
	import re
	import torch
	import pypdf
	import gradio as gr
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from docx import Document

	HF_TOKEN = os.environ.get("HF_TOKEN")
	repo_id = "ianro04/ScandiProb"

	labels = ["Norwegian", "Swedish", "Danish", "Non-Scandinavian"]

	print("Loading model and tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
	model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
	model.eval()

	def read_file(file_path):
	if file_path is None:
	return ""
	if file_path.endswith(".txt"):
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()
	elif file_path.endswith(".docx"):
	doc = Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs])
	elif file_path.endswith(".pdf"):
	reader = pypdf.PdfReader(file_path)
	return "\n".join([page.extract_text() or "" for page in reader.pages])
	return ""

	def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
	text = text.strip()
	if len(text) < 2:
	return 1.0

	scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\\|]"
	scandi_keyboard_alpha_only = r"[a-zA-ZæøåÆØÅäöÄÖéÉ ]"
	scandi_key_matches = re.findall(scandi_keyboard, text)
	scandi_alpha_matches = re.findall(scandi_keyboard_alpha_only, text)

	if len(scandi_alpha_matches) < (len(text) * 0.5):
	nonscandi_percent = 1.0
	else:
	nonscandi_percent = (1 - (len(scandi_key_matches) / len(text)))
	return nonscandi_percent

	def da_no_cross_skew(text):
	text = text.strip().lower()
	if not text:
	return [0.0, 0.0]

	da_skew, no_skew = 0.0, 0.0

	da_no_regex = {
	r"æ[bgltv]": "DA",
	r"[eø]j" : "DA",
	r"\b\w+hed(?:en\|et)?\b" : "DA",
	r"\b\w*([bdfgklnprst])\1\b" : "NO",
	r"(?:g\|k\|sk)j[eæø]" : "NO"
	}

	words = text.split()
	if not words:
	return [0.0, 0.0]

	skew_amount = 1.0 / len(text)

	for rule, lang in da_no_regex.items():
	rule_matches = len(re.findall(rule, text))
	skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
	if lang == "NO":
	no_skew += skew_inc
	da_skew -= skew_inc
	elif lang == "DA":
	da_skew += skew_inc
	no_skew -= skew_inc

	return [no_skew, da_skew]

	def ScandiProb(text):
	text = text.strip()
	if not text:
	return "None", {label: 0.0 for label in labels}

	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

	with torch.no_grad():
	outputs = model(**inputs)

	raw_probs = torch.sigmoid(outputs.logits)[0]

	nonscandi_ratio = nonscandi_penalty(text)
	no_skew, da_skew = da_no_cross_skew(text)

	final_probs = {}

	for i, label in enumerate(labels):
	prob = raw_probs[i].item()

	if label in ["Norwegian", "Swedish", "Danish"]:
	adjusted = prob * (1.0 - nonscandi_ratio)
	else:
	adjusted = prob + ((1.0 - prob) * nonscandi_ratio)

	if label == "Norwegian":
	adjusted = adjusted * (1.0 + no_skew)
	adjusted = adjusted * (1.0 - da_skew)
	elif label == "Danish":
	adjusted = adjusted * (1.0 + da_skew)
	adjusted = adjusted * (1.0 - no_skew)

	adjusted = min(1.0, max(0.0, adjusted))
	final_probs[label] = float(adjusted)

	top_labels = [label for label, prob in final_probs.items() if prob > 0.5]

	top_labels_str = ", ".join(top_labels) if top_labels else "Indefinitive"

	return top_labels_str, final_probs

	def classify(text, file):
	if file is not None:
	text = read_file(file)
	return ScandiProb(text)

	with gr.Blocks() as demo:
	gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
	gr.Markdown("### By Ian Rodriguez")
	gr.Markdown("Enter text or upload a file to output independent probabilities that it is written in Norwegian, Swedish, Danish, or None of the Above / Non-Scandinavian. Only first 512 tokens of input will be used.")
	gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), and combined with regex-enforced heuristics.
	Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
	gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
	gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) \| [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) \| [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")

	with gr.Row():
	with gr.Column():
	with gr.Tab("Text Input"):
	input_text = gr.Textbox(lines=5, placeholder="Enter text...", label="Input Text")
	with gr.Tab("File Upload"):
	input_file = gr.File(file_types=[".txt", ".docx", ".pdf"])
	submit_btn = gr.Button("Classify")

	with gr.Column():
	top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
	output_labels = gr.Label(num_top_classes=4, label="All Probabilities")

	submit_btn.click(fn=classify, inputs=[input_text, input_file], outputs=[top_prediction, output_labels])

	if __name__ == "__main__":
	demo.launch()