Spaces:
Running
Running
Updated nonscandi_penalty and da_no_cross_skew to match latest Kaggle Notebook + linked model page in Markdown
Browse files
app.py
CHANGED
|
@@ -15,7 +15,8 @@ model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOK
|
|
| 15 |
model.eval()
|
| 16 |
|
| 17 |
def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
|
| 18 |
-
|
|
|
|
| 19 |
return 1.0
|
| 20 |
|
| 21 |
scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
|
|
@@ -30,11 +31,11 @@ def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model
|
|
| 30 |
return nonscandi_percent
|
| 31 |
|
| 32 |
def da_no_cross_skew(text):
|
|
|
|
| 33 |
if not text:
|
| 34 |
return [0.0, 0.0]
|
| 35 |
|
| 36 |
da_skew, no_skew = 0.0, 0.0
|
| 37 |
-
text = text.strip().lower()
|
| 38 |
|
| 39 |
da_no_regex = {
|
| 40 |
r"æ[bgltv]": "DA",
|
|
@@ -52,7 +53,7 @@ def da_no_cross_skew(text):
|
|
| 52 |
|
| 53 |
for rule, lang in da_no_regex.items():
|
| 54 |
rule_matches = len(re.findall(rule, text))
|
| 55 |
-
skew_inc = rule_matches * skew_amount * (
|
| 56 |
if lang == "NO":
|
| 57 |
no_skew += skew_inc
|
| 58 |
da_skew -= skew_inc
|
|
@@ -109,13 +110,13 @@ with gr.Blocks() as demo:
|
|
| 109 |
gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
|
| 110 |
gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
|
| 111 |
gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
|
| 112 |
-
gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/))")
|
| 113 |
|
| 114 |
with gr.Row():
|
| 115 |
with gr.Column():
|
| 116 |
input_text = gr.Textbox(
|
| 117 |
lines=5,
|
| 118 |
-
placeholder="
|
| 119 |
label="Input Text"
|
| 120 |
)
|
| 121 |
submit_btn = gr.Button("Classify")
|
|
|
|
| 15 |
model.eval()
|
| 16 |
|
| 17 |
def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
|
| 18 |
+
text = text.strip()
|
| 19 |
+
if len(text) < 2:
|
| 20 |
return 1.0
|
| 21 |
|
| 22 |
scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
|
|
|
|
| 31 |
return nonscandi_percent
|
| 32 |
|
| 33 |
def da_no_cross_skew(text):
|
| 34 |
+
text = text.strip().lower()
|
| 35 |
if not text:
|
| 36 |
return [0.0, 0.0]
|
| 37 |
|
| 38 |
da_skew, no_skew = 0.0, 0.0
|
|
|
|
| 39 |
|
| 40 |
da_no_regex = {
|
| 41 |
r"æ[bgltv]": "DA",
|
|
|
|
| 53 |
|
| 54 |
for rule, lang in da_no_regex.items():
|
| 55 |
rule_matches = len(re.findall(rule, text))
|
| 56 |
+
skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
|
| 57 |
if lang == "NO":
|
| 58 |
no_skew += skew_inc
|
| 59 |
da_skew -= skew_inc
|
|
|
|
| 110 |
gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
|
| 111 |
gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
|
| 112 |
gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
|
| 113 |
+
gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
|
| 114 |
|
| 115 |
with gr.Row():
|
| 116 |
with gr.Column():
|
| 117 |
input_text = gr.Textbox(
|
| 118 |
lines=5,
|
| 119 |
+
placeholder="Enter text...",
|
| 120 |
label="Input Text"
|
| 121 |
)
|
| 122 |
submit_btn = gr.Button("Classify")
|