Spaces:

ianro04
/

ScandiProb

Running

App Files Files Community

ianro04 commited on 1 day ago

Commit

bfff6ca

verified ·

1 Parent(s): c12026a

Updated nonscandi_penalty and da_no_cross_skew to match latest Kaggle Notebook + linked model page in Markdown

Browse files

Files changed (1) hide show

app.py +6 -5

app.py CHANGED Viewed

@@ -15,7 +15,8 @@ model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOK
 model.eval()
 def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
-    if not text.strip():
         return 1.0
     scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
@@ -30,11 +31,11 @@ def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model
     return nonscandi_percent
 def da_no_cross_skew(text):
     if not text:
         return [0.0, 0.0]
     da_skew, no_skew = 0.0, 0.0
-    text = text.strip().lower()
     da_no_regex = {
         r"æ[bgltv]": "DA",
@@ -52,7 +53,7 @@ def da_no_cross_skew(text):
     for rule, lang in da_no_regex.items():
         rule_matches = len(re.findall(rule, text))
-        skew_inc = rule_matches * skew_amount * (1.5 if len(words) <= 6 else 1)
         if lang == "NO":
             no_skew += skew_inc
             da_skew -= skew_inc
@@ -109,13 +110,13 @@ with gr.Blocks() as demo:
     gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
     gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
     gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
-    gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/))")
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
                 lines=5,
-                placeholder="Type your text here...",
                 label="Input Text"
             )
             submit_btn = gr.Button("Classify")

 model.eval()
 def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
+    text = text.strip()
+    if len(text) < 2:
         return 1.0
     scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
     return nonscandi_percent
 def da_no_cross_skew(text):
+    text = text.strip().lower()
     if not text:
         return [0.0, 0.0]
     da_skew, no_skew = 0.0, 0.0
     da_no_regex = {
         r"æ[bgltv]": "DA",
     for rule, lang in da_no_regex.items():
         rule_matches = len(re.findall(rule, text))
+        skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
         if lang == "NO":
             no_skew += skew_inc
             da_skew -= skew_inc
     gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
     gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
     gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
+    gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
                 lines=5,
+                placeholder="Enter text...",
                 label="Input Text"
             )
             submit_btn = gr.Button("Classify")