ianro04 commited on
Commit
bfff6ca
·
verified ·
1 Parent(s): c12026a

Updated nonscandi_penalty and da_no_cross_skew to match latest Kaggle Notebook + linked model page in Markdown

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -15,7 +15,8 @@ model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOK
15
  model.eval()
16
 
17
  def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
18
- if not text.strip():
 
19
  return 1.0
20
 
21
  scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
@@ -30,11 +31,11 @@ def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model
30
  return nonscandi_percent
31
 
32
  def da_no_cross_skew(text):
 
33
  if not text:
34
  return [0.0, 0.0]
35
 
36
  da_skew, no_skew = 0.0, 0.0
37
- text = text.strip().lower()
38
 
39
  da_no_regex = {
40
  r"æ[bgltv]": "DA",
@@ -52,7 +53,7 @@ def da_no_cross_skew(text):
52
 
53
  for rule, lang in da_no_regex.items():
54
  rule_matches = len(re.findall(rule, text))
55
- skew_inc = rule_matches * skew_amount * (1.5 if len(words) <= 6 else 1)
56
  if lang == "NO":
57
  no_skew += skew_inc
58
  da_skew -= skew_inc
@@ -109,13 +110,13 @@ with gr.Blocks() as demo:
109
  gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
110
  gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
111
  gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
112
- gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/))")
113
 
114
  with gr.Row():
115
  with gr.Column():
116
  input_text = gr.Textbox(
117
  lines=5,
118
- placeholder="Type your text here...",
119
  label="Input Text"
120
  )
121
  submit_btn = gr.Button("Classify")
 
15
  model.eval()
16
 
17
  def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
18
+ text = text.strip()
19
+ if len(text) < 2:
20
  return 1.0
21
 
22
  scandi_keyboard = r"[a-zA-ZæøåÆØÅäöÄÖéÉ0-9 !@#$%^&*()\-_=+\[\]{};':\",.<>?/`~\\|]"
 
31
  return nonscandi_percent
32
 
33
  def da_no_cross_skew(text):
34
+ text = text.strip().lower()
35
  if not text:
36
  return [0.0, 0.0]
37
 
38
  da_skew, no_skew = 0.0, 0.0
 
39
 
40
  da_no_regex = {
41
  r"æ[bgltv]": "DA",
 
53
 
54
  for rule, lang in da_no_regex.items():
55
  rule_matches = len(re.findall(rule, text))
56
+ skew_inc = rule_matches * skew_amount * (2 if len(words) <= 6 else 1)
57
  if lang == "NO":
58
  no_skew += skew_inc
59
  da_skew -= skew_inc
 
110
  gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
111
  gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
112
  gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
113
+ gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
114
 
115
  with gr.Row():
116
  with gr.Column():
117
  input_text = gr.Textbox(
118
  lines=5,
119
+ placeholder="Enter text...",
120
  label="Input Text"
121
  )
122
  submit_btn = gr.Button("Classify")