Spaces:

ianro04
/

ScandiProb

Running

App Files Files Community

ianro04 commited on about 24 hours ago

Commit

1f85a2e

verified ·

1 Parent(s): bfff6ca

Attempt at implementing file upload option for input text in app.py + markdown change again

Browse files

Files changed (1) hide show

app.py +30 -9

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import os
 import re
 import torch
 import gradio as gr
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 HF_TOKEN = os.environ.get("HF_TOKEN")
 repo_id = "ianro04/ScandiProb"
@@ -14,6 +16,20 @@ tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
 model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
 model.eval()
 def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
     text = text.strip()
     if len(text) < 2:
@@ -68,7 +84,7 @@ def ScandiProb(text):
     if not text:
         return "None", {label: 0.0 for label in labels}
-    inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
@@ -104,28 +120,33 @@ def ScandiProb(text):
     return top_labels_str, final_probs
 with gr.Blocks() as demo:
     gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
     gr.Markdown("### By Ian Rodriguez")
-    gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
-    gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
     gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
     gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
     with gr.Row():
         with gr.Column():
-            input_text = gr.Textbox(
-                lines=5,
-                placeholder="Enter text...",
-                label="Input Text"
-            )
             submit_btn = gr.Button("Classify")
         with gr.Column():
             top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
             output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
-    submit_btn.click(fn=ScandiProb, inputs=input_text, outputs=[top_prediction, output_labels])
 if __name__ == "__main__":
     demo.launch()

 import os
 import re
 import torch
+import pypdf
 import gradio as gr
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from docx import Document
 HF_TOKEN = os.environ.get("HF_TOKEN")
 repo_id = "ianro04/ScandiProb"
 model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
 model.eval()
+def read_file(file_path):
+    if file_path is None:
+        return ""
+    if file_path.endswith(".txt"):
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    elif file_path.endswith(".docx"):
+        doc = Document(file_path)
+        return "\n".join([p.text for p in doc.paragraphs])
+    elif file_path.endswith(".pdf"):
+        reader = pypdf.PdfReader(file_path)
+        return "\n".join([page.extract_text() or "" for page in reader.pages])
+    return ""
 def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
     text = text.strip()
     if len(text) < 2:
     if not text:
         return "None", {label: 0.0 for label in labels}
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
     with torch.no_grad():
         outputs = model(**inputs)
     return top_labels_str, final_probs
+def classify(text, file):
+    if file is not None:
+        text = read_file(file)
+    return ScandiProb(text)
 with gr.Blocks() as demo:
     gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
     gr.Markdown("### By Ian Rodriguez")
+    gr.Markdown("Enter text or upload a file to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**. Only first 512 tokens of input will be used.")
+    gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), and combined with regex-enforced heuristics.
+    Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
     gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
     gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
     with gr.Row():
         with gr.Column():
+            with gr.Tab("Text Input"):
+                input_text = gr.Textbox(lines=5, placeholder="Enter text...", label="Input Text")
+            with gr.Tab("File Upload"):
+                input_file = gr.File(file_types=[".txt", ".docx", ".pdf"])
             submit_btn = gr.Button("Classify")
         with gr.Column():
             top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
             output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
+    submit_btn.click(fn=classify, inputs=[input_text, input_file], outputs=[top_prediction, output_labels])
 if __name__ == "__main__":
     demo.launch()