ianro04 commited on
Commit
1f85a2e
·
verified ·
1 Parent(s): bfff6ca

Attempt at implementing file upload option for input text in app.py + markdown change again

Browse files
Files changed (1) hide show
  1. app.py +30 -9
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import os
2
  import re
3
  import torch
 
4
  import gradio as gr
5
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
6
 
7
  HF_TOKEN = os.environ.get("HF_TOKEN")
8
  repo_id = "ianro04/ScandiProb"
@@ -14,6 +16,20 @@ tokenizer = AutoTokenizer.from_pretrained(repo_id, token=HF_TOKEN)
14
  model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
15
  model.eval()
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
18
  text = text.strip()
19
  if len(text) < 2:
@@ -68,7 +84,7 @@ def ScandiProb(text):
68
  if not text:
69
  return "None", {label: 0.0 for label in labels}
70
 
71
- inputs = tokenizer(text, return_tensors="pt")
72
 
73
  with torch.no_grad():
74
  outputs = model(**inputs)
@@ -104,28 +120,33 @@ def ScandiProb(text):
104
 
105
  return top_labels_str, final_probs
106
 
 
 
 
 
 
107
  with gr.Blocks() as demo:
108
  gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
109
  gr.Markdown("### By Ian Rodriguez")
110
- gr.Markdown("Enter text to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**.")
111
- gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), combined with regex-enforced heuristics. Achieves ~84% macro-F1 score on the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
 
112
  gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
113
  gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
114
 
115
  with gr.Row():
116
  with gr.Column():
117
- input_text = gr.Textbox(
118
- lines=5,
119
- placeholder="Enter text...",
120
- label="Input Text"
121
- )
122
  submit_btn = gr.Button("Classify")
123
 
124
  with gr.Column():
125
  top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
126
  output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
127
 
128
- submit_btn.click(fn=ScandiProb, inputs=input_text, outputs=[top_prediction, output_labels])
129
 
130
  if __name__ == "__main__":
131
  demo.launch()
 
1
  import os
2
  import re
3
  import torch
4
+ import pypdf
5
  import gradio as gr
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
7
+ from docx import Document
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
  repo_id = "ianro04/ScandiProb"
 
16
  model = AutoModelForSequenceClassification.from_pretrained(repo_id, token=HF_TOKEN)
17
  model.eval()
18
 
19
+ def read_file(file_path):
20
+ if file_path is None:
21
+ return ""
22
+ if file_path.endswith(".txt"):
23
+ with open(file_path, "r", encoding="utf-8") as f:
24
+ return f.read()
25
+ elif file_path.endswith(".docx"):
26
+ doc = Document(file_path)
27
+ return "\n".join([p.text for p in doc.paragraphs])
28
+ elif file_path.endswith(".pdf"):
29
+ reader = pypdf.PdfReader(file_path)
30
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
31
+ return ""
32
+
33
  def nonscandi_penalty(text): # Copy-pasting everything that isn't the raw model here
34
  text = text.strip()
35
  if len(text) < 2:
 
84
  if not text:
85
  return "None", {label: 0.0 for label in labels}
86
 
87
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
88
 
89
  with torch.no_grad():
90
  outputs = model(**inputs)
 
120
 
121
  return top_labels_str, final_probs
122
 
123
+ def classify(text, file):
124
+ if file is not None:
125
+ text = read_file(file)
126
+ return ScandiProb(text)
127
+
128
  with gr.Blocks() as demo:
129
  gr.Markdown("# ScandiProb: Hybrid Language ID Classifier")
130
  gr.Markdown("### By Ian Rodriguez")
131
+ gr.Markdown("Enter text or upload a file to output independent probabilities that it is written in **Norwegian**, **Swedish**, **Danish**, or **None of the Above / Non-Scandinavian**. Only first 512 tokens of input will be used.")
132
+ gr.Markdown("This model utilizes a fine-tuned [ScandiBERT](https://huggingface.co/vesteinn/ScandiBERT), trained on limited amounts of [OPUS-100](https://huggingface.co/datasets/Helsinki-NLP/opus-100/), and combined with regex-enforced heuristics.
133
+ Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive [SLIDE eval set](https://huggingface.co/datasets/ltg/slide) with a fraction of the training data used in the 2025 SLIDE paper.")
134
  gr.Markdown("[This project is licensed under AGPL-3.0.](https://www.gnu.org/licenses/agpl-3.0.en.html)")
135
  gr.Markdown("([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Raw Model Page](https://huggingface.co/ianro04/ScandiProb))")
136
 
137
  with gr.Row():
138
  with gr.Column():
139
+ with gr.Tab("Text Input"):
140
+ input_text = gr.Textbox(lines=5, placeholder="Enter text...", label="Input Text")
141
+ with gr.Tab("File Upload"):
142
+ input_file = gr.File(file_types=[".txt", ".docx", ".pdf"])
 
143
  submit_btn = gr.Button("Classify")
144
 
145
  with gr.Column():
146
  top_prediction = gr.Textbox(label="Probable Languages (>50%)", interactive=False)
147
  output_labels = gr.Label(num_top_classes=4, label="All Probabilities")
148
 
149
+ submit_btn.click(fn=classify, inputs=[input_text, input_file], outputs=[top_prediction, output_labels])
150
 
151
  if __name__ == "__main__":
152
  demo.launch()