metadata
license: agpl-3.0
datasets:
- Helsinki-NLP/opus-100
language:
- 'no'
- sv
- da
base_model:
- vesteinn/ScandiBERT
pipeline_tag: text-classification
citation: |
@misc{vésteinn_snæbjarnarson_2023,
author = { Vésteinn Snæbjarnarson },
title = { ScandiBERT (Revision 0f86e40) },
year = 2023,
url = { https://huggingface.co/vesteinn/ScandiBERT },
doi = { 10.57967/hf/0382 },
publisher = { Hugging Face }
|
@inproceedings{zhang-etal-2020-improving,
title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation",
author = "Zhang, Biao and
Williams, Philip and
Titov, Ivan and
Sennrich, Rico",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.148",
doi = "10.18653/v1/2020.acl-main.148",
pages = "1628--1639",
|
@inproceedings{snaebjarnarson-etal-2023-transfer,
title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese",
author = "Snæbjarnarson, Vésteinn and
Simonsen, Annika and
Glavaš, Goran and
Vulić, Ivan",
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = "may 22--24",
year = "2023",
address = "Tórshavn, Faroe Islands",
publisher = {Link{\"o}ping University Electronic Press, Sweden},
|
@inproceedings{fedorova-etal-2025-multi,
title = "Multi-label {S}candinavian Language Identification ({SLIDE})",
author = "Fedorova, Mariia and
Frydenberg, Jonas Sebulon and
Handford, Victoria and
Lang{\o}, Victoria Ovedie Chruickshank and
Willoch, Solveig Helene and
Midtgaard, Marthe L{\o}ken and
Scherrer, Yves and
M{\ae}hlum, Petter and
Samuel, David",
editor = "Holdt, {\v{S}}pela Arhar and
Ilinykh, Nikolai and
Scalvini, Barbara and
Bruton, Micaella and
Debess, Iben Nyholm and
Tudor, Crina Madalina",
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library, Estonia",
url = "https://aclanthology.org/2025.resourceful-1.33/",
pages = "179--189",
ISBN = "978-9908-53-121-2",
|
@inproceedings{tiedemann-2012-parallel,
title = "Parallel Data, Tools and Interfaces in {OPUS}",
author = {Tiedemann, J{\"o}rg},
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf",
pages = "2214--2218",
|
@inproceedings{haas-derczynski-2021-discriminating,
title = "Discriminating Between Similar {N}ordic Languages",
author = "Haas, Ren{\'e} and
Derczynski, Leon",
editor = {Zampieri, Marcos and
Nakov, Preslav and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Scherrer, Yves and
Jauhiainen, Tommi},
booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.vardial-1.8/",
pages = "67--75",
abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\r{a}}l), Faroese and Icelandic."
Note: This hybrid program is intended to be used in its corresponding Space.
ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on ScandiBERT. It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under AGPL-3.0.
The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics. Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive SLIDE eval set, with a fraction of the training data used in SLIDE.
(GitHub | Kaggle Notebooks | Space)