| --- |
| license: agpl-3.0 |
| datasets: |
| - Helsinki-NLP/opus-100 |
| language: |
| - 'no' |
| - sv |
| - da |
| base_model: |
| - vesteinn/ScandiBERT |
| pipeline_tag: text-classification |
| citation: > |
| @misc{vésteinn_snæbjarnarson_2023, |
| author = { Vésteinn Snæbjarnarson }, |
| title = { ScandiBERT (Revision 0f86e40) }, |
| year = 2023, |
| url = { https://huggingface.co/vesteinn/ScandiBERT }, |
| doi = { 10.57967/hf/0382 }, |
| publisher = { Hugging Face } |
| | |
| |
| @inproceedings{zhang-etal-2020-improving, |
| title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation", |
| author = "Zhang, Biao and |
| Williams, Philip and |
| Titov, Ivan and |
| Sennrich, Rico", |
| editor = "Jurafsky, Dan and |
| Chai, Joyce and |
| Schluter, Natalie and |
| Tetreault, Joel", |
| booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", |
| month = jul, |
| year = "2020", |
| address = "Online", |
| publisher = "Association for Computational Linguistics", |
| url = "https://aclanthology.org/2020.acl-main.148", |
| doi = "10.18653/v1/2020.acl-main.148", |
| pages = "1628--1639", |
| | |
|
|
| @inproceedings{snaebjarnarson-etal-2023-transfer, |
| title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese", |
| author = "Snæbjarnarson, Vésteinn and |
| Simonsen, Annika and |
| Glavaš, Goran and |
| Vulić, Ivan", |
| booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", |
| month = "may 22--24", |
| year = "2023", |
| address = "Tórshavn, Faroe Islands", |
| publisher = {Link{\"o}ping University Electronic Press, Sweden}, |
| | |
|
|
| @inproceedings{fedorova-etal-2025-multi, |
| title = "Multi-label {S}candinavian Language Identification ({SLIDE})", |
| author = "Fedorova, Mariia and |
| Frydenberg, Jonas Sebulon and |
| Handford, Victoria and |
| Lang{\o}, Victoria Ovedie Chruickshank and |
| Willoch, Solveig Helene and |
| Midtgaard, Marthe L{\o}ken and |
| Scherrer, Yves and |
| M{\ae}hlum, Petter and |
| Samuel, David", |
| editor = "Holdt, {\v{S}}pela Arhar and |
| Ilinykh, Nikolai and |
| Scalvini, Barbara and |
| Bruton, Micaella and |
| Debess, Iben Nyholm and |
| Tudor, Crina Madalina", |
| booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)", |
| month = mar, |
| year = "2025", |
| address = "Tallinn, Estonia", |
| publisher = "University of Tartu Library, Estonia", |
| url = "https://aclanthology.org/2025.resourceful-1.33/", |
| pages = "179--189", |
| ISBN = "978-9908-53-121-2", |
| | |
|
|
| @inproceedings{tiedemann-2012-parallel, |
| title = "Parallel Data, Tools and Interfaces in {OPUS}", |
| author = {Tiedemann, J{\"o}rg}, |
| editor = "Calzolari, Nicoletta and |
| Choukri, Khalid and |
| Declerck, Thierry and |
| Do{\u{g}}an, Mehmet U{\u{g}}ur and |
| Maegaard, Bente and |
| Mariani, Joseph and |
| Moreno, Asuncion and |
| Odijk, Jan and |
| Piperidis, Stelios", |
| booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)", |
| month = may, |
| year = "2012", |
| address = "Istanbul, Turkey", |
| publisher = "European Language Resources Association (ELRA)", |
| url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", |
| pages = "2214--2218", |
| | |
|
|
| @inproceedings{haas-derczynski-2021-discriminating, |
| title = "Discriminating Between Similar {N}ordic Languages", |
| author = "Haas, Ren{\'e} and |
| Derczynski, Leon", |
| editor = {Zampieri, Marcos and |
| Nakov, Preslav and |
| Ljube{\v{s}}i{\'c}, Nikola and |
| Tiedemann, J{\"o}rg and |
| Scherrer, Yves and |
| Jauhiainen, Tommi}, |
| booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects", |
| month = apr, |
| year = "2021", |
| address = "Kiyv, Ukraine", |
| publisher = "Association for Computational Linguistics", |
| url = "https://aclanthology.org/2021.vardial-1.8/", |
| pages = "67--75", |
| abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\r{a}}l), Faroese and Icelandic." |
|
|
| --- |
| **Note: This hybrid program is intended to be used in [its corresponding Space](https://huggingface.co/spaces/ianro04/ScandiProb).** |
|
|
| ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on ScandiBERT. |
| It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html). |
|
|
| The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics. |
| Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive SLIDE eval set, with a fraction of the training data used in SLIDE. |
|
|
| ([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Space](https://huggingface.co/spaces/ianro04/ScandiProb)) |