File size: 5,664 Bytes
a8672c0 066aaac 3a3c724 f991231 066aaac f991231 3ba777c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | ---
license: agpl-3.0
datasets:
- Helsinki-NLP/opus-100
language:
- 'no'
- sv
- da
base_model:
- vesteinn/ScandiBERT
pipeline_tag: text-classification
citation: >
@misc{vésteinn_snæbjarnarson_2023,
author = { Vésteinn Snæbjarnarson },
title = { ScandiBERT (Revision 0f86e40) },
year = 2023,
url = { https://huggingface.co/vesteinn/ScandiBERT },
doi = { 10.57967/hf/0382 },
publisher = { Hugging Face }
|
@inproceedings{zhang-etal-2020-improving,
title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation",
author = "Zhang, Biao and
Williams, Philip and
Titov, Ivan and
Sennrich, Rico",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.148",
doi = "10.18653/v1/2020.acl-main.148",
pages = "1628--1639",
|
@inproceedings{snaebjarnarson-etal-2023-transfer,
title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese",
author = "Snæbjarnarson, Vésteinn and
Simonsen, Annika and
Glavaš, Goran and
Vulić, Ivan",
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = "may 22--24",
year = "2023",
address = "Tórshavn, Faroe Islands",
publisher = {Link{\"o}ping University Electronic Press, Sweden},
|
@inproceedings{fedorova-etal-2025-multi,
title = "Multi-label {S}candinavian Language Identification ({SLIDE})",
author = "Fedorova, Mariia and
Frydenberg, Jonas Sebulon and
Handford, Victoria and
Lang{\o}, Victoria Ovedie Chruickshank and
Willoch, Solveig Helene and
Midtgaard, Marthe L{\o}ken and
Scherrer, Yves and
M{\ae}hlum, Petter and
Samuel, David",
editor = "Holdt, {\v{S}}pela Arhar and
Ilinykh, Nikolai and
Scalvini, Barbara and
Bruton, Micaella and
Debess, Iben Nyholm and
Tudor, Crina Madalina",
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library, Estonia",
url = "https://aclanthology.org/2025.resourceful-1.33/",
pages = "179--189",
ISBN = "978-9908-53-121-2",
|
@inproceedings{tiedemann-2012-parallel,
title = "Parallel Data, Tools and Interfaces in {OPUS}",
author = {Tiedemann, J{\"o}rg},
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf",
pages = "2214--2218",
|
@inproceedings{haas-derczynski-2021-discriminating,
title = "Discriminating Between Similar {N}ordic Languages",
author = "Haas, Ren{\'e} and
Derczynski, Leon",
editor = {Zampieri, Marcos and
Nakov, Preslav and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Scherrer, Yves and
Jauhiainen, Tommi},
booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.vardial-1.8/",
pages = "67--75",
abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\r{a}}l), Faroese and Icelandic."
---
**Note: This hybrid program is intended to be used in [its corresponding Space](https://huggingface.co/spaces/ianro04/ScandiProb).**
ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on ScandiBERT.
It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html).
The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics.
Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive SLIDE eval set, with a fraction of the training data used in SLIDE.
([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Space](https://huggingface.co/spaces/ianro04/ScandiProb)) |