--- license: agpl-3.0 datasets: - Helsinki-NLP/opus-100 language: - 'no' - sv - da base_model: - vesteinn/ScandiBERT pipeline_tag: text-classification citation: > @misc{vésteinn_snæbjarnarson_2023, author = { Vésteinn Snæbjarnarson }, title = { ScandiBERT (Revision 0f86e40) }, year = 2023, url = { https://huggingface.co/vesteinn/ScandiBERT }, doi = { 10.57967/hf/0382 }, publisher = { Hugging Face } | @inproceedings{zhang-etal-2020-improving, title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation", author = "Zhang, Biao and Williams, Philip and Titov, Ivan and Sennrich, Rico", editor = "Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2020.acl-main.148", doi = "10.18653/v1/2020.acl-main.148", pages = "1628--1639", | @inproceedings{snaebjarnarson-etal-2023-transfer, title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese", author = "Snæbjarnarson, Vésteinn and Simonsen, Annika and Glavaš, Goran and Vulić, Ivan", booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", month = "may 22--24", year = "2023", address = "Tórshavn, Faroe Islands", publisher = {Link{\"o}ping University Electronic Press, Sweden}, | @inproceedings{fedorova-etal-2025-multi, title = "Multi-label {S}candinavian Language Identification ({SLIDE})", author = "Fedorova, Mariia and Frydenberg, Jonas Sebulon and Handford, Victoria and Lang{\o}, Victoria Ovedie Chruickshank and Willoch, Solveig Helene and Midtgaard, Marthe L{\o}ken and Scherrer, Yves and M{\ae}hlum, Petter and Samuel, David", editor = "Holdt, {\v{S}}pela Arhar and Ilinykh, Nikolai and Scalvini, Barbara and Bruton, Micaella and Debess, Iben Nyholm and Tudor, Crina Madalina", booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)", month = mar, year = "2025", address = "Tallinn, Estonia", publisher = "University of Tartu Library, Estonia", url = "https://aclanthology.org/2025.resourceful-1.33/", pages = "179--189", ISBN = "978-9908-53-121-2", | @inproceedings{tiedemann-2012-parallel, title = "Parallel Data, Tools and Interfaces in {OPUS}", author = {Tiedemann, J{\"o}rg}, editor = "Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Do{\u{g}}an, Mehmet U{\u{g}}ur and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios", booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)", month = may, year = "2012", address = "Istanbul, Turkey", publisher = "European Language Resources Association (ELRA)", url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", pages = "2214--2218", | @inproceedings{haas-derczynski-2021-discriminating, title = "Discriminating Between Similar {N}ordic Languages", author = "Haas, Ren{\'e} and Derczynski, Leon", editor = {Zampieri, Marcos and Nakov, Preslav and Ljube{\v{s}}i{\'c}, Nikola and Tiedemann, J{\"o}rg and Scherrer, Yves and Jauhiainen, Tommi}, booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects", month = apr, year = "2021", address = "Kiyv, Ukraine", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.vardial-1.8/", pages = "67--75", abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\r{a}}l), Faroese and Icelandic." --- **Note: This hybrid program is intended to be used in [its corresponding Space](https://huggingface.co/spaces/ianro04/ScandiProb).** ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on ScandiBERT. It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html). The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics. Achieves ~93% macro-F1 score on OPUS-100 test set and ~84% macro-F1 score against the comprehensive SLIDE eval set, with a fraction of the training data used in SLIDE. ([GitHub](https://github.com/cloudeerie/scandiprob) | [Kaggle Notebooks](https://www.kaggle.com/code/cloudeerie/scandiprob/) | [Space](https://huggingface.co/spaces/ianro04/ScandiProb))