Added YAML citations
Browse files
README.md
CHANGED
|
@@ -9,10 +9,120 @@ language:
|
|
| 9 |
base_model:
|
| 10 |
- vesteinn/ScandiBERT
|
| 11 |
pipeline_tag: text-classification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
**Note: This hybrid program is intended to be used in [its corresponding Space](https://huggingface.co/spaces/ianro04/ScandiProb).**
|
| 14 |
|
| 15 |
-
ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on
|
| 16 |
It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
| 17 |
|
| 18 |
The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics.
|
|
|
|
| 9 |
base_model:
|
| 10 |
- vesteinn/ScandiBERT
|
| 11 |
pipeline_tag: text-classification
|
| 12 |
+
citation: >
|
| 13 |
+
@misc{vésteinn_snæbjarnarson_2023,
|
| 14 |
+
author = { Vésteinn Snæbjarnarson },
|
| 15 |
+
title = { ScandiBERT (Revision 0f86e40) },
|
| 16 |
+
year = 2023,
|
| 17 |
+
url = { https://huggingface.co/vesteinn/ScandiBERT },
|
| 18 |
+
doi = { 10.57967/hf/0382 },
|
| 19 |
+
publisher = { Hugging Face }
|
| 20 |
+
|
|
| 21 |
+
|
| 22 |
+
@inproceedings{zhang-etal-2020-improving,
|
| 23 |
+
title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation",
|
| 24 |
+
author = "Zhang, Biao and
|
| 25 |
+
Williams, Philip and
|
| 26 |
+
Titov, Ivan and
|
| 27 |
+
Sennrich, Rico",
|
| 28 |
+
editor = "Jurafsky, Dan and
|
| 29 |
+
Chai, Joyce and
|
| 30 |
+
Schluter, Natalie and
|
| 31 |
+
Tetreault, Joel",
|
| 32 |
+
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
|
| 33 |
+
month = jul,
|
| 34 |
+
year = "2020",
|
| 35 |
+
address = "Online",
|
| 36 |
+
publisher = "Association for Computational Linguistics",
|
| 37 |
+
url = "https://aclanthology.org/2020.acl-main.148",
|
| 38 |
+
doi = "10.18653/v1/2020.acl-main.148",
|
| 39 |
+
pages = "1628--1639",
|
| 40 |
+
|
|
| 41 |
+
|
| 42 |
+
@inproceedings{snaebjarnarson-etal-2023-transfer,
|
| 43 |
+
title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese",
|
| 44 |
+
author = "Snæbjarnarson, Vésteinn and
|
| 45 |
+
Simonsen, Annika and
|
| 46 |
+
Glavaš, Goran and
|
| 47 |
+
Vulić, Ivan",
|
| 48 |
+
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
|
| 49 |
+
month = "may 22--24",
|
| 50 |
+
year = "2023",
|
| 51 |
+
address = "Tórshavn, Faroe Islands",
|
| 52 |
+
publisher = {Link{\"o}ping University Electronic Press, Sweden},
|
| 53 |
+
|
|
| 54 |
+
|
| 55 |
+
@inproceedings{fedorova-etal-2025-multi,
|
| 56 |
+
title = "Multi-label {S}candinavian Language Identification ({SLIDE})",
|
| 57 |
+
author = "Fedorova, Mariia and
|
| 58 |
+
Frydenberg, Jonas Sebulon and
|
| 59 |
+
Handford, Victoria and
|
| 60 |
+
Lang{\o}, Victoria Ovedie Chruickshank and
|
| 61 |
+
Willoch, Solveig Helene and
|
| 62 |
+
Midtgaard, Marthe L{\o}ken and
|
| 63 |
+
Scherrer, Yves and
|
| 64 |
+
M{\ae}hlum, Petter and
|
| 65 |
+
Samuel, David",
|
| 66 |
+
editor = "Holdt, {\v{S}}pela Arhar and
|
| 67 |
+
Ilinykh, Nikolai and
|
| 68 |
+
Scalvini, Barbara and
|
| 69 |
+
Bruton, Micaella and
|
| 70 |
+
Debess, Iben Nyholm and
|
| 71 |
+
Tudor, Crina Madalina",
|
| 72 |
+
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
|
| 73 |
+
month = mar,
|
| 74 |
+
year = "2025",
|
| 75 |
+
address = "Tallinn, Estonia",
|
| 76 |
+
publisher = "University of Tartu Library, Estonia",
|
| 77 |
+
url = "https://aclanthology.org/2025.resourceful-1.33/",
|
| 78 |
+
pages = "179--189",
|
| 79 |
+
ISBN = "978-9908-53-121-2",
|
| 80 |
+
|
|
| 81 |
+
|
| 82 |
+
@inproceedings{tiedemann-2012-parallel,
|
| 83 |
+
title = "Parallel Data, Tools and Interfaces in {OPUS}",
|
| 84 |
+
author = {Tiedemann, J{\"o}rg},
|
| 85 |
+
editor = "Calzolari, Nicoletta and
|
| 86 |
+
Choukri, Khalid and
|
| 87 |
+
Declerck, Thierry and
|
| 88 |
+
Do{\u{g}}an, Mehmet U{\u{g}}ur and
|
| 89 |
+
Maegaard, Bente and
|
| 90 |
+
Mariani, Joseph and
|
| 91 |
+
Moreno, Asuncion and
|
| 92 |
+
Odijk, Jan and
|
| 93 |
+
Piperidis, Stelios",
|
| 94 |
+
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
|
| 95 |
+
month = may,
|
| 96 |
+
year = "2012",
|
| 97 |
+
address = "Istanbul, Turkey",
|
| 98 |
+
publisher = "European Language Resources Association (ELRA)",
|
| 99 |
+
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf",
|
| 100 |
+
pages = "2214--2218",
|
| 101 |
+
|
|
| 102 |
+
|
| 103 |
+
@inproceedings{haas-derczynski-2021-discriminating,
|
| 104 |
+
title = "Discriminating Between Similar {N}ordic Languages",
|
| 105 |
+
author = "Haas, Ren{\'e} and
|
| 106 |
+
Derczynski, Leon",
|
| 107 |
+
editor = {Zampieri, Marcos and
|
| 108 |
+
Nakov, Preslav and
|
| 109 |
+
Ljube{\v{s}}i{\'c}, Nikola and
|
| 110 |
+
Tiedemann, J{\"o}rg and
|
| 111 |
+
Scherrer, Yves and
|
| 112 |
+
Jauhiainen, Tommi},
|
| 113 |
+
booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects",
|
| 114 |
+
month = apr,
|
| 115 |
+
year = "2021",
|
| 116 |
+
address = "Kiyv, Ukraine",
|
| 117 |
+
publisher = "Association for Computational Linguistics",
|
| 118 |
+
url = "https://aclanthology.org/2021.vardial-1.8/",
|
| 119 |
+
pages = "67--75",
|
| 120 |
+
abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\r{a}}l), Faroese and Icelandic."
|
| 121 |
+
|
| 122 |
---
|
| 123 |
**Note: This hybrid program is intended to be used in [its corresponding Space](https://huggingface.co/spaces/ianro04/ScandiProb).**
|
| 124 |
|
| 125 |
+
ScandiProb is an intentionally data-constrained, multi-label language ID hybrid text classifier for Norwegian, Swedish, and Danish, based on ScandiBERT.
|
| 126 |
It was done as an undergraduate final project for a Spring 2026 NLP course at the University of Alaska Fairbanks. It is licensed under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
| 127 |
|
| 128 |
The full program utilizes a fine-tuned ScandiBERT, trained on limited amounts of OPUS-100, and combined with regex-enforced heuristics.
|