@inproceedings{734ba33182a546189217ba2816db3e6a,
title = "Evaluating Open-Source LLMs in Low-Resource Languages: Insights from Latvian High School Exams",
abstract = "The latest large language models (LLM) have significantly advanced natural language processing (NLP) capabilities across various tasks.However, their performance in low-resource languages, such as Latvian with 1.5 million native speakers, remains substantially underexplored due to both limited training data and the absence of comprehensive evaluation benchmarks.This study addresses this gap by conducting a systematic assessment of prominent open-source LLMs on natural language understanding (NLU) and natural language generation (NLG) tasks in Latvian.We utilize standardized high school centralized graduation exams as a benchmark dataset, offering relatable and diverse evaluation scenarios that encompass multiple-choice questions and complex text analysis tasks.Our experimental setup involves testing models from the leading LLM families, including Llama, Qwen, Gemma, and Mistral, with OpenAI's GPT-4 serving as a performance reference.The results reveal that certain open-source models demonstrate competitive performance in NLU tasks, narrowing the gap with GPT-4.However, all models exhibit notable deficiencies in NLG tasks, specifically in generating coherent and contextually appropriate text analyses, highlighting persistent challenges in NLG for low-resource languages.These findings contribute to efforts to develop robust multilingual benchmarks and to improve LLM performance in diverse linguistic contexts.",
author = "Roberts Dargis and Guntis Bārzdiņ{\v s} and Inguna Skadiņa and Normunds Grūzītis and Baiba Sauli¯te",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.",
year = "2024",
doi = "10.18653/v1/2024.nlp4dh-1.28",
language = "English",
isbn = "979-889176181-0",
series = "NLP4DH 2024 - 4th International Conference on Natural Language Processing for Digital Humanities, Proceedings of the Conference",
publisher = "Association for Computational Linguistics",
pages = "289--293",
editor = "Mika Hamalainen and Emily Ohman and So Miyagawa and Khalid Alnajjar and Yuri Bizzoni",
booktitle = "Nlp4dh 2024 4th International Conference on Natural Language Processing for Digital Humanities Proceedings of the Conference",
address = "United States",
}