@inproceedings{3c74b837962b458882dc97394905039f,
title = "Latvie{\v s}u valodas morfēmu un vārddarinā{\v s}anas modeļu datubāzes lemmu atlase",
abstract = "The article offers an overview of the first working stage for the project “Database of Latvian Morphemes and Derivational Models (DLMDM)” (No. LZP-2022/1-0013), during which a set of the lemmas database was created. The register of the lemmas was made from The Balanced Corpus of Moderns Latvian, dated to 2018. Originally, 165 090 lemmas had been obtained from corpus texts, and at the end of data revision, 77 124 lemmas were declared valid. The analysis of the lemmas took place in three steps: step 1 – automated selection of the lemmas database, step 2 – manual processing of the lemmas database, step 3 – one more automated checking of the lemmas database. A total of 30 009 lemmas (steps 1 and 3) were invalidated during the automated selection of the lemmas database. These were words that contained characters or symbols that were not letters of the Latvian alphabet, as well as various duplicate shapes. During the manual processing of the lemmas database, 78 518 lemmas were selected and tested for spelling and usage context. At this step, 57 957 lemmas were declared invalid – abbreviations, various words that do not exist in Latvian, etc. Other selected lemmas (total – 20 561) were divided into three groups: (1) lemmas that have been corrected, (2) lemmas that have been left with parallel forms, and (3) lemmas that have not been corrected. These lemmas were included in the database. The final lemmas amount is 77 124, but this number is variable because the process of data revision still proceeds during the next steps of the project.",
keywords = "datubāzes izstrāde, lemmu atlase, database{\textquoteright}s design, database{\textquoteright}s startup data, lemma selection, word formation, parallelism of lemmas, print error",
author = "Inta Urbanovi{\v c}a and Vanesa Balmane",
note = "Publisher Copyright: {\textcopyright} 2025 University of Latvia. All rights reserved.",
year = "2025",
doi = "10.22364/vnf.16.16",
language = "Latvie{\v s}u",
isbn = "978-9934-36-493-8",
volume = "16",
series = "Valoda: Nozime un Forma",
pages = "225--237",
editor = "Andra Kalna{\v c}a and Ilze Lokmane and Daiki Horiguchi",
booktitle = "Valoda: nozīme un forma",
}