@inproceedings{b2b181656ebf4ef390d5fc5ac3adb80d,
title = "Creating Lithuanian and Latvian Speech Corpora from Inaccurately Annotated Web Data",
abstract = "This paper describes the method that was used to produce additional acoustic model training data for the less-resourced languages of Lithuanian and Latvian. The method uses existing baseline speech recognition systems for Latvian and Lithuanian to align audio data from the Web with imprecise non-normalised transcripts. From 690 hours of Web data (300h for Latvian, 390h for Lithuanian), we have created additional 378 hours of training data (186h for Latvian and 192 for Lithuanian). Combining this additional data with baseline training data allowed to significantly improve word error rate for Lithuanian from 40\% to 23\%. Word error rate for the Latvian system was improved from 19\% to 17\%.",
author = "Askars Salimbajevs",
note = "Publisher Copyright: {\textcopyright} LREC 2018 - 11th International Conference on Language Resources and Evaluation. All rights reserved.; 11th International Conference on Language Resources and Evaluation, LREC 2018 ; Conference date: 07-05-2018 Through 12-05-2018",
year = "2018",
language = "English",
isbn = "979-109554600-9",
series = "LREC 2018 - 11th International Conference on Language Resources and Evaluation",
publisher = "European Language Resources Association (ELRA)",
pages = "2871--2875",
editor = "Nicoletta Calzolari and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga",
booktitle = "11th International Conference on Language Resources and Evaluation (LREC 2018), May 7-12, 2018, Miyazaki, Japan : Conference Paper",
}