@inproceedings{2c77a794ffd4427a83e5acc1d8a4efc2,
title = "MultiLeg: Dataset for Text Sanitisation in Less-resourced Languages",
abstract = "Text sanitization is the task of detecting and removing personal information from the text. While it has been well-studied in monolingual settings, today, there is also a need for multilingual text sanitization. In this paper, we introduce MultiLeg: a parallel, multilingual named entity (NE) dataset consisting of documents from the Court of Justice of the European Union annotated with semantic categories suitable for text sanitization. The dataset is available in 8 languages, and it contains 3082 parallel text segments for each language. We also show that the pseudonymized dataset remains useful for downstream tasks.",
keywords = "legal domain, multilingual, named entities, text sanitization",
author = "Rinalds V{\=ı}ksna and Inguna Skadin and Roberts Rozis",
note = "Publisher Copyright: {\textcopyright} 2024 ELRA Language Resource Association: CC BY-NC 4.0.",
year = "2024",
language = "English",
isbn = "978-249381410-4",
series = "2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024 - Main Conference Proceedings",
pages = "11776--11782",
editor = "Nicoletta Calzolari and Min-Yen Kan and Veronique Hoste and Alessandro Lenci and Sakriani Sakti and Nianwen Xue",
booktitle = "2024 Joint International Conference on Computational Linguistics Language Resources and Evaluation Lrec Coling 2024 Main Conference Proceedings",
}