@inproceedings{liu2025data,
title = "What data should {I} include in my {POS} tagging training set?",
author = "Liu, Zoey and
Jasbi, Masoud and
Grant, Christan and
Sagae, Kenji and
Prud{'}hommeaux, Emily",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.448/",
pages = "8439--8455",
ISBN = "979-8-89176-335-7",
abstract = "Building an NLP training set for understudied languages, including Indigenous and endangered languages, often faces challenges due to varying degrees of resource limitations in the speaker communities. What are some reasonable approaches for training set construction in these cases? We address this question with POS tagging as the test case. Although many might consider POS tagging ``a solved problem'', it remains a crucial task for descriptive linguistics and language documentation and requires laborious manual annotation. Drawing data from 12 language families, we compare in-context learning, active learning (AL), and random sampling. Our results suggest: (1) for communities whose language data can be ethically shared with an API, using only 1,000 randomly sampled tokens as prompt examples, the proprietary GPT-4.1-mini can deliver desirable performance (F1{\ensuremath{>}}0.83) on par with that from a training set of thousands of tokens in AL iterations; (2) in cases where communities prefer not to share data, 4,500-5,500 tokens selected from AL can yield reasonable results at a pace statistically significantly faster than random sampling, evidenced by growth curve modeling."
}