@inproceedings{cf00d1afeea94ff7b59f5dc3bc2f9e1a,
title = "BatchEval: Towards Human-like Text Evaluation",
abstract = "Significant progress has been made in automatic text evaluation with the introduction of large language models (LLMs) as evaluators. However, current sample-wise evaluation paradigm suffers from the following issues: (1) Sensitive to prompt design; (2) Poor resistance to context noise; (3) Inferior ensemble performance with static reference. Inspired by the fact that humans treat both criterion definition and inter sample comparison as references for evaluation, we propose BATCHEVAL, a paradigm that conducts batch-wise evaluation iteratively to alleviate the above problems. We explore variants under this paradigm and confirm the optimal settings are two stage procedure with heterogeneous batch composition strategy and decimal scoring format. Comprehensive experiments across 3 LLMs on 4 text evaluation tasks demonstrate that BATCHEVAL outperforms state-of-the-art methods by 10.5\% on Pearson correlations with only 64\% API cost on average. Further analyses have verified the robustness, generalization, and working mechanism of BATCHEVAL.",
author = "Peiwen Yuan and Shaoxiong Feng and Yiwei Li and Xinglin Wang and Boyuan Pan and Heda Wang and Yao Hu and Kan Li",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 62nd Annual Meeting of the Association for Computational Linguistics, ACL 2024 ; Conference date: 11-08-2024 Through 16-08-2024",
year = "2024",
doi = "10.18653/v1/2024.acl-long.846",
language = "English",
series = "Proceedings of the Annual Meeting of the Association for Computational Linguistics",
publisher = "Association for Computational Linguistics (ACL)",
pages = "15940--15958",
editor = "Lun-Wei Ku and Martins, \{Andre F. T.\} and Vivek Srikumar",
booktitle = "Long Papers",
address = "United States",
}