@inproceedings{df05bbdc7cfe47bcb2bf4b6b37607f48,
title = "A work-stealing scheduling framework supporting fault tolerance",
abstract = "Fault tolerance and load balancing are critical points for executing long-running parallel applications on multicore clusters. This paper addresses both fault tolerance and load balancing on multicore clusters by presenting a novel work-stealing task scheduling framework which supports hardware fault tolerance. In this framework, both transient and permanent faults are detected and recovered at task granularity. We incorporate task-based fault detection and recovery mechanisms into a hierarchical work-stealing scheme to establish the framework. This framework provides low-overhead fault-tolerance and optimal load balancing by fully exploiting task parallelism.",
keywords = "Cluster, Fault tolerance, Multicore, Work-stealing",
author = "Yizhuo Wang and Weixing Ji and Feng Shi and Qi Zuo",
year = "2013",
doi = "10.7873/date.2013.150",
language = "English",
isbn = "9783981537000",
series = "Proceedings -Design, Automation and Test in Europe, DATE",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "695--700",
booktitle = "Proceedings - Design, Automation and Test in Europe, DATE 2013",
address = "United States",
note = "16th Design, Automation and Test in Europe Conference and Exhibition, DATE 2013 ; Conference date: 18-03-2013 Through 22-03-2013",
}