@inproceedings{5c690e479ba6430b9392af9085b6e0bc,
title = "Task parallel implementation of matrix multiplication on multi-socket multi-core architectures",
abstract = "Matrix multiplication is a very important computation kernel in many science and engineering applications. This paper presents a parallel implementation framework for dense matrix multiplication on multi-socket multi-core architectures. Our framework first partitions the computation between the multi-core processors. Then a hybrid matrix multiplication algorithm is used on each processor, which combines the Winograd algorithm and the classical algorithm. In addition, a hierarchical work-stealing scheme is applied to achieve dynamic load balancing and enforce data locality in our framework. Performance experiments on two platforms show that our implementation gets significant performance gains compared with the state-of-the-art implementations.",
keywords = "Fast algorithms, Matrix multiplications, Multi-socket, Winograd, Work-stealing",
author = "Yizhuo Wang and Weixing Ji and Xu Chen and Sensen Hu",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2015.; 15th International Conference on Algorithms and Architectures for Parallel Processing, ICA3PP 2015 ; Conference date: 18-11-2015 Through 20-11-2015",
year = "2015",
doi = "10.1007/978-3-319-27137-8_8",
language = "English",
isbn = "9783319271361",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "93--104",
editor = "Guojun Wang and Perez, {Gregorio Martinez} and Albert Zomaya and Kenli Li",
booktitle = "Algorithms and Architectures for Parallel Processing - 15th International Conference, ICA3PP 2015, Proceedings",
address = "Germany",
}