@inproceedings{b9920741e6f5439bb5c515cfc851c912,
title = "Interference-Aware Latency Prediction With Kernels For Deep Neural Network",
abstract = "With the popularity of artificial intelligence applications, deep neural network (DNN) inference workloads are becoming more common in cloud servers. To improve GPU utilization, a GPU executes multiple workloads simultaneously, inevitably leading to resource contention and increasing inference latency. We propose a kernel-based latency prediction method that can more accurately predict the latency in the case of mutual interference between multiple workloads. The method uses the kernel parameters decomposed during the DNN inference to predict the latency of each kernel. It obtains the impact of interference on each model by the amount of data exchanged between the L1 cache, L2 cache, and GPU memory during the execution of each model. We conduct experiments on popular models. The results show that compared with the state-of-the-art multi-model coexistence prediction method, our method reduces the average error by 52% when predicting the latency of a single model and by 62%, 51%, and 58% when predicting the co-location of two, three, and four models.",
keywords = "DNN, Deep learning, GPU, Kernel, Latency prediction, MPS",
author = "Huang, {Pei Jie} and Xiufeng Sui and Dawei Liu and Liyue Zhu",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 4th International Academic Exchange Conference on Science and Technology Innovation, IAECST 2022 ; Conference date: 09-12-2022 Through 11-12-2022",
year = "2022",
doi = "10.1109/IAECST57965.2022.10062171",
language = "English",
series = "2022 4th International Academic Exchange Conference on Science and Technology Innovation, IAECST 2022",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1232--1238",
booktitle = "2022 4th International Academic Exchange Conference on Science and Technology Innovation, IAECST 2022",
address = "United States",
}