@inproceedings{69714cf7409c418f97f5ef5389922b4c,
title = "Partitioned Scheduling and Parallelism Assignment for Real-Time DNN Inference Tasks on Multi-TPU",
abstract = "Pipelining on Edge Tensor Processing Units (TPUs) optimizes the deep neural network (DNN) inference by breaking it down into multiple stages processed concurrently on multiple accelerators. Such DNN inference tasks can be modeled as sporadic non-preemptive gangs with execution times that vary with their parallelism levels. This paper proposes a strict partitioning strategy for deploying DNN inferences in real-time systems. The strategy determines tasks' parallelism levels and assigns tasks to disjoint processor partitions. Configuring the tasks in the same partition with a uniform parallelism level avoids scheduling anomalies and enables schedulability verification using well-understood uniprocessor analyses. Evaluation using real-world Edge TPU benchmarks demonstrated that the proposed method achieves a higher schedulability ratio than state-of-the-art gang scheduling techniques.",
author = "Binqi Sun and Tomasz Kloda and Wu, {Chu Ge} and Marco Caccamo",
note = "Publisher Copyright: {\textcopyright} 2024 Copyright is held by the owner/author(s). Publication rights licensed to ACM.; 61st ACM/IEEE Design Automation Conference, DAC 2024 ; Conference date: 23-06-2024 Through 27-06-2024",
year = "2024",
month = nov,
day = "7",
doi = "10.1145/3649329.3655979",
language = "English",
series = "Proceedings - Design Automation Conference",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "Proceedings of the 61st ACM/IEEE Design Automation Conference, DAC 2024",
address = "United States",
}