@inproceedings{fc96c2c610514c4c90943fb0fa9177d1,
title = "LAGA: Lagged AllReduce with Gradient Accumulation for Minimal Idle Time",
abstract = "Training neural networks on large distributed clusters has become a common practice due to the size and complexity of recent neural networks. These high-end clusters of advanced computational devices cooperate together to reduce the neural network training duration. In practice, training at linear scalability with respect to the number of devices is difficult, due to communication overheads. These communication overheads often cause long idle times for the computational devices. In this paper, we propose LAGA (Lagged AllReduce with Gradient Accumulation): a hybrid technique that combines the best of synchronous and asynchronous approaches, that scales linearly. LAGA reduces the device idle time by accumulating locally computed gradients and executing the communications in the background. We demonstrate the effectiveness of LAGA in both final accuracy and scalability on the ImageNet dataset, where LAGA achieves a speedup of up to 2. 96x and 5. 24x less idle time. Finally, we provide convergence guarantees for LAGA under the non-convex setting.",
keywords = "neural networks, non-convex, optimization",
author = "Ido Hakimi and Aviv, \{Rotem Zamir\} and Levy, \{Kfir Y.\} and Assaf Schuster",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 21st IEEE International Conference on Data Mining, ICDM 2021 ; Conference date: 07-12-2021 Through 10-12-2021",
year = "2021",
doi = "10.1109/ICDM51629.2021.00027",
language = "الإنجليزيّة",
series = "Proceedings - IEEE International Conference on Data Mining, ICDM",
pages = "171--180",
editor = "James Bailey and Pauli Miettinen and Koh, \{Yun Sing\} and Dacheng Tao and Xindong Wu",
booktitle = "Proceedings - 21st IEEE International Conference on Data Mining, ICDM 2021",
}