@inproceedings{de667fb7d42c4d78973f26c6ea4cf42c,
title = "MPIrigen: MPI Code Generation through Domain-Specific Language Models",
abstract = "The imperative need to scale computation across numerous nodes highlights the significance of efficient parallel computing, particularly in the realm of Message Passing Interface (MPI) integration. While MPI serves as a cornerstone for large-scale parallelism, its seamless integration into codebases, especially concerning domain decomposition, has proven challenging. Static tools aimed at addressing this challenge have exhibited limited effectiveness and scalability. On the other hand, contemporary language models designed for programming problems have demonstrated utility in parallel programming tasks such as OpenMP pragma generation. However, the challenging parallel programming task of generating MPI-based parallel programs has remained unexplored.This study first investigates the performance of state-of-the-art language models in generating MPI-based parallel programs. Findings reveal that widely used models such as GPT-3.5 and PolyCoder (specialized multi-lingual code models) exhibit notable performance degradation when generating MPI-based programs compared to general-purpose programs. In contrast, domain-specific models such as MonoCoder, which are pre-trained on MPI-related programming languages of C and C++, outperform larger models. Subsequently, we introduce a dedicated downstream task of MPI-based program generation by fine-tuning MonoCoder on HPCorpusMPI. We call the resulting model as MPIrigen. We propose an innovative preprocessing for completion only after observing the whole code, thus enabling better completion with a wider context. Comparative analysis against GPT-3.5 zero-shot performance, using a novel HPC-oriented evaluation method, demonstrates that MPIrigen excels in generating accurate MPI functions calls. The success of this tailored solution underscores the importance of domain-specific fine-tuning in optimizing language models for parallel computing code generation, paving the way for a new generation of automatic parallelization tools.The sources of this work are available at our GitHub MPIrigen repository.",
keywords = "AI, LLM, MPI, code generation, domain decomposition, transformer",
author = "Nadav Schneider and Niranjan Hasabnis and Vo, {Vy A.} and Tal Kadosh and Neva Krien and Mihai Capota and Guy Tamir and Willke, {Theodore L.} and Nesreen Ahmed and Yuval Pinter and Timothy Mattson and Gal Oren",
note = "Publisher Copyright: {\textcopyright} 2024 is held by the owner/author(s).; 2024 Workshop on AI For Systems, AI4Sys 2024 ; Conference date: 03-06-2024",
year = "2024",
month = jun,
day = "3",
doi = "10.1145/3660605.3660944",
language = "American English",
series = "AI4Sys 2024 - Proceedings of the 2024 Workshop on AI For Systems, Part of: HPDC 2024 - 33rd International Symposium on High-Performance Parallel and Distributed Computing",
pages = "1--6",
booktitle = "AI4Sys 2024 - Proceedings of the 2024 Workshop on AI For Systems, Part of",
}