@inproceedings{2970fa5e09ad46b590c60fd23ce0196d,
title = "PromptonomyViT: Multi-Task Prompt Learning Improves Video Transformers using Synthetic Scene Data",
abstract = "Action recognition models have achieved impressive results by incorporating scene-level annotations, such as objects, their relations, 3D structure, and more. However, obtaining annotations of scene structure for videos requires a significant amount of effort to gather and annotate, making these methods expensive to train. In contrast, synthetic datasets generated by graphics engines provide powerful alternatives for generating scene-level annotations across multiple tasks. In this work, we propose an approach to leverage synthetic scene data for improving video understanding. We present a multi-task prompt learning approach for video transformers, where a shared video transformer backbone is enhanced by a small set of specialized parameters for each task. Specifically, we add a set of {"}task prompts{"}, each corresponding to a different task, and let each prompt predict task-related annotations. This design allows the model to capture information shared among synthetic scene tasks as well as information shared between synthetic scene tasks and a real video downstream task throughout the entire network. We refer to this approach as {"}Promptonomy{"}, since the prompts model task-related structure. We propose the PromptonomyViT model (PViT), a video transformer that incorporates various types of scene-level information from synthetic data using the {"}Promptonomy{"}approach. PViT shows strong performance improvements on multiple video understanding tasks and datasets. Project page: https://ofir1080.github.io/PromptonomyViT",
keywords = "Algorithms, Algorithms, Image recognition and understanding, Video recognition and understanding",
author = "Roei Herzig and Ofir Abramovich and {Ben Avraham}, Elad and Assaf Arbelle and Leonid Karlinsky and Ariel Shamir and Trevor Darrell and Amir Globerson",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE Winter Conference on Applications of Computer Vision, WACV 2024 ; Conference date: 04-01-2024 Through 08-01-2024",
year = "2024",
month = jan,
day = "3",
doi = "10.1109/WACV57701.2024.00666",
language = "الإنجليزيّة",
series = "Proceedings - 2024 IEEE Winter Conference on Applications of Computer Vision, WACV 2024",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "6789--6801",
booktitle = "Proceedings - 2024 IEEE Winter Conference on Applications of Computer Vision, WACV 2024",
address = "الولايات المتّحدة",
}