@misc{noauthor_frequently_nodate,
	title = {Frequently {Asked} {Questions} {\textbar} {TOP500}},
	url = {https://top500.org/resources/frequently-asked-questions/},
	urldate = {2025-12-08},
}

@misc{noauthor_history_nodate,
	title = {The {History} of the {Development} of {Parallel} {Computing} {\textbar} {PARALLEL}.{RU} - Информационно-аналитический центр по параллельным вычислениям},
	url = {https://parallel.ru/history/wilson_history.html},
	urldate = {2025-12-08},
}

@misc{noauthor_mpi_nodate,
	title = {{MPI} {Documents}},
	url = {https://www.mpi-forum.org/docs/},
	language = {en},
	urldate = {2025-12-08},
}

@misc{noauthor_parallel_nodate,
	title = {Parallel {Programming} - {HPC} {Wiki}},
	url = {https://hpc-wiki.info/hpc/Parallel_Programming},
	urldate = {2025-12-08},
}

@misc{noauthor_cuda_nodate,
	title = {{CUDA} {Toolkit} {Archive}},
	url = {https://developer.nvidia.com/cuda-toolkit-archive},
	language = {en},
	urldate = {2025-12-08},
	journal = {NVIDIA Developer},
}

@misc{noauthor_opencl_nodate,
	title = {{OpenCL} - an overview {\textbar} {ScienceDirect} {Topics}},
	url = {https://www.sciencedirect.com/topics/computer-science/opencl},
	urldate = {2025-12-08},
}

@incollection{balaji_chapel_2015,
	title = {Chapel},
	isbn = {9780262332248},
	url = {https://direct.mit.edu/books/book/4070/chapter/168849/Chapel},
	language = {en},
	urldate = {2025-12-08},
	booktitle = {Programming {Models} for {Parallel} {Computing}},
	publisher = {The MIT Press},
	author = {Chamberlain, Bradford L.},
	editor = {Balaji, Pavan},
	month = nov,
	year = {2015},
	doi = {10.7551/mitpress/9486.003.0008},
	pages = {129--160},
}

@misc{noauthor_fortran_nodate,
	title = {Fortran {\textbar} {IBM}},
	url = {https://www.ibm.com/history/fortran},
	abstract = {The world’s first programming language standard opened the door to modern computing.},
	language = {en},
	urldate = {2025-12-08},
}

@misc{noauthor_why_nodate,
	title = {Why {ALGOL} was an important programming language?},
	url = {https://bulldogjob.com/readme/why-algol-was-an-important-programming-language},
	abstract = {ALGOL is a more interesting language than you think, both in terms of its story and legacy.},
	language = {en},
	urldate = {2025-12-08},
}

@incollection{wexelblat_history_1978,
	address = {New York, NY, USA},
	title = {History of {LISP}},
	copyright = {https://www.acm.org/publications/policies/copyright\_policy\#Background},
	isbn = {9780127450407},
	url = {http://dl.acm.org/doi/10.1145/800025.1198360},
	language = {en},
	urldate = {2025-12-08},
	booktitle = {History of programming languages},
	publisher = {ACM},
	author = {McCarthy, John},
	editor = {Wexelblat, Richard L.},
	month = jun,
	year = {1978},
	doi = {10.1145/800025.1198360},
	pages = {173--185},
}

@book{hecht_flow_1977,
	address = {New York, NY},
	series = {Programming languages series},
	title = {Flow analysis of computer programs},
	isbn = {9780444002105 9780444002167},
	language = {eng},
	number = {5},
	publisher = {North-Holland},
	author = {Hecht, Matthew S.},
	year = {1977},
}

@misc{noauthor__nodate,
	title = {Архитектура {Cray}-1 {\textbar} {PARALLEL}.{RU} - Информационно-аналитический центр по параллельным вычислениям},
	url = {https://parallel.ru/history/cray1.html},
	urldate = {2025-12-08},
}

@article{flynn_very_1966,
	title = {Very high-speed computing systems},
	volume = {54},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {0018-9219},
	url = {http://ieeexplore.ieee.org/document/1447203/},
	doi = {10.1109/PROC.1966.5273},
	number = {12},
	urldate = {2025-12-08},
	journal = {Proceedings of the IEEE},
	author = {Flynn, M.J.},
	year = {1966},
	pages = {1901--1909},
}

@article{chang_support_2004,
	title = {Support and optimization for parallel sparse programs with array intrinsics of {Fortran} 90},
	volume = {30},
	copyright = {https://www.elsevier.com/tdm/userlicense/1.0/},
	issn = {01678191},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0167819104000286},
	doi = {10.1016/j.parco.2004.02.004},
	language = {en},
	number = {4},
	urldate = {2025-12-08},
	journal = {Parallel Computing},
	author = {Chang, Rong-Guey and Chuang, Tyng-Ruey and Lee, Jenq Kuen},
	month = apr,
	year = {2004},
	pages = {527--550},
}

@article{allen_automatic_1987,
	title = {Automatic translation of {FORTRAN} programs to vector form},
	volume = {9},
	issn = {0164-0925, 1558-4593},
	url = {https://dl.acm.org/doi/10.1145/29873.29875},
	doi = {10.1145/29873.29875},
	abstract = {The recent success of vector computers such as the Cray-1 and array processors such as those manufactured by Floating Point Systems has increased interest in making vector operations available to the FORTRAN programmer. The FORTRAN standards committee is currently considering a successor to FORTRAN 77, usually called FORTRAN 8x, that will permit the programmer to explicitly specify vector and array operations.
            Although FORTRAN 8x will make it convenient to specify explicit vector operations in new programs, it does little for existing code. In order to benefit from the power of vector hardware, existing programs will need to be rewritten in some language (presumably FORTRAN 8x) that permits the explicit specification of vector operations. One way to avoid a massive manual recoding effort is to provide a translator that discovers the parallelism implicit in a FORTRAN program and automatically rewrites that program in FORTRAN 8x.
            
              Such a translation from FORTRAN to FORTRAN 8x is not straightforward because FORTRAN DO loops are not always semantically equivalent to the corresponding FORTRAN 8x parallel operation. The semantic difference between these two constructs is precisely captured by the concept of
              dependence
              . A translation from FORTRAN to FORTRAN 8x preserves the semantics of the original program if it preserves the dependences in that program.
            
            The theoretical background is developed here for employing data dependence to convert FORTRAN programs to parallel form. Dependence is defined and characterized in terms of the conditions that give rise to it; accurate tests to determine dependence are presented; and transformations that use dependence to uncover additional parallelism are discussed.},
	language = {en},
	number = {4},
	urldate = {2025-12-08},
	journal = {ACM Transactions on Programming Languages and Systems},
	author = {Allen, Randy and Kennedy, Ken},
	month = oct,
	year = {1987},
	pages = {491--542},
}

@inproceedings{mowry_design_1992,
	address = {Boston Massachusetts USA},
	title = {Design and evaluation of a compiler algorithm for prefetching},
	isbn = {9780897915342},
	url = {https://dl.acm.org/doi/10.1145/143365.143488},
	doi = {10.1145/143365.143488},
	language = {en},
	urldate = {2025-12-08},
	booktitle = {Proceedings of the fifth international conference on {Architectural} support for programming languages and operating systems},
	publisher = {ACM},
	author = {Mowry, Todd C. and Lam, Monica S. and Gupta, Anoop},
	month = sep,
	year = {1992},
	pages = {62--73},
}

@misc{morgan_compiling_2018,
	title = {Compiling {History} {To} {Understand} {The} {Future}},
	url = {https://www.nextplatform.com/2018/11/02/compiling-history-to-understand-the-future/},
	abstract = {If you want to understand where we are going with computer architectures and the compilers that drive them, it is instructive to look at how compilers},
	language = {en-US},
	urldate = {2025-12-08},
	journal = {The Next Platform},
	author = {Morgan, Timothy Prickett},
	month = nov,
	year = {2018},
}

@misc{noauthor_vectorization_nodate,
	title = {Vectorization {Directives} {\textbar} {Cray} {Fortran} {Reference} {Manual} 8.{7A} {S}-3901},
	url = {https://support.hpe.com/hpesc/public/docDisplay?docId=a00113909en_us&page=Vectorization_Directives.html&docLocale=en_US},
	urldate = {2025-12-08},
}

@article{hoare_communicating_1978,
	title = {Communicating sequential processes},
	volume = {21},
	issn = {0001-0782, 1557-7317},
	url = {https://dl.acm.org/doi/10.1145/359576.359585},
	doi = {10.1145/359576.359585},
	abstract = {This paper suggests that input and output are basic primitives of programming and that parallel composition of communicating sequential processes is a fundamental program structuring method. When combined with a development of Dijkstra's guarded command, these concepts are surprisingly versatile. Their use is illustrated by sample solutions of a variety of a familiar programming exercises.},
	language = {en},
	number = {8},
	urldate = {2025-12-08},
	journal = {Communications of the ACM},
	author = {Hoare, C. A. R.},
	month = aug,
	year = {1978},
	pages = {666--677},
}

@article{nickolls_gpu_2010,
	title = {The {GPU} {Computing} {Era}},
	volume = {30},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {0272-1732},
	url = {http://ieeexplore.ieee.org/document/5446251/},
	doi = {10.1109/MM.2010.41},
	number = {2},
	urldate = {2025-12-08},
	journal = {IEEE Micro},
	author = {Nickolls, John and Dally, William J},
	month = mar,
	year = {2010},
	pages = {56--69},
}

@article{nickolls_scalable_2008,
	title = {Scalable {Parallel} {Programming} with {CUDA}: {Is} {CUDA} the parallel programming model that application developers have been waiting for?},
	volume = {6},
	issn = {1542-7730, 1542-7749},
	shorttitle = {Scalable {Parallel} {Programming} with {CUDA}},
	url = {https://dl.acm.org/doi/10.1145/1365490.1365500},
	doi = {10.1145/1365490.1365500},
	abstract = {The advent of multicore CPUs and manycore GPUs means that mainstream processor chips are now parallel systems. Furthermore, their parallelism continues to scale with Moore’s law. The challenge is to develop mainstream application software that transparently scales its parallelism to leverage the increasing number of processor cores, much as 3D graphics applications transparently scale their parallelism to manycore GPUs with widely varying numbers of cores.},
	language = {en},
	number = {2},
	urldate = {2025-12-08},
	journal = {Queue},
	author = {Nickolls, John and Buck, Ian and Garland, Michael and Skadron, Kevin},
	month = mar,
	year = {2008},
	pages = {40--53},
}

@article{garland_parallel_2008,
	title = {Parallel {Computing} {Experiences} with {CUDA}},
	volume = {28},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {0272-1732},
	url = {http://ieeexplore.ieee.org/document/4626815/},
	doi = {10.1109/MM.2008.57},
	number = {4},
	urldate = {2025-12-08},
	journal = {IEEE Micro},
	author = {Garland, Michael and Le Grand, Scott and Nickolls, John and Anderson, Joshua and Hardwick, Jim and Morton, Scott and Phillips, Everett and Zhang, Yao and Volkov, Vasily},
	month = jul,
	year = {2008},
	pages = {13--27},
}

@inproceedings{munshi_opencl_2009,
	address = {Stanford, CA, USA},
	title = {The {OpenCL} specification},
	copyright = {https://doi.org/10.15223/policy-029},
	isbn = {9781467388733},
	url = {https://ieeexplore.ieee.org/document/7478342/},
	doi = {10.1109/HOTCHIPS.2009.7478342},
	urldate = {2025-12-08},
	booktitle = {2009 {IEEE} {Hot} {Chips} 21 {Symposium} ({HCS})},
	publisher = {IEEE},
	author = {Munshi, Aaftab},
	month = aug,
	year = {2009},
	pages = {1--314},
}

@misc{abadi_tensorflow:_2016,
	title = {{TensorFlow}: {A} system for large-scale machine learning},
	shorttitle = {{TensorFlow}},
	url = {http://arxiv.org/abs/1605.08695},
	doi = {10.48550/arXiv.1605.08695},
	abstract = {TensorFlow is a machine learning system that operates at large scale and in heterogeneous environments. TensorFlow uses dataflow graphs to represent computation, shared state, and the operations that mutate that state. It maps the nodes of a dataflow graph across many machines in a cluster, and within a machine across multiple computational devices, including multicore CPUs, general-purpose GPUs, and custom designed ASICs known as Tensor Processing Units (TPUs). This architecture gives flexibility to the application developer: whereas in previous "parameter server" designs the management of shared state is built into the system, TensorFlow enables developers to experiment with novel optimizations and training algorithms. TensorFlow supports a variety of applications, with particularly strong support for training and inference on deep neural networks. Several Google services use TensorFlow in production, we have released it as an open-source project, and it has become widely used for machine learning research. In this paper, we describe the TensorFlow dataflow model in contrast to existing systems, and demonstrate the compelling performance that TensorFlow achieves for several real-world applications.},
	urldate = {2025-12-08},
	publisher = {arXiv},
	author = {Abadi, Martín and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and Kudlur, Manjunath and Levenberg, Josh and Monga, Rajat and Moore, Sherry and Murray, Derek G. and Steiner, Benoit and Tucker, Paul and Vasudevan, Vijay and Warden, Pete and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
	month = may,
	year = {2016},
	note = {arXiv:1605.08695},
	keywords = {Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Artificial Intelligence},
}

@misc{paszke_pytorch:_2019,
	title = {{PyTorch}: {An} {Imperative} {Style}, {High}-{Performance} {Deep} {Learning} {Library}},
	shorttitle = {{PyTorch}},
	url = {http://arxiv.org/abs/1912.01703},
	doi = {10.48550/arXiv.1912.01703},
	abstract = {Deep learning frameworks have often focused on either usability or speed, but not both. PyTorch is a machine learning library that shows that these two goals are in fact compatible: it provides an imperative and Pythonic programming style that supports code as a model, makes debugging easy and is consistent with other popular scientific computing libraries, while remaining efficient and supporting hardware accelerators such as GPUs. In this paper, we detail the principles that drove the implementation of PyTorch and how they are reflected in its architecture. We emphasize that every aspect of PyTorch is a regular Python program under the full control of its user. We also explain how the careful and pragmatic implementation of the key components of its runtime enables them to work together to achieve compelling performance. We demonstrate the efficiency of individual subsystems, as well as the overall speed of PyTorch on several common benchmarks.},
	urldate = {2025-12-08},
	publisher = {arXiv},
	author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Köpf, Andreas and Yang, Edward and DeVito, Zach and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
	month = dec,
	year = {2019},
	note = {arXiv:1912.01703},
	keywords = {Computer Science - Machine Learning, Computer Science - Mathematical Software, Statistics - Machine Learning},
}

@misc{lattner_mlir:_2020,
	title = {{MLIR}: {A} {Compiler} {Infrastructure} for the {End} of {Moore}'s {Law}},
	shorttitle = {{MLIR}},
	url = {http://arxiv.org/abs/2002.11054},
	doi = {10.48550/arXiv.2002.11054},
	abstract = {This work presents MLIR, a novel approach to building reusable and extensible compiler infrastructure. MLIR aims to address software fragmentation, improve compilation for heterogeneous hardware, significantly reduce the cost of building domain specific compilers, and aid in connecting existing compilers together. MLIR facilitates the design and implementation of code generators, translators and optimizers at different levels of abstraction and also across application domains, hardware targets and execution environments. The contribution of this work includes (1) discussion of MLIR as a research artifact, built for extension and evolution, and identifying the challenges and opportunities posed by this novel design point in design, semantics, optimization specification, system, and engineering. (2) evaluation of MLIR as a generalized infrastructure that reduces the cost of building compilers-describing diverse use-cases to show research and educational opportunities for future programming languages, compilers, execution environments, and computer architecture. The paper also presents the rationale for MLIR, its original design principles, structures and semantics.},
	urldate = {2025-12-08},
	publisher = {arXiv},
	author = {Lattner, Chris and Amini, Mehdi and Bondhugula, Uday and Cohen, Albert and Davis, Andy and Pienaar, Jacques and Riddle, River and Shpeisman, Tatiana and Vasilache, Nicolas and Zinenko, Oleksandr},
	month = mar,
	year = {2020},
	note = {arXiv:2002.11054},
	keywords = {Computer Science - Programming Languages, Computer Science - Machine Learning},
}

@misc{chen_tvm:_2018,
	title = {{TVM}: {An} {Automated} {End}-to-{End} {Optimizing} {Compiler} for {Deep} {Learning}},
	shorttitle = {{TVM}},
	url = {http://arxiv.org/abs/1802.04799},
	doi = {10.48550/arXiv.1802.04799},
	abstract = {There is an increasing need to bring machine learning to a wide diversity of hardware devices. Current frameworks rely on vendor-specific operator libraries and optimize for a narrow range of server-class GPUs. Deploying workloads to new platforms -- such as mobile phones, embedded devices, and accelerators (e.g., FPGAs, ASICs) -- requires significant manual effort. We propose TVM, a compiler that exposes graph-level and operator-level optimizations to provide performance portability to deep learning workloads across diverse hardware back-ends. TVM solves optimization challenges specific to deep learning, such as high-level operator fusion, mapping to arbitrary hardware primitives, and memory latency hiding. It also automates optimization of low-level programs to hardware characteristics by employing a novel, learning-based cost modeling method for rapid exploration of code optimizations. Experimental results show that TVM delivers performance across hardware back-ends that are competitive with state-of-the-art, hand-tuned libraries for low-power CPU, mobile GPU, and server-class GPUs. We also demonstrate TVM's ability to target new accelerator back-ends, such as the FPGA-based generic deep learning accelerator. The system is open sourced and in production use inside several major companies.},
	urldate = {2025-12-08},
	publisher = {arXiv},
	author = {Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Zheng, Lianmin and Yan, Eddie and Cowan, Meghan and Shen, Haichen and Wang, Leyuan and Hu, Yuwei and Ceze, Luis and Guestrin, Carlos and Krishnamurthy, Arvind},
	month = oct,
	year = {2018},
	note = {arXiv:1802.04799},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Programming Languages},
}

@inproceedings{ragan-kelley_halide:_2013,
	address = {Seattle Washington USA},
	title = {Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines},
	isbn = {9781450320146},
	shorttitle = {Halide},
	url = {https://dl.acm.org/doi/10.1145/2491956.2462176},
	doi = {10.1145/2491956.2462176},
	language = {en},
	urldate = {2025-12-08},
	booktitle = {Proceedings of the 34th {ACM} {SIGPLAN} {Conference} on {Programming} {Language} {Design} and {Implementation}},
	publisher = {ACM},
	author = {Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Frédo and Amarasinghe, Saman},
	month = jun,
	year = {2013},
	pages = {519--530},
}

@inproceedings{ansel_opentuner:_2014,
	address = {Edmonton AB Canada},
	title = {{OpenTuner}: an extensible framework for program autotuning},
	isbn = {9781450328098},
	shorttitle = {{OpenTuner}},
	url = {https://dl.acm.org/doi/10.1145/2628071.2628092},
	doi = {10.1145/2628071.2628092},
	language = {en},
	urldate = {2025-12-08},
	booktitle = {Proceedings of the 23rd international conference on {Parallel} architectures and compilation},
	publisher = {ACM},
	author = {Ansel, Jason and Kamil, Shoaib and Veeramachaneni, Kalyan and Ragan-Kelley, Jonathan and Bosboom, Jeffrey and O'Reilly, Una-May and Amarasinghe, Saman},
	month = aug,
	year = {2014},
	pages = {303--316},
}

@inproceedings{baghdadi_tiramisu:_2019,
	address = {Washington, DC, USA},
	title = {Tiramisu: {A} {Polyhedral} {Compiler} for {Expressing} {Fast} and {Portable} {Code}},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	isbn = {9781728114361},
	shorttitle = {Tiramisu},
	url = {https://ieeexplore.ieee.org/document/8661197/},
	doi = {10.1109/CGO.2019.8661197},
	urldate = {2025-12-08},
	booktitle = {2019 {IEEE}/{ACM} {International} {Symposium} on {Code} {Generation} and {Optimization} ({CGO})},
	publisher = {IEEE},
	author = {Baghdadi, Riyadh and Ray, Jessica and Romdhane, Malek Ben and Sozzo, Emanuele Del and Akkas, Abdurrahman and Zhang, Yunming and Suriana, Patricia and Kamil, Shoaib and Amarasinghe, Saman},
	month = feb,
	year = {2019},
	pages = {193--205},
}

@book{kuhn_structure_2009,
	address = {Chicago},
	edition = {3. ed., [Nachdr.]},
	title = {The structure of scientific revolutions},
	isbn = {9780226458083 9780226458076},
	language = {eng},
	publisher = {Univ. of Chicago Press},
	author = {Kuhn, Thomas S.},
	year = {2009},
}

@book{reinders_data_2021,
	address = {New York},
	title = {Data parallel {C}++: mastering {DPC}++ for programming of heterogeneous systems using {C}++ and {SYCL}},
	isbn = {9781484255735},
	shorttitle = {Data parallel {C}++},
	language = {eng},
	publisher = {Apress open},
	author = {Reinders, James and Ashbaugh, Ben and Brodman, James and Kinsner, Michael and Pennycook, John and Tian, Xinmin},
	year = {2021},
}

@article{mittal_survey_2015,
	title = {A {Survey} of {CPU}-{GPU} {Heterogeneous} {Computing} {Techniques}},
	volume = {47},
	issn = {0360-0300, 1557-7341},
	url = {https://dl.acm.org/doi/10.1145/2788396},
	doi = {10.1145/2788396},
	abstract = {As both CPUs and GPUs become employed in a wide range of applications, it has been acknowledged that both of these Processing Units (PUs) have their unique features and strengths and hence, CPU-GPU collaboration is inevitable to achieve high-performance computing. This has motivated a significant amount of research on heterogeneous computing techniques, along with the design of CPU-GPU fused chips and petascale heterogeneous supercomputers. In this article, we survey Heterogeneous Computing Techniques (HCTs) such as workload partitioning that enable utilizing both CPUs and GPUs to improve performance and/or energy efficiency. We review heterogeneous computing approaches at runtime, algorithm, programming, compiler, and application levels. Further, we review both discrete and fused CPU-GPU systems and discuss benchmark suites designed for evaluating Heterogeneous Computing Systems (HCSs). We believe that this article will provide insights into the workings and scope of applications of HCTs to researchers and motivate them to further harness the computational powers of CPUs and GPUs to achieve the goal of exascale performance.},
	language = {en},
	number = {4},
	urldate = {2025-12-08},
	journal = {ACM Computing Surveys},
	author = {Mittal, Sparsh and Vetter, Jeffrey S.},
	month = jul,
	year = {2015},
	pages = {1--35},
}

@book{kirk_programming_2017,
	address = {Amsterdam Boston Heidelberg},
	edition = {Third edition},
	title = {Programming massively parallel processors: a hands-on approach},
	isbn = {9780128119860},
	shorttitle = {Programming massively parallel processors},
	language = {eng},
	publisher = {Elsevier, Morgan Kaufmann},
	author = {Kirk, David and Hwu, Wen-mei W.},
	year = {2017},
}

@misc{jia_beyond_2018,
	title = {Beyond {Data} and {Model} {Parallelism} for {Deep} {Neural} {Networks}},
	url = {http://arxiv.org/abs/1807.05358},
	doi = {10.48550/arXiv.1807.05358},
	abstract = {The computational requirements for training deep neural networks (DNNs) have grown to the point that it is now standard practice to parallelize training. Existing deep learning systems commonly use data or model parallelism, but unfortunately, these strategies often result in suboptimal parallelization performance. In this paper, we define a more comprehensive search space of parallelization strategies for DNNs called SOAP, which includes strategies to parallelize a DNN in the Sample, Operation, Attribute, and Parameter dimensions. We also propose FlexFlow, a deep learning framework that uses guided randomized search of the SOAP space to find a fast parallelization strategy for a specific parallel machine. To accelerate this search, FlexFlow introduces a novel execution simulator that can accurately predict a parallelization strategy's performance and is three orders of magnitude faster than prior approaches that have to execute each strategy. We evaluate FlexFlow with six real-world DNN benchmarks on two GPU clusters and show that FlexFlow can increase training throughput by up to 3.8x over state-of-the-art approaches, even when including its search time, and also improves scalability.},
	urldate = {2025-12-08},
	publisher = {arXiv},
	author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
	month = jul,
	year = {2018},
	note = {arXiv:1807.05358},
	keywords = {Computer Science - Distributed, Parallel, and Cluster Computing},
}