docs: add new section on high-level models and DSLs for HPC

This commit is contained in:
Andrew 2025-12-09 00:52:52 +07:00
parent 1a7a00609a
commit 1dd2cd94a4
2 changed files with 159 additions and 0 deletions

View file

@ -278,3 +278,114 @@
year = {2009},
pages = {1--314},
}
@misc{abadi_tensorflow:_2016,
title = {{TensorFlow}: {A} system for large-scale machine learning},
shorttitle = {{TensorFlow}},
url = {http://arxiv.org/abs/1605.08695},
doi = {10.48550/arXiv.1605.08695},
abstract = {TensorFlow is a machine learning system that operates at large scale and in heterogeneous environments. TensorFlow uses dataflow graphs to represent computation, shared state, and the operations that mutate that state. It maps the nodes of a dataflow graph across many machines in a cluster, and within a machine across multiple computational devices, including multicore CPUs, general-purpose GPUs, and custom designed ASICs known as Tensor Processing Units (TPUs). This architecture gives flexibility to the application developer: whereas in previous "parameter server" designs the management of shared state is built into the system, TensorFlow enables developers to experiment with novel optimizations and training algorithms. TensorFlow supports a variety of applications, with particularly strong support for training and inference on deep neural networks. Several Google services use TensorFlow in production, we have released it as an open-source project, and it has become widely used for machine learning research. In this paper, we describe the TensorFlow dataflow model in contrast to existing systems, and demonstrate the compelling performance that TensorFlow achieves for several real-world applications.},
urldate = {2025-12-08},
publisher = {arXiv},
author = {Abadi, Martín and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and Kudlur, Manjunath and Levenberg, Josh and Monga, Rajat and Moore, Sherry and Murray, Derek G. and Steiner, Benoit and Tucker, Paul and Vasudevan, Vijay and Warden, Pete and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
month = may,
year = {2016},
note = {arXiv:1605.08695},
keywords = {Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Artificial Intelligence},
}
@misc{paszke_pytorch:_2019,
title = {{PyTorch}: {An} {Imperative} {Style}, {High}-{Performance} {Deep} {Learning} {Library}},
shorttitle = {{PyTorch}},
url = {http://arxiv.org/abs/1912.01703},
doi = {10.48550/arXiv.1912.01703},
abstract = {Deep learning frameworks have often focused on either usability or speed, but not both. PyTorch is a machine learning library that shows that these two goals are in fact compatible: it provides an imperative and Pythonic programming style that supports code as a model, makes debugging easy and is consistent with other popular scientific computing libraries, while remaining efficient and supporting hardware accelerators such as GPUs. In this paper, we detail the principles that drove the implementation of PyTorch and how they are reflected in its architecture. We emphasize that every aspect of PyTorch is a regular Python program under the full control of its user. We also explain how the careful and pragmatic implementation of the key components of its runtime enables them to work together to achieve compelling performance. We demonstrate the efficiency of individual subsystems, as well as the overall speed of PyTorch on several common benchmarks.},
urldate = {2025-12-08},
publisher = {arXiv},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Köpf, Andreas and Yang, Edward and DeVito, Zach and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
month = dec,
year = {2019},
note = {arXiv:1912.01703},
keywords = {Computer Science - Machine Learning, Computer Science - Mathematical Software, Statistics - Machine Learning},
}
@misc{lattner_mlir:_2020,
title = {{MLIR}: {A} {Compiler} {Infrastructure} for the {End} of {Moore}'s {Law}},
shorttitle = {{MLIR}},
url = {http://arxiv.org/abs/2002.11054},
doi = {10.48550/arXiv.2002.11054},
abstract = {This work presents MLIR, a novel approach to building reusable and extensible compiler infrastructure. MLIR aims to address software fragmentation, improve compilation for heterogeneous hardware, significantly reduce the cost of building domain specific compilers, and aid in connecting existing compilers together. MLIR facilitates the design and implementation of code generators, translators and optimizers at different levels of abstraction and also across application domains, hardware targets and execution environments. The contribution of this work includes (1) discussion of MLIR as a research artifact, built for extension and evolution, and identifying the challenges and opportunities posed by this novel design point in design, semantics, optimization specification, system, and engineering. (2) evaluation of MLIR as a generalized infrastructure that reduces the cost of building compilers-describing diverse use-cases to show research and educational opportunities for future programming languages, compilers, execution environments, and computer architecture. The paper also presents the rationale for MLIR, its original design principles, structures and semantics.},
urldate = {2025-12-08},
publisher = {arXiv},
author = {Lattner, Chris and Amini, Mehdi and Bondhugula, Uday and Cohen, Albert and Davis, Andy and Pienaar, Jacques and Riddle, River and Shpeisman, Tatiana and Vasilache, Nicolas and Zinenko, Oleksandr},
month = mar,
year = {2020},
note = {arXiv:2002.11054},
keywords = {Computer Science - Programming Languages, Computer Science - Machine Learning},
}
@misc{chen_tvm:_2018,
title = {{TVM}: {An} {Automated} {End}-to-{End} {Optimizing} {Compiler} for {Deep} {Learning}},
shorttitle = {{TVM}},
url = {http://arxiv.org/abs/1802.04799},
doi = {10.48550/arXiv.1802.04799},
abstract = {There is an increasing need to bring machine learning to a wide diversity of hardware devices. Current frameworks rely on vendor-specific operator libraries and optimize for a narrow range of server-class GPUs. Deploying workloads to new platforms -- such as mobile phones, embedded devices, and accelerators (e.g., FPGAs, ASICs) -- requires significant manual effort. We propose TVM, a compiler that exposes graph-level and operator-level optimizations to provide performance portability to deep learning workloads across diverse hardware back-ends. TVM solves optimization challenges specific to deep learning, such as high-level operator fusion, mapping to arbitrary hardware primitives, and memory latency hiding. It also automates optimization of low-level programs to hardware characteristics by employing a novel, learning-based cost modeling method for rapid exploration of code optimizations. Experimental results show that TVM delivers performance across hardware back-ends that are competitive with state-of-the-art, hand-tuned libraries for low-power CPU, mobile GPU, and server-class GPUs. We also demonstrate TVM's ability to target new accelerator back-ends, such as the FPGA-based generic deep learning accelerator. The system is open sourced and in production use inside several major companies.},
urldate = {2025-12-08},
publisher = {arXiv},
author = {Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Zheng, Lianmin and Yan, Eddie and Cowan, Meghan and Shen, Haichen and Wang, Leyuan and Hu, Yuwei and Ceze, Luis and Guestrin, Carlos and Krishnamurthy, Arvind},
month = oct,
year = {2018},
note = {arXiv:1802.04799},
keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Programming Languages},
}
@inproceedings{ragan-kelley_halide:_2013,
address = {Seattle Washington USA},
title = {Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines},
isbn = {9781450320146},
shorttitle = {Halide},
url = {https://dl.acm.org/doi/10.1145/2491956.2462176},
doi = {10.1145/2491956.2462176},
language = {en},
urldate = {2025-12-08},
booktitle = {Proceedings of the 34th {ACM} {SIGPLAN} {Conference} on {Programming} {Language} {Design} and {Implementation}},
publisher = {ACM},
author = {Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Frédo and Amarasinghe, Saman},
month = jun,
year = {2013},
pages = {519--530},
}
@inproceedings{ansel_opentuner:_2014,
address = {Edmonton AB Canada},
title = {{OpenTuner}: an extensible framework for program autotuning},
isbn = {9781450328098},
shorttitle = {{OpenTuner}},
url = {https://dl.acm.org/doi/10.1145/2628071.2628092},
doi = {10.1145/2628071.2628092},
language = {en},
urldate = {2025-12-08},
booktitle = {Proceedings of the 23rd international conference on {Parallel} architectures and compilation},
publisher = {ACM},
author = {Ansel, Jason and Kamil, Shoaib and Veeramachaneni, Kalyan and Ragan-Kelley, Jonathan and Bosboom, Jeffrey and O'Reilly, Una-May and Amarasinghe, Saman},
month = aug,
year = {2014},
pages = {303--316},
}
@inproceedings{baghdadi_tiramisu:_2019,
address = {Washington, DC, USA},
title = {Tiramisu: {A} {Polyhedral} {Compiler} for {Expressing} {Fast} and {Portable} {Code}},
copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
isbn = {9781728114361},
shorttitle = {Tiramisu},
url = {https://ieeexplore.ieee.org/document/8661197/},
doi = {10.1109/CGO.2019.8661197},
urldate = {2025-12-08},
booktitle = {2019 {IEEE}/{ACM} {International} {Symposium} on {Code} {Generation} and {Optimization} ({CGO})},
publisher = {IEEE},
author = {Baghdadi, Riyadh and Ray, Jessica and Romdhane, Malek Ben and Sozzo, Emanuele Del and Akkas, Abdurrahman and Zhang, Yunming and Suriana, Patricia and Kamil, Shoaib and Amarasinghe, Saman},
month = feb,
year = {2019},
pages = {193--205},
}