[Doc] Customize Optimization (#17320)

Siyuan Feng · web-flow · commit cd3448603dff · 2024-09-02T02:06:37.000-04:00
[Doc] Customization Optimization
diff --git a/docs/how_to/index.rst b/docs/how_to/index.rst
@@ -19,5 +19,6 @@
    :maxdepth: 1
 
    tutorials/e2e_opt_model
+   tutorials/customize_opt
    tutorials/cross_compilation_and_rpc
    dev/index
diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+.. _customize_opt:
+
+Customize Optimization
+======================
+One main design goal of Apache TVM is to enable easy customization of the optimization pipeline
+for both research or development purposes and iterate the engineering optimizations. In this
+tutorial we will
+
+.. contents:: Table of Contents
+    :local:
+    :depth: 1
+"""
+
+######################################################################
+# Review Overall Flow
+# -------------------
+# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
+#    :align: center
+#    :width: 80%
+#
+# The overall flow consists of the following steps:
+#
+# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained
+#   model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
+#   all the information needed for compilation, including high-level Relax functions for
+#   computational graph, and low-level TensorIR functions for tensor program.
+# - **Perform Composable Optimizations**: Perform a series of optimization transformations,
+#   such as graph optimizations, tensor program optimizations, and library dispatching.
+# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the
+#   universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.
+#
+
+import os
+import tempfile
+import numpy as np
+import tvm
+from tvm import IRModule, relax
+from tvm.relax.frontend import nn
+
+######################################################################
+# Composable IRModule Optimization
+# --------------------------------
+# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered
+# around IRModule optimization can be composed with existing pipelines. Note that each optimization
+# can focus on **part of the computation graph**, enabling partial lowering or partial optimization.
+#
+# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity.
+
+######################################################################
+# Prepare a Relax Module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# We first prepare a Relax module. The module can be imported from other frameworks, constructed
+# with NN module frontend or TVMScript. Here we use a simple neural network model as an example.
+
+
+class RelaxModel(nn.Module):
+    def __init__(self):
+        super(RelaxModel, self).__init__()
+        self.fc1 = nn.Linear(784, 256)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(256, 10, bias=False)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu1(x)
+        x = self.fc2(x)
+        return x
+
+
+input_shape = (1, 784)
+mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}})
+mod.show()
+
+######################################################################
+# Library Dispatch
+# ~~~~~~~~~~~~~~~~
+# We would like to quickly try out a variant of library optimization for certain platforms
+# (e.g., GPU). We can write a certain dispatching pass for the specific platform and
+# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns.
+#
+# .. note::
+#   This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting
+#   the flexibility of the optimization pipeline. In real-world cases, we can import multiple
+#   patterns and dispatch them to different kernels.
+
+
+# Import cublas pattern
+import tvm.relax.backend.contrib.cublas as _cublas
+
+
+# Define a new pass for CUBLAS dispatch
+@tvm.transform.module_pass(opt_level=0, name="CublasDispatch")
+class CublasDispatch:
+    def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
+        # Check if CUBLAS is enabled
+        if not tvm.get_global_func("relax.ext.cublas", True):
+            raise Exception("CUBLAS is not enabled.")
+
+        # Get interested patterns
+        patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")]
+        # Note in real-world cases, we usually get all patterns
+        # patterns = relax.backend.get_patterns_with_prefix("cublas")
+
+        # Fuse ops by patterns and then run codegen
+        mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod)
+        mod = relax.transform.RunCodegen()(mod)
+        return mod
+
+
+mod = CublasDispatch()(mod)
+mod.show()
+
+######################################################################
+# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused
+# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the
+# other part is not changed, which means we can selectively dispatch the optimization for
+# certain computation.
+
+######################################################################
+# Auto Tuning
+# ~~~~~~~~~~~
+# Continuing from the previous example, we can further optimize the model with auto-tuning for
+# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune
+# the model.
+#
+# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase``
+# pass to apply the best configuration to the model. The tuning process will generate search space,
+# tune the model and the following steps will apply the best configuration to the model. Before
+# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps``
+#
+# .. note::
+#
+#   To save CI time and avoid flakiness, we skip the tuning process in CI environment.
+#
+
+device = tvm.cuda(0)
+target = tvm.target.Target.from_device(device)
+if os.getenv("CI", "") != "true":
+    trials = 2000
+    with target, tempfile.TemporaryDirectory() as tmp_dir:
+        mod = tvm.ir.transform.Sequential(
+            [
+                relax.get_pipeline("zero"),
+                relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
+                relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
+            ]
+        )(mod)
+
+    mod.show()
+
+######################################################################
+# DLight Rules
+# ~~~~~~~~~~~~
+# DLight rules are a set of default rules for scheduling and optimization the kernel.
+# DLight rules are designed for fast compilation and **fair** performance. In some cases,
+# e.g. language model, DLight provides excellent performance, while for generic models,
+# it achieves a balance between performance and compilation time.
+
+from tvm import dlight as dl
+
+# Apply DLight rules
+with target:
+    mod = tvm.ir.transform.Sequential(
+        [
+            relax.get_pipeline("zero"),
+            dl.ApplyDefaultSchedule(  # pylint: disable=not-callable
+                dl.gpu.Matmul(),
+                dl.gpu.GEMV(),
+                dl.gpu.Reduction(),
+                dl.gpu.GeneralReduction(),
+                dl.gpu.Fallback(),
+            ),
+        ]
+    )(mod)
+
+mod.show()
+
+######################################################################
+# .. note::
+#
+#   This tutorial focuses on the demonstration of the optimization pipeline, instead of
+#   pushing the performance to the limit. The current optimization may not be the best.
+
+
+######################################################################
+# Deploy the Optimized Model
+# --------------------------
+# We can build and deploy the optimized model to the TVM runtime.
+
+ex = relax.build(mod, target="cuda")
+dev = tvm.device("cuda", 0)
+vm = relax.VirtualMachine(ex, dev)
+# Need to allocate data and params on GPU device
+data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev)
+gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
+gpu_out = vm["forward"](data, *gpu_params).numpy()
+print(gpu_out)
+
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM.
+# We can easily compose the optimization passes and customize the optimization for different parts
+# of the computation graph. The flexibility of the optimization pipeline enables us to quickly
+# iterate the optimization and improve the performance of the model.
+#