From b9b15303f890e11e2a31266828c0d45b1a3008fc Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Tue, 3 Sep 2024 00:03:59 +0800 Subject: [PATCH] [Doc] How to Optimize a Language Model This tutorial demonstrates how to optimize a language model using TVM. --- docs/conf.py | 1 - docs/how_to/index.rst | 24 - docs/how_to/tutorials/optimize_llm.py | 614 ++++++++++++++++++ docs/index.rst | 6 +- docs/legacy_redirect.py | 1 - .../how_to/work_with_schedules/intrin_math.py | 173 ----- 6 files changed, 619 insertions(+), 200 deletions(-) delete mode 100644 docs/how_to/index.rst create mode 100644 docs/how_to/tutorials/optimize_llm.py delete mode 100644 gallery/how_to/work_with_schedules/intrin_math.py diff --git a/docs/conf.py b/docs/conf.py index c933653233b1..1ffc4dcafdb2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -488,7 +488,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func): "work_with_schedules": [ "schedule_primitives.py", "reduction.py", - "intrin_math.py", "scan.py", "extern_op.py", "tensorize.py", diff --git a/docs/how_to/index.rst b/docs/how_to/index.rst deleted file mode 100644 index c5b9d703f032..000000000000 --- a/docs/how_to/index.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. toctree:: - :maxdepth: 1 - - tutorials/e2e_opt_model - tutorials/customize_opt - tutorials/cross_compilation_and_rpc - dev/index diff --git a/docs/how_to/tutorials/optimize_llm.py b/docs/how_to/tutorials/optimize_llm.py new file mode 100644 index 000000000000..9311c0557fe7 --- /dev/null +++ b/docs/how_to/tutorials/optimize_llm.py @@ -0,0 +1,614 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +.. _opt_llm: + +Optimize Large Language Model +============================= +As large language models (LLMs) have become a popular research topic in many different fields, +deploying them on cloud and edge devices has become a challenging task. In this tutorial, we will +demonstrate how to optimize a large language model using Apache TVM. We will use a pre-trained +TinyLlama model from Hugging Face and deploy it on various devices. +""" + +###################################################################### +# Review Overall Flow +# ------------------- +# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg +# :align: center +# :width: 80% +# +# The overall flow consists of the following steps: +# +# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained +# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains +# all the information needed for compilation, including high-level Relax functions for +# computational graph, and low-level TensorIR functions for tensor program. +# - **Perform Composable Optimizations**: Perform a series of optimization transformations, +# such as graph optimizations, tensor program optimizations, and library dispatching. +# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the +# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators. +# + + +###################################################################### +# Construct the model architecture +# -------------------------------- +# We will use a pre-trained TinyLlama model from Hugging Face. However, usually we only load the +# pre-trained weight from Hugging Face but not the model architecture. We need to construct the +# model architecture by ourselves. Apache TVM prepares a PyTorch-liked API to construct the model +# architecture. We can use the API to construct the model architecture. + + +import dataclasses +import enum +import os +from pathlib import Path +from pprint import pprint +from typing import List, Optional + +import tvm +from tvm import dlight, relax, te, tir +from tvm.relax import register_pipeline +from tvm.relax.frontend import nn +from tvm.relax.frontend.nn import Tensor, op +from tvm.relax.frontend.nn.llm.kv_cache import PagedKVCache, TIRPagedKVCache +from tvm.runtime import ShapeTuple + +###################################################################### +# First, we need to define the model configuration. The configuration includes the key parameters +# of the model, such as hidden size, intermediate size, etc. Here for convenience, we define a +# constant config specially for the TinyLlama model. + + +@dataclasses.dataclass +class LlamaConfig: + hidden_size: int = 2048 + intermediate_size: int = 5632 + num_attention_heads: int = 32 + num_hidden_layers: int = 22 + rms_norm_eps: float = 1e-05 + vocab_size: int = 32000 + rope_theta: int = 10000 + context_window_size: int = 2048 + prefill_chunk_size: int = 2048 + num_key_value_heads: int = 4 + head_dim: int = 64 # hidden_size // num_attention_heads + + +dev = tvm.device("cuda", 0) +target = tvm.target.Target.from_device(dev) + + +###################################################################### +# Next, we define the RoPE mode of the Paged KV cache. The RoPE mode is used to apply the +# Relative Positional Encoding (RoPE) to the query and key tensors. The RoPE mode can be set to +# `NONE`, `NORMAL`, or `INLINE`. If the RoPE mode is `NONE`, the KV cache will not apply RoPE to +# the query and key tensors. If the RoPE mode is `NORMAL`, RoPE will be applied to the key tensor +# before adding the key tensor to the cache. If the RoPE mode is `INLINE`, RoPE will be applied to +# the query and key tensors in the attention kernel on-the-fly. + + +class RopeMode(enum.IntEnum): + """The RoPE mode of the Paged KV cache. + If it is none, the KV cache will not apply RoPE to q and k. + If it is normal, RoPE will be applied to k before adding k to cache. + Otherwise, RoPE will be applied to q/k in attention kernel on-the-fly. + """ + + NONE = 0 + NORMAL = 1 + INLINE = 2 + + +###################################################################### +# Secondly, we define the model architecture. The model architecture consists of three parts: +# +# - Embedding layer: The embedding layer converts the input token IDs to the hidden states. +# - Decoder layers: The decoder layers are the core of the model. Each decoder layer consists of +# a self-attention layer and a feed-forward network (FFN) layer. +# - Output layer: The output layer converts the hidden states to the logits. +# +# First we define the FFN layer. Note that the following FFN layer is optimized implementation +# where we fuse the gate and up projection into one kernel. +# The naive implementation of FFN layer is: ``FFN(x) = down_proj(silu(gate(x)) * up(x))`` +# We could combine the ``gate`` and ``up`` projection into one kernel for better performance. +# The optimized implementation is: +# +# .. code-block:: python +# +# concat_x = gate_up(x) +# gate_x, up_x = split(concat_x, 2, axis=-1) +# FFN(x) = down_proj(silu(gate_x) * up_x) +# + + +class LlamaFFN(nn.Module): + def __init__(self, config: LlamaConfig): + super().__init__() + self.gate_up_proj = nn.Linear( + in_features=config.hidden_size, + out_features=2 * config.intermediate_size, + bias=False, + ) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + def forward(self, x: Tensor): + concat_x1_x2 = self.gate_up_proj(x) + x1, x2 = op.split(concat_x1_x2, 2, axis=-1) + return self.down_proj(op.silu(x1) * x2) + + +###################################################################### +# Then we define the self-attention layer. The self-attention layer consists of three parts: +# +# - QKV projection: The QKV projection converts the input hidden states to the query, key, and +# value tensors. +# - Attention: The attention layer computes the attention scores and applies the softmax +# operation. +# - Output projection: The output projection converts the attention output to the hidden states. +# +# We perform optimizations on the different parts of the self-attention layer: +# +# - QKV projection: We leverage the horizontal fusion on QKV projection and fuse them into one +# kernel. +# - Attention: We leverage the horizontal fusion on attention and fuse the QKV projection and + + +class LlamaAttention(nn.Module): # pylint: disable=too-many-instance-attributes + def __init__(self, config: LlamaConfig): + self.head_dim = config.head_dim + self.num_q_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + # horizontal fusion on QKV projection + self.qkv_proj = nn.Linear( + in_features=config.hidden_size, + out_features=(self.num_q_heads + 2 * self.num_kv_heads) * self.head_dim, + bias=False, + ) + self.o_proj = nn.Linear(self.num_q_heads * self.head_dim, config.hidden_size, bias=False) + + def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache, layer_id: int): + d, h_q, h_kv = self.head_dim, self.num_q_heads, self.num_kv_heads + b, s, _ = hidden_states.shape + # QKV Projection + qkv = self.qkv_proj(hidden_states) + qkv = op.reshape(qkv, (b, s, h_q + h_kv + h_kv, d)) + # Attention + output = op.reshape( + paged_kv_cache.attention_with_fused_qkv(layer_id, qkv, self.num_q_heads), + (b, s, h_q * d), + ) + # Output Projection + return self.o_proj(output) + + +###################################################################### +# Finally, we define the model architecture with FFN and self-attention layers. + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig): + rms_norm_eps = config.rms_norm_eps + self.self_attn = LlamaAttention(config) + self.mlp = LlamaFFN(config) + self.input_layernorm = nn.RMSNorm(config.hidden_size, -1, rms_norm_eps, bias=False) + self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, -1, rms_norm_eps, bias=False) + + def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache, layer_id: int): + hidden_states += self.self_attn( + self.input_layernorm(hidden_states), paged_kv_cache, layer_id + ) + hidden_states += self.mlp(self.post_attention_layernorm(hidden_states)) + return hidden_states + + +class LlamaModel(nn.Module): + def __init__(self, config: LlamaConfig): + assert config.hidden_size % config.num_attention_heads == 0 + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + self.norm = nn.RMSNorm(config.hidden_size, -1, config.rms_norm_eps, bias=False) + + def forward(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): + hidden_states = input_embed + for layer_id, layer in enumerate(self.layers): + hidden_states = layer(hidden_states, paged_kv_cache, layer_id) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class LlamaForCasualLM(nn.Module): + def __init__(self, config: LlamaConfig): + self.model = LlamaModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.num_hidden_layers = config.num_hidden_layers + self.num_attention_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.head_dim = config.head_dim + self.hidden_size = config.hidden_size + self.vocab_size = config.vocab_size + self.rope_theta = config.rope_theta + self.dtype = "float32" + + def to(self, dtype: Optional[str] = None): + super().to(dtype=dtype) + if dtype is not None: + self.dtype = dtype + + def embed(self, input_ids: Tensor): + return self.model.embed_tokens(input_ids) + + def get_logits(self, hidden_states: Tensor): + logits = self.lm_head(hidden_states) + if logits.dtype != "float32": + logits = logits.astype("float32") + return logits + + def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): + def _index(x: te.Tensor): # x[:-1,:] + b, s, d = x.shape + return te.compute((b, 1, d), lambda i, _, k: x[i, s - 1, k], name="index") + + hidden_states = self.model(input_embed, paged_kv_cache) + hidden_states = op.tensor_expr_op(_index, name_hint="index", args=[hidden_states]) + logits = self.get_logits(hidden_states) + return logits, paged_kv_cache + + def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache): + hidden_states = self.model(input_embed, paged_kv_cache) + logits = self.get_logits(hidden_states) + return logits, paged_kv_cache + + def create_tir_paged_kv_cache( + self, + max_batch_size: tir.Var, + max_total_seq_len: tir.Var, + prefill_chunk_size: tir.Var, + page_size: tir.Var, + ) -> PagedKVCache: + return TIRPagedKVCache( + max_batch_size=max_batch_size, + max_total_seq_len=max_total_seq_len, + prefill_chunk_size=prefill_chunk_size, + page_size=page_size, + support_sliding_window=0, + layer_partition=relax.ShapeExpr([0, self.num_hidden_layers]), + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + head_dim=self.head_dim, + rope_mode=RopeMode.NORMAL, + rope_scale=1, + rope_theta=self.rope_theta, + rope_scaling={}, + rope_ext_factors=relax.PrimValue(0), + rotary_dim=self.head_dim, + dtype=self.dtype, + target=target, + ) + + def get_default_spec(self): + mod_spec = { + "embed": { + "input_ids": nn.spec.Tensor(["seq_len"], "int32"), + "$": { + "param_mode": "packed", + "effect_mode": "none", + }, + }, + "prefill": { + "input_embed": nn.spec.Tensor([1, "seq_len", self.hidden_size], self.dtype), + "paged_kv_cache": nn.spec.Object(object_type=PagedKVCache), + "$": { + "param_mode": "packed", + "effect_mode": "none", + }, + }, + "decode": { + "input_embed": nn.spec.Tensor([1, 1, self.hidden_size], self.dtype), + "paged_kv_cache": nn.spec.Object(object_type=PagedKVCache), + "$": { + "param_mode": "packed", + "effect_mode": "none", + }, + }, + "create_tir_paged_kv_cache": { + "max_batch_size": int, + "max_total_seq_len": int, + "prefill_chunk_size": int, + "page_size": int, + "$": { + "param_mode": "none", + "effect_mode": "none", + }, + }, + } + return nn.spec.ModuleSpec.from_raw(mod_spec, self) + + +###################################################################### +# Export the model to Relax IRModule +# ---------------------------------- +# After defining the model architecture, we can export the model to the Relax IRModule. +# For demonstration, we only show the part of the model architecture. and parameters. + +model_config = LlamaConfig() +model = LlamaForCasualLM(model_config) +model.to("float16") +mod, named_params = model.export_tvm(spec=model.get_default_spec()) +prefill_str = mod["prefill"].script() +print(*prefill_str.split("\n")[3:20], sep="\n") # Only show the first 10 lines for demonstration +print(" ...") + +print("\nParameters:") +pprint(named_params[:5]) # Only show the first 5 parameters for demonstration + +###################################################################### +# Define Optimization Pipeline +# ---------------------------- +# We define a series of optimization passes to optimize the model. The optimization pipeline +# is designed specifically for the LLMs. + + +@register_pipeline("opt_llm") +def _pipeline( # pylint: disable=too-many-arguments + ext_mods: List[nn.ExternModule] = None, +): + ext_mods = ext_mods or [] + + @tvm.transform.module_pass(opt_level=0) + def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.IRModule: + seq = tvm.transform.Sequential( + [ + # Phase 1. Passes on high-level operator graph + # We can enable cublas for further optimization + relax.transform.FuseTransposeMatmul(), + # Phase 2. Lowering to TIR, inherited TVM Relax's official "zero" pipeline + relax.transform.LegalizeOps(), + relax.transform.AnnotateTIROpPattern(), + relax.transform.FoldConstant(), + relax.transform.FuseOps(), + relax.transform.FuseTIR(), + # Phase 3. Passes on TIR + relax.transform.DeadCodeElimination(), + # Phase 4. Low-level Optimizations + dlight.ApplyDefaultSchedule( + dlight.gpu.Matmul(), + dlight.gpu.GEMV(), + dlight.gpu.Reduction(), + dlight.gpu.GeneralReduction(), + dlight.gpu.Fallback(), + ), + # Phase 5. Lowering to VM bytecode + relax.transform.RewriteDataflowReshape(), + relax.transform.ToNonDataflow(), + relax.transform.RemovePurityChecking(), + relax.transform.CallTIRRewrite(), + relax.transform.StaticPlanBlockMemory(), + relax.transform.RewriteCUDAGraph(), + relax.transform.LowerAllocTensor(), + relax.transform.KillAfterLastUse(), + relax.transform.LowerRuntimeBuiltin(), + relax.transform.VMShapeLower(), + relax.transform.AttachGlobalSymbol(), + relax.transform.AttachExternModules(ext_mods), + ] + ) + mod = seq(mod) + return mod + + return _pipeline + + +with target: + ex = relax.build(mod, target, pipeline=relax.get_pipeline("opt_llm")) + vm = relax.VirtualMachine(ex, dev) + + +###################################################################### +# Prepare the model weights +# ------------------------- +# We load the pre-trained weights from Hugging Face and prepare the model weights. +# The pre-trained weights are stored in the Hugging Face format. We need to load the weights +# and prepare the model parameters. +# +# .. note:: +# +# Note that we won't execute the following code in this tutorial because the pre-trained weights +# are not available in the CI environment. +# + + +IS_IN_CI = os.getenv("CI", "") == "true" + +HF_WEIGHT_PATH = None +# HF_WEIGHT_PATH = Path("/path/to/TinyLlama-1.1B-Chat-v1.0/") + +if not IS_IN_CI: + import numpy as np + import safetensors.torch + import torch + + if HF_WEIGHT_PATH is None or not HF_WEIGHT_PATH.exists(): + raise ValueError("Please set the HF_WEIGHT_PATH to the path of the pre-trained weights.") + + # Torch format weights + param_dict = safetensors.torch.load_file(HF_WEIGHT_PATH / "model.safetensors", device="cpu") + # Numpy format weights + param_dict = { + k: v.half().numpy() if v.dtype == torch.bfloat16 else v.numpy() + for k, v in param_dict.items() + } + + named_params = dict(named_params) + for i in range(model_config.num_hidden_layers): + # Add QKV in self attention + attn = f"model.layers.{i}.self_attn" + param_dict[f"{attn}.qkv_proj.weight"] = np.concatenate( + [ + param_dict.pop(f"{attn}.q_proj.weight"), # Pop the old parameters to save memory + param_dict.pop(f"{attn}.k_proj.weight"), + param_dict.pop(f"{attn}.v_proj.weight"), + ], + axis=0, + ) + # Add gates in MLP + mlp = f"model.layers.{i}.mlp" + param_dict[f"{mlp}.gate_up_proj.weight"] = np.concatenate( + [ + param_dict.pop(f"{mlp}.gate_proj.weight"), + param_dict.pop(f"{mlp}.up_proj.weight"), + ], + axis=0, + ) + + # Convert params into ndarray + params = [ + tvm.nd.array(param_dict[k].astype("float16"), device=dev) for k in named_params.keys() + ] + + +###################################################################### +# Deploy the compiled model +# ------------------------- +# After the model and weights are ready, we can deploy the compiled model on the target device. +# The language models inference includes two steps: prefill and decode. The prefill step is +# used to process the input tokens and store the KVCache. The decode step is used to generate +# the token until the end token is generated. + + +###################################################################### +# Tokenization +# ~~~~~~~~~~~~ +# The first step is to tokenize the input prompt and embed the tokens into the hidden states. +# The tokenization and embedding are the same as the original model. We use the HF tokenizer +# to tokenize the input prompt and embed the tokens into the hidden states. +# Note that different models require different tokenization and prompt format, please refer to +# the model documentation for the correct tokenization and prompt format. + + +if not IS_IN_CI: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(HF_WEIGHT_PATH) + messages = [ + {"role": "user", "content": "What's your name?"}, + ] + prompt = tokenizer.apply_chat_template(messages) + input_len = len(prompt) + + # Load prompt tokens into TVM ndarray on the target device + tokens = tvm.nd.array(np.array(prompt).astype("int32"), device=dev) + +###################################################################### +# Create the KVCache +# ~~~~~~~~~~~~~~~~~~ +# Before starting the inference, we need to create the KVCache. The KVCache is used to store the +# key and value tensors for the attention layer. Apache TVM provides a PagedKVCache to store the +# key and value tensors. We create the PagedKVCache with the specified parameters. + +if not IS_IN_CI: + kv_cache = vm["create_tir_paged_kv_cache"]( + ShapeTuple([1]), # max_batch_size=1 + ShapeTuple([2048]), # max_total_seq_len=2048 + ShapeTuple([2048]), # prefill_chunk_size=2048 + ShapeTuple([16]), # page_size=16 + ) + + +###################################################################### +# Embedding +# ~~~~~~~~~ +# The next step is to embed the tokens into the hidden states. We use the `embed` function +# compiled in the Relax IRModule to embed the tokens into the hidden states. + +nd_view_func = tvm.get_global_func("vm.builtin.reshape") + + +def embed(tokens, params): + _embed = vm["embed"](tokens, params) + # Reshape hidden from [seq_len, hidden_size] to [1, seq_len, hidden_size] + _embed = nd_view_func(_embed, ShapeTuple([1, _embed.shape[0], _embed.shape[1]])) + return _embed + + +###################################################################### +# Prefill +# ~~~~~~~ +# Before running the forward pass, we first get some help functions for preparation. + +add_sequence_func = tvm.get_global_func("vm.builtin.kv_state_add_sequence") +begin_forward_func = tvm.get_global_func("vm.builtin.kv_state_begin_forward") +end_forward_func = tvm.get_global_func("vm.builtin.kv_state_end_forward") + +###################################################################### +# As we are creating a new sequence, we need to call `add_sequence_func` to initialize +# the request. Additionally, we need to call `begin_forward_func` to start the forward pass, +# and `end_forward_func` to end the forward pass. + +if not IS_IN_CI: + seq_id = 0 + add_sequence_func(kv_cache, seq_id) + hidden_states = embed(tokens, params) + begin_forward_func(kv_cache, ShapeTuple([seq_id]), ShapeTuple([input_len])) + logits, kv_cache = vm["prefill"](hidden_states, kv_cache, params) + end_forward_func(kv_cache) + +###################################################################### +# Now we have the output logits from the prefill step. The logits are used to generate the token +# via sampling. Let's sample the token from the logits. +# +# In this tutorial, we simplify the sampling process and pick the token with the highest +# probability. In practice, we should sample the token based on the probability distribution. +# Also, to make the tutorial concise, we execute the sample process on CPU. + + +def sample_token(logits): + logits_np = logits.numpy() + return np.argmax(logits_np) + + +if not IS_IN_CI: + last_token = sample_token(logits) + output_tokens = [last_token] + + +###################################################################### +# Decode +# ~~~~~~ +# After the prefill step, we can start the decode step. The decode step is used to generate the +# token until the end token is generated. We use the `decode` function compiled in the Relax +# IRModule to generate the token. + +if not IS_IN_CI: + print("The generated token:") + + while last_token != tokenizer.eos_token_id: + tokens = tvm.nd.array(np.array([last_token]).astype("int32"), device=dev) + hidden_states = embed(tokens, params) + begin_forward_func(kv_cache, ShapeTuple([seq_id]), ShapeTuple([1])) + logits, kv_cache = vm["decode"](hidden_states, kv_cache, params) + + end_forward_func(kv_cache) + last_token = sample_token(logits) + output_tokens.append(last_token) + + print(tokenizer.decode(output_tokens)) diff --git a/docs/index.rst b/docs/index.rst index fdfaa56f7454..5d5d07640134 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,7 +41,11 @@ driving its costs down. :maxdepth: 1 :caption: How To - how_to/index + how_to/tutorials/e2e_opt_model + how_to/tutorials/customize_opt + how_to/tutorials/optimize_llm + how_to/tutorials/cross_compilation_and_rpc + how_to/dev/index .. toctree:: :maxdepth: 1 diff --git a/docs/legacy_redirect.py b/docs/legacy_redirect.py index 5e4bdd7430d6..502c7dd0b5bf 100644 --- a/docs/legacy_redirect.py +++ b/docs/legacy_redirect.py @@ -206,7 +206,6 @@ "../../how_to/work_with_relay/using_external_lib.html", ], ["tutorials/language/extern_op.html", "../../how_to/work_with_schedules/extern_op.html"], - ["tutorials/language/intrin_math.html", "../../how_to/work_with_schedules/intrin_math.html"], ["tutorials/language/reduction.html", "../../how_to/work_with_schedules/reduction.html"], ["tutorials/language/scan.html", "../../how_to/work_with_schedules/scan.html"], [ diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py deleted file mode 100644 index 5a35ae1cbd8e..000000000000 --- a/gallery/how_to/work_with_schedules/intrin_math.py +++ /dev/null @@ -1,173 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Intrinsics and Math Functions -============================= -**Author**: `Tianqi Chen `_ - -While TVM supports basic arithmetic operations. In many cases -usually we will need more complicated builtin functions. -For example :code:`exp` to take the exponential of the function. - -These functions are target system dependent and may have different -names of different target platforms. In this tutorial, we will learn -how we can invoke these target specific functions, and how we can unify -the interface via TVM's intrinsic API. -""" -from __future__ import absolute_import, print_function - -import numpy as np - -import tvm -from tvm import te -from tvm.ir import register_op_attr, register_intrin_lowering - -###################################################################### -# Direct Declare Extern Math Call -# ------------------------------- -# The most straight-forward way to call target specific function is via -# extern function call construct in tvm. -# In the following example, we use :any:`tvm.tir.call_pure_extern` to call -# :code:`__expf` function, which is only available under CUDA. -# -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.compute(A.shape, lambda i: tvm.tir.call_pure_extern("float32", "__expf", A[i]), name="B") -s = te.create_schedule(B.op) -num_thread = 64 -bx, tx = s[B].split(B.op.axis[0], factor=num_thread) -s[B].bind(bx, te.thread_axis("blockIdx.x")) -s[B].bind(tx, te.thread_axis("threadIdx.x")) -f = tvm.build(s, [A, B], "cuda", name="myexp") -print(f.imported_modules[0].get_source()) - -###################################################################### -# Unified Intrinsic Call -# ---------------------- -# The above code verifies that direct external call can be used to -# call into device specific functions. -# However, the above way only works for CUDA target with float type. -# Ideally, we want to write same code for any device and any data type. -# -# TVM intrinsic provides the user a mechanism to achieve this, and this -# is the recommended way to solve the problem. -# The following code use te.exp instead, which create an intrinsic call -# :py::func:`tvm.te.exp` to do the exponential. -# -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.compute(A.shape, lambda i: te.exp(A[i]), name="B") -s = te.create_schedule(B.op) -num_thread = 64 -bx, tx = s[B].split(B.op.axis[0], factor=num_thread) -s[B].bind(bx, te.thread_axis("blockIdx.x")) -s[B].bind(tx, te.thread_axis("threadIdx.x")) -fcuda = tvm.build(s, [A, B], "cuda", name="myexp") -print(fcuda.imported_modules[0].get_source()) -###################################################################### -# We can find that the code works for both CUDA and opencl. -# The same te.exp can also be used for float64 data types. -# -fopencl = tvm.build(s, [A, B], "opencl", name="myexp") -print(fopencl.imported_modules[0].get_source()) - -###################################################################### -# Intrinsic Lowering Rule -# ----------------------- -# When :py:func:`tvm.te.exp` is called, TVM creates an intrinsic Call Expr. -# TVM uses transformation rules to transform the intrinsic -# call to device specific extern calls. -# -# TVM also allows user to customize the rules during runtime. -# The following example customizes CUDA lowering rule for :code:`exp`. -# - - -def my_cuda_math_rule(op): - """Customized CUDA intrinsic lowering rule""" - assert isinstance(op, tvm.tir.Call) - name = op.op.name - assert name.startswith("tir.") - dispatch_name = name[4:] - if op.dtype == "float32": - # call float function - return tvm.tir.call_pure_extern("float32", "%sf" % dispatch_name, op.args[0]) - elif op.dtype == "float64": - # call double function - return tvm.tir.call_pure_extern("float32", dispatch_name, op.args[0]) - else: - # cannot do translation, return self. - return op - - -register_intrin_lowering("tir.exp", target="cuda", f=my_cuda_math_rule, level=99) -###################################################################### -# Register the rule to TVM with override option to override existing rule. -# Notice the difference between the printed code from previous one: -# our new rule uses math function :code:`expf` instead of -# fast math version :code:`__expf`. -# -fcuda = tvm.build(s, [A, B], "cuda", name="myexp") -print(fcuda.imported_modules[0].get_source()) - -###################################################################### -# Add Your Own Intrinsic -# ---------------------- -# If there is an intrinsic that is not provided by TVM. -# User can easily add new intrinsic by using the intrinsic rule system. -# The following example add an intrinsic :code:`mylog` to the system. -# - - -def mylog(x): - """customized log intrinsic function""" - return tvm.tir.call_intrin(x.dtype, "tir.mylog", x) - - -def my_cuda_mylog_rule(op): - """CUDA lowering rule for log""" - if op.dtype == "float32": - return tvm.tir.call_pure_extern("float32", "logf", op.args[0]) - elif op.dtype == "float64": - return tvm.tir.call_pure_extern("float64", "log", op.args[0]) - else: - return op - - -# new op registration is triggered by registering an attribute of the op -register_op_attr("tir.mylog", "TCallEffectKind", tvm.tir.CallEffectKind.Pure) -register_intrin_lowering("tir.mylog", target="cuda", f=my_cuda_mylog_rule, level=99) - -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.compute(A.shape, lambda i: mylog(A[i]), name="B") -s = te.create_schedule(B.op) -num_thread = 64 -bx, tx = s[B].split(B.op.axis[0], factor=num_thread) -s[B].bind(bx, te.thread_axis("blockIdx.x")) -s[B].bind(tx, te.thread_axis("threadIdx.x")) -fcuda = tvm.build(s, [A, B], "cuda", name="mylog") -print(fcuda.imported_modules[0].get_source()) - -###################################################################### -# Summary -# ------- -# - TVM can call extern target dependent math function. -# - Use intrinsic to defined a unified interface for the functions. -# - For more intrinsics available in tvm, take a look at :any:`tvm.tir` -# - You can customize the intrinsic behavior by defining your own rules. -#