apache
diff --git a/‎docs/how_to/tutorials/optimize_llm.py‎
Lines changed: 8 additions & 2 deletions b/‎docs/how_to/tutorials/optimize_llm.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎python/tvm/relax/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/relax/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/relax/backend/cuda/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/relax/backend/cuda/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -191,7 +191,9 @@ def forward(self, hidden_states: Tensor, paged_kv_cache: PagedKVCache, layer_id:
         qkv = op.reshape(qkv, (b, s, h_q + h_kv + h_kv, d))
         # Attention
         output = op.reshape(
-            paged_kv_cache.attention_with_fused_qkv(layer_id, qkv, self.num_q_heads),
+            paged_kv_cache.attention_with_fused_qkv(
+                layer_id, qkv, self.num_q_heads, sm_scale=self.head_dim**-0.5
+            ),
             (b, s, h_q * d),
         )
         # Output Projection
@@ -285,6 +287,7 @@ def create_tir_paged_kv_cache(
         page_size: tir.Var,
     ) -> PagedKVCache:
         return TIRPagedKVCache(
+            attn_kind="mha",
             max_batch_size=max_batch_size,
             max_total_seq_len=max_total_seq_len,
             prefill_chunk_size=prefill_chunk_size,
@@ -294,7 +297,10 @@ def create_tir_paged_kv_cache(
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
             num_key_value_heads=self.num_key_value_heads,
-            head_dim=self.head_dim,
+            qk_head_dim=self.head_dim,
+            v_head_dim=self.head_dim,
+            mla_original_qk_head_dim=0,
+            mla_original_v_head_dim=0,
             rope_mode=RopeMode.NORMAL,
             rope_scale=1,
             rope_theta=self.rope_theta,
 
@@ -91,6 +91,7 @@
 )
 
 # pipeline
+from .pipeline import get_default_pipeline
 from .pipeline import get_pipeline
 from .pipeline import register_pipeline
 
 
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """The Relax CUDA backend compilation pipeline and other passes."""
+from . import flashinfer
 from .pipeline import (
     finalize_passes,
     get_default_pipeline,
Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@`
`91`	`91`	`)`
`92`	`92`
`93`	`93`	`# pipeline`
	`94`	`+from .pipeline import get_default_pipeline`
`94`	`95`	`from .pipeline import get_pipeline`
`95`	`96`	`from .pipeline import register_pipeline`
`96`	`97`