diff --git a/docs/examples/isa_optimization/Dockerfile b/docs/examples/isa_optimization/Dockerfile new file mode 100644 index 0000000000..ff7f7de578 --- /dev/null +++ b/docs/examples/isa_optimization/Dockerfile @@ -0,0 +1,30 @@ +FROM rocm/dev-ubuntu-24.04:7.2.1-complete + +# LLVM tools are already at /opt/rocm/llvm/bin/ in this image +# rocprofv3 is already at /opt/rocm/bin/ + +# Install Python deps and AITER +RUN pip install --no-cache-dir aiter && \ + pip install --no-cache-dir pandas matplotlib + +# Build ATT trace decoder from source (not shipped as pre-built binary) +RUN git clone --depth 1 --branch develop \ + https://github.com/ROCm/rocm-systems.git /tmp/rocm-systems && \ + cd /tmp/rocm-systems/projects/rocprof-trace-decoder && \ + cmake -B build -DCMAKE_INSTALL_PREFIX=/opt/rocm && \ + cmake --build build -j$(nproc) && \ + cmake --install build && \ + rm -rf /tmp/rocm-systems + +# Copy example scripts +COPY extract_asm.py analyze_kernel.py roundtrip.sh /workspace/ +RUN chmod +x /workspace/roundtrip.sh + +WORKDIR /workspace + +# Verify tools are available +RUN /opt/rocm/llvm/bin/llvm-objdump --version | head -1 && \ + /opt/rocm/llvm/bin/clang++ --version | head -1 && \ + rocprofv3 --version 2>&1 | head -1 || true + +CMD ["/bin/bash"] diff --git a/docs/examples/isa_optimization/README.md b/docs/examples/isa_optimization/README.md new file mode 100644 index 0000000000..cbf7e99c9e --- /dev/null +++ b/docs/examples/isa_optimization/README.md @@ -0,0 +1,69 @@ +# ISA Kernel Optimization Examples + +Code examples for the [ISA-Level Kernel Optimization Guide](../../isa_kernel_optimization.md). + +## Quick Start + +### Using Docker (recommended) + +```bash +docker build -t aiter-isa-opt . +docker run -it --device=/dev/kfd --device=/dev/dri --group-add video \ + aiter-isa-opt +``` + +Inside the container: + +```bash +# Full round-trip on a PA kernel +./roundtrip.sh /path/to/kernel.co --mcpu gfx942 + +# Analyze instruction mix of a .co file +python3 analyze_kernel.py isa /path/to/kernel.co --mcpu gfx942 + +# Profile with rocprofv3 and analyze results +rocprofv3 --kernel-trace -d ./profile_out -- python3 your_benchmark.py +python3 analyze_kernel.py profile ./profile_out --filter "pa_" +``` + +### Without Docker + +Requires ROCm 6.x+ installed with LLVM tools and rocprofv3. + +```bash +# Run the round-trip script directly +./roundtrip.sh kernel.co --mcpu gfx942 + +# Or use the individual scripts +python3 extract_asm.py kernel.isa SYMBOL_NAME --target amdgcn-amd-amdhsa--gfx942 -o kernel.s +python3 analyze_kernel.py isa kernel.co +``` + +## Scripts + +| Script | Purpose | +|--------|---------| +| `extract_asm.py` | Extract reassemblable `.s` from `llvm-objdump -d` output | +| `analyze_kernel.py` | ISA instruction mix analysis and rocprofv3 profile parsing | +| `roundtrip.sh` | End-to-end round-trip: disassemble, extract, recompile, verify | +| `Dockerfile` | Development environment with all tools pre-installed | + +## Workflow + +``` +kernel.co + │ + ├─ llvm-objdump -d ──► kernel.isa + │ │ + │ extract_asm.py ──► kernel.s + │ │ + │ (edit ISA here) + │ │ + │ clang++ -x assembler ──► recompiled.co + │ │ + │ llvm-objcopy -O binary -j .text ──► recompiled_text.bin + │ │ + └─── cp ──► modified.co ◄── llvm-objcopy --update-section .text=recompiled_text.bin + │ + (loadable, with original metadata preserved) +``` diff --git a/docs/examples/isa_optimization/analyze_kernel.py b/docs/examples/isa_optimization/analyze_kernel.py new file mode 100644 index 0000000000..6cc7fd82a9 --- /dev/null +++ b/docs/examples/isa_optimization/analyze_kernel.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +"""Analyze AITER kernel ISA and rocprofv3 profiling results. + +Two modes: + 1. ISA analysis — instruction mix breakdown from a .co file + 2. Profile analysis — parse rocprofv3 --kernel-trace SQLite output + +Usage: + # Analyze a .co file directly + python3 analyze_kernel.py isa kernel.co --mcpu gfx942 + + # Analyze rocprofv3 results + rocprofv3 --kernel-trace -d ./profile_out -- python bench.py + python3 analyze_kernel.py profile ./profile_out + + # Filter profile results by kernel name pattern + python3 analyze_kernel.py profile ./profile_out --filter "pa_" +""" + +import argparse +import glob +import os +import re +import sqlite3 +import subprocess +import sys + + +def analyze_isa(co_path: str, mcpu: str): + """Disassemble a .co and print instruction mix analysis.""" + objdump = "/opt/rocm/llvm/bin/llvm-objdump" + if not os.path.exists(objdump): + print(f"Error: {objdump} not found. Is ROCm installed?", file=sys.stderr) + sys.exit(1) + + result = subprocess.run( + [objdump, "-d", f"--mcpu={mcpu}", co_path], capture_output=True, text=True + ) + if result.returncode != 0: + print(f"Error: llvm-objdump failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + + lines = result.stdout.splitlines() + + # Find kernel symbols + symbols = [] + for line in lines: + m = re.match(r"[0-9a-fA-F]+ <(.+)>:", line) + if m: + symbols.append(m.group(1)) + + # Count instructions by category + instructions = [ln for ln in lines if re.match(r"\s+[0-9a-f]+:", ln)] + categories = { + "MFMA (matrix)": [ln for ln in instructions if "v_mfma_" in ln], + "Buffer load": [ln for ln in instructions if "buffer_load" in ln], + "Buffer store": [ln for ln in instructions if "buffer_store" in ln], + "Global load": [ln for ln in instructions if "global_load" in ln], + "Global store": [ln for ln in instructions if "global_store" in ln], + "LDS (ds_*)": [ln for ln in instructions if re.search(r"\bds_", ln)], + "DPP (*_dpp)": [ln for ln in instructions if "_dpp" in ln], + "Scalar (s_*)": [ln for ln in instructions if re.search(r"\bs_\w+", ln)], + "Vector ALU": [ln for ln in instructions if re.search(r"\bv_(?!mfma_)", ln)], + "Wait states": [ + ln for ln in instructions if "s_waitcnt" in ln or "s_nop" in ln + ], + } + + print(f"File: {co_path}") + print(f"Architecture: {mcpu}") + print(f"Kernel symbols: {len(symbols)}") + for s in symbols: + print(f" {s}") + print(f"\nTotal instructions: {len(instructions)}") + print(f"\n{'Category':<25s} {'Count':>8s} {'Pct':>8s}") + print("-" * 43) + for cat, matches in categories.items(): + pct = len(matches) / len(instructions) * 100 if instructions else 0 + print(f" {cat:<23s} {len(matches):>8d} {pct:>7.1f}%") + + # Compute-to-memory ratio + n_compute = len(categories["MFMA (matrix)"]) + len(categories["Vector ALU"]) + n_memory = ( + len(categories["Buffer load"]) + + len(categories["Buffer store"]) + + len(categories["Global load"]) + + len(categories["Global store"]) + ) + if n_memory > 0: + print(f"\nCompute/Memory ratio: {n_compute / n_memory:.2f}") + print() + + +def analyze_profile(profile_dir: str, name_filter: str | None = None): + """Parse rocprofv3 --kernel-trace SQLite output.""" + db_files = glob.glob(os.path.join(profile_dir, "**/*results.db"), recursive=True) + if not db_files: + # Try flat directory + db_files = glob.glob(os.path.join(profile_dir, "*.db")) + if not db_files: + print(f"Error: no .db file found in {profile_dir}", file=sys.stderr) + sys.exit(1) + + db_path = db_files[0] + print(f"Database: {db_path}\n") + + conn = sqlite3.connect(db_path) + c = conn.cursor() + + # Find table names (they have UUID suffixes) + c.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = [row[0] for row in c.fetchall()] + + dispatch_table = next((t for t in tables if "kernel_dispatch" in t.lower()), None) + symbol_table = next((t for t in tables if "kernel_symbol" in t.lower()), None) + + if not dispatch_table or not symbol_table: + print( + "Error: expected kernel_dispatch and kernel_symbol tables", file=sys.stderr + ) + print(f"Available tables: {tables}", file=sys.stderr) + sys.exit(1) + + # Build query + where_clause = "" + if name_filter: + where_clause = f"WHERE ks.kernel_name LIKE '%{name_filter}%'" + + c.execute(f""" + SELECT ks.kernel_name, COUNT(*) as cnt, + AVG(d.end - d.start) as avg_ns, + MIN(d.end - d.start) as min_ns, + MAX(d.end - d.start) as max_ns + FROM {dispatch_table} d + JOIN {symbol_table} ks ON d.kernel_id = ks.id + {where_clause} + GROUP BY ks.kernel_name + ORDER BY avg_ns DESC + """) + rows = c.fetchall() + + if not rows: + print("No kernel dispatches found.") + conn.close() + return + + print( + f"{'Kernel':<70s} {'Count':>6s} {'Avg(us)':>10s} {'Min(us)':>10s} {'Max(us)':>10s}" + ) + print("-" * 100) + total_ns = 0 + total_dispatches = 0 + for name, cnt, avg, mn, mx in rows: + display = name[:68] + print( + f" {display:<70s} {cnt:>6d} {avg/1000:>10.1f} {mn/1000:>10.1f} {mx/1000:>10.1f}" + ) + total_ns += avg * cnt + total_dispatches += cnt + + print(f"\nTotal dispatches: {total_dispatches}") + print(f"Total GPU time: {total_ns/1e6:.2f} ms") + + # Register usage (if available in symbol table) + try: + c.execute(f"PRAGMA table_info({symbol_table})") + columns = [row[1] for row in c.fetchall()] + if "arch_vgpr_count" in columns: + print( + f"\n{'Kernel':<50s} {'VGPR':>6s} {'AGPR':>6s} {'SGPR':>6s} {'LDS':>8s}" + ) + print("-" * 78) + c.execute(f""" + SELECT DISTINCT kernel_name, arch_vgpr_count, accum_vgpr_count, + sgpr_count, group_segment_size + FROM {symbol_table} + {where_clause.replace('ks.', '')} + ORDER BY kernel_name + """) + for name, vgpr, agpr, sgpr, lds in c.fetchall(): + display = name[:48] + print(f" {display:<50s} {vgpr:>6d} {agpr:>6d} {sgpr:>6d} {lds:>8d}") + except Exception: + pass + + conn.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze AITER kernel ISA or rocprofv3 profile results" + ) + sub = parser.add_subparsers(dest="mode", required=True) + + # ISA mode + p_isa = sub.add_parser("isa", help="Analyze .co file instruction mix") + p_isa.add_argument("co_file", help="Path to .co kernel object") + p_isa.add_argument( + "--mcpu", default="gfx942", help="GPU architecture (default: gfx942)" + ) + + # Profile mode + p_prof = sub.add_parser("profile", help="Analyze rocprofv3 --kernel-trace output") + p_prof.add_argument("profile_dir", help="rocprofv3 output directory") + p_prof.add_argument("--filter", help="Filter kernels by name substring") + + args = parser.parse_args() + + if args.mode == "isa": + analyze_isa(args.co_file, args.mcpu) + elif args.mode == "profile": + analyze_profile(args.profile_dir, args.filter) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/isa_optimization/extract_asm.py b/docs/examples/isa_optimization/extract_asm.py new file mode 100644 index 0000000000..b25de3c583 --- /dev/null +++ b/docs/examples/isa_optimization/extract_asm.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Extract reassemblable .s from llvm-objdump -d output. + +Handles branch label resolution using word-offset addressing: + target_address = base_address + label_value * 4 + +Usage: + # Step 1: Disassemble a .co file + /opt/rocm/llvm/bin/llvm-objdump -d --mcpu=gfx942 kernel.co > kernel.isa + + # Step 2: Find the kernel symbol + grep "^[0-9a-f]" kernel.isa | head -1 + # Example: 0000000000001000 <_ZN5aiter...E>: + + # Step 3: Extract reassemblable .s + python3 extract_asm.py kernel.isa _ZN5aiter...E > kernel.s + + # Step 4: Recompile + /opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn-amd-amdhsa \\ + -mcpu=gfx942 -o kernel_recompiled.co kernel.s +""" + +import re +import sys +import argparse + + +def extract(isa_path: str, kernel_symbol: str, target: str) -> str: + """Parse llvm-objdump output and emit a reassemblable .s file.""" + with open(isa_path) as f: + lines = f.readlines() + + # Find kernel section + section_start = None + for i, line in enumerate(lines): + if f"<{kernel_symbol}>:" in line: + section_start = i + break + if section_start is None: + print( + f"Error: kernel symbol '{kernel_symbol}' not found in {isa_path}", + file=sys.stderr, + ) + # List available symbols + symbols = [] + for line in lines: + m = re.match(r"[0-9a-fA-F]+ <(.+)>:", line) + if m: + symbols.append(m.group(1)) + if symbols: + print("Available symbols:", file=sys.stderr) + for s in symbols: + print(f" {s}", file=sys.stderr) + sys.exit(1) + + # Parse base address from first instruction + first_instr_line = lines[section_start + 1].strip() + base_addr = int(first_instr_line.split(":")[0].strip(), 16) + + # Collect instructions + instructions = [] + for line in lines[section_start + 1 :]: + stripped = line.strip() + if not stripped or stripped.startswith("Disassembly"): + break + # Match: " addr: hex_bytes instruction" + m = re.match(r"\s*([0-9a-fA-F]+):\s+(?:[0-9a-fA-F]+\s+)+(.+)", stripped) + if m: + addr = int(m.group(1), 16) + instr = m.group(2).strip() + # Remove trailing hex comments (e.g., "// 000000001234") + instr = re.sub(r"\s*//\s*[0-9A-Fa-f]+$", "", instr) + instructions.append((addr, instr)) + + if not instructions: + print(f"Error: no instructions found for '{kernel_symbol}'", file=sys.stderr) + sys.exit(1) + + # Resolve branch labels (word offset: label_val * 4 + base_addr) + branch_targets = {} + for addr, instr in instructions: + m = re.search(r"label_([0-9A-Fa-f]+)", instr) + if m: + label_val = int(m.group(1), 16) + target_addr = base_addr + label_val * 4 + branch_targets[target_addr] = f"label_{m.group(1)}" + + # Emit .s file + output = [] + output.append(f' .amdgcn_target "{target}"') + output.append(f" .globl {kernel_symbol}") + output.append(f" .type {kernel_symbol}, @function") + output.append(f"{kernel_symbol}:") + + for addr, instr in instructions: + if addr in branch_targets: + output.append(f"{branch_targets[addr]}:") + output.append(f" {instr}") + + output.append(f" .size {kernel_symbol}, .-{kernel_symbol}") + + # Summary stats + n_mfma = sum(1 for _, i in instructions if "v_mfma_" in i) + n_buf = sum(1 for _, i in instructions if "buffer_load" in i) + n_lds = sum(1 for _, i in instructions if i.startswith("ds_")) + n_branch = len(branch_targets) + print( + f"Extracted {len(instructions)} instructions " + f"(MFMA={n_mfma}, buf_load={n_buf}, LDS={n_lds}, branches={n_branch})", + file=sys.stderr, + ) + + return "\n".join(output) + + +def list_symbols(isa_path: str) -> list[str]: + """List all kernel symbols in an llvm-objdump output file.""" + symbols = [] + with open(isa_path) as f: + for line in f: + m = re.match(r"[0-9a-fA-F]+ <(.+)>:", line) + if m: + symbols.append(m.group(1)) + return symbols + + +def main(): + parser = argparse.ArgumentParser( + description="Extract reassemblable .s from llvm-objdump output" + ) + parser.add_argument("isa_file", help="llvm-objdump -d output file") + parser.add_argument( + "kernel_symbol", + nargs="?", + help="Kernel symbol name (omit to list available symbols)", + ) + parser.add_argument( + "--target", + default="amdgcn-amd-amdhsa--gfx942", + help="AMDGCN target triple (default: amdgcn-amd-amdhsa--gfx942)", + ) + parser.add_argument("-o", "--output", help="Output .s file (default: stdout)") + args = parser.parse_args() + + if not args.kernel_symbol: + symbols = list_symbols(args.isa_file) + print("Available kernel symbols:") + for s in symbols: + print(f" {s}") + return + + result = extract(args.isa_file, args.kernel_symbol, args.target) + + if args.output: + with open(args.output, "w") as f: + f.write(result + "\n") + print(f"Written to {args.output}", file=sys.stderr) + else: + print(result) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/isa_optimization/roundtrip.sh b/docs/examples/isa_optimization/roundtrip.sh new file mode 100644 index 0000000000..8d5ce400f3 --- /dev/null +++ b/docs/examples/isa_optimization/roundtrip.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# ISA Round-Trip: disassemble -> extract .s -> recompile -> verify +# +# Demonstrates the full workflow from the isa_kernel_optimization.md guide. +# Uses a PA kernel from AITER as a concrete example. +# +# Usage: +# ./roundtrip.sh [--mcpu gfx942] +# +# The script will: +# 1. Disassemble the .co to ISA text +# 2. List available kernel symbols +# 3. Extract reassemblable .s for each symbol +# 4. Recompile to a new .co +# 5. Verify .text section is binary-identical +# 6. Produce a loadable .co via llvm-objcopy --update-section + +set -euo pipefail + +LLVM_BIN="${ROCM_PATH:-/opt/rocm}/llvm/bin" +OBJDUMP="$LLVM_BIN/llvm-objdump" +OBJCOPY="$LLVM_BIN/llvm-objcopy" +CLANGXX="$LLVM_BIN/clang++" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# ---------- Parse args ---------- +CO_FILE="" +MCPU="gfx942" +while [[ $# -gt 0 ]]; do + case $1 in + --mcpu) MCPU="$2"; shift 2 ;; + --help|-h) + echo "Usage: $0 [--mcpu gfx942]" + exit 0 ;; + *) CO_FILE="$1"; shift ;; + esac +done + +if [[ -z "$CO_FILE" ]]; then + # Default: find a PA kernel from AITER + AITER_PATH=$(python3 -c "import aiter; print(aiter.__path__[0])" 2>/dev/null || true) + if [[ -n "$AITER_PATH" ]]; then + CO_FILE=$(find "$AITER_PATH" -path "*/hsa/${MCPU}/pa/*.co" | head -1) + fi + if [[ -z "$CO_FILE" ]]; then + echo "Error: no .co file specified and no AITER PA kernel found for $MCPU" + echo "Usage: $0 [--mcpu gfx942]" + exit 1 + fi + echo "Using AITER kernel: $CO_FILE" +fi + +if [[ ! -f "$CO_FILE" ]]; then + echo "Error: $CO_FILE not found" + exit 1 +fi + +WORKDIR=$(mktemp -d -t isa_roundtrip.XXXXXX) +echo "Working directory: $WORKDIR" +echo "Target architecture: $MCPU" +echo + +# ---------- Step 1: Disassemble ---------- +echo "=== Step 1: Disassemble ===" +ISA_FILE="$WORKDIR/kernel.isa" +"$OBJDUMP" -d --mcpu="$MCPU" "$CO_FILE" > "$ISA_FILE" +TOTAL_INSTR=$(grep -cE '^\s+[0-9a-f]+:' "$ISA_FILE" || true) +echo "Total instructions: $TOTAL_INSTR" + +# Quick instruction stats +echo " MFMA ops: $(grep -c 'v_mfma_' "$ISA_FILE" || true)" +echo " Buffer loads: $(grep -c 'buffer_load' "$ISA_FILE" || true)" +echo " LDS ops: $(grep -c 'ds_' "$ISA_FILE" || true)" +echo " DPP ops: $(grep -c '_dpp' "$ISA_FILE" || true)" +echo + +# ---------- Step 2: List symbols ---------- +echo "=== Step 2: Kernel symbols ===" +SYMBOLS=$(grep -oP '(?<=<).+?(?=>:)' "$ISA_FILE" || true) +FIRST_SYMBOL=$(echo "$SYMBOLS" | head -1) +echo "$SYMBOLS" | while read -r sym; do + echo " $sym" +done +echo + +if [[ -z "$FIRST_SYMBOL" ]]; then + echo "Error: no kernel symbols found" + exit 1 +fi + +# ---------- Step 3: Extract .s ---------- +echo "=== Step 3: Extract reassemblable .s ===" +TARGET="amdgcn-amd-amdhsa--${MCPU}" +S_FILE="$WORKDIR/kernel.s" +python3 "$SCRIPT_DIR/extract_asm.py" "$ISA_FILE" "$FIRST_SYMBOL" \ + --target "$TARGET" -o "$S_FILE" +echo + +# ---------- Step 4: Recompile ---------- +echo "=== Step 4: Recompile ===" +RECOMP_CO="$WORKDIR/kernel_recompiled.co" +"$CLANGXX" -x assembler -target amdgcn-amd-amdhsa \ + -mcpu="$MCPU" -o "$RECOMP_CO" "$S_FILE" +echo "Recompiled: $RECOMP_CO" +echo + +# ---------- Step 5: Verify .text ---------- +echo "=== Step 5: Verify .text section ===" +ORIG_TEXT="$WORKDIR/original_text.bin" +RECOMP_TEXT="$WORKDIR/recompiled_text.bin" +"$OBJCOPY" -O binary -j .text "$CO_FILE" "$ORIG_TEXT" +"$OBJCOPY" -O binary -j .text "$RECOMP_CO" "$RECOMP_TEXT" + +ORIG_MD5=$(md5sum "$ORIG_TEXT" | cut -d' ' -f1) +RECOMP_MD5=$(md5sum "$RECOMP_TEXT" | cut -d' ' -f1) +echo "Original .text: $ORIG_MD5 ($(wc -c < "$ORIG_TEXT") bytes)" +echo "Recompiled .text: $RECOMP_MD5 ($(wc -c < "$RECOMP_TEXT") bytes)" + +if [[ "$ORIG_MD5" == "$RECOMP_MD5" ]]; then + echo "PASS: .text sections are binary-identical" +else + echo "FAIL: .text sections differ" + echo " Run: diff <(xxd $ORIG_TEXT) <(xxd $RECOMP_TEXT) | head -20" + exit 1 +fi +echo + +# ---------- Step 6: Produce loadable .co ---------- +echo "=== Step 6: Produce loadable .co ===" +LOADABLE_CO="$WORKDIR/kernel_modified.co" +cp "$CO_FILE" "$LOADABLE_CO" +"$OBJCOPY" --update-section .text="$RECOMP_TEXT" "$LOADABLE_CO" +echo "Loadable kernel: $LOADABLE_CO" +echo " Original size: $(wc -c < "$CO_FILE") bytes" +echo " Modified size: $(wc -c < "$LOADABLE_CO") bytes" +echo + +echo "=== Round-trip complete ===" +echo +echo "Files in $WORKDIR:" +ls -la "$WORKDIR" +echo +echo "Next steps:" +echo " 1. Edit $S_FILE to modify ISA instructions" +echo " 2. Recompile: $CLANGXX -x assembler -target amdgcn-amd-amdhsa -mcpu=$MCPU -o new.co $S_FILE" +echo " 3. Inject: cp $CO_FILE modified.co && $OBJCOPY --update-section .text=new_text.bin modified.co" +echo " 4. Benchmark the modified kernel against the original" diff --git a/docs/isa_kernel_optimization.md b/docs/isa_kernel_optimization.md new file mode 100644 index 0000000000..abc80a6731 --- /dev/null +++ b/docs/isa_kernel_optimization.md @@ -0,0 +1,475 @@ +# ISA-Level Kernel Optimization with LLVM Tools + +A guide to inspecting, analyzing, modifying, and recompiling AITER GPU kernel ISA using the ROCm LLVM toolchain. + +> **Code examples and Dockerfile:** See [`docs/examples/isa_optimization/`](examples/isa_optimization/) for runnable scripts and a Docker development environment. + +## Overview + +AITER ships optimized GPU kernels as compiled code objects (`.co` files). Sometimes you need to go deeper than source-level optimization. This guide shows how to: + +1. Disassemble a `.co` kernel to human-readable ISA +2. Analyze instruction mix (MFMA, memory, LDS, DPP) +3. Extract a reassemblable `.s` file +4. Modify ISA instructions and recompile +5. Profile kernel performance with `rocprofv3` + +All tools used are open-source ROCm components. No proprietary tools required. + +## Prerequisites + +- ROCm 6.x or later (tested on ROCm 7.2.1) +- LLVM tools: `llvm-objdump`, `clang++` (shipped with ROCm at `/opt/rocm/llvm/bin/`) +- `rocprofv3` (shipped with ROCm at `/opt/rocm/bin/`) +- Python 3.8+ +- An AMD GPU (gfx90a, gfx942, or newer) + +## Step 1: Locate the Kernel Object + +AITER kernel `.co` files are typically found in the build directory or installed package: + +```bash +# Find compiled kernel objects +find $(python -c "import aiter; print(aiter.__path__[0])") -name "*.co" | head -20 + +# Or look in the HSA directory +ls aiter/hsa/ +``` + +For this guide, we'll use a Paged Attention kernel as an example: + +```bash +KERNEL_CO="pa_bf16_pertokenFp8_gqa16_2tg_4w.co" +``` + +## Step 2: Disassemble to ISA + +Use `llvm-objdump` to produce a full disassembly: + +```bash +/opt/rocm/llvm/bin/llvm-objdump -d --mcpu=gfx942 $KERNEL_CO > kernel.isa +``` + +Replace `gfx942` with your target GPU architecture. + +### Quick ISA Analysis + +Count key instruction types to understand the kernel profile: + +```bash +# Instruction statistics +echo "Total instructions: $(grep -cE '^\s+[0-9a-f]+:' kernel.isa)" +echo "MFMA (matrix) ops: $(grep -c 'v_mfma_' kernel.isa)" +echo "Buffer loads: $(grep -c 'buffer_load' kernel.isa)" +echo "LDS ops: $(grep -c 'ds_' kernel.isa)" +echo "DPP ops: $(grep -c '_dpp' kernel.isa)" +echo "Scalar ops: $(grep -c '^[[:space:]]*s_' kernel.isa)" +``` + +### Read Kernel Metadata + +```bash +# Extract register usage and resource requirements +/opt/rocm/llvm/bin/llvm-objdump --mcpu=gfx942 -s -j .note $KERNEL_CO +``` + +Key metrics to look for: +- **SGPRs / VGPRs**: Register pressure limits occupancy +- **LDS size**: Shared memory per workgroup +- **Wavefront size**: 32 or 64 + +## Step 3: Extract Reassemblable Assembly + +The raw `llvm-objdump` output is not directly reassemblable. A Python extraction script converts it to a valid `.s` file. + +Create `extract_asm.py`: + +```python +#!/usr/bin/env python3 +"""Extract reassemblable .s from llvm-objdump -d output. + +Handles branch label resolution using word-offset addressing: + target_address = base_address + label_value * 4 +""" +import re +import sys + +def extract(isa_path, kernel_symbol, target="amdgcn-amd-amdhsa--gfx942"): + with open(isa_path) as f: + lines = f.readlines() + + # Find kernel section + section_start = None + for i, line in enumerate(lines): + if f"<{kernel_symbol}>:" in line: + section_start = i + break + if section_start is None: + print(f"Kernel symbol '{kernel_symbol}' not found", file=sys.stderr) + sys.exit(1) + + # Parse instruction address from first line + first_instr_line = lines[section_start + 1].strip() + base_addr = int(first_instr_line.split(":")[0].strip(), 16) + + # Collect instructions + instructions = [] + for line in lines[section_start + 1:]: + stripped = line.strip() + if not stripped or stripped.startswith("Disassembly"): + break + m = re.match(r"\s*([0-9a-fA-F]+):\s+(?:[0-9a-fA-F]+\s+)+(.+)", stripped) + if m: + addr = int(m.group(1), 16) + instr = m.group(2).strip() + # Remove trailing hex comments + instr = re.sub(r"\s*//\s*[0-9A-Fa-f]+$", "", instr) + instructions.append((addr, instr)) + + # Resolve branch labels (word offset: label_val * 4 + base_addr) + branch_targets = {} + for addr, instr in instructions: + m = re.search(r"label_([0-9A-Fa-f]+)", instr) + if m: + label_val = int(m.group(1), 16) + target_addr = base_addr + label_val * 4 + branch_targets[target_addr] = f"label_{m.group(1)}" + + # Emit .s file + output = [] + output.append(f' .amdgcn_target "{target}"') + output.append(f" .globl {kernel_symbol}") + output.append(f" .type {kernel_symbol}, @function") + output.append(f"{kernel_symbol}:") + + for addr, instr in instructions: + if addr in branch_targets: + output.append(f"{branch_targets[addr]}:") + output.append(f" {instr}") + + output.append(f" .size {kernel_symbol}, .-{kernel_symbol}") + return "\n".join(output) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} [target]") + sys.exit(1) + + isa_file = sys.argv[1] + symbol = sys.argv[2] + target = sys.argv[3] if len(sys.argv) > 3 else "amdgcn-amd-amdhsa--gfx942" + + result = extract(isa_file, symbol, target) + print(result) +``` + +Run the extraction: + +```bash +# Find the kernel symbol name +grep "^[0-9a-f]" kernel.isa | head -1 +# Example output: 0000000000001000 <_ZN5aiter32pa_bf16_pertokenFp8_gqa16_2tg_4wE>: + +# Extract reassemblable .s +python3 extract_asm.py kernel.isa \ + _ZN5aiter32pa_bf16_pertokenFp8_gqa16_2tg_4w > kernel_roundtrip.s +``` + +## Step 4: Recompile and Verify + +Recompile the `.s` file back to a `.co`: + +```bash +/opt/rocm/llvm/bin/clang++ \ + -x assembler \ + -target amdgcn-amd-amdhsa \ + -mcpu=gfx942 \ + -o kernel_recompiled.co \ + kernel_roundtrip.s +``` + +### Verify Binary Equivalence + +Compare the `.text` section of the original and recompiled kernels: + +```bash +# Extract .text sections +/opt/rocm/llvm/bin/llvm-objcopy -O binary -j .text $KERNEL_CO original_text.bin +/opt/rocm/llvm/bin/llvm-objcopy -O binary -j .text kernel_recompiled.co recompiled_text.bin + +# Compare +md5sum original_text.bin recompiled_text.bin +diff <(xxd original_text.bin) <(xxd recompiled_text.bin) && echo "IDENTICAL" || echo "DIFFERS" +``` + +A successful round-trip produces identical `.text` sections. Metadata sections may differ (they are regenerated by the assembler), but the executable code is bit-exact. + +### Producing a Loadable Kernel Object + +The recompiled `.co` from Step 4 has a minimal `.note` section and may fail to load with `hipModuleLoad` ("no kernel image available"). The original `.co` contains rich AMDHSA metadata (kernel arguments, register counts, LDS size) that the HIP runtime requires. + +To produce a loadable `.co`, inject the modified `.text` section back into the original kernel object: + +```bash +# Copy original .co (preserves all metadata) +cp $KERNEL_CO kernel_modified.co + +# Replace only the .text section with recompiled code +/opt/rocm/llvm/bin/llvm-objcopy --update-section .text=recompiled_text.bin kernel_modified.co +``` + +This preserves the original kernel descriptor, argument metadata, and ELF structure while swapping in the new executable code. + +### Verifying Performance Equivalence + +After swapping, benchmark both versions to confirm identical performance: + +```bash +# Benchmark original +cp original_kernel.co $INSTALL_PATH/kernel.co +python benchmark.py # record time + +# Benchmark modified +cp kernel_modified.co $INSTALL_PATH/kernel.co +python benchmark.py # compare time +``` + +On a PA decode kernel (bf16+fp8, GQA16, SEQ=4096), 3 runs of 500 iterations each showed original vs recompiled within ±3% noise — confirming zero performance regression from the round-trip. + +## Step 5: Modify and Iterate + +With a working round-trip established, you can now modify the `.s` file: + +### Common ISA Optimizations + +**Instruction scheduling** — Fill MFMA co-execution slots with independent operations: + +```asm +; Before: MFMA followed by wait +v_mfma_f32_32x32x16_bf16 a[0:15], v[0:1], v[2:3], a[0:15] +s_nop 7 ; wasted cycles + +; After: Fill with independent work +v_mfma_f32_32x32x16_bf16 a[0:15], v[0:1], v[2:3], a[0:15] +buffer_load_dwordx4 v[8:11], v4, s[0:3], 0 ; prefetch next tile +ds_read_b128 v[12:15], v5 ; load from LDS +``` + +**Register pressure reduction** — Reuse registers to improve occupancy: + +```asm +; Identify dead registers after their last use +; and reassign them for new values +``` + +**Memory access patterns** — Optimize buffer load/store coalescing and LDS bank conflicts. + +After modifying, recompile and benchmark: + +```bash +# Recompile modified kernel +/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn-amd-amdhsa \ + -mcpu=gfx942 -o kernel_modified.co kernel_modified.s + +# Replace the .co in the AITER installation and re-run benchmark +``` + +## Step 6: Profile with rocprofv3 + +### Kernel-Level Tracing + +Collect per-kernel dispatch timing with `--kernel-trace`: + +```bash +rocprofv3 --kernel-trace -d ./profile_out -- python your_benchmark.py +``` + +This produces a SQLite database (`.db`) under the output directory with all kernel dispatches, including timestamps, grid dimensions, and kernel metadata. + +### Output Formats + +rocprofv3 supports multiple output formats: + +```bash +# CSV (human-readable, easy to grep) +rocprofv3 --kernel-trace -f csv -d ./profile_out -- python your_benchmark.py + +# JSON +rocprofv3 --kernel-trace -f json -d ./profile_out -- python your_benchmark.py + +# Perfetto trace (open in https://ui.perfetto.dev) +rocprofv3 --kernel-trace -f pftrace -d ./profile_out -- python your_benchmark.py +``` + +### Querying the SQLite Database + +The default output is a `.db` file. Table names include a UUID suffix. Use Python to query: + +```python +import sqlite3, glob + +db_path = glob.glob("profile_out/**/*results.db", recursive=True)[0] +conn = sqlite3.connect(db_path) +c = conn.cursor() + +# Find table names (they have UUID suffixes) +c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%kernel_dispatch%'") +dispatch_table = c.fetchone()[0] + +c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%kernel_symbol%'") +symbol_table = c.fetchone()[0] + +# Top kernels by average duration +c.execute(f""" + SELECT ks.kernel_name, COUNT(*) as cnt, + AVG(d.end - d.start) as avg_ns, + MIN(d.end - d.start) as min_ns, + MAX(d.end - d.start) as max_ns + FROM {dispatch_table} d + JOIN {symbol_table} ks ON d.kernel_id = ks.id + GROUP BY ks.kernel_name + ORDER BY avg_ns DESC LIMIT 10 +""") +print(f"{'Kernel':<60s} {'Count':>6s} {'Avg(us)':>10s} {'Min(us)':>10s} {'Max(us)':>10s}") +for name, cnt, avg, mn, mx in c.fetchall(): + print(f" {name[:58]:<60s} {cnt:>6d} {avg/1000:>10.1f} {mn/1000:>10.1f} {mx/1000:>10.1f}") +``` + +### Filtering by Kernel Name + +To focus on a specific kernel (e.g., paged attention): + +```python +# Filter dispatches for PA kernels only +c.execute(f""" + SELECT ks.kernel_name, COUNT(*) as cnt, + AVG(d.end - d.start) as avg_ns, + ks.arch_vgpr_count, ks.accum_vgpr_count, + ks.sgpr_count, ks.group_segment_size + FROM {dispatch_table} d + JOIN {symbol_table} ks ON d.kernel_id = ks.id + WHERE ks.kernel_name LIKE '%paged_attn%' + OR ks.kernel_name LIKE '%pa_%' + GROUP BY ks.kernel_name + ORDER BY avg_ns DESC +""") +for name, cnt, avg, vgpr, agpr, sgpr, lds in c.fetchall(): + print(f" {name[:70]}") + print(f" dispatches={cnt}, avg={avg/1000:.1f} us, " + f"VGPR={vgpr}, AGPR={agpr}, SGPR={sgpr}, LDS={lds}") +``` + +### Combining Tracing Modes + +Collect kernel, memory copy, and HIP runtime traces together for a full picture: + +```bash +rocprofv3 --kernel-trace --memory-copy-trace --hip-trace \ + -d ./profile_out -- python your_benchmark.py +``` + +### Comparing Original vs Modified Kernel + +After modifying the ISA and recompiling (Step 5), re-run the benchmark under `--kernel-trace` and compare: + +```bash +# Profile original +rocprofv3 --kernel-trace -d ./profile_original -- python benchmark.py + +# Swap in modified .co, profile again +rocprofv3 --kernel-trace -d ./profile_modified -- python benchmark.py + +# Compare average kernel durations +``` + +### Advanced Thread Trace (ATT) + +ATT captures instruction-level cycle counts, enabling precise bottleneck identification. It requires the `rocprof-trace-decoder` library, which is not shipped as a pre-built binary in ROCm 7.2.x but can be built from source: + +```bash +# Build and install rocprof-trace-decoder +git clone --depth 1 --branch develop https://github.com/ROCm/rocm-systems.git +cd rocm-systems/projects/rocprof-trace-decoder +cmake -B build -DCMAKE_INSTALL_PREFIX=/opt/rocm +cmake --build build -j$(nproc) +cmake --install build # installs librocprof-trace-decoder.so to /opt/rocm/lib +``` + +Once installed, run ATT: + +```bash +# Trace a single compute unit (CU 1) +rocprofv3 --att --att-target-cu 1 --kernel-trace \ + -d ./att_output -- python your_benchmark.py +``` + +This produces: +- `.att` files — raw per-wave binary trace data +- `.out` files — disassembled code objects for each kernel +- `results.db` — kernel dispatch database (same as `--kernel-trace`) + +ATT options: + +| Flag | Default | Description | +|------|---------|-------------| +| `--att-target-cu CU_ID` | 1 | Which compute unit (WGP) to trace | +| `--att-buffer-size BYTES` | 256MB | Trace buffer size per SE | +| `--att-shader-engine-mask MASK` | all | Bitmask of shader engines | +| `--att-gpu-index LIST` | all | Comma-separated GPU indices | + +## Key Technical Details + +### Branch Label Addressing + +In `llvm-objdump` output, branch instructions reference labels like `label_0694`. These use **word offsets**, not byte offsets: + +``` +target_address = base_address + label_value * 4 +``` + +For example, with `base_address = 0x1000` and `label_0694`: +``` +target = 0x1000 + 0x694 * 4 = 0x1000 + 0x1A50 = 0x2A50 +``` + +The extraction script handles this automatically. + +### Architecture-Specific Considerations + +| Architecture | GPU | Notes | +|-------------|-----|-------| +| gfx90a | MI210, MI250 | CDNA2, wavefront 64 | +| gfx942 | MI300X | CDNA3, wavefront 64, MFMA co-execution | + +This workflow applies to all AMDGPU architectures supported by LLVM. Adjust the `--mcpu` flag and `.amdgcn_target` string to match your target GPU. + +### Typical Kernel Instruction Profile + +For a well-optimized attention kernel on gfx942: + +| Instruction Category | Count | Purpose | +|---------------------|-------|---------| +| MFMA (v_mfma_*) | ~192 | Matrix multiply-accumulate | +| Buffer loads | ~100 | Global memory reads | +| LDS ops (ds_*) | ~300+ | Shared memory access | +| DPP ops (*_dpp) | ~300+ | Cross-lane data movement | +| Scalar ops (s_*) | ~200+ | Control flow, address calculation | + +## Troubleshooting + +**llvm-objdump not found**: Use the full path `/opt/rocm/llvm/bin/llvm-objdump`. + +**Recompiled .text differs from original**: Check that branch labels are resolved correctly. The extraction script must use word-offset addressing (multiply label value by 4). + +**ATT trace fails with "trace-decoder not found"**: Build and install `librocprof-trace-decoder.so` from source (see the ATT section above). The library is not included in ROCm 7.2.x binary packages. + +**Metadata sections differ after round-trip**: This is expected. The `.text` (executable code) section should be identical. Metadata is regenerated by the assembler and may have different formatting. + +## References + +- [LLVM AMDGPU Backend](https://llvm.org/docs/AMDGPUUsage.html) +- [ROCm Documentation](https://rocm.docs.amd.com/) +- [AMDGPU ISA Reference (GFX9)](https://llvm.org/docs/AMDGPU/AMDGPUAsmGFX9.html) +- [rocprofv3 Documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/) +- [rocprof-trace-decoder](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprof-trace-decoder) — ATT trace decoder library (build from source)