Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
Language: Cpp
AccessModifierOffset: 0
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: true
AfterEnum: true
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
- Regex: '^(<|"(gtest|isl|json)/)'
Priority: 3
- Regex: '.*'
Priority: 1
IndentCaseLabels: false
IndentWidth: 4
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
ReflowComments: true
SortIncludes: false
SpaceAfterCStyleCast: false
# SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: Never
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...
3 changes: 3 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CheckOptions:
- key: bugprone-reserved-identifier.AllowedIdentifiers
value: '__HIP_PLATFORM_HCC__;__HIP_PLATFORM_AMD__;__HIP_ROCclr__'
2 changes: 2 additions & 0 deletions .githooks/pre-commit
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ if command -v black >/dev/null; then
if [[ -e $file ]] && echo $file | grep -Eq '\.py$'; then
echo "black $file"
black "$file"
echo "ruff check --fix $file"
ruff check --fix "$file"
git add -u "$file"
fi
done
Expand Down
6 changes: 6 additions & 0 deletions CONTRIBUTE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Install pre commit hook before submit the PR
```
pip install black==25.1.0 ruff==0.11.11
apt install clang-format
bash ./.githooks/install
```
2 changes: 2 additions & 0 deletions aiter/aot/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
52 changes: 52 additions & 0 deletions aiter/aot/asm_mla_decode_fwd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from collections import namedtuple
import os
import concurrent.futures
from csrc.cpp_itfs.mla.asm_mla_decode_fwd import compile
from csrc.cpp_itfs.utils import AITER_CORE_DIR

MLAConfig = namedtuple(
"MLAConfig",
[
"hsaco_path",
"page_size",
"q_itemsize",
"kv_itemsize",
"num_kv_splits",
"v_head_dim",
],
)


def process_config(config):
return compile(
config.hsaco_path,
config.page_size,
config.q_itemsize,
config.kv_itemsize,
config.num_kv_splits,
config.v_head_dim,
)


def main():
configs = []
for num_kv_splits in range(1, 17):
configs.append(
MLAConfig(
hsaco_path=f"{AITER_CORE_DIR}/hsa/mla_stage1_a16w16_bf16.co",
page_size=1,
q_itemsize=2,
kv_itemsize=2,
num_kv_splits=num_kv_splits,
v_head_dim=512,
)
)

with concurrent.futures.ProcessPoolExecutor(
os.environ.get("MAX_JOBS", 16)
) as executor:
executor.map(process_config, configs)


if __name__ == "__main__":
main()
103 changes: 103 additions & 0 deletions aiter/aot/pa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from collections import namedtuple
import os
import concurrent.futures
from csrc.cpp_itfs.pa.pa import compile

PAConfig = namedtuple(
"PAConfig",
[
"gqa_ratio",
"head_size",
"npar_loops",
"block_size",
"dtype",
"kv_dtype",
"fp8_kv_dtype",
"out_dtype",
"alibi_enabled",
],
)


def process_config(config):
return compile(
config.gqa_ratio,
config.head_size,
config.npar_loops,
config.dtype,
config.kv_dtype,
config.fp8_kv_dtype,
config.out_dtype,
config.block_size,
config.alibi_enabled,
)


def main():
configs = []
for gqa_ratio in range(1, 17):
for alibi_enabled in ["false", "true"]:
for block_size in [1, 16, 32]:
for npar_loops in range(1, 9):
for head_size in [64, 128]:
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="_Float16",
kv_dtype="_Float16",
fp8_kv_dtype="auto",
out_dtype="_Float16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="__hip_bfloat16",
kv_dtype="__hip_bfloat16",
fp8_kv_dtype="auto",
out_dtype="__hip_bfloat16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="_Float16",
kv_dtype="uint8_t",
fp8_kv_dtype="fp8",
out_dtype="_Float16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="__hip_bfloat16",
kv_dtype="uint8_t",
fp8_kv_dtype="fp8",
out_dtype="__hip_bfloat16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)

with concurrent.futures.ProcessPoolExecutor(
os.environ.get("MAX_JOBS", 16)
) as executor:
executor.map(process_config, configs)


if __name__ == "__main__":
main()
103 changes: 103 additions & 0 deletions aiter/aot/pa_ragged.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from collections import namedtuple
import os
import concurrent.futures
from csrc.cpp_itfs.pa.pa_ragged import compile

PAConfig = namedtuple(
"PAConfig",
[
"gqa_ratio",
"head_size",
"npar_loops",
"block_size",
"dtype",
"kv_dtype",
"fp8_kv_dtype",
"out_dtype",
"alibi_enabled",
],
)


def process_config(config):
return compile(
config.gqa_ratio,
config.head_size,
config.npar_loops,
config.dtype,
config.kv_dtype,
config.fp8_kv_dtype,
config.out_dtype,
config.block_size,
config.alibi_enabled,
)


def main():
configs = []
for gqa_ratio in range(1, 17):
for alibi_enabled in ["false", "true"]:
for block_size in [1, 16, 32]:
for npar_loops in range(1, 9):
for head_size in [64, 128]:
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="_Float16",
kv_dtype="_Float16",
fp8_kv_dtype="auto",
out_dtype="_Float16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="__hip_bfloat16",
kv_dtype="__hip_bfloat16",
fp8_kv_dtype="auto",
out_dtype="__hip_bfloat16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="_Float16",
kv_dtype="uint8_t",
fp8_kv_dtype="fp8",
out_dtype="_Float16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)
configs.append(
PAConfig(
gqa_ratio=gqa_ratio,
head_size=head_size,
npar_loops=npar_loops,
dtype="__hip_bfloat16",
kv_dtype="uint8_t",
fp8_kv_dtype="fp8",
out_dtype="__hip_bfloat16",
block_size=block_size,
alibi_enabled=alibi_enabled,
)
)

with concurrent.futures.ProcessPoolExecutor(
os.environ.get("MAX_JOBS", 16)
) as executor:
executor.map(process_config, configs)


if __name__ == "__main__":
main()
Loading