diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2846e58ef..facf1d620 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,6 +28,9 @@ repos: - id: check-ast fail_fast: true - id: debug-statements + - id: file-contents-sorter + args: [--ignore-case] + files: ^docs/spelling_wordlist\.txt$ - repo: https://github.com/pre-commit/mirrors-clang-format rev: v15.0.7 # sync with requirements-lint.txt hooks: diff --git a/README.md b/README.md index 256acf6da..0ab62c46a 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,6 @@ Welcome to join our Discord community for discussions, support, and collaboratio [![Join our Discord](https://img.shields.io/badge/Discord-Join%20Us-blue?logo=discord&style=for-the-badge)](https://discord.gg/TUrHyJnKPG) -## Acknowledgements +## Acknowledgments We would like to express our gratitude to the [TVM](https://github.com/apache/tvm) community for their invaluable contributions. The initial version of this project was mainly developed by [LeiWang1999](https://github.com/LeiWang1999), [chengyupku](https://github.com/chengyupku) and [nox-410](https://github.com/nox-410) with supervision from Prof. [Zhi Yang](https://yangzhihome.github.io) at Peking University. Part of this work was carried out during an internship at Microsoft Research, where Dr. Lingxiao Ma, Dr. Yuqing Xia, Dr. Jilong Xue, and Dr. Fan Yang offered valuable advice and support. We deeply appreciate their mentorship and contributions. diff --git a/docs/compiler_internals/inject_fence_proxy.md b/docs/compiler_internals/inject_fence_proxy.md index df173bdf5..81f498e57 100644 --- a/docs/compiler_internals/inject_fence_proxy.md +++ b/docs/compiler_internals/inject_fence_proxy.md @@ -4,7 +4,7 @@ ## Why Fences Are Needed -Hopper separates memory instructions into generic and asynchronous proxy paths. When an asynchronous instruction (for example, `cp.async` or `tma.load`) issues after generic traffic (like `ldmatrix` or plain buffer stores), the hardware requires a `fence.proxy.async` to guarantee ordering. Missing fences can lead to race conditions or undefined behaviour. +Hopper separates memory instructions into generic and asynchronous proxy paths. When an asynchronous instruction (for example, `cp.async` or `tma.load`) issues after generic traffic (like `ldmatrix` or plain buffer stores), the hardware requires a `fence.proxy.async` to guarantee ordering. Missing fences can lead to race conditions or undefined behavior. ## What the Pass Does diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt new file mode 100644 index 000000000..e859d0e7b --- /dev/null +++ b/docs/spelling_wordlist.txt @@ -0,0 +1,8 @@ +cancelled +hsa +ist +LOD +nd +NotIn +offen +te diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py index c78896c33..6e3c42b6f 100644 --- a/examples/bitnet-1.58b/modeling_bitnet.py +++ b/examples/bitnet-1.58b/modeling_bitnet.py @@ -1718,11 +1718,11 @@ def forward( ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the start of the labelled span for computing the token classification loss. + Labels for position (index) of the start of the labeled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for computing the token classification loss. + Labels for position (index) of the end of the labeled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py index 202559fae..6fea3252a 100644 --- a/examples/bitnet-1.58b/tokenization_bitnet.py +++ b/examples/bitnet-1.58b/tokenization_bitnet.py @@ -170,9 +170,9 @@ def __init__( if legacy is None: logger.warning_once( - f"You are using the default legacy behaviour of the {self.__class__}. This is" + f"You are using the default legacy behavior of the {self.__class__}. This is" " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." - " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" + " If you want to use the new behavior, set `legacy=False`. This should only be set if you understand what it" " means, and thoroughly read the reason why this was added as explained in" " https://github.com/huggingface/transformers/pull/24565") legacy = True @@ -215,7 +215,7 @@ def get_spm_processor(self, from_slow=False): with open(self.vocab_file, "rb") as f: sp_model = f.read() model_pb2 = import_protobuf( - f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)") + f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)") model = model_pb2.ModelProto.FromString(sp_model) normalizer_spec = model_pb2.NormalizerSpec() normalizer_spec.add_dummy_prefix = False diff --git a/examples/deepseek_mla/amd/README.md b/examples/deepseek_mla/amd/README.md index 32e869634..cc0fb576d 100644 --- a/examples/deepseek_mla/amd/README.md +++ b/examples/deepseek_mla/amd/README.md @@ -15,7 +15,7 @@ Key implementation differences between Hopper and MI300X architectures include: # Original shared memory allocation Q_shared = T.alloc_shared([block_H, dim], dtype) Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype) - + # Optimized register allocation Q_local = T.alloc_fragment([block_H, dim], dtype) Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype) @@ -47,5 +47,6 @@ Notably, TileLang achieves performance parity with hand-optimized assembly kerne - Improve compute-to-memory access ratios - Enhance parallelism through dimension-wise task distribution -## Acknowledgement -We would like to express our sincere gratitude to the AMD ROCm and Composable Kernel team for their outstanding contributions. We have learned a great deal from the ROCm software stack. \ No newline at end of file +## Acknowledgment + +We would like to express our sincere gratitude to the AMD ROCm and Composable Kernel team for their outstanding contributions. We have learned a great deal from the ROCm software stack. diff --git a/examples/gdn/README.md b/examples/gdn/README.md index 23a125fae..31dd2361e 100644 --- a/examples/gdn/README.md +++ b/examples/gdn/README.md @@ -10,5 +10,6 @@ The [chunk_delta_h](common/chunk_delta_h.py) implements the most critical forward kernel of GDN. It's a good start to understand the GDN logic and the TileLang optimization. -## Acknowledgements -This kernel was developed by Yu Cheng and Zhengju Tang following in-depth discussions with Xiaomi's LLM-Core Team (MiMo). \ No newline at end of file +## Acknowledgments + +This kernel was developed by Yu Cheng and Zhengju Tang following in-depth discussions with Xiaomi's LLM-Core Team (MiMo). diff --git a/pyproject.toml b/pyproject.toml index 1d8d3b2e4..a7d5534f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,8 @@ column_limit = 100 indent_width = 4 [tool.codespell] -ignore-words-list = "nd, te, ist, LOD, offen, NotIn, HSA" +builtin = "clear,rare,en-GB_to_en-US" +ignore-words = "docs/spelling_wordlist.txt" skip = [ "build", "3rdparty", diff --git a/tilelang/language/overrides/__init__.py b/tilelang/language/overrides/__init__.py index 1b87b7d0c..c900642fa 100644 --- a/tilelang/language/overrides/__init__.py +++ b/tilelang/language/overrides/__init__.py @@ -1,7 +1,7 @@ """TileLang-specific runtime overrides. Importing this package registers custom handlers that extend or override -behaviour from upstream TVMScript for TileLang semantics. +behavior from upstream TVMScript for TileLang semantics. """ # Register parser overrides upon import.