hpcaitech · pre-commit-ci · Sep 1, 2025 · Sep 1, 2025
@@ -8,28 +8,28 @@ repos:
         args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+    rev: 6.0.1
     hooks:
       - id: isort
         name: sort all imports (python)
         args: ["--profile", "black"] # avoid conflict with black
 
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
+    rev: 25.1.0
     hooks:
     - id: black
       name: black formatter
       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.5
+    rev: v21.1.0
     hooks:
     - id: clang-format
       name: clang formatter
       types_or: [c++, c]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: check-yaml
       - id: check-merge-conflict

@@ -81,11 +81,11 @@ def restart(chatbot, txt):
     )
     with gr.Row():
         btn = gr.UploadButton("📁", file_types=["file"], file_count="multiple", size="sm")
-        restart_btn = gr.Button(str("\u21BB"), elem_id="restart-btn", scale=1)
+        restart_btn = gr.Button(str("\u21bb"), elem_id="restart-btn", scale=1)
         txt = gr.Textbox(
             scale=8,
             show_label=False,
-            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21BB to clear loaded files and restart chat",
+            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21bb to clear loaded files and restart chat",
             container=True,
             autofocus=True,
         )

@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import multiprocessing
 import time

@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import operator
 from dataclasses import dataclass

@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OpenMoE model."""
+"""PyTorch OpenMoE model."""
 import math
 from typing import List, Optional, Tuple, Union
 

@@ -1,6 +1,6 @@
 """This code is from NVIDIA apex:
-      https://github.com/NVIDIA/apex
-   with some changes. """
+   https://github.com/NVIDIA/apex
+with some changes."""
 
 import numbers
 

@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from collections import defaultdict

@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from typing import List

@@ -1,4 +1,4 @@
-""" PyTorch ChatGLM model. """
+"""PyTorch ChatGLM model."""
 
 from typing import List, Optional, Tuple
 

@@ -34,8 +34,8 @@ def __init__(
         self.do_whole_word_mask = do_whole_word_mask
         self.max_predictions_per_seq = max_predictions_per_seq
         self.vocab_words = list(tokenizer.vocab.keys())
-        self.rec = re.compile("[\u4E00-\u9FA5]")
-        self.whole_rec = re.compile("##[\u4E00-\u9FA5]")
+        self.rec = re.compile("[\u4e00-\u9fa5]")
+        self.whole_rec = re.compile("##[\u4e00-\u9fa5]")
 
         self.mlm_p = 0.15
         self.mlm_mask_p = 0.8

@@ -75,15 +75,15 @@ auto get_new_segment(
   return new_segment;
 }
 
-bool startsWith(const std::string &s, const std::string &sub) {
+bool startsWith(const std::string& s, const std::string& sub) {
   return s.find(sub) == 0 ? true : false;
 }
 
 auto create_whole_masked_lm_predictions(
-    std::vector<std::string> &tokens,
-    const std::vector<std::string> &original_tokens,
-    const std::vector<std::string> &vocab_words,
-    std::map<std::string, int> &vocab, const int max_predictions_per_seq,
+    std::vector<std::string>& tokens,
+    const std::vector<std::string>& original_tokens,
+    const std::vector<std::string>& vocab_words,
+    std::map<std::string, int>& vocab, const int max_predictions_per_seq,
     const double masked_lm_prob) {
   // for (auto item : vocab) {
   //     std::cout << "key=" << std::string(py::str(item.first)) << ", "

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa-v2 model."""
+"""PyTorch DeBERTa-v2 model."""
 
 import math
 from collections.abc import Sequence

@@ -1,7 +1,7 @@
 #include "cpu_adam_arm.h"
 
-void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_1(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -106,8 +106,8 @@ void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg,
   }
 }
 
-void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_4(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -192,8 +192,8 @@ void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg,
   }
 }
 
-void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_8(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -279,9 +279,9 @@ void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg,
 
 void AdamOptimizer::step(size_t step, float lr, float beta1, float beta2,
                          float epsilon, float weight_decay,
-                         bool bias_correction, torch::Tensor &params,
-                         torch::Tensor &grads, torch::Tensor &exp_avg,
-                         torch::Tensor &exp_avg_sq, float loss_scale) {
+                         bool bias_correction, torch::Tensor& params,
+                         torch::Tensor& grads, torch::Tensor& exp_avg,
+                         torch::Tensor& exp_avg_sq, float loss_scale) {
   auto params_c = params.contiguous();
   auto grads_c = grads.contiguous();
   auto exp_avg_c = exp_avg.contiguous();

@@ -11,15 +11,15 @@
 #include <arm_neon.h>
 #define SIMD_WIDTH 4
 
-inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype,
+inline float32x4_t simd_load_offset(const void* ptr, at::ScalarType dtype,
                                     size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float: {
-      auto ptr_f = reinterpret_cast<const float32_t *>(ptr);
+      auto ptr_f = reinterpret_cast<const float32_t*>(ptr);
       return vld1q_f32(ptr_f + offset);
     }
     case at::ScalarType::Half: {
-      auto ptr_h = reinterpret_cast<const float16_t *>(ptr);
+      auto ptr_h = reinterpret_cast<const float16_t*>(ptr);
       return vcvt_f32_f16(vld1_f16(ptr_h + offset));
     }
     // case at::ScalarType::BFloat16: {
@@ -31,20 +31,20 @@ inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype,
       break;
   }
 }
-inline float32x4_t simd_load(void const *ptr, at::ScalarType dtype) {
+inline float32x4_t simd_load(void const* ptr, at::ScalarType dtype) {
   return simd_load_offset(ptr, dtype, 0);
 }
 
-inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data,
+inline void simd_store_offset(void* ptr, at::ScalarType dtype, float32x4_t data,
                               size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float: {
-      auto ptr_f = reinterpret_cast<float32_t *>(ptr);
+      auto ptr_f = reinterpret_cast<float32_t*>(ptr);
       vst1q_f32(ptr_f + offset, data);
       break;
     }
     case at::ScalarType::Half: {
-      auto ptr_h = reinterpret_cast<float16_t *>(ptr);
+      auto ptr_h = reinterpret_cast<float16_t*>(ptr);
       vst1_f16(ptr_h + offset, vcvt_f16_f32(data));
       break;
     }
@@ -59,7 +59,7 @@ inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data,
   }
 }
 
-inline void simd_store(void *ptr, at::ScalarType dtype, float32x4_t data) {
+inline void simd_store(void* ptr, at::ScalarType dtype, float32x4_t data) {
   return simd_store_offset(ptr, dtype, data, 0);
 }
 
@@ -70,14 +70,14 @@ inline float32x4_t simd_set(float value) {
 
 #endif
 
-inline float scalar_load_offset(const void *ptr, at::ScalarType dtype,
+inline float scalar_load_offset(const void* ptr, at::ScalarType dtype,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      return *(reinterpret_cast<const float *>(ptr) + offset);
+      return *(reinterpret_cast<const float*>(ptr) + offset);
     case at::ScalarType::Half:
       return static_cast<float>(
-          *(reinterpret_cast<const at::Half *>(ptr) + offset));
+          *(reinterpret_cast<const at::Half*>(ptr) + offset));
     // case at::ScalarType::BFloat16:
     //   return static_cast<float>(
     //       *(reinterpret_cast<const at::BFloat16 *>(ptr) + offset));
@@ -87,14 +87,14 @@ inline float scalar_load_offset(const void *ptr, at::ScalarType dtype,
   }
 }
 
-inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data,
+inline void scalar_store_offset(void* ptr, at::ScalarType dtype, float data,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      *(reinterpret_cast<float *>(ptr) + offset) = data;
+      *(reinterpret_cast<float*>(ptr) + offset) = data;
       break;
     case at::ScalarType::Half:
-      *(reinterpret_cast<at::Half *>(ptr) + offset) = data;
+      *(reinterpret_cast<at::Half*>(ptr) + offset) = data;
       break;
       // case at::ScalarType::BFloat16:
       //   *(reinterpret_cast<at::BFloat16 *>(ptr) + offset) = data;
@@ -105,13 +105,13 @@ inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data,
   }
 }
 
-inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype,
+inline void* scalar_seek_offset(void* ptr, at::ScalarType dtype,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      return reinterpret_cast<float *>(ptr) + offset;
+      return reinterpret_cast<float*>(ptr) + offset;
     case at::ScalarType::Half:
-      return reinterpret_cast<at::Half *>(ptr) + offset;
+      return reinterpret_cast<at::Half*>(ptr) + offset;
     // case at::ScalarType::BFloat16:
     //   return reinterpret_cast<at::BFloat16 *>(ptr) + offset;
     default:
@@ -120,8 +120,8 @@ inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype,
   }
 }
 #define STEP(SPAN)                                                        \
-  void Step_##SPAN(void *_params, void *grads, void *_exp_avg,            \
-                   void *_exp_avg_sq, size_t _param_size,                 \
+  void Step_##SPAN(void* _params, void* grads, void* _exp_avg,            \
+                   void* _exp_avg_sq, size_t _param_size,                 \
                    at::ScalarType param_dtype, at::ScalarType grad_dtype, \
                    at::ScalarType exp_avg_dtype,                          \
                    at::ScalarType exp_avg_sq_dtype, float loss_scale = -1);
@@ -195,7 +195,7 @@ class AdamOptimizer {
   }
 
   void step(size_t step, float lr, float beta1, float beta2, float epsilon,
-            float weight_decay, bool bias_correction, torch::Tensor &params,
-            torch::Tensor &grads, torch::Tensor &exp_avg,
-            torch::Tensor &exp_avg_sq, float loss_scale);
+            float weight_decay, bool bias_correction, torch::Tensor& params,
+            torch::Tensor& grads, torch::Tensor& exp_avg,
+            torch::Tensor& exp_avg_sq, float loss_scale);
 };
@@ -9,36 +9,36 @@ namespace cuda {
 namespace utils {
 
 template <typename T, int VecSize>
-__device__ __inline__ void copy_zero(T *dst) {
+__device__ __inline__ void copy_zero(T* dst) {
   using VT = typename common::VecTypeTrait<T, VecSize>::Type;
-  *(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
+  *(reinterpret_cast<VT*>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
 }
 
 template <typename SrcT, typename DstT, int VecSize>
-__device__ __inline__ void copy(const SrcT *src, DstT *dst) {
+__device__ __inline__ void copy(const SrcT* src, DstT* dst) {
   using SrcVT = typename common::VecTypeTrait<SrcT, VecSize>::Type;
   using DstVT = typename common::VecTypeTrait<DstT, VecSize>::Type;
-  *(reinterpret_cast<DstVT *>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
-      *(reinterpret_cast<const SrcVT *>(src)));
+  *(reinterpret_cast<DstVT*>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
+      *(reinterpret_cast<const SrcVT*>(src)));
 }
 
 template <typename T, int VecSize>
-__device__ __inline__ void copy(const T *src, T *dst) {
+__device__ __inline__ void copy(const T* src, T* dst) {
   using VT = typename common::VecTypeTrait<T, VecSize>::Type;
-  *(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
+  *(reinterpret_cast<VT*>(dst)) = *(reinterpret_cast<const VT*>(src));
 }
 
 template <>
-__device__ __inline__ void copy<float, float, 8>(const float *src, float *dst) {
+__device__ __inline__ void copy<float, float, 8>(const float* src, float* dst) {
   // Since the maximum memory alignment length is 128 bits, we choose float4
   // here.
-  *(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
-  *(reinterpret_cast<float4 *>(dst + 4)) =
-      *(reinterpret_cast<const float4 *>(src + 4));
+  *(reinterpret_cast<float4*>(dst)) = *(reinterpret_cast<const float4*>(src));
+  *(reinterpret_cast<float4*>(dst + 4)) =
+      *(reinterpret_cast<const float4*>(src + 4));
 }
 
 template <typename T>
-int get_vec_size(const torch::Tensor &tensor) {
+int get_vec_size(const torch::Tensor& tensor) {
   uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr());
   const int max_aligned_size = 128;
   const int dtype_size = sizeof(T) * 8;