diff --git a/csrc/xpu_kernels.cpp b/csrc/xpu_kernels.cpp index efc5e6fbe..8ee8add98 100644 --- a/csrc/xpu_kernels.cpp +++ b/csrc/xpu_kernels.cpp @@ -94,7 +94,7 @@ inline float dDequantizeNF4(unsigned char val) { } template -SYCL_EXTERNAL void kDequantizeBlockwise::operator()(sycl::and_item<1> item) const { +SYCL_EXTERNAL void kDequantizeBlockwise::operator()(sycl::nd_item<1> item) const { const int base_idx = item.get_group(0) * TILE_SIZE; size_t local_idx = item.get_local_id(0) * NUM_PER_TH; float local_abs_max = -FLT_MAX; @@ -172,7 +172,7 @@ SYCL_EXTERNAL void kDequantizeBlockwise::op template SYCL_EXTERNAL void - kgemv_4bit_inference::operator()(sycl::and_item<1> item) const { + kgemv_4bit_inference::operator()(sycl::nd_item<1> item) const { size_t idx = item.get_local_id(); const int sg_idx = idx / SUBG_SIZE; const int sg_lane = idx % SUBG_SIZE; diff --git a/csrc/xpu_kernels.h b/csrc/xpu_kernels.h index bad6d4ca8..caa7e6716 100644 --- a/csrc/xpu_kernels.h +++ b/csrc/xpu_kernels.h @@ -6,7 +6,7 @@ template class kDequantizeBlockwise { public: - SYCL_EXTERNAL void operator()(sycl::and_item<1> item) const; + SYCL_EXTERNAL void operator()(sycl::nd_item<1> item) const; kDequantizeBlockwise(float* code_, uint8_t* A_, float* absmax_, T* out_, const int blocksize_, const int n_) : code(code_), A(A_), absmax(absmax_), out(out_), blocksize(blocksize_), n(n_) {} @@ -22,7 +22,7 @@ template class kDequa template class kgemv_4bit_inference { public: - SYCL_EXTERNAL void operator()(sycl::and_item<1> item) const; + SYCL_EXTERNAL void operator()(sycl::nd_item<1> item) const; kgemv_4bit_inference( int M_, int N_, int K_, T* A_, unsigned char* B_, float* absmax_, const float* datatype_, T* out_, int lda_, diff --git a/csrc/xpu_ops.cpp b/csrc/xpu_ops.cpp index 37ef92973..aa6ac808f 100644 --- a/csrc/xpu_ops.cpp +++ b/csrc/xpu_ops.cpp @@ -16,7 +16,7 @@ void dequantizeBlockwise( sycl::range<1> global_range{(size_t)workgroup_num * (size_t)workgroup_size}; kDequantizeBlockwise kfn(code, A, absmax, out, blocksize / 2, n); sycl_kernel_submit( - sycl::and_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)), queue, kfn + sycl::nd_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)), queue, kfn ); } else { const int workgroup_num = (n + tile_size - 1) / tile_size; @@ -24,7 +24,7 @@ void dequantizeBlockwise( sycl::range<1> global_range{(size_t)workgroup_num * (size_t)workgroup_size}; kDequantizeBlockwise kfn(code, A, absmax, out, blocksize, n); sycl_kernel_submit( - sycl::and_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)), queue, kfn + sycl::nd_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)), queue, kfn ); } } @@ -47,7 +47,7 @@ void gemv_4bit_inference( ); sycl_comp_kernel_submit( - sycl::and_range<1>(sycl::range<1>(GROUP_SIZE * workgroup_num), sycl::range<1>(GROUP_SIZE)), queue, kfn + sycl::nd_range<1>(sycl::range<1>(GROUP_SIZE * workgroup_num), sycl::range<1>(GROUP_SIZE)), queue, kfn ); } diff --git a/csrc/xpu_ops.h b/csrc/xpu_ops.h index fa395fcc4..142d6c161 100644 --- a/csrc/xpu_ops.h +++ b/csrc/xpu_ops.h @@ -12,14 +12,14 @@ #include template -static inline void sycl_kernel_submit(sycl::and_range range, sycl::queue q, ker_t ker) { +static inline void sycl_kernel_submit(sycl::nd_range range, sycl::queue q, ker_t ker) { auto cgf = [&](::sycl::handler& cgh) [[sycl::reqd_sub_group_size(subgroup_size)]] { cgh.parallel_for(range, ker); }; q.submit(cgf); } template -static inline void sycl_comp_kernel_submit(sycl::and_range range, sycl::queue q, ker_t ker) { +static inline void sycl_comp_kernel_submit(sycl::nd_range range, sycl::queue q, ker_t ker) { auto cgf = [&](::sycl::handler& cgh) [[sycl::reqd_sub_group_size(subgroup_size)]] { ker.sycl_ker_local_memory_creation(cgh); cgh.parallel_for(range, ker); diff --git a/tests/test_functional.py b/tests/test_functional.py index d201bc8ec..25844d20f 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1238,8 +1238,8 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double max_errs3 = [] # Large number of iterations is excessive and slow on CPU. - # Keep for CUDA for now. - iters = 100 if device == "cuda" else 10 + # Keep for CUDA/XPU for now. + iters = 10 if device == "cpu" else 100 for i in range(iters): if kind == "fc1":