@@ -528,6 +528,15 @@ typedef struct {
528528} block_iq1_s;
529529static_assert (sizeof (block_iq1_s) == sizeof (ggml_fp16_t ) + QK_K/8  + QK_K/16 , " wrong iq1_s block size/padding" 
530530
531+ #define  QK4_NL  32 
532+ #define  QR4_NL  2 
533+ #define  QI4_NL  (QK4_NL / (4 *QR4_NL))
534+ typedef  struct  {
535+     half d;
536+     uint8_t  qs[QK4_NL/2 ];
537+ } block_iq4_nl;
538+ static_assert (sizeof (block_iq4_nl) == sizeof (ggml_fp16_t ) + QK4_NL/2 , " wrong iq4_nl block size/padding" 
539+ 
531540#define  WARP_SIZE  32 
532541#define  MATRIX_ROW_PADDING  512  //  last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
533542
@@ -1987,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
19871996
19881997}
19891998
1999+ static  const  __device__  int8_t  kvalues_iq4nl[16 ] = {-127 , -104 , -83 , -65 , -49 , -35 , -22 , -10 , 1 , 13 , 25 , 38 , 53 , 69 , 89 , 113 };
2000+ 
2001+ template <typename  dst_t >
2002+ static  __global__  void  dequantize_block_iq4_nl (const  void  * __restrict__  vx, dst_t  * __restrict__  yy) {
2003+ 
2004+     const  int  i   = blockIdx .x ;
2005+     const  block_iq4_nl * x = (const  block_iq4_nl *) vx + i*(QK_K/QK4_NL);
2006+ 
2007+     const  int  tid = threadIdx .x ;
2008+     const  int  il = tid/8 ; //  0...3
2009+     const  int  ib = tid%8 ; //  0...7
2010+     dst_t  * y = yy + i*QK_K + 32 *ib + 4 *il;
2011+     const  uint8_t   * q4 = x[ib].qs  + 4 *il;
2012+     const  float  d = (float )x[ib].d ;
2013+     for  (int  j = 0 ; j < 4 ; ++j) {
2014+         y[j+ 0 ] = d * kvalues_iq4nl[q4[j] & 0xf ];
2015+         y[j+16 ] = d * kvalues_iq4nl[q4[j] >>  4 ];
2016+     }
2017+ 
2018+ }
19902019
19912020static  __global__  void  dequantize_mul_mat_vec_q2_k (const  void  * __restrict__  vx, const  float  * __restrict__  yy, float  * __restrict__  dst, const  int  ncols, int  nrows) {
19922021
@@ -4732,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
47324761#endif 
47334762}
47344763
4764+ #if  __CUDA_ARCH__ >= MIN_CC_DP4A //  lowest compute capability for integer intrinsics
4765+ static  __device__  __forceinline__  void  get_int_from_table_16 (const  uint32_t  & q4, const  uint8_t  * values,
4766+         int  & val1, int  & val2) {
4767+ 
4768+     uint32_t  aux32; const  uint8_t  * q8 = (const  uint8_t  *)&aux32;
4769+     aux32 = q4 & 0x0f0f0f0f ;
4770+     uint16_t  v1 = values[q8[0 ]] | (values[q8[1 ]] << 8 );
4771+     uint16_t  v2 = values[q8[2 ]] | (values[q8[3 ]] << 8 );
4772+     val1 = v1 | (v2 << 16 );
4773+     aux32 = (q4 >> 4 ) & 0x0f0f0f0f ;
4774+     v1 = values[q8[0 ]] | (values[q8[1 ]] << 8 );
4775+     v2 = values[q8[2 ]] | (values[q8[3 ]] << 8 );
4776+     val2 = v1 | (v2 << 16 );
4777+ }
4778+ #endif 
4779+ 
4780+ static  __device__  __forceinline__  float  vec_dot_iq4_nl_q8_1 (
4781+     const  void  * __restrict__  vbq, const  block_q8_1 * __restrict__  bq8_1, const  int  & iqs) {
4782+ 
4783+     const  block_iq4_nl * bq = (const  block_iq4_nl *) vbq;
4784+ 
4785+ #if  __CUDA_ARCH__ >= MIN_CC_DP4A //  lowest compute capability for integer intrinsics
4786+     const  uint16_t  * q4 = (const  uint16_t  *)bq->qs  + 2 *iqs;
4787+     const  int32_t   * q8 = (const  int32_t   *)bq8_1->qs  + iqs;
4788+ 
4789+     const  uint8_t  * values = (const  uint8_t  *)kvalues_iq4nl;
4790+ 
4791+     int  v1, v2;
4792+     int  sumi1 = 0 , sumi2 = 0 ;
4793+     for  (int  l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
4794+         const  uint32_t  aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
4795+         get_int_from_table_16 (aux, values, v1, v2);
4796+         sumi1 = __dp4a (v1, q8[l+0 ], sumi1);
4797+         sumi2 = __dp4a (v2, q8[l+4 ], sumi2);
4798+     }
4799+ 
4800+ #else 
4801+     const  uint8_t  * q4 = bq->qs  + 4 *iqs;
4802+     const  int8_t   * q8 = bq8_1->qs  + 4 *iqs;
4803+ 
4804+     int  sumi1 = 0 , sumi2 = 0 ;
4805+     for  (int  l = 0 ; l < 4 *VDR_Q4_0_Q8_1_MMVQ; ++l) {
4806+         sumi1 += q8[l+ 0 ] * kvalues_iq4nl[q4[l] & 0xf ];
4807+         sumi2 += q8[l+16 ] * kvalues_iq4nl[q4[l] >>  4 ];
4808+     }
4809+ #endif 
4810+     const  float  d = (float )bq->d  * __low2float (bq8_1->ds );
4811+     return  d * (sumi1 + sumi2);
4812+ }
4813+ 
47354814template  <int  qk, int  qr, int  qi, bool  need_sum, typename  block_q_t , int  mmq_x, int  mmq_y, int  nwarps,
47364815              allocate_tiles_cuda_t  allocate_tiles, load_tiles_cuda_t  load_tiles, int  vdr, vec_dot_q_mul_mat_cuda_t  vec_dot>
47374816static  __device__  __forceinline__  void  mul_mat_q (
@@ -6777,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
67776856    dequantize_block_iq1_s<<<nb, 32 , 0 , stream>>> (vx, y);
67786857}
67796858
6859+ template <typename  dst_t >
6860+ static  void  dequantize_row_iq4_nl_cuda (const  void  * vx, dst_t  * y, const  int  k, cudaStream_t stream) {
6861+     const  int  nb = (k + QK_K - 1 ) / QK_K;
6862+     dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
6863+ }
6864+ 
67806865template  <typename  src_t , typename  dst_t >
67816866static  void  convert_unary_cuda (const  void  * __restrict__  vx, dst_t  * __restrict__  y, const  int  k, cudaStream_t stream) {
67826867    const  int  num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6818,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
68186903            return  dequantize_row_iq3_xxs_cuda;
68196904        case  GGML_TYPE_IQ1_S:
68206905            return  dequantize_row_iq1_s_cuda;
6906+         case  GGML_TYPE_IQ4_NL:
6907+             return  dequantize_row_iq4_nl_cuda;
68216908        case  GGML_TYPE_F32:
68226909            return  convert_unary_cuda<float >;
68236910        default :
@@ -6855,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
68556942            return  dequantize_row_iq3_xxs_cuda;
68566943        case  GGML_TYPE_IQ1_S:
68576944            return  dequantize_row_iq1_s_cuda;
6945+         case  GGML_TYPE_IQ4_NL:
6946+             return  dequantize_row_iq4_nl_cuda;
68586947        case  GGML_TYPE_F16:
68596948            return  convert_unary_cuda<half>;
68606949        default :
@@ -8599,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
85998688        case  GGML_TYPE_IQ2_XS:
86008689        case  GGML_TYPE_IQ3_XXS:
86018690        case  GGML_TYPE_IQ1_S:
8691+         case  GGML_TYPE_IQ4_NL:
86028692            return  max_compute_capability >= CC_RDNA2 ? 128  : 64 ;
86038693        default :
86048694            GGML_ASSERT (false );
@@ -8623,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
86238713        case  GGML_TYPE_IQ2_XS:
86248714        case  GGML_TYPE_IQ3_XXS:
86258715        case  GGML_TYPE_IQ1_S:
8716+         case  GGML_TYPE_IQ4_NL:
86268717            return  max_compute_capability >= CC_VOLTA ? 128  : 64 ;
86278718        case  GGML_TYPE_Q6_K:
86288719            return  64 ;
@@ -8724,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
87248815            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1 , vec_dot_iq1_s_q8_1>
87258816                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
87268817            break ;
8818+         case  GGML_TYPE_IQ4_NL:
8819+             mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
8820+                 (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8821+             break ;
87278822        default :
87288823            GGML_ASSERT (false );
87298824            break ;
@@ -11446,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1144611541                    return  false ;
1144711542                }
1144811543                ggml_type a_type = a->type ;
11449-                 if  (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
11544+                 if  (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
11545+                     a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL) {
1145011546                    if  (b->ne [1 ] == 1  && ggml_nrows (b) > 1 ) {
1145111547                        return  false ;
1145211548                    }
0 commit comments