File tree Expand file tree Collapse file tree 2 files changed +13
-2
lines changed Expand file tree Collapse file tree 2 files changed +13
-2
lines changed Original file line number Diff line number Diff line change 22
33#include  " common.cuh" 
44
5+ 
6+ static  __device__  __forceinline__  unsigned  int  ggml_cuda_ctva_generic_to_shared (void  * generic_ptr) {
7+ #ifdef  CP_ASYNC_AVAILABLE
8+     return  __cvta_generic_to_shared (generic_ptr);
9+ #else 
10+     GGML_UNUSED (generic_ptr);
11+     NO_DEVICE_CODE;
12+     return  -1 ;
13+ #endif  //  CP_ASYNC_AVAILABLE
14+ }
15+ 
516//  Copies data from global to shared memory, cg == cache global.
617//  Both the src and dst pointers must be aligned to 16 bit.
718//  Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
Original file line number Diff line number Diff line change @@ -112,7 +112,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
112112    //  The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
113113
114114    if  (use_cp_async) {
115-         const  unsigned  int  tile_KV_32 = __cvta_generic_to_shared (tile_KV);
115+         const  unsigned  int  tile_KV_32 = ggml_cuda_cvta_generic_to_shared (tile_KV);
116116
117117        constexpr  int  preload = 64 ;
118118        constexpr  int  h2_per_chunk = 16 /sizeof (half2);
@@ -186,7 +186,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
186186        constexpr  int  cols_per_warp = 8 *WARP_SIZE/nbatch_fa;
187187        constexpr  int  stride_j = nwarps * cols_per_warp;
188188
189-         const  unsigned  int  tile_mask_32 = __cvta_generic_to_shared (tile_mask);
189+         const  unsigned  int  tile_mask_32 = ggml_cuda_cvta_generic_to_shared (tile_mask);
190190
191191#pragma  unroll
192192        for  (int  j0 = 0 ; j0 < ncols1; j0 += stride_j) {
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments