@@ -3122,23 +3122,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31223122 // inspired from resampler of Qwen-VL:
31233123 // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
31243124 // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3125- struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
31263125 int embed_dim = clip_n_mmproj_embd (ctx);
31273126
31283127 // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
31293128 auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
31303129
3131- std::vector<float > pos_data (ggml_nelements (pos_embed));
3132- float * data = pos_data.data ();
3130+ std::vector<float > pos_embed (embed_dim * pos_w * pos_h);
31333131 for (int i = 0 ; i < pos_w * pos_h; ++i){
31343132 for (int j = 0 ; j < embed_dim; ++j){
3135- data [i * embed_dim + j] = pos_embed_t [i][j];
3133+ pos_embed [i * embed_dim + j] = pos_embed_t [i][j];
31363134 }
31373135 }
31383136
3139- ggml_backend_tensor_set ( pos_embed, data, 0 , ggml_nbytes ( pos_embed) );
3137+ set_input_f32 ( " pos_embed" , pos_embed);
31403138 } break ;
31413139 case PROJECTOR_TYPE_QWEN2VL:
3140+ {
3141+ const int pw = image_size_width / patch_size;
3142+ const int ph = image_size_height / patch_size;
3143+ std::vector<int > positions (num_positions * 4 );
3144+
3145+ int ptr = 0 ;
3146+ for (int y = 0 ; y < ph; y += 2 ) {
3147+ for (int x = 0 ; x < pw; x += 2 ) {
3148+ for (int dy = 0 ; dy < 2 ; dy++) {
3149+ for (int dx = 0 ; dx < 2 ; dx++) {
3150+ positions[ ptr] = y + dy;
3151+ positions[ num_patches + ptr] = x + dx;
3152+ positions[2 * num_patches + ptr] = y + dy;
3153+ positions[3 * num_patches + ptr] = x + dx;
3154+ ptr++;
3155+ }
3156+ }
3157+ }
3158+ }
3159+
3160+ set_input_i32 (" positions" , positions);
3161+ } break ;
31423162 case PROJECTOR_TYPE_QWEN25VL:
31433163 {
31443164 // pw * ph = number of tokens output by ViT after apply patch merger
@@ -3154,10 +3174,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31543174
31553175 if (use_window_attn) {
31563176 const int attn_window_size = 112 ;
3157- struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
3158- struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
3159- struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
3160-
31613177 const int grid_window = attn_window_size / patch_size / merge_ratio;
31623178 int dst = 0 ;
31633179 // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3175,8 +3191,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31753191 for (int dy = 0 ; dy < win_h; dy++) {
31763192 for (int dx = 0 ; dx < win_w; dx++) {
31773193 const int src = (y + dy) * pw + (x + dx);
3178- assert (src < (int )idx.size ());
3179- assert (dst < (int )inv_idx.size ());
3194+ GGML_ASSERT (src < (int )idx.size ());
3195+ GGML_ASSERT (dst < (int )inv_idx.size ());
31803196 idx [src] = dst;
31813197 inv_idx[dst] = src;
31823198 dst++;
@@ -3194,40 +3210,37 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31943210 }
31953211 }
31963212
3197- ggml_backend_tensor_set ( window_idx, idx. data (), 0 , ggml_nbytes (window_idx) );
3198- ggml_backend_tensor_set ( inv_window_idx, inv_idx. data (), 0 , ggml_nbytes (inv_window_idx) );
3199- ggml_backend_tensor_set ( window_mask, mask. data (), 0 , ggml_nbytes (window_mask) );
3213+ set_input_i32 ( " window_idx" , idx);
3214+ set_input_i32 ( " inv_window_idx" , inv_idx);
3215+ set_input_f32 ( " window_mask" , mask);
32003216 } else {
3201- std::iota (idx.begin (), idx.end (), 0 );
3202- // std::iota(inv_idx.begin(), inv_idx.end(), 0);
3217+ for (int i = 0 ; i < ph * pw; i++) {
3218+ idx[i] = i;
3219+ }
32033220 }
32043221
3205- struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
32063222 const int mpow = merge_ratio * merge_ratio;
3207- std::vector<int > positions_data (ggml_nelements (positions));
3208- int * data = positions_data.data ();
3223+ std::vector<int > positions (num_positions * 4 );
32093224
32103225 int ptr = 0 ;
3211- for (int y = 0 ; y < iph; y += merge_ratio)
3212- {
3213- for (int x = 0 ; x < ipw; x += merge_ratio)
3214- {
3226+ for (int y = 0 ; y < iph; y += merge_ratio) {
3227+ for (int x = 0 ; x < ipw; x += merge_ratio) {
32153228 for (int dy = 0 ; dy < 2 ; dy++) {
32163229 for (int dx = 0 ; dx < 2 ; dx++) {
32173230 auto remap = idx[ptr / mpow];
3218- remap = remap * mpow + (ptr % mpow);
3231+ remap = ( remap * mpow) + (ptr % mpow);
32193232
3220- data [ remap] = y + dy;
3221- data [ num_patches + remap] = x + dx;
3222- data [2 * num_patches + remap] = y + dy;
3223- data [3 * num_patches + remap] = x + dx;
3233+ positions [ remap] = y + dy;
3234+ positions [ num_patches + remap] = x + dx;
3235+ positions [2 * num_patches + remap] = y + dy;
3236+ positions [3 * num_patches + remap] = x + dx;
32243237 ptr++;
32253238 }
32263239 }
32273240 }
32283241 }
32293242
3230- ggml_backend_tensor_set ( positions, data, 0 , ggml_nbytes ( positions) );
3243+ set_input_i32 ( " positions" , positions);
32313244 } break ;
32323245 case PROJECTOR_TYPE_PIXTRAL:
32333246 {
0 commit comments