16
16
if (duration.count() > 1) \
17
17
printf("Time taken by function %s: %lld ms\n", name , duration.count());*/
18
18
19
- struct ggml_tensor * tensor_shaped_like (struct ggml_context * ctx, ggml_type type, std::vector<int64_t > shape, float value) {
19
+ struct ggml_tensor * tensor_shaped_like (struct ggml_context * ctx, struct ggml_allocr * allocr, ggml_type type, std::vector<int64_t > shape, float value) {
20
20
auto tensor = ggml_new_tensor (ctx, type, shape.size (), shape.data ());
21
+ ALLOC (tensor)
21
22
auto data_fp32 = (type == GGML_TYPE_F32)
22
23
? static_cast <float *>(tensor->data )
23
24
: nullptr ;
24
25
auto data_fp16 = (type == GGML_TYPE_F16)
25
26
? static_cast <ggml_fp16_t *>(tensor->data )
26
27
: nullptr ;
27
- auto size = ggml_nelements (tensor) ;
28
+ auto size = ggml_nelements (tensor);
28
29
for (int i = 0 ; i < size; ++i) {
29
30
if (type == GGML_TYPE_F16) {
30
31
data_fp16[i] = ggml_fp16_t (value);
@@ -37,26 +38,30 @@ struct ggml_tensor* tensor_shaped_like(struct ggml_context* ctx, ggml_type type,
37
38
return tensor;
38
39
}
39
40
40
- struct ggml_tensor * tensor_zeros (struct ggml_context * ctx, std::vector<int64_t > shape) {
41
- return tensor_shaped_like (ctx, DEFAULT_TENSOR_TYPE, std::move (shape), 0 );
41
+ struct ggml_tensor * tensor_zeros (struct ggml_context * ctx, struct ggml_allocr * allocr, std::vector<int64_t > shape) {
42
+ return tensor_shaped_like (ctx, allocr, DEFAULT_TENSOR_TYPE, std::move (shape), 0 );
42
43
}
43
44
45
+ void * allocate_temp_i32_array (const std::vector<int >& arr) {
46
+ void * ptr = malloc (arr.size () * sizeof (int32_t ));
47
+ memcpy (ptr, arr.data (), arr.size () * sizeof (int ));
48
+ return ptr;
49
+ }
44
50
45
- struct ggml_tensor * allocate_temp_i32_array ( struct ggml_context * ctx, const std::vector<int >& arr) {
46
- auto tensor = ggml_new_tensor_1d (ctx, GGML_TYPE_I32, arr.size ());
47
- memcpy (tensor-> data , arr.data (), arr.size () * sizeof (int ));
48
- return tensor ;
51
+ void * allocate_temp_f32_array ( const std::vector<float >& arr) {
52
+ void * ptr = malloc ( arr.size () * sizeof ( float ));
53
+ memcpy (ptr , arr.data (), arr.size () * sizeof (float ));
54
+ return ptr ;
49
55
}
50
56
51
- struct ggml_tensor * allocate_temp_f32_array (struct ggml_context * ctx, const std::vector< float >& arr ) {
52
- auto tensor = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, arr. size ());
53
- memcpy (tensor-> data , arr. data (), arr. size () * sizeof ( float ) );
54
- return tensor ;
57
+ struct ggml_tensor * cleanup (struct ggml_context * ctx, struct ggml_tensor * result, void * userdata ) {
58
+ return ggml_map_custom1_inplace (ctx, result, []( struct ggml_tensor * dst, const struct ggml_tensor * src, int ith, int nth, void * userdata) {
59
+ free (userdata );
60
+ }, 1 , userdata) ;
55
61
}
56
62
57
63
template <class T > T* get_temp_data (void * userdata) {
58
- auto tensor = (struct ggml_tensor *) userdata;
59
- return (T*) tensor->data ;
64
+ return (T*) userdata;
60
65
}
61
66
62
67
inline int compute_index (const ggml_tensor* tensor, int i, int j, int k) {
@@ -165,8 +170,10 @@ struct ggml_tensor* name##suffix##_impl(struct ggml_context* ctx, struct ggml_te
165
170
else \
166
171
dst_ptr[w] = name<T>(src_ptr[w]);\
167
172
};\
168
- };\
169
- return ggml_map_custom1##suffix (ctx, tensor, func, ggml_nelements (tensor) >= get_thread_count () ? GGML_N_TASKS_MAX : 1 , allocate_temp_f32_array (ctx, {(float ) value})); \
173
+ }; \
174
+ auto userdata = allocate_temp_f32_array ({(float ) value}); \
175
+ auto result = ggml_map_custom1##suffix (ctx, tensor, func, ggml_nelements (tensor) >= get_thread_count () ? GGML_N_TASKS_MAX : 1 , userdata); \
176
+ return cleanup (ctx, result, userdata); \
170
177
} \
171
178
\
172
179
struct ggml_tensor * tensor_##name##suffix(struct ggml_context * ctx, struct ggml_tensor * tensor, double _extra = 0.0 ) {\
@@ -212,7 +219,9 @@ template<class T> struct ggml_tensor* flip_3d_impl(struct ggml_context* ctx, str
212
219
});
213
220
};
214
221
215
- return ggml_map_custom1 (ctx, tensor, func, GGML_N_TASKS_MAX, allocate_temp_i32_array (ctx, {along}));
222
+ auto userdata = allocate_temp_i32_array ({along});
223
+ auto result = ggml_map_custom1 (ctx, tensor, func, GGML_N_TASKS_MAX, userdata);
224
+ return cleanup (ctx, result, userdata);
216
225
}
217
226
218
227
template <class T > struct ggml_tensor * per_row_cumsum_impl (struct ggml_context * ctx, struct ggml_tensor * tensor) {
@@ -264,19 +273,18 @@ template <class T> struct ggml_tensor* max_element_impl(struct ggml_context* ctx
264
273
return ggml_view_1d (ctx, max, 1 , (ggml_nelements (max)-1 ) * ggml_element_size (max));
265
274
}
266
275
267
- template <class T > struct ggml_tensor * repeat_impl (struct ggml_context * ctx, struct ggml_tensor * tensor, int64_t new_dim_size, int across) {
276
+ template <class T > struct ggml_tensor * repeat_impl (struct ggml_context * ctx, struct ggml_allocr * allocr, struct ggml_tensor * tensor, int64_t new_dim_size, int across) {
268
277
ASSERT (tensor->n_dims == 1 , " Only 1d tensors supported" );
269
278
ASSERT (across == 0 || across == 1 , " Only across == 0 || 1 supported" );
270
279
std::vector<int64_t > shape = {across == 0 ? new_dim_size : tensor->ne [0 ], across == 1 ? new_dim_size : tensor->ne [0 ]};
271
- auto new_tensor = tensor_zeros (ctx, shape);
272
-
273
- ggml_custom2_op_t func = [](struct ggml_tensor * dst, const struct ggml_tensor * src0, const struct ggml_tensor * src1, int ith, int nth, void * userdata) {
280
+ auto new_tensor = tensor_zeros (ctx, allocr, shape);
281
+ ggml_custom2_op_t func = [](struct ggml_tensor * dst, const struct ggml_tensor * _, const struct ggml_tensor * src1, int ith, int nth, void * userdata) {
274
282
START_BENCH ()
275
283
auto across = *get_temp_data<int >(userdata);
276
284
auto * dst_ptr = (T*)dst->data ;
277
285
auto * src1_ptr = (T*)src1->data ;
278
286
279
- size_t size = ggml_element_size (src0 );
287
+ size_t size = ggml_element_size (dst );
280
288
for_each_element_threaded (dst, ith, nth, [&] (int i, int j, int k) {
281
289
auto idx1 = ((across == 0 ? j : i) * src1->nb [0 ]) / size;
282
290
auto dst_idx = (i * dst->nb [0 ] + j * dst->nb [1 ]) / size;
@@ -285,14 +293,17 @@ template<class T> struct ggml_tensor* repeat_impl(struct ggml_context* ctx, stru
285
293
});
286
294
PRINT_BENCH (" repeat" )
287
295
};
288
- return ggml_map_custom2_inplace (
296
+ auto userdata = allocate_temp_i32_array ({across});
297
+ // inplace breaks?
298
+ auto result = ggml_map_custom2 (
289
299
ctx,
290
300
new_tensor,
291
301
tensor,
292
302
func,
293
- 1 ,
294
- allocate_temp_i32_array (ctx, {across})
303
+ GGML_N_TASKS_MAX ,
304
+ userdata
295
305
);
306
+ return cleanup (ctx, result, userdata);
296
307
}
297
308
298
309
template <class T > struct ggml_tensor * compare_impl (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, std::function<bool (float , float )> compare_op) {
@@ -364,14 +375,16 @@ template<class T> struct ggml_tensor* set_inplace_impl(struct ggml_context* ctx,
364
375
* */
365
376
PRINT_BENCH (" set_inplace" )
366
377
};
367
- return ggml_map_custom2_inplace (
378
+ auto userdata = allocate_temp_i32_array ({start0, start1, start2});
379
+ auto result = ggml_map_custom2_inplace (
368
380
ctx,
369
381
tensor,
370
382
values,
371
383
func,
372
384
1 ,
373
- allocate_temp_i32_array (ctx, {start0, start1, start2})
385
+ userdata
374
386
);
387
+ return cleanup (ctx, result, userdata);
375
388
}
376
389
377
390
template <class T > struct ggml_tensor * add_bias_inplace_impl (struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_tensor * bias) {
@@ -566,27 +579,24 @@ struct ggml_tensor* conv_1d_inplace_impl_fp16(struct ggml_context* ctx, struct g
566
579
void im2col_multi_channel (float * dst_data, const float * src_data, int num_channels, int input_length, int output_length, int kernel_size, int stride, int padding, int dilation, int ith, int nth) {
567
580
// Precompute constants that are invariant across the inner loops
568
581
int stride_times_dilation = stride * dilation;
569
- int part_size = output_length / nth;
570
- int offset = ith * part_size;
571
- int lane = 4 ;
582
+ int input_length_times_num_channels = input_length * num_channels;
583
+
584
+ for (int c = 0 ; c < num_channels; ++c) {
585
+ int channel_base_index = c * input_length;
586
+ int channel_end_index = channel_base_index + input_length;
572
587
573
- #pragma clang loop vectorize(enable)
574
- for (int i = offset; i < offset + part_size; ++i) {
575
588
for (int j = 0 ; j < kernel_size; ++j) {
576
- #pragma unroll
577
- for (int c = 0 ; c < num_channels; c += lane) {
578
- int channel_base_index = c * input_length;
579
- int channel_end_index = channel_base_index + input_length;
580
- int dilation_offset = j * dilation - padding;
589
+ int dilation_offset = j * dilation - padding;
581
590
591
+ for (int i = 0 ; i < output_length; ++i) {
582
592
int src_index = channel_base_index + i * stride_times_dilation + dilation_offset;
583
- int dst_index = (i * kernel_size + j) * num_channels + c; // Adjusted for transposition
593
+ int dst_index = (c * kernel_size + j) * output_length + i;
584
594
585
595
// Check bounds only once per loop iteration
586
596
if (src_index >= channel_base_index && src_index < channel_end_index) {
587
597
dst_data[dst_index] = src_data[src_index];
588
- // float32x4_t input_data = vld1q_f32(src_data + src_index);
589
- // vst1q_f32( dst_data + dst_index, input_data) ;
598
+ } else {
599
+ dst_data[ dst_index] = 0 ;
590
600
}
591
601
}
592
602
}
@@ -603,8 +613,8 @@ struct ggml_tensor* im2col_impl(struct ggml_context* ctx, struct ggml_tensor* we
603
613
int32_t batch_size = inputs->ne [2 ];
604
614
605
615
int32_t output_columns = ((in_length + 2 * padding - dilation * (kernel_size - 1 ) - 1 ) / stride) + 1 ;
606
- auto dst = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, i_in_channels * kernel_size, output_columns, 1 , 1 );
607
- memset (dst->data , 0 , ggml_nbytes (dst));
616
+ auto dst = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, output_columns, i_in_channels * kernel_size, 1 , 1 );
617
+ // memset(dst->data, 0, ggml_nbytes(dst));
608
618
609
619
// printf("Conv1d with kernel_size = %d, dilation = %d, padding = %d, channels = %d \n", kernel_size, dilation, padding, i_in_channels);
610
620
@@ -623,14 +633,18 @@ struct ggml_tensor* im2col_impl(struct ggml_context* ctx, struct ggml_tensor* we
623
633
im2col_multi_channel (dst_ptr, inputs_ptr, channel_count, in_length, output_columns, kernel_size, stride, padding, dilation, ith, nth);
624
634
};
625
635
636
+ auto userdata = allocate_temp_i32_array ({stride, padding, dilation, output_columns, kernel_size, in_length, w_in_channels});
626
637
auto result = ggml_map_custom2_inplace (
627
638
ctx,
628
639
dst,
629
640
inputs,
630
641
func,
631
642
GGML_N_TASKS_MAX,
632
- allocate_temp_i32_array (ctx, {stride, padding, dilation, output_columns, kernel_size, in_length, w_in_channels})
643
+ userdata
633
644
);
645
+ result = cleanup (ctx, result, userdata);
646
+ // result = ggml_permute(ctx, result, 1, 0, 2, 3);
647
+ // result = ggml_cont(ctx, result);
634
648
return result;
635
649
}
636
650
@@ -836,8 +850,8 @@ struct ggml_tensor* tensor_per_row_cumsum(struct ggml_context* ctx, struct ggml_
836
850
TENSOR_OP_IMPL (per_row_cumsum, tensor, ctx, tensor);
837
851
}
838
852
839
- struct ggml_tensor * tensor_repeat (struct ggml_context * ctx, struct ggml_tensor * tensor, size_t new_dim_size, int across) {
840
- TENSOR_OP_IMPL (repeat, tensor, ctx, tensor, new_dim_size, across);
853
+ struct ggml_tensor * tensor_repeat (struct ggml_context * ctx, struct ggml_allocr * allocr, struct ggml_tensor * tensor, size_t new_dim_size, int across) {
854
+ TENSOR_OP_IMPL (repeat, tensor, ctx, allocr, tensor, new_dim_size, across);
841
855
}
842
856
843
857
struct ggml_tensor * tensor_compare (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, std::function<bool (float , float )> compare_op) {
0 commit comments