Skip to content

Commit 9268c1a

Browse files
committed
improve memory usage
1 parent 89f767b commit 9268c1a

File tree

9 files changed

+186
-147
lines changed

9 files changed

+186
-147
lines changed

CMakeLists.txt

+5-10
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,14 @@ project(vits VERSION 0.1.0)
44
set(CMAKE_C_STANDARD 20)
55
set(CMAKE_CXX_STANDARD 20)
66

7-
if(CMAKE_BUILD_TYPE MATCHES debug)
7+
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
88
# Set the macro VITS_DEBUG=True
99
message("Building in debug mode")
10-
add_compile_definitions(VITS_DEBUG=1)
11-
add_compile_definitions(GGML_PERF=1)
12-
add_compile_definitions(GGML_DEBUG=10)
13-
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fsanitize=address")
14-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
15-
set(CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fsanitize=address")
10+
add_compile_options(-fsanitize=address)
11+
add_link_options(-fsanitize=address)
1612
else()
17-
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -march=native -flto -fno-omit-frame-pointer")
18-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto -fno-omit-frame-pointer")
19-
set(CMAKE_LINKER_FLAGS_RELEASE "${CMAKE_LINKER_FLAGS_RELEASE} -O3 -march=native -flto -fno-omit-frame-pointer")
13+
add_compile_options(-O3 -march=native -flto -fno-omit-frame-pointer)
14+
add_link_options(-O3 -march=native -flto -fno-omit-frame-pointer)
2015
endif()
2116

2217
#message("CMAKE_C_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")

scripts/export_vits.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def serialize_model_to_binary(config, state_dict, tokenizer, file_name):
6969
f.write(struct.pack('<I', len(tensor_bytes)))
7070
f.write(tensor_bytes)
7171

72-
def remove_weight_norm_and_convert_to_fp16(module):
72+
def remove_weight_norm_and_convert_to_fp16(module, full_name=''):
7373
import torch
7474
import torch.nn.utils.parametrize as parametrize
7575

@@ -83,11 +83,12 @@ def remove_weight_norm_and_convert_to_fp16(module):
8383
# Optionally print a message
8484
print(f"Removed weight norm")
8585

86-
submodule.weight.data = submodule.weight.data.to(torch.float16)
87-
print(f"Converted {name} weights to float16")
86+
#if not 'resblocks' in full_name:
87+
#submodule.weight.data = submodule.weight.data.to(torch.float16)
88+
#print(f"Converted {name} weights to float16")
8889

8990
# Recursively apply to children modules
90-
remove_weight_norm_and_convert_to_fp16(submodule)
91+
remove_weight_norm_and_convert_to_fp16(submodule, full_name + '.' + name)
9192

9293
return module
9394

src/include/common.h

+10
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <stdint.h>
99
#include <thread>
1010

11+
const int MEGABYTE = 1024 * 1024;
12+
1113
static uint32_t read_number(std::ifstream& file) {
1214
uint32_t number;
1315
file.read(reinterpret_cast<char*>(&number), sizeof(uint32_t));
@@ -18,6 +20,14 @@ static int get_thread_count() {
1820
return std::max((int)std::thread::hardware_concurrency(), 6);
1921
}
2022

23+
#define ALLOC(tensor) \
24+
do { \
25+
if (allocr)\
26+
ggml_allocr_alloc(allocr, tensor); \
27+
else\
28+
ASSERT(!ggml_get_no_alloc(ctx), "Failed mem initialization") \
29+
} while(0);
30+
2131
#define DEFAULT_TENSOR_TYPE GGML_TYPE_F32
2232
#define DEFAULT_TYPE float
2333

src/include/custom-ops.h

+59-45
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,16 @@
1616
if (duration.count() > 1) \
1717
printf("Time taken by function %s: %lld ms\n", name , duration.count());*/
1818

19-
struct ggml_tensor* tensor_shaped_like(struct ggml_context* ctx, ggml_type type, std::vector<int64_t> shape, float value) {
19+
struct ggml_tensor* tensor_shaped_like(struct ggml_context* ctx, struct ggml_allocr* allocr, ggml_type type, std::vector<int64_t> shape, float value) {
2020
auto tensor = ggml_new_tensor(ctx, type, shape.size(), shape.data());
21+
ALLOC(tensor)
2122
auto data_fp32 = (type == GGML_TYPE_F32)
2223
? static_cast<float*>(tensor->data)
2324
: nullptr;
2425
auto data_fp16 = (type == GGML_TYPE_F16)
2526
? static_cast<ggml_fp16_t*>(tensor->data)
2627
: nullptr;
27-
auto size = ggml_nelements(tensor) ;
28+
auto size = ggml_nelements(tensor);
2829
for (int i = 0; i < size; ++i) {
2930
if (type == GGML_TYPE_F16) {
3031
data_fp16[i] = ggml_fp16_t(value);
@@ -37,26 +38,30 @@ struct ggml_tensor* tensor_shaped_like(struct ggml_context* ctx, ggml_type type,
3738
return tensor;
3839
}
3940

40-
struct ggml_tensor* tensor_zeros(struct ggml_context* ctx, std::vector<int64_t> shape) {
41-
return tensor_shaped_like(ctx, DEFAULT_TENSOR_TYPE, std::move(shape), 0);
41+
struct ggml_tensor* tensor_zeros(struct ggml_context* ctx, struct ggml_allocr* allocr, std::vector<int64_t> shape) {
42+
return tensor_shaped_like(ctx, allocr, DEFAULT_TENSOR_TYPE, std::move(shape), 0);
4243
}
4344

45+
void* allocate_temp_i32_array(const std::vector<int>& arr) {
46+
void* ptr = malloc(arr.size() * sizeof(int32_t));
47+
memcpy(ptr, arr.data(), arr.size() * sizeof(int));
48+
return ptr;
49+
}
4450

45-
struct ggml_tensor* allocate_temp_i32_array(struct ggml_context* ctx, const std::vector<int>& arr) {
46-
auto tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, arr.size());
47-
memcpy(tensor->data, arr.data(), arr.size() * sizeof(int));
48-
return tensor;
51+
void* allocate_temp_f32_array(const std::vector<float>& arr) {
52+
void* ptr = malloc(arr.size() * sizeof(float));
53+
memcpy(ptr, arr.data(), arr.size() * sizeof(float));
54+
return ptr;
4955
}
5056

51-
struct ggml_tensor* allocate_temp_f32_array(struct ggml_context* ctx, const std::vector<float>& arr) {
52-
auto tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, arr.size());
53-
memcpy(tensor->data, arr.data(), arr.size() * sizeof(float));
54-
return tensor;
57+
struct ggml_tensor* cleanup(struct ggml_context* ctx, struct ggml_tensor* result, void* userdata) {
58+
return ggml_map_custom1_inplace(ctx, result, [](struct ggml_tensor* dst, const struct ggml_tensor* src, int ith, int nth, void* userdata) {
59+
free(userdata);
60+
}, 1, userdata);
5561
}
5662

5763
template<class T> T* get_temp_data(void* userdata) {
58-
auto tensor = (struct ggml_tensor*) userdata;
59-
return (T*) tensor->data;
64+
return (T*) userdata;
6065
}
6166

6267
inline int compute_index(const ggml_tensor* tensor, int i, int j, int k) {
@@ -165,8 +170,10 @@ struct ggml_tensor* name##suffix##_impl(struct ggml_context* ctx, struct ggml_te
165170
else \
166171
dst_ptr[w] = name<T>(src_ptr[w]);\
167172
};\
168-
};\
169-
return ggml_map_custom1##suffix(ctx, tensor, func, ggml_nelements(tensor) >= get_thread_count() ? GGML_N_TASKS_MAX : 1, allocate_temp_f32_array(ctx, {(float) value})); \
173+
}; \
174+
auto userdata = allocate_temp_f32_array({(float) value}); \
175+
auto result = ggml_map_custom1##suffix(ctx, tensor, func, ggml_nelements(tensor) >= get_thread_count() ? GGML_N_TASKS_MAX : 1, userdata); \
176+
return cleanup(ctx, result, userdata); \
170177
} \
171178
\
172179
struct ggml_tensor* tensor_##name##suffix(struct ggml_context* ctx, struct ggml_tensor* tensor, double _extra = 0.0) {\
@@ -212,7 +219,9 @@ template<class T> struct ggml_tensor* flip_3d_impl(struct ggml_context* ctx, str
212219
});
213220
};
214221

215-
return ggml_map_custom1(ctx, tensor, func, GGML_N_TASKS_MAX, allocate_temp_i32_array(ctx, {along}));
222+
auto userdata = allocate_temp_i32_array({along});
223+
auto result = ggml_map_custom1(ctx, tensor, func, GGML_N_TASKS_MAX, userdata);
224+
return cleanup(ctx, result, userdata);
216225
}
217226

218227
template<class T> struct ggml_tensor* per_row_cumsum_impl(struct ggml_context* ctx, struct ggml_tensor* tensor) {
@@ -264,19 +273,18 @@ template <class T> struct ggml_tensor* max_element_impl(struct ggml_context* ctx
264273
return ggml_view_1d(ctx, max, 1, (ggml_nelements(max)-1) * ggml_element_size(max));
265274
}
266275

267-
template<class T> struct ggml_tensor* repeat_impl(struct ggml_context* ctx, struct ggml_tensor* tensor, int64_t new_dim_size, int across) {
276+
template<class T> struct ggml_tensor* repeat_impl(struct ggml_context* ctx, struct ggml_allocr* allocr, struct ggml_tensor* tensor, int64_t new_dim_size, int across) {
268277
ASSERT(tensor->n_dims == 1, "Only 1d tensors supported");
269278
ASSERT(across == 0 || across == 1, "Only across == 0 || 1 supported");
270279
std::vector<int64_t> shape = {across == 0 ? new_dim_size : tensor->ne[0], across == 1 ? new_dim_size : tensor->ne[0]};
271-
auto new_tensor = tensor_zeros(ctx, shape);
272-
273-
ggml_custom2_op_t func = [](struct ggml_tensor * dst, const struct ggml_tensor * src0, const struct ggml_tensor * src1, int ith, int nth, void * userdata) {
280+
auto new_tensor = tensor_zeros(ctx, allocr, shape);
281+
ggml_custom2_op_t func = [](struct ggml_tensor * dst, const struct ggml_tensor * _, const struct ggml_tensor * src1, int ith, int nth, void * userdata) {
274282
START_BENCH()
275283
auto across = *get_temp_data<int>(userdata);
276284
auto* dst_ptr = (T*)dst->data;
277285
auto* src1_ptr = (T*)src1->data;
278286

279-
size_t size = ggml_element_size(src0);
287+
size_t size = ggml_element_size(dst);
280288
for_each_element_threaded(dst, ith, nth, [&] (int i, int j, int k) {
281289
auto idx1 = ((across == 0 ? j : i) * src1->nb[0]) / size;
282290
auto dst_idx = (i * dst->nb[0] + j * dst->nb[1]) / size;
@@ -285,14 +293,17 @@ template<class T> struct ggml_tensor* repeat_impl(struct ggml_context* ctx, stru
285293
});
286294
PRINT_BENCH("repeat")
287295
};
288-
return ggml_map_custom2_inplace(
296+
auto userdata = allocate_temp_i32_array({across});
297+
// inplace breaks?
298+
auto result = ggml_map_custom2(
289299
ctx,
290300
new_tensor,
291301
tensor,
292302
func,
293-
1,
294-
allocate_temp_i32_array(ctx, {across})
303+
GGML_N_TASKS_MAX,
304+
userdata
295305
);
306+
return cleanup(ctx, result, userdata);
296307
}
297308

298309
template<class T> struct ggml_tensor* compare_impl(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, std::function<bool(float, float)> compare_op) {
@@ -364,14 +375,16 @@ template<class T> struct ggml_tensor* set_inplace_impl(struct ggml_context* ctx,
364375
* */
365376
PRINT_BENCH("set_inplace")
366377
};
367-
return ggml_map_custom2_inplace(
378+
auto userdata = allocate_temp_i32_array({start0, start1, start2});
379+
auto result = ggml_map_custom2_inplace(
368380
ctx,
369381
tensor,
370382
values,
371383
func,
372384
1,
373-
allocate_temp_i32_array(ctx, {start0, start1, start2})
385+
userdata
374386
);
387+
return cleanup(ctx, result, userdata);
375388
}
376389

377390
template<class T> struct ggml_tensor* add_bias_inplace_impl(struct ggml_context* ctx, struct ggml_tensor* tensor, struct ggml_tensor* bias) {
@@ -566,27 +579,24 @@ struct ggml_tensor* conv_1d_inplace_impl_fp16(struct ggml_context* ctx, struct g
566579
void im2col_multi_channel(float * dst_data, const float* src_data, int num_channels, int input_length, int output_length, int kernel_size, int stride, int padding, int dilation, int ith, int nth) {
567580
// Precompute constants that are invariant across the inner loops
568581
int stride_times_dilation = stride * dilation;
569-
int part_size = output_length / nth;
570-
int offset = ith * part_size;
571-
int lane = 4;
582+
int input_length_times_num_channels = input_length * num_channels;
583+
584+
for (int c = 0; c < num_channels; ++c) {
585+
int channel_base_index = c * input_length;
586+
int channel_end_index = channel_base_index + input_length;
572587

573-
#pragma clang loop vectorize(enable)
574-
for (int i = offset; i < offset + part_size; ++i) {
575588
for (int j = 0; j < kernel_size; ++j) {
576-
#pragma unroll
577-
for (int c = 0; c < num_channels; c += lane) {
578-
int channel_base_index = c * input_length;
579-
int channel_end_index = channel_base_index + input_length;
580-
int dilation_offset = j * dilation - padding;
589+
int dilation_offset = j * dilation - padding;
581590

591+
for (int i = 0; i < output_length; ++i) {
582592
int src_index = channel_base_index + i * stride_times_dilation + dilation_offset;
583-
int dst_index = (i * kernel_size + j) * num_channels + c; // Adjusted for transposition
593+
int dst_index = (c * kernel_size + j) * output_length + i;
584594

585595
// Check bounds only once per loop iteration
586596
if (src_index >= channel_base_index && src_index < channel_end_index) {
587597
dst_data[dst_index] = src_data[src_index];
588-
//float32x4_t input_data = vld1q_f32(src_data + src_index);
589-
//vst1q_f32(dst_data + dst_index, input_data);
598+
} else {
599+
dst_data[dst_index] = 0;
590600
}
591601
}
592602
}
@@ -603,8 +613,8 @@ struct ggml_tensor* im2col_impl(struct ggml_context* ctx, struct ggml_tensor* we
603613
int32_t batch_size = inputs->ne[2];
604614

605615
int32_t output_columns = ((in_length + 2 * padding - dilation * (kernel_size - 1) - 1) / stride) + 1;
606-
auto dst = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, i_in_channels * kernel_size, output_columns, 1, 1);
607-
memset(dst->data, 0, ggml_nbytes(dst));
616+
auto dst = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, output_columns, i_in_channels * kernel_size, 1, 1);
617+
//memset(dst->data, 0, ggml_nbytes(dst));
608618

609619
//printf("Conv1d with kernel_size = %d, dilation = %d, padding = %d, channels = %d \n", kernel_size, dilation, padding, i_in_channels);
610620

@@ -623,14 +633,18 @@ struct ggml_tensor* im2col_impl(struct ggml_context* ctx, struct ggml_tensor* we
623633
im2col_multi_channel(dst_ptr, inputs_ptr, channel_count, in_length, output_columns, kernel_size, stride, padding, dilation, ith, nth);
624634
};
625635

636+
auto userdata = allocate_temp_i32_array({stride, padding, dilation, output_columns, kernel_size, in_length, w_in_channels});
626637
auto result = ggml_map_custom2_inplace(
627638
ctx,
628639
dst,
629640
inputs,
630641
func,
631642
GGML_N_TASKS_MAX,
632-
allocate_temp_i32_array(ctx, {stride, padding, dilation, output_columns, kernel_size, in_length, w_in_channels})
643+
userdata
633644
);
645+
result = cleanup(ctx, result, userdata);
646+
//result = ggml_permute(ctx, result, 1, 0, 2, 3);
647+
//result = ggml_cont(ctx, result);
634648
return result;
635649
}
636650

@@ -836,8 +850,8 @@ struct ggml_tensor* tensor_per_row_cumsum(struct ggml_context* ctx, struct ggml_
836850
TENSOR_OP_IMPL(per_row_cumsum, tensor, ctx, tensor);
837851
}
838852

839-
struct ggml_tensor* tensor_repeat(struct ggml_context* ctx, struct ggml_tensor* tensor, size_t new_dim_size, int across) {
840-
TENSOR_OP_IMPL(repeat, tensor, ctx, tensor, new_dim_size, across);
853+
struct ggml_tensor* tensor_repeat(struct ggml_context* ctx, struct ggml_allocr* allocr, struct ggml_tensor* tensor, size_t new_dim_size, int across) {
854+
TENSOR_OP_IMPL(repeat, tensor, ctx, allocr, tensor, new_dim_size, across);
841855
}
842856

843857
struct ggml_tensor* tensor_compare(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, std::function<bool(float, float)> compare_op) {

0 commit comments

Comments
 (0)