Skip to content

Commit 579972e

Browse files
committed
test: split model loading into phases for measurement
This test separates model loading into two distinct phases: 1. Reading the model from disk and converting data types (stored in RAM). 2. Loading tensors into VRAM (device memory). This allows more precise measurement of how much time each phase contributes to the overall loading time. The results appear as additional log lines in the output: [INFO ] model.cpp:1882 - SD_TEST_POSTPONE_DEVICE_LOADING=2 |==================================================| 2641/2641 - 822.49it/s [INFO ] model.cpp:2107 - loaded 0 tensors to device memory in 3.211ms (phase 1) |==================================================| 2533/2533 - 572.43it/s [INFO ] model.cpp:2127 - loaded 2533 tensors to device memory in 4.425ms (phase 2) The behavior is controlled by the environment variable SD_TEST_POSTPONE_DEVICE_LOADING: - 0: original behavior: only counts tensors loaded to device. - 1: defer loading of tensors whose stored type doesn't match the device type (typically smaller and more numerous). - 2: load all tensors from disk into RAM first, then transfer them to VRAM in phase 2 (full separation). Note: Splitting the loading process into phases demands enough RAM to store all converted weights, and will likely increase total loading time.
1 parent 2eb3845 commit 579972e

File tree

1 file changed

+56
-3
lines changed

1 file changed

+56
-3
lines changed

model.cpp

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,16 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
18721872
}
18731873

18741874
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
1875+
1876+
int postpone_device = -1;
1877+
const char * postpone_param = getenv("SD_TEST_POSTPONE_DEVICE_LOADING");
1878+
if (postpone_param != nullptr) {
1879+
postpone_device = atoi(postpone_param);
1880+
if (postpone_device < 0 || postpone_device > 2)
1881+
postpone_device = -1;
1882+
LOG_INFO("SD_TEST_POSTPONE_DEVICE_LOADING=%d", postpone_device);
1883+
}
1884+
18751885
std::vector<TensorStorage> processed_tensor_storages;
18761886
for (auto& tensor_storage : tensor_storages) {
18771887
// LOG_DEBUG("%s", name.c_str());
@@ -1942,11 +1952,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
19421952
return true;
19431953
};
19441954
int tensor_count = 0;
1955+
int tensor_cnt_1 = 0;
19451956
int64_t t0 = ggml_time_ms();
19461957
int64_t t1 = t0;
19471958
bool partial = true;
19481959
int tensor_max = (int)processed_tensor_storages.size();
19491960
pretty_progress(0, tensor_max, 0.0f);
1961+
1962+
std::vector<std::pair<ggml_tensor*, std::vector<uint8_t> > > postponed;
1963+
19501964
for (auto& tensor_storage : processed_tensor_storages) {
19511965
if (tensor_storage.file_index != file_index) {
19521966
++tensor_count;
@@ -2038,15 +2052,27 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
20382052
}
20392053

20402054
if (tensor_storage.type == dst_tensor->type) {
2041-
// copy to device memory
2042-
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
2055+
if (postpone_device >= 2) {
2056+
postponed.emplace_back(dst_tensor, std::move(read_buffer));
2057+
}
2058+
else {
2059+
// copy to device memory
2060+
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
2061+
tensor_cnt_1++;
2062+
}
20432063
} else {
20442064
// convert first, then copy to device memory
20452065
convert_buffer.resize(ggml_nbytes(dst_tensor));
20462066
convert_tensor((void*)read_buffer.data(), tensor_storage.type,
20472067
(void*)convert_buffer.data(), dst_tensor->type,
20482068
(int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
2049-
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
2069+
if (postpone_device >= 1) {
2070+
postponed.emplace_back(dst_tensor, std::move(convert_buffer));
2071+
}
2072+
else {
2073+
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
2074+
tensor_cnt_1++;
2075+
}
20502076
}
20512077
}
20522078
++tensor_count;
@@ -2075,6 +2101,33 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
20752101
if (!success) {
20762102
break;
20772103
}
2104+
2105+
if (postpone_device >= 0) {
2106+
2107+
LOG_INFO("loaded %d tensors to device memory in %.3fms (phase 1)", tensor_cnt_1, (t1 - t0) / 1000.0f);
2108+
2109+
if (postponed.size() > 0) {
2110+
int tensor_cnt_2 = 0;
2111+
int64_t t3 = t1;
2112+
t1 = t3;
2113+
int tensor_max = (int)postponed.size();
2114+
pretty_progress(0, tensor_max, 0.0f);
2115+
for (auto& tensor : postponed) {
2116+
ggml_backend_tensor_set(tensor.first, tensor.second.data(), 0, ggml_nbytes(tensor.first));
2117+
++tensor_cnt_2;
2118+
int64_t t2 = ggml_time_ms();
2119+
if ((t2 - t1) >= 200) {
2120+
t1 = t2;
2121+
pretty_progress(tensor_cnt_2, tensor_max, (t1 - t3) / (1000.0f * tensor_cnt_2));
2122+
}
2123+
}
2124+
2125+
t1 = ggml_time_ms();
2126+
pretty_progress(tensor_cnt_2, tensor_max, (t1 - t3) / (1000.0f * tensor_cnt_2));
2127+
LOG_INFO("loaded %d tensors to device memory in %.3fms (phase 2)", tensor_cnt_2, (t1 - t3) / 1000.0f);
2128+
}
2129+
}
2130+
20782131
}
20792132
return success;
20802133
}

0 commit comments

Comments
 (0)