Skip to content

Commit c342a78

Browse files
authored
[SYCL] Clear cache in case of PI_ERROR_OUT_OF_HOST_MEMORY (#14119)
I'm observing cache overflow when running heavy tests on OCL backend with gpu. Clear cache in case of PI_ERROR_OUT_OF_HOST_MEMORY as well as for PI_ERROR_OUT_OF_RESOURCES. Using as reference: #11987
1 parent da3b5df commit c342a78

File tree

3 files changed

+136
-4
lines changed

3 files changed

+136
-4
lines changed

Diff for: sycl/source/detail/kernel_program_cache.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,8 @@ class KernelProgramCache {
290290
} catch (const exception &Ex) {
291291
BuildResult->Error.Msg = Ex.what();
292292
BuildResult->Error.Code = Ex.get_cl_code();
293-
if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES) {
293+
if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES ||
294+
BuildResult->Error.Code == PI_ERROR_OUT_OF_HOST_MEMORY) {
294295
reset();
295296
BuildResult->updateAndNotify(BuildState::BS_Initial);
296297
continue;

Diff for: sycl/source/detail/program_manager/program_manager.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -1223,7 +1223,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
12231223
nullptr, &LinkedProg);
12241224
};
12251225
sycl::detail::pi::PiResult Error = doLink();
1226-
if (Error == PI_ERROR_OUT_OF_RESOURCES) {
1226+
if (Error == PI_ERROR_OUT_OF_RESOURCES ||
1227+
Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
12271228
Context->getKernelProgramCache().reset();
12281229
Error = doLink();
12291230
}
@@ -2118,7 +2119,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
21182119
/*user_data=*/nullptr, &LinkedProg);
21192120
};
21202121
sycl::detail::pi::PiResult Error = doLink();
2121-
if (Error == PI_ERROR_OUT_OF_RESOURCES) {
2122+
if (Error == PI_ERROR_OUT_OF_RESOURCES ||
2123+
Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
21222124
ContextImpl->getKernelProgramCache().reset();
21232125
Error = doLink();
21242126
}

Diff for: sycl/unittests/kernel-and-program/OutOfResources.cpp

+130-1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ static sycl::unittest::PiImageArray<2> ImgArray{Img};
6868

6969
static int nProgramCreate = 0;
7070
static volatile bool outOfResourcesToggle = false;
71+
static volatile bool outOfHostMemoryToggle = false;
7172

7273
static pi_result redefinedProgramCreate(pi_context context, const void *il,
7374
size_t length,
@@ -80,6 +81,17 @@ static pi_result redefinedProgramCreate(pi_context context, const void *il,
8081
return PI_SUCCESS;
8182
}
8283

84+
static pi_result
85+
redefinedProgramCreateOutOfHostMemory(pi_context context, const void *il,
86+
size_t length, pi_program *res_program) {
87+
++nProgramCreate;
88+
if (outOfHostMemoryToggle) {
89+
outOfHostMemoryToggle = false;
90+
return PI_ERROR_OUT_OF_HOST_MEMORY;
91+
}
92+
return PI_SUCCESS;
93+
}
94+
8395
TEST(OutOfResourcesTest, piProgramCreate) {
8496
sycl::unittest::PiMock Mock;
8597
Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
@@ -141,6 +153,70 @@ TEST(OutOfResourcesTest, piProgramCreate) {
141153
}
142154
}
143155

156+
TEST(OutOfHostMemoryTest, piProgramCreate) {
157+
// Reset to zero.
158+
nProgramCreate = 0;
159+
160+
sycl::unittest::PiMock Mock;
161+
Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
162+
redefinedProgramCreateOutOfHostMemory);
163+
164+
sycl::platform Plt{Mock.getPlatform()};
165+
sycl::context Ctx{Plt};
166+
auto CtxImpl = detail::getSyclObjImpl(Ctx);
167+
queue q(Ctx, default_selector_v);
168+
169+
int runningTotal = 0;
170+
// Cache is empty, so one piProgramCreate call.
171+
q.single_task<class OutOfResourcesKernel1>([] {});
172+
EXPECT_EQ(nProgramCreate, runningTotal += 1);
173+
174+
// Now, we make the next piProgramCreate call fail with
175+
// PI_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch this,
176+
// clear the cache, and retry the piProgramCreate.
177+
outOfHostMemoryToggle = true;
178+
q.single_task<class OutOfResourcesKernel2>([] {});
179+
EXPECT_FALSE(outOfHostMemoryToggle);
180+
EXPECT_EQ(nProgramCreate, runningTotal += 2);
181+
{
182+
detail::KernelProgramCache::ProgramCache &Cache =
183+
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
184+
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
185+
}
186+
187+
// The next piProgramCreate call will fail with
188+
// PI_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in
189+
// the cache, so we expect no new piProgramCreate calls.
190+
outOfHostMemoryToggle = true;
191+
q.single_task<class OutOfResourcesKernel2>([] {});
192+
EXPECT_TRUE(outOfHostMemoryToggle);
193+
EXPECT_EQ(nProgramCreate, runningTotal);
194+
195+
// OutOfResourcesKernel1 is not in the cache, so we have to
196+
// build it. From what we set before, this call will fail,
197+
// the cache will clear out, and will try again.
198+
q.single_task<class OutOfResourcesKernel1>([] {});
199+
EXPECT_FALSE(outOfHostMemoryToggle);
200+
EXPECT_EQ(nProgramCreate, runningTotal += 2);
201+
{
202+
detail::KernelProgramCache::ProgramCache &Cache =
203+
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
204+
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
205+
}
206+
207+
// Finally, OutOfResourcesKernel1 will be in the cache, but
208+
// OutOfResourceKenel2 will not, so one more piProgramCreate.
209+
// Toggle is not set, so this should succeed.
210+
q.single_task<class OutOfResourcesKernel1>([] {});
211+
q.single_task<class OutOfResourcesKernel2>([] {});
212+
EXPECT_EQ(nProgramCreate, runningTotal += 1);
213+
{
214+
detail::KernelProgramCache::ProgramCache &Cache =
215+
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
216+
EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
217+
}
218+
}
219+
144220
static int nProgramLink = 0;
145221

146222
static pi_result
@@ -158,6 +234,20 @@ redefinedProgramLink(pi_context context, pi_uint32 num_devices,
158234
return PI_SUCCESS;
159235
}
160236

237+
static pi_result redefinedProgramLinkOutOfHostMemory(
238+
pi_context context, pi_uint32 num_devices, const pi_device *device_list,
239+
const char *options, pi_uint32 num_input_programs,
240+
const pi_program *input_programs,
241+
void (*pfn_notify)(pi_program program, void *user_data), void *user_data,
242+
pi_program *ret_program) {
243+
++nProgramLink;
244+
if (outOfHostMemoryToggle) {
245+
outOfHostMemoryToggle = false;
246+
return PI_ERROR_OUT_OF_HOST_MEMORY;
247+
}
248+
return PI_SUCCESS;
249+
}
250+
161251
TEST(OutOfResourcesTest, piProgramLink) {
162252
sycl::unittest::PiMock Mock;
163253
Mock.redefineBefore<detail::PiApiKind::piProgramLink>(redefinedProgramLink);
@@ -191,4 +281,43 @@ TEST(OutOfResourcesTest, piProgramLink) {
191281
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
192282
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
193283
}
194-
}
284+
}
285+
286+
TEST(OutOfHostMemoryTest, piProgramLink) {
287+
// Reset to zero.
288+
nProgramLink = 0;
289+
290+
sycl::unittest::PiMock Mock;
291+
Mock.redefineBefore<detail::PiApiKind::piProgramLink>(
292+
redefinedProgramLinkOutOfHostMemory);
293+
294+
sycl::platform Plt{Mock.getPlatform()};
295+
sycl::context Ctx{Plt};
296+
auto CtxImpl = detail::getSyclObjImpl(Ctx);
297+
queue q(Ctx, default_selector_v);
298+
// Put some programs in the cache
299+
q.single_task<class OutOfResourcesKernel1>([] {});
300+
q.single_task<class OutOfResourcesKernel2>([] {});
301+
{
302+
detail::KernelProgramCache::ProgramCache &Cache =
303+
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
304+
EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
305+
}
306+
307+
auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
308+
sycl::bundle_state::object>(Ctx);
309+
auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
310+
sycl::bundle_state::object>(Ctx);
311+
outOfHostMemoryToggle = true;
312+
EXPECT_EQ(nProgramLink, 0);
313+
auto b3 = sycl::link({b1, b2});
314+
EXPECT_FALSE(outOfHostMemoryToggle);
315+
// one restart due to out of resources, one link per each of b1 and b2.
316+
EXPECT_EQ(nProgramLink, 3);
317+
// no programs should be in the cache due to out of resources.
318+
{
319+
detail::KernelProgramCache::ProgramCache &Cache =
320+
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
321+
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
322+
}
323+
}

0 commit comments

Comments
 (0)