diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 2bbf87371e7f3..dd668d30352b6 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images | | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | +| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index d0cd5c3b91e34..47ca5a2f1ccbe 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -435,12 +435,46 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, return PI_SUCCESS; } +static const pi_uint32 ZeCommandListBatchSize = [] { + // Default value of 4. This has been seen as a good tradeoff between + // lower overhead of number of enqueue and fence calls, and getting + // commands seen as soon possible (i.e. lazy vs eager submission). + pi_uint32 BatchSizeVal = 4; + const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + if (BatchSizeStr) { + pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); + // Level Zero may only support a limted number of commands per command + // list. The actual upper limit is not specified by the Level Zero + // Specification. For now we allow an arbitrary upper limit. + // Negative numbers will be silently ignored. + if (BatchSizeStrVal >= 0) + BatchSizeVal = BatchSizeStrVal; + } + return BatchSizeVal; +}(); + // Retrieve an available command list to be used in a PI call // Caller must hold a lock on the Queue passed in. -pi_result -_pi_device::getAvailableCommandList(pi_queue Queue, - ze_command_list_handle_t *ZeCommandList, - ze_fence_handle_t *ZeFence) { +pi_result _pi_device::getAvailableCommandList( + pi_queue Queue, ze_command_list_handle_t *ZeCommandList, + ze_fence_handle_t *ZeFence, bool AllowBatching) { + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->ZeOpenCommandList) { + if (AllowBatching) { + *ZeCommandList = Queue->ZeOpenCommandList; + *ZeFence = Queue->ZeOpenCommandListFence; + return PI_SUCCESS; + } + + // If this command isn't allowed to be batched, then we need to + // go ahead and execute what is already in the batched list, + // and then go on to process this. On exit from executeOpenCommandList + // ZeOpenCommandList will be nullptr. + if (auto Res = Queue->executeOpenCommandList()) + return Res; + } + // Create/Reuse the command list, because in Level Zero commands are added to // the command lists, and later are then added to the command queue. // Each command list is paired with an associated fence to track when the @@ -525,6 +559,55 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList, return PI_SUCCESS; } +bool _pi_queue::isBatchingAllowed() { + return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0)); +} + +pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, + ze_fence_handle_t ZeFence) { + if (this->isBatchingAllowed()) { + assert(this->ZeOpenCommandList == nullptr || + this->ZeOpenCommandList == ZeCommandList); + + if (this->ZeOpenCommandListSize + 1 < QueueBatchSize) { + this->ZeOpenCommandList = ZeCommandList; + this->ZeOpenCommandListFence = ZeFence; + + // NOTE: we don't know here how many commands are in the ZeCommandList + // but most PI interfaces translate to a single Level-Zero command. + // Some do translate to multiple commands so we may be undercounting + // a bit here, but this is a heuristic, not an exact measure. + // + this->ZeOpenCommandListSize += 1; + + return PI_SUCCESS; + } + + this->ZeOpenCommandList = nullptr; + this->ZeOpenCommandListFence = nullptr; + this->ZeOpenCommandListSize = 0; + } + + return executeCommandList(ZeCommandList, ZeFence); +} + +pi_result _pi_queue::executeOpenCommandList() { + // If there are any commands still in the open command list for this + // queue, then close and execute that command list now. + auto OpenList = this->ZeOpenCommandList; + if (OpenList) { + auto OpenListFence = this->ZeOpenCommandListFence; + + this->ZeOpenCommandList = nullptr; + this->ZeOpenCommandListFence = nullptr; + this->ZeOpenCommandListSize = 0; + + return executeCommandList(OpenList, OpenListFence); + } + + return PI_SUCCESS; +} + ze_event_handle_t *_pi_event::createZeEventList(pi_uint32 EventListLength, const pi_event *EventList) { try { @@ -1650,7 +1733,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, assert(Queue); try { - *Queue = new _pi_queue(ZeCommandQueue, Context, Device); + *Queue = + new _pi_queue(ZeCommandQueue, Context, Device, ZeCommandListBatchSize); } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { @@ -1706,6 +1790,12 @@ pi_result piQueueRelease(pi_queue Queue) { std::lock_guard lock(Queue->PiQueueMutex); if (--(Queue->RefCount) == 0) { + // It is possible to get to here and still have an open command list + // if no wait or finish ever occurred for this queue. But still need + // to make sure commands get executed. + if (auto Res = Queue->executeOpenCommandList()) + return Res; + // Destroy all the fences created associated with this queue. for (const auto &MapEntry : Queue->ZeCommandListFenceMap) { ZE_CALL(zeFenceDestroy(MapEntry.second)); @@ -1724,6 +1814,10 @@ pi_result piQueueFinish(pi_queue Queue) { // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); + // execute any command list that may still be open. + if (auto Res = Queue->executeOpenCommandList()) + return Res; + ZE_CALL(zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT32_MAX)); return PI_SUCCESS; } @@ -1754,7 +1848,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, // Attach the queue to the "0" device. // TODO: see if we need to let user choose the device. pi_device Device = Context->Devices[0]; - *Queue = new _pi_queue(ZeQueue, Context, Device); + *Queue = new _pi_queue(ZeQueue, Context, Device, ZeCommandListBatchSize); return PI_SUCCESS; } @@ -3022,7 +3116,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, ze_command_list_handle_t ZeCommandList = nullptr; ze_fence_handle_t ZeFence = nullptr; if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList, - &ZeFence)) + &ZeFence, true)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -3059,7 +3153,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence)) + if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence)) return Res; _pi_event::deleteZeEventList(ZeEventWaitList); @@ -3194,6 +3288,19 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { return PI_INVALID_EVENT; } + // Submit dependent open command lists for execution, if any + for (uint32_t I = 0; I < NumEvents; I++) { + auto Queue = EventList[I]->Queue; + + // Lock automatically releases when this goes out of scope. + std::lock_guard lock(Queue->PiQueueMutex); + + if (Queue->RefCount > 0) { + if (auto Res = Queue->executeOpenCommandList()) + return Res; + } + } + for (uint32_t I = 0; I < NumEvents; I++) { ze_event_handle_t ZeEvent = EventList[I]->ZeEvent; zePrint("ZeEvent = %lx\n", pi_cast(ZeEvent)); diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 71e34a5b2a834..53036b4570df3 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -177,9 +177,13 @@ struct _pi_device : _pi_object { // caller must pass a command queue to create a new fence for the new command // list if a command list/fence pair is not available. All Command Lists & // associated fences are destroyed at Device Release. + // If AllowBatching is true, then the command list returned may already have + // command in it, if AllowBatching is false, any open command lists that + // already exist in Queue will be closed and executed. pi_result getAvailableCommandList(pi_queue Queue, ze_command_list_handle_t *ZeCommandList, - ze_fence_handle_t *ZeFence); + ze_fence_handle_t *ZeFence, + bool AllowBatching = false); // Cache of the immutable device properties. ze_device_properties_t ZeDeviceProperties; @@ -268,8 +272,9 @@ struct _pi_context : _pi_object { struct _pi_queue : _pi_object { _pi_queue(ze_command_queue_handle_t Queue, pi_context Context, - pi_device Device) - : ZeCommandQueue{Queue}, Context{Context}, Device{Device} {} + pi_device Device, pi_uint32 QueueBatchSize) + : ZeCommandQueue{Queue}, Context{Context}, Device{Device}, + QueueBatchSize{QueueBatchSize} {} // Level Zero command queue handle. ze_command_queue_handle_t ZeCommandQueue; @@ -291,10 +296,27 @@ struct _pi_queue : _pi_object { // needed/used for the queue data structures. std::mutex PiQueueMutex; + // Open command list field for batching commands into this queue. + ze_command_list_handle_t ZeOpenCommandList = {nullptr}; + ze_fence_handle_t ZeOpenCommandListFence = {nullptr}; + pi_uint32 ZeOpenCommandListSize = {0}; + + // Approximate number of commands that are allowed to be batched for + // this queue. + // Added this member to the queue rather than using a global variable + // so that future implementation could use heuristics to change this on + // a queue specific basis. And by putting it in the queue itself, this + // is thread safe because of the locking of the queue that occurs. + pi_uint32 QueueBatchSize = {0}; + // Map of all Command lists created with their associated Fence used for // tracking when the command list is available for use again. std::map ZeCommandListFenceMap; + // Returns true if any commands for this queue are allowed to + // be batched together. + bool isBatchingAllowed(); + // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable // needs to be set to true. The caller must verify that this command list and @@ -302,14 +324,25 @@ struct _pi_queue : _pi_object { pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, bool MakeAvailable); + // Attach a command list to this queue and allow it to remain open + // and used for further batching. It may be executed immediately, + // or it may be left open for other future command to be batched into. + pi_result batchCommandList(ze_command_list_handle_t ZeCommandList, + ze_fence_handle_t ZeFence); + // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. - // The "is_blocking" tells if the wait for completion is requested. + // The "IsBlocking" tells if the wait for completion is requested. // The "ZeFence" passed is used to track when the command list passed // has completed execution on the device and can be reused. pi_result executeCommandList(ze_command_list_handle_t ZeCommandList, ze_fence_handle_t ZeFence, - bool is_blocking = false); + bool IsBlocking = false); + + // If there is an open command list associated with this queue, + // close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence, + // and ZeOpenCommandListSize. + pi_result executeOpenCommandList(); }; struct _pi_mem : _pi_object {