diff --git a/layers/10_cmdbufemu/README.md b/layers/10_cmdbufemu/README.md index 418af38..ee32b8e 100644 --- a/layers/10_cmdbufemu/README.md +++ b/layers/10_cmdbufemu/README.md @@ -33,6 +33,7 @@ The following environment variables can modify the behavior of the command buffe | Environment Variable | Behavior | Example Format | |----------------------|----------|-----------------| | `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`

`set CMDBUFEMU_EnhancedErrorChecking=1` | +| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`

`set CMDBUFEMU_KernelForProfiling=1` | ## Known Limitations diff --git a/layers/10_cmdbufemu/emulate.cpp b/layers/10_cmdbufemu/emulate.cpp index 6584eeb..119ad4a 100644 --- a/layers/10_cmdbufemu/emulate.cpp +++ b/layers/10_cmdbufemu/emulate.cpp @@ -39,6 +39,26 @@ const cl_mutable_dispatch_fields_khr g_MutableDispatchCaps = CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR; +static cl_int enqueueProfilingKernel( + cl_command_queue queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event ) +{ + const size_t one = 1; + return g_pNextDispatch->clEnqueueNDRangeKernel( + queue, + kernel, + 1, + nullptr, + &one, + nullptr, + num_events_in_wait_list, + event_wait_list, + event ); +} + typedef struct _cl_mutable_command_khr { static bool isValid( cl_mutable_command_khr command ) @@ -1229,6 +1249,7 @@ typedef struct _cl_command_buffer_khr (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0 ); cmdbuf->setupTestQueue(queue); + cmdbuf->setupProfilingKernel(queue); } } @@ -1254,6 +1275,11 @@ typedef struct _cl_command_buffer_khr { g_pNextDispatch->clReleaseCommandQueue(queue); } + + for( auto kernel : ProfilingKernels ) + { + g_pNextDispatch->clReleaseKernel(kernel); + } } static bool isValid( cl_command_buffer_khr cmdbuf ) @@ -1297,20 +1323,17 @@ typedef struct _cl_command_buffer_khr cl_command_queue getQueue() const { - if( Queues.size() > 0 ) - { - return Queues[0]; - } - return nullptr; + return Queues.empty() ? nullptr : Queues[0]; } cl_command_queue getTestQueue() const { - if( TestQueues.size() > 0 ) - { - return TestQueues[0]; - } - return nullptr; + return TestQueues.empty() ? nullptr : TestQueues[0]; + } + + cl_kernel getProfilingKernel() const + { + return ProfilingKernels.empty() ? nullptr : ProfilingKernels[0]; } cl_mutable_dispatch_asserts_khr getMutableDispatchAsserts() const @@ -1671,6 +1694,7 @@ typedef struct _cl_command_buffer_khr std::vector IsInOrder; std::vector TestQueues; std::vector BlockingEvents; + std::vector ProfilingKernels; std::vector> Commands; std::atomic NextSyncPoint; @@ -1747,6 +1771,52 @@ typedef struct _cl_command_buffer_khr } } + void setupProfilingKernel(cl_command_queue queue) + { + if( g_KernelForProfiling ) + { + cl_context context = nullptr; + g_pNextDispatch->clGetCommandQueueInfo( + queue, + CL_QUEUE_CONTEXT, + sizeof(context), + &context, + nullptr ); + + cl_device_id device = nullptr; + g_pNextDispatch->clGetCommandQueueInfo( + queue, + CL_QUEUE_DEVICE, + sizeof(device), + &device, + nullptr ); + + const char* kernelString = "kernel void Empty() {}"; + cl_program program = g_pNextDispatch->clCreateProgramWithSource( + context, + 1, + &kernelString, + nullptr, + nullptr ); + g_pNextDispatch->clBuildProgram( + program, + 1, + &device, + nullptr, + nullptr, + nullptr ); + + cl_kernel kernel = g_pNextDispatch->clCreateKernel( + program, + "Empty", + nullptr ); + g_pNextDispatch->clReleaseProgram( + program ); + + ProfilingKernels.push_back(kernel); + } + } + _cl_command_buffer_khr( cl_command_buffer_flags_khr flags, cl_mutable_dispatch_asserts_khr mutableDispatchAsserts) : @@ -1993,7 +2063,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU( queue, num_events_in_wait_list, event_wait_list, - event ? &startEvent : nullptr); + event == nullptr || g_KernelForProfiling ? nullptr : &startEvent ); + if( errorCode == CL_SUCCESS && event && g_KernelForProfiling ) + { + errorCode = enqueueProfilingKernel( + queue, + cmdbuf->getProfilingKernel(), + 0, + nullptr, + &startEvent ); + } } if( errorCode == CL_SUCCESS ) @@ -2007,7 +2086,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU( queue, 0, nullptr, - event ); + g_KernelForProfiling ? nullptr : event ); + if( errorCode == CL_SUCCESS && g_KernelForProfiling ) + { + errorCode = enqueueProfilingKernel( + queue, + cmdbuf->getProfilingKernel(), + 0, + nullptr, + event ); + } } if( event ) diff --git a/layers/10_cmdbufemu/emulate.h b/layers/10_cmdbufemu/emulate.h index a2fc1dc..cdf0b67 100644 --- a/layers/10_cmdbufemu/emulate.h +++ b/layers/10_cmdbufemu/emulate.h @@ -10,6 +10,7 @@ #include extern bool g_EnhancedErrorChecking; +extern bool g_KernelForProfiling; extern const struct _cl_icd_dispatch* g_pNextDispatch; diff --git a/layers/10_cmdbufemu/main.cpp b/layers/10_cmdbufemu/main.cpp index 0b06f0a..0937489 100644 --- a/layers/10_cmdbufemu/main.cpp +++ b/layers/10_cmdbufemu/main.cpp @@ -34,6 +34,11 @@ bool g_EnhancedErrorChecking = false; +// Using kernels for profiling can fix issues with some implementations +// that do not properly support event profiling on barrkers. + +bool g_KernelForProfiling = false; + const struct _cl_icd_dispatch* g_pNextDispatch = NULL; static cl_int CL_API_CALL @@ -283,6 +288,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer( _init_dispatch(); getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking); + getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling); g_pNextDispatch = target_dispatch;