diff --git a/layers/10_cmdbufemu/README.md b/layers/10_cmdbufemu/README.md
index 418af38..ee32b8e 100644
--- a/layers/10_cmdbufemu/README.md
+++ b/layers/10_cmdbufemu/README.md
@@ -33,6 +33,7 @@ The following environment variables can modify the behavior of the command buffe
 | Environment Variable | Behavior |  Example Format |
 |----------------------|----------|-----------------|
 | `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue".  By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
+| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier.  By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
 
 ## Known Limitations
 
diff --git a/layers/10_cmdbufemu/emulate.cpp b/layers/10_cmdbufemu/emulate.cpp
index 6584eeb..119ad4a 100644
--- a/layers/10_cmdbufemu/emulate.cpp
+++ b/layers/10_cmdbufemu/emulate.cpp
@@ -39,6 +39,26 @@ const cl_mutable_dispatch_fields_khr g_MutableDispatchCaps =
     CL_MUTABLE_DISPATCH_ARGUMENTS_KHR |
     CL_MUTABLE_DISPATCH_EXEC_INFO_KHR;
 
+static cl_int enqueueProfilingKernel(
+    cl_command_queue queue,
+    cl_kernel kernel,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event )
+{
+    const size_t one = 1;
+    return g_pNextDispatch->clEnqueueNDRangeKernel(
+        queue,
+        kernel,
+        1,
+        nullptr,
+        &one,
+        nullptr,
+        num_events_in_wait_list,
+        event_wait_list,
+        event );
+}
+
 typedef struct _cl_mutable_command_khr
 {
     static bool isValid( cl_mutable_command_khr command )
@@ -1229,6 +1249,7 @@ typedef struct _cl_command_buffer_khr
                     (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0 );
 
                 cmdbuf->setupTestQueue(queue);
+                cmdbuf->setupProfilingKernel(queue);
             }
         }
 
@@ -1254,6 +1275,11 @@ typedef struct _cl_command_buffer_khr
         {
             g_pNextDispatch->clReleaseCommandQueue(queue);
         }
+
+        for( auto kernel : ProfilingKernels )
+        {
+            g_pNextDispatch->clReleaseKernel(kernel);
+        }
     }
 
     static bool isValid( cl_command_buffer_khr cmdbuf )
@@ -1297,20 +1323,17 @@ typedef struct _cl_command_buffer_khr
 
     cl_command_queue    getQueue() const
     {
-        if( Queues.size() > 0 )
-        {
-            return Queues[0];
-        }
-        return nullptr;
+        return Queues.empty() ? nullptr : Queues[0];
     }
 
     cl_command_queue    getTestQueue() const
     {
-        if( TestQueues.size() > 0 )
-        {
-            return TestQueues[0];
-        }
-        return nullptr;
+        return TestQueues.empty() ? nullptr : TestQueues[0];
+    }
+
+    cl_kernel   getProfilingKernel() const
+    {
+        return ProfilingKernels.empty() ? nullptr : ProfilingKernels[0];
     }
 
     cl_mutable_dispatch_asserts_khr getMutableDispatchAsserts() const
@@ -1671,6 +1694,7 @@ typedef struct _cl_command_buffer_khr
     std::vector<bool>   IsInOrder;
     std::vector<cl_command_queue>   TestQueues;
     std::vector<cl_event>   BlockingEvents;
+    std::vector<cl_kernel>  ProfilingKernels;
 
     std::vector<std::unique_ptr<Command>> Commands;
     std::atomic<uint32_t> NextSyncPoint;
@@ -1747,6 +1771,52 @@ typedef struct _cl_command_buffer_khr
         }
     }
 
+    void setupProfilingKernel(cl_command_queue queue)
+    {
+        if( g_KernelForProfiling )
+        {
+            cl_context context = nullptr;
+            g_pNextDispatch->clGetCommandQueueInfo(
+                queue,
+                CL_QUEUE_CONTEXT,
+                sizeof(context),
+                &context,
+                nullptr );
+
+            cl_device_id device = nullptr;
+            g_pNextDispatch->clGetCommandQueueInfo(
+                queue,
+                CL_QUEUE_DEVICE,
+                sizeof(device),
+                &device,
+                nullptr );
+
+            const char* kernelString = "kernel void Empty() {}";
+            cl_program program = g_pNextDispatch->clCreateProgramWithSource(
+                context,
+                1,
+                &kernelString,
+                nullptr,
+                nullptr );
+            g_pNextDispatch->clBuildProgram(
+                program,
+                1,
+                &device,
+                nullptr,
+                nullptr,
+                nullptr );
+
+            cl_kernel kernel = g_pNextDispatch->clCreateKernel(
+                program,
+                "Empty",
+                nullptr );
+            g_pNextDispatch->clReleaseProgram(
+                program );
+
+            ProfilingKernels.push_back(kernel);
+        }
+    }
+
     _cl_command_buffer_khr(
             cl_command_buffer_flags_khr flags,
             cl_mutable_dispatch_asserts_khr mutableDispatchAsserts) :
@@ -1993,7 +2063,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
             queue,
             num_events_in_wait_list,
             event_wait_list,
-            event ? &startEvent : nullptr);
+            event == nullptr || g_KernelForProfiling ? nullptr : &startEvent );
+        if( errorCode == CL_SUCCESS && event && g_KernelForProfiling )
+        {
+            errorCode = enqueueProfilingKernel(
+                queue,
+                cmdbuf->getProfilingKernel(),
+                0,
+                nullptr,
+                &startEvent );
+        }
     }
 
     if( errorCode == CL_SUCCESS )
@@ -2007,7 +2086,16 @@ cl_int CL_API_CALL clEnqueueCommandBufferKHR_EMU(
             queue,
             0,
             nullptr,
-            event );
+            g_KernelForProfiling ? nullptr : event );
+        if( errorCode == CL_SUCCESS && g_KernelForProfiling )
+        {
+            errorCode = enqueueProfilingKernel(
+                queue,
+                cmdbuf->getProfilingKernel(),
+                0,
+                nullptr,
+                event );
+        }
     }
 
     if( event )
diff --git a/layers/10_cmdbufemu/emulate.h b/layers/10_cmdbufemu/emulate.h
index a2fc1dc..cdf0b67 100644
--- a/layers/10_cmdbufemu/emulate.h
+++ b/layers/10_cmdbufemu/emulate.h
@@ -10,6 +10,7 @@
 #include <map>
 
 extern bool g_EnhancedErrorChecking;
+extern bool g_KernelForProfiling;
 
 extern const struct _cl_icd_dispatch* g_pNextDispatch;
 
diff --git a/layers/10_cmdbufemu/main.cpp b/layers/10_cmdbufemu/main.cpp
index 0b06f0a..0937489 100644
--- a/layers/10_cmdbufemu/main.cpp
+++ b/layers/10_cmdbufemu/main.cpp
@@ -34,6 +34,11 @@
 
 bool g_EnhancedErrorChecking = false;
 
+// Using kernels for profiling can fix issues with some implementations
+// that do not properly support event profiling on barrkers.
+
+bool g_KernelForProfiling = false;
+
 const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
 
 static cl_int CL_API_CALL
@@ -283,6 +288,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayer(
     _init_dispatch();
 
     getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
+    getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
 
     g_pNextDispatch = target_dispatch;