diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt index 0221f02..5e8e390 100644 --- a/sdk/CMakeLists.txt +++ b/sdk/CMakeLists.txt @@ -240,6 +240,7 @@ if(BUILD_TESTING AND PTI_BUILD_SAMPLES) add_subdirectory(samples/onemkl_gemm) add_subdirectory(samples/iso3dfd_dpcpp) add_subdirectory(samples/metrics_iso3dfd_dpcpp) + add_subdirectory(samples/callback) endif() if(BUILD_TESTING AND PTI_BUILD_TESTING) diff --git a/sdk/include/pti/pti.h b/sdk/include/pti/pti.h index 3bd6a3d..707e8a3 100644 --- a/sdk/include/pti/pti.h +++ b/sdk/include/pti/pti.h @@ -31,7 +31,8 @@ typedef enum { //!< PTI_VIEW_EXTERNAL_CORRELATION PTI_ERROR_BAD_TIMESTAMP = 6, //!< error in timestamp conversion, might be related with the user //!< provided TimestampCallback - PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id + PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id + PTI_ERROR_AT_LEAST_ONE_GPU_VIEW_MUST_BE_ENABLED = 8, //!< at least one GPU view must be enabled for kernel tracing PTI_ERROR_DRIVER = 50, //!< unknown driver error PTI_ERROR_TRACING_NOT_INITIALIZED = 51, //!< installed driver requires tracing enabling with //!< setting environment variable ZE_ENABLE_TRACING_LAYER diff --git a/sdk/include/pti/pti_sync_callback.h b/sdk/include/pti/pti_sync_callback.h new file mode 100644 index 0000000..6ed2986 --- /dev/null +++ b/sdk/include/pti/pti_sync_callback.h @@ -0,0 +1,227 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_SYNC_CALLBACK_H_ +#define PTI_SYNC_CALLBACK_H_ + +#include "pti/pti_metrics.h" +#include "pti/pti_view.h" + +/** + * This file contains APIs that so far experimental in PTI + * APIs and data structures in this file are work-in-progress and subject to change! + * + * All in this file concerns Callback API + * Callback API is useful for many things, + * including to the implementation of MetricsScope functionality that wants to subscribe for + * kernel append to command list .. and may be to other events. + * + * So MetricsScope API is the first user of [for now internal] Callback API + * + */ + + +/* clang-format off */ +#if defined(__cplusplus) +extern "C" { +#endif + +typedef uint32_t pti_callback_subscriber_handle; + +#define PTI_CALLBACK_SUBSCRIBER_HANDLE_INVALID 0 + +typedef enum _pti_callback_domain { + PTI_CB_DOMAIN_INVALID = 0, + PTI_CB_DOMAIN_DRIVER_CONTEXT_CREATED = 1, //!< Not implememted yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + PTI_CB_DOMAIN_DRIVER_MODULE_LOADED = 2, //!< Not implememted yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + PTI_CB_DOMAIN_DRIVER_MODULE_UNLOADED = 3, //!< Not implememted yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED = 4, //!< This also serves as PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED + //!< when appended to Immediate Command List, + //!< which means no separate callback PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED = 5, //!< Not implememted yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED = 6, + PTI_CB_DOMAIN_DRIVER_HOST_SYNCHRONIZATION = 7, //!< Not implememted yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_API = 1023, //!< Not implemeted yet, + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + //!< Callback created for all Driver APIs + // below domains to inform user about PTI internal events + PTI_CB_DOMAIN_INTERNAL_THREADS = 1024, //!< Not implemeted yet + PTI_CB_DOMAIN_INTERNAL_EVENT = 1025, //!< Not implemeted yet + + PTI_CB_DOMAIN_MAX = 0x7fffffff +} pti_callback_domain; + +typedef enum _pti_callback_phase { + PTI_CB_PHASE_INVALID = 0, + PTI_CB_PHASE_API_ENTER = 1, + PTI_CB_PHASE_API_EXIT = 2, + PTI_CB_PHASE_INTERNAL_THREAD_START = 3, + PTI_CB_PHASE_INTERNAL_THREAD_END = 4, + PTI_CB_PHASE_INTERNAL_EVENT = 5, + + PTI_CB_PHASE_MAX = 0x7fffffff +} pti_callback_phase; + +typedef enum _pti_backend_command_list_type { + PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN = (1<<0), + PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE = (1<<1), + PTI_BACKEND_COMMAND_LIST_TYPE_MUTABLE = (1<<2), + + PTI_BACKEND_COMMAND_LIST_TYPE_MAX = 0x7fffffff +} pti_backend_command_list_type; + +typedef void* pti_backend_command_list_t; //!< Backend command list handle + +/** + * A user can subscribe to notifications about non-standard situation from PTI + * when it collects or processes the data + */ +typedef enum _pti_internal_event_type { + PTI_INTERNAL_EVENT_TYPE_INFO = 0, + PTI_INTERNAL_EVENT_TYPE_WARNING = 1, // one or few records data inconsistences, or other + // collection is safe to continue + PTI_INTERNAL_EVENT_TYPE_CRITICAL = 2, // critical error after which further collected data are invalid + + PTI_INTERNAL_EVENT_TYPE_MAX = 0x7fffffff +} pti_internal_event_type; + +typedef enum _pti_gpu_operation_kind { + PTI_GPU_OPERATION_KIND_INVALID = 0, + PTI_GPU_OPERATION_KIND_KERNEL = 1, + PTI_GPU_OPERATION_KIND_MEMORY = 2, + PTI_GPU_OPERATION_KIND_OTHER = 3, + + PTI_OPERATION_KIND_MAX = 0x7fffffff +} pti_gpu_operation_kind; + +typedef struct _pti_gpu_op_details { + pti_gpu_operation_kind _operation_kind; // +#include + +#include +#include +#include +#include + +#include "pti/pti_sync_callback.h" +#include "samples_utils.h" + +#define NSEC_IN_SEC 1'000'000'000 +#define A_VALUE 0.128f +#define B_VALUE 0.256f +#define MAX_EPS 1.0e-4f + +constexpr auto kRequestedRecordCount = 5'000'000ULL; +constexpr auto kRequestedBufferSize = kRequestedRecordCount * sizeof(pti_view_record_kernel); +char kEnterString[] = "I have seen ENTER"; + +ze_event_handle_t global_time_stamp_event = nullptr; +ze_event_pool_handle_t event_pool = nullptr; +void *buff = nullptr; + +bool PrepareDataForGlobalEventAppend(ze_command_list_handle_t command_list) { + static bool ready = false; + if (!ready) { + ze_context_handle_t hContext = nullptr; + ze_result_t status = zeCommandListGetContextHandle(command_list, &hContext); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "zeCommandListGetContextHandle failed with error code: " << status << '\n'; + return false; + } + ze_device_handle_t hDevice = nullptr; + status = zeCommandListGetDeviceHandle(command_list, &hDevice); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "zeCommandListGetDeviceHandle failed with error code: " << status << '\n'; + return false; + } + + ze_event_pool_desc_t event_pool_desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, + ZE_EVENT_POOL_FLAG_IPC | + ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | + ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + 10}; + + status = zeEventPoolCreate(hContext, &event_pool_desc, 1, &hDevice, &event_pool); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "zeEventPoolCreate failed with error code: " << status << '\n'; + return false; + } + ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, + 0}; + status = zeMemAllocDevice(hContext, &alloc_desc, 64, 64, hDevice, &buff); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "zeMemAllocDevice failed with error code: " << status << '\n'; + return false; + } + ze_event_desc_t event_desc = {}; + event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC; + event_desc.index = 0; + event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; // Event is signaled on host + event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST; // Event is waited on host + status = zeEventCreate(event_pool, &event_desc, &global_time_stamp_event); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "zeEventCreate failed with error code: " << status << '\n'; + return false; + } + ready = true; + } + return true; +} + +void StartTracing() { + PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL)); + PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL)); + PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY)); +} + +void StopTracing() { + PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL)); + PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL)); + PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY)); +} + +void ProvideBuffer(unsigned char **buf, std::size_t *buf_size) { + *buf = samples_utils::AlignedAlloc(kRequestedBufferSize); + if (!*buf) { + std::cerr << "Unable to allocate buffer for PTI tracing " << '\n'; + std::abort(); + } + *buf_size = kRequestedBufferSize; +} + +void ParseBuffer(unsigned char *buf, std::size_t buf_size, std::size_t valid_buf_size) { + if (!buf || !valid_buf_size || !buf_size) { + std::cerr << "Received empty buffer" << '\n'; + if (valid_buf_size) { + samples_utils::AlignedDealloc(buf); + } + return; + } + pti_view_record_base *ptr = nullptr; + while (true) { + auto buf_status = ptiViewGetNextRecord(buf, valid_buf_size, &ptr); + if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) { + std::cout << "Reached End of buffer" << '\n'; + break; + } + if (buf_status != pti_result::PTI_SUCCESS) { + std::cerr << "Found Error Parsing Records from PTI" << '\n'; + break; + } + switch (ptr->_view_kind) { + case pti_view_kind::PTI_VIEW_INVALID: { + std::cout << "Found Invalid Record" << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_COLLECTION_OVERHEAD: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + break; + } + case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + break; + } + case pti_view_kind::PTI_VIEW_RUNTIME_API: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + std::cout << "Found Sycl Runtime Record" << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + break; + } + case pti_view_kind::PTI_VIEW_DRIVER_API: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + std::cout << "Found Driver Api Record" << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + std::cout << "Found Memory Record" << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + std::cout << "Found Memory Record" << '\n'; + samples_utils::DumpRecord(reinterpret_cast(ptr)); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: { + pti_view_record_kernel *rec = reinterpret_cast(ptr); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + std::cout << "Found Kernel Record" << '\n'; + samples_utils::DumpRecord(rec); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + if (samples_utils::isMonotonic({rec->_sycl_task_begin_timestamp, + rec->_sycl_enqk_begin_timestamp, rec->_append_timestamp, + rec->_submit_timestamp, rec->_start_timestamp, + rec->_end_timestamp})) { + std::cout << "------------> All Monotonic" << std::endl; + } else { + std::cerr << "------------> Something wrong: NOT All monotonic" << std::endl; + } + if (rec->_sycl_task_begin_timestamp == 0) { + std::cerr << "------------> Something wrong: Sycl Task " + "Begin Time is 0" + << std::endl; + } + if (rec->_sycl_enqk_begin_timestamp == 0) { + std::cerr << "------------> Something wrong: Sycl Enq " + "Launch Kernel Time is 0" + << std::endl; + } + break; + } + default: { + std::cerr << "This shouldn't happen" << '\n'; + break; + } + } + } + samples_utils::AlignedDealloc(buf); +} + +void CallbackGPUOperationAppend([[maybe_unused]] pti_callback_domain domain, + pti_api_group_id driver_group_id, uint32_t driver_api_id, + [[maybe_unused]] pti_backend_ctx_t backend_context, void *cb_data, + void *user_data, void **instance_user_data) { + std::cout << "In " << __func__ + << " Subscriber: " << *(reinterpret_cast(user_data)) + << std::endl; + + samples_utils::DumpCallbackData(domain, driver_group_id, driver_api_id, backend_context, cb_data, + user_data, instance_user_data); + + pti_callback_gpu_op_data *callback_data = static_cast(cb_data); + if (callback_data == nullptr) { + std::cerr << "CallbackGPUOperationAppend: callback_data is null" << std::endl; + return; + } + + ze_bool_t is_immediate = 0; + if (callback_data->_cmd_list_handle != nullptr) { + ze_result_t res = zeCommandListIsImmediate( + static_cast(callback_data->_cmd_list_handle), &is_immediate); + if (ZE_RESULT_SUCCESS == res) { + std::cout << "Command List is " << (is_immediate ? "Immediate" : "Regular") << std::endl; + } else { + std::cout << "zeCommandListIsImmediate failed with error code: " << res << std::endl; + } + } + + pti_callback_gpu_op_data *gpu_op_data = static_cast(cb_data); + + pti_gpu_op_details *op_details = + (gpu_op_data->_operation_details != nullptr) + ? static_cast(gpu_op_data->_operation_details) + : nullptr; + bool is_op_kernel = (op_details != nullptr) + ? (op_details->_operation_kind == PTI_GPU_OPERATION_KIND_KERNEL) + : false; + uint32_t operation_count = (gpu_op_data != nullptr) ? gpu_op_data->_operation_count : 0; + + if (operation_count != 1) { + std::cout << "WARNING: Operation count is not 1, it is: " << operation_count + << " .It is unexpected for now!" << std::endl; + } + + if (callback_data->_phase == PTI_CB_PHASE_API_ENTER) { + *instance_user_data = static_cast(kEnterString); + std::cout << "Append started..."; + if (is_op_kernel) { + std::cout << "Operation is Kernel\n" << std::endl; + std::cout << " Preparing data to append smth from here" << std::endl; + auto res = PrepareDataForGlobalEventAppend( + static_cast(callback_data->_queue_handle)); + if (res) { + std::cout << "Prepared data for Append" << std::endl; + res = zeCommandListAppendWriteGlobalTimestamp( + static_cast(callback_data->_queue_handle), + static_cast(buff), global_time_stamp_event, 0, nullptr); + + if (res == ZE_RESULT_SUCCESS) { + std::cout << "Appended Write Global Timestamp to Command List" << std::endl; + } else { + std::cout << "zeCommandListAppendWriteGlobalTimestamp failed with error code: " << res + << std::endl; + } + } else { + std::cout << "Failed to prepare data for Append" << std::endl; + } + } else { + std::cout << "Operation is not Kernel" << std::endl; + } + } else if (callback_data->_phase == PTI_CB_PHASE_API_EXIT) { + std::cout << "Append ended. Data from ENTER: " << static_cast(*instance_user_data) + << std::endl; + } else { + std::cout << "Unexpected phase: " << callback_data->_phase << std::endl; + } +} + +void CallbackGPUOperationCompletion([[maybe_unused]] pti_callback_domain domain, + pti_api_group_id driver_group_id, uint32_t driver_api_id, + [[maybe_unused]] pti_backend_ctx_t backend_context, + void *cb_data, void *user_data, + [[maybe_unused]] void **instance_user_data) { + std::cout << "In " << __func__ + << " Subscriber: " << *(reinterpret_cast(user_data)) + << std::endl; + + samples_utils::DumpCallbackData(domain, driver_group_id, driver_api_id, backend_context, cb_data, + user_data, instance_user_data); + + pti_callback_gpu_op_data *callback_data = static_cast(cb_data); + if (callback_data == nullptr) { + std::cerr << "CallbackGPUOperationCompletion: callback_data is null" << std::endl; + return; + } + + if (global_time_stamp_event != nullptr) { + auto result = zeEventQueryStatus(global_time_stamp_event); + if (result == ZE_RESULT_SUCCESS) { + std::cout << "Appended Global Time Stamp Signaled. Resetting the event" << std::endl; + auto result2 = zeEventHostReset(global_time_stamp_event); + if (result2 != ZE_RESULT_SUCCESS) { + std::cout << "zeEventHostReset failed with error code: " << result2 << std::endl; + } + } else if (result == ZE_RESULT_NOT_READY) { + std::cout << "Appended Global Time Stamp NOT Ready " << std::endl; + } else { + std::cout << "zeEventQueryStatus failed with error code: " << result << std::endl; + } + } +} + +void CallbackCommon(pti_callback_domain domain, pti_api_group_id driver_group_id, + uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data, + void *user_data, void **instance_user_data) { + switch (domain) { + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED: + CallbackGPUOperationAppend(domain, driver_group_id, driver_api_id, backend_context, cb_data, + user_data, instance_user_data); + break; + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED: + CallbackGPUOperationCompletion(domain, driver_group_id, driver_api_id, backend_context, + cb_data, user_data, instance_user_data); + break; + default: { + std::cout << "In " << __func__ << " (default case)" << std::endl; + samples_utils::DumpCallbackData(domain, driver_group_id, driver_api_id, backend_context, + cb_data, user_data, instance_user_data); + break; + } + } + std::cout << std::endl; +} + +static float Check(const std::vector &a, float value) { + assert(value > MAX_EPS); + + float eps = 0.0f; + for (size_t i = 0; i < a.size(); ++i) { + eps += std::fabs((a[i] - value) / value); + } + + return eps / a.size(); +} + +void GEMM(const float *a, const float *b, float *c, unsigned size, sycl::id<2> id) { + int i = id.get(0); + int j = id.get(1); + float sum = 0.0f; + for (unsigned k = 0; k < size; ++k) { + sum += a[i * size + k] * b[k * size + j]; + } + c[i * size + j] = sum; +} + +static float RunAndCheck(sycl::queue queue, const std::vector &a, + const std::vector &b, std::vector &c, unsigned size, + float expected_result) { + assert(size > 0); + assert(a.size() == size * size); + assert(b.size() == size * size); + assert(c.size() == size * size); + + try { + sycl::buffer a_buf(a.data(), a.size()); + sycl::buffer b_buf(b.data(), b.size()); + sycl::buffer c_buf(c.data(), c.size()); + + [[maybe_unused]] sycl::event event = queue.submit([&](sycl::handler &cgh) { + auto a_acc = a_buf.get_access(cgh); + auto b_acc = b_buf.get_access(cgh); + auto c_acc = c_buf.get_access(cgh); + + cgh.parallel_for(sycl::range<2>(size, size), [=](sycl::id<2> id) { + auto a_acc_ptr = a_acc.get_multi_ptr(); + auto b_acc_ptr = b_acc.get_multi_ptr(); + auto c_acc_ptr = c_acc.get_multi_ptr(); + GEMM(a_acc_ptr.get(), b_acc_ptr.get(), c_acc_ptr.get(), size, id); + }); + }); + queue.wait_and_throw(); + } catch (const sycl::exception &e) { + std::cout << "[ERROR] " << e.what() << std::endl; + throw; + } + + std::cout << "Matrix multiplication done. Checking result.." << std::endl; + + return Check(c, expected_result); +} + +static void Compute(sycl::queue queue, const std::vector &a, const std::vector &b, + std::vector &c, unsigned size, unsigned repeat_count, + float expected_result) { + for (unsigned i = 0; i < repeat_count; ++i) { + float eps = RunAndCheck(queue, a, b, c, size, expected_result); + std::cout << "Results are " << ((eps < MAX_EPS) ? "" : "IN") << "CORRECT with accuracy: " << eps + << std::endl; + } +} + +const unsigned max_size = 8192; +const unsigned min_size = 32; + +void Usage(const char *name) { + std::cout << " Calculating floating point matrix multiply on gpu\n"; + std::cout << name + << " [ [gpu|cpu|host, default=gpu], [matrix size, default=1024, max=" << max_size + << "], [repetition count, default=4]] \n"; +} + +int main(int argc, char *argv[]) { + int exit_code = EXIT_SUCCESS; + PTI_CHECK_SUCCESS(ptiViewSetCallbacks(ProvideBuffer, ParseBuffer)); + StartTracing(); + pti_callback_subscriber_handle subscriber1 = PTI_CALLBACK_SUBSCRIBER_HANDLE_INVALID; + pti_callback_subscriber_handle subscriber2 = PTI_CALLBACK_SUBSCRIBER_HANDLE_INVALID; + PTI_CHECK_SUCCESS(ptiCallbackSubscribe(&subscriber1, CallbackCommon, &subscriber1)); + std::cout << "Initialized Subscriber: " << subscriber1 << std::endl << std::flush; + PTI_CHECK_SUCCESS(ptiCallbackSubscribe(&subscriber2, CallbackCommon, &subscriber2)); + std::cout << "Initialized Subscriber: " << subscriber2 << std::endl << std::flush; + PTI_CHECK_SUCCESS( + ptiCallbackEnableDomain(subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1)); + PTI_CHECK_SUCCESS( + ptiCallbackEnableDomain(subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1, 1)); + PTI_CHECK_SUCCESS( + ptiCallbackEnableDomain(subscriber2, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1, 1)); + + unsigned repeat_count = 1; + unsigned size = 1024; + sycl::device dev; + try { + dev = sycl::device(sycl::gpu_selector_v); + if (argc > 1 && strcmp(argv[1], "cpu") == 0) { + dev = sycl::device(sycl::cpu_selector_v); + std::cerr << "PTI doesn't support cpu profiling yet" << '\n'; + std::exit(EXIT_FAILURE); + } else if (argc > 1 && strcmp(argv[1], "host") == 0) { + dev = sycl::device(sycl::default_selector_v); + std::cerr << "PTI doesn't support host profiling yet" << '\n'; + std::exit(EXIT_FAILURE); + } + + unsigned temp = size; + if (argc > 2) { + temp = std::stoul(argv[2]); + size = (temp < min_size) ? min_size : (temp > max_size) ? max_size : temp; + } + + if (argc > 3) { + temp = std::stoul(argv[3]); + repeat_count = (temp < 1) ? 1 : temp; + } + } catch (const sycl::exception &e) { + Usage(argv[0]); + std::cerr << "Error: Exception caught while executing SYCL " << e.what() << '\n'; + std::cerr << "Unable to select valid sycl device" << '\n'; + return EXIT_FAILURE; + } catch (...) { + Usage(argv[0]); + return EXIT_FAILURE; + } + + sycl::property_list prop_list{sycl::property::queue::in_order()}; + sycl::queue queue(dev, sycl::async_handler{}, prop_list); // Main runandcheck kernel + + std::cout << "DPC++ Matrix Multiplication (matrix size: " << size << " x " << size << ", repeats " + << repeat_count << " times)" << std::endl; + std::cout << "Target device: " + << queue.get_info().get_info() + << std::endl; + + std::vector a(size * size, A_VALUE); + std::vector b(size * size, B_VALUE); + std::vector c(size * size, 0.0f); + + try { + auto start = std::chrono::steady_clock::now(); + float expected_result = A_VALUE * B_VALUE * size; + Compute(queue, a, b, c, size, repeat_count, expected_result); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration time = end - start; + std::cout << "Total execution time: " << time.count() << " sec" << std::endl; + + } catch (const sycl::exception &e) { + std::cerr << "Error: Exception while executing SYCL " << e.what() << '\n'; + std::cerr << "\tError code: " << e.code().value() << "\n\tCategory: " << e.category().name() + << "\n\tMessage: " << e.code().message() << '\n'; + exit_code = EXIT_FAILURE; + } catch (const std::exception &e) { + std::cerr << "Error: Exception caught " << e.what() << '\n'; + exit_code = EXIT_FAILURE; + } catch (...) { + std::cerr << "Error: Unknown exception caught." << '\n'; + exit_code = EXIT_FAILURE; + } + PTI_CHECK_SUCCESS(ptiCallbackUnsubscribe(subscriber1)); + PTI_CHECK_SUCCESS(ptiCallbackUnsubscribe(subscriber2)); + StopTracing(); + PTI_CHECK_SUCCESS(ptiFlushAllViews()); + + return exit_code; +} diff --git a/sdk/samples/samples_utilities/samples_utils.h b/sdk/samples/samples_utilities/samples_utils.h index e62a1f9..114adf4 100644 --- a/sdk/samples/samples_utilities/samples_utils.h +++ b/sdk/samples/samples_utilities/samples_utils.h @@ -10,6 +10,7 @@ #include #include +#include "pti/pti_sync_callback.h" #include "pti/pti_view.h" namespace samples_utils { @@ -329,5 +330,104 @@ inline pti_backend_queue_t GetLevelZeroBackendQueue(sycl::queue& queue) { return backend_queue; } +inline std::string GetCommandListTypeString(pti_backend_command_list_type cmd_list_type) { + std::string result; + if (cmd_list_type & PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN) { + result += " | Unknown "; + } + if (cmd_list_type & PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE) { + result += " | Immediate"; + } + if (cmd_list_type & PTI_BACKEND_COMMAND_LIST_TYPE_MUTABLE) { + result += " | Mutable"; + } + + if (!result.empty()) { + result = result.substr(3); // remove leading " | " + } else { + result = "INVALID_VALUE"; + } + return result; +} + +inline void DumpCallbackData(pti_callback_domain domain, pti_api_group_id driver_api_group_id, + uint32_t driver_api_id, pti_backend_ctx_t backend_context, + void* cb_data, void* global_user_data, void** instance_user_data) { + std::cout << "=== Callback Data Dump ===" << std::endl; + std::cout << "Domain: " << ptiCallbackDomainTypeToString(domain) << " (" << domain << ")" + << std::endl; + std::cout << "Backend Context: " << backend_context << std::endl; + + const char* api_name = nullptr; + if (PTI_SUCCESS == ptiViewGetApiIdName(driver_api_group_id, driver_api_id, &api_name)) { + std::cout << "Driver API Group ID/API ID/Name: " << driver_api_group_id << "/" << driver_api_id + << "/" << api_name << std::endl; + } else { + std::cout << "Driver API Group ID/API ID/Name: " << driver_api_group_id << "/" << driver_api_id + << "/Unknown" << std::endl; + } + if (cb_data != nullptr) { + switch (domain) { + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED: + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED: + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED: { + pti_callback_gpu_op_data* gpu_op_data = static_cast(cb_data); + std::cout << "GPU Operation Data:" << std::endl; + std::cout << " Command List Type: " + << GetCommandListTypeString(gpu_op_data->_cmd_list_properties) << std::endl; + std::cout << " Cmd List Handle: " << gpu_op_data->_cmd_list_handle << std::endl; + std::cout << " Queue Handle: " << gpu_op_data->_queue_handle << std::endl; + std::cout << " Device Handle: " << gpu_op_data->_device_handle << std::endl; + std::cout << " Phase: " << ptiCallbackPhaseTypeToString(gpu_op_data->_phase) << " (" + << gpu_op_data->_phase << ")" << std::endl; + std::cout << " Return Code: " << gpu_op_data->_return_code << std::endl; + std::cout << " Correlation ID: " << gpu_op_data->_correlation_id << std::endl; + std::cout << " Operation Count: " << gpu_op_data->_operation_count << std::endl; + + if (gpu_op_data->_operation_details != nullptr) { + pti_gpu_op_details* op_details = + static_cast(gpu_op_data->_operation_details); + std::cout << " Operation Details:" << std::endl; + std::cout << " Operation Kind: " << op_details->_operation_kind << std::endl; + std::cout << " Operation ID: " << op_details->_operation_id << std::endl; + std::cout << " Kernel Handle: " << op_details->_kernel_handle << std::endl; + if (op_details->_name != nullptr) { + std::cout << " Name: " << op_details->_name << std::endl; + } + } + break; + } + case PTI_CB_DOMAIN_INTERNAL_THREADS: + case PTI_CB_DOMAIN_INTERNAL_EVENT: { + pti_internal_callback_data* internal_data = + static_cast(cb_data); + std::cout << "Internal Callback Data:" << std::endl; + std::cout << " Phase: " << ptiCallbackPhaseTypeToString(internal_data->_phase) << " (" + << internal_data->_phase << ")" << std::endl; + std::cout << " Detail: " << internal_data->_detail << std::endl; + if (internal_data->_message != nullptr) { + std::cout << " Message: " << internal_data->_message << std::endl; + } + break; + } + default: + std::cout << "Unknown domain type for callback data" << std::endl; + break; + } + } else { + std::cout << "Callback data is null" << std::endl; + } + + if (global_user_data != nullptr) { + std::cout << "Global User Data: " << global_user_data << std::endl; + } + + if (instance_user_data != nullptr && *instance_user_data != nullptr) { + std::cout << "Instance User Data: " << *instance_user_data << std::endl; + } + + std::cout << "=========================" << std::endl; +} + } // namespace samples_utils #endif diff --git a/sdk/src/gen_tracing_callbacks.py b/sdk/src/gen_tracing_callbacks.py index bc119c0..b670a9c 100644 --- a/sdk/src/gen_tracing_callbacks.py +++ b/sdk/src/gen_tracing_callbacks.py @@ -575,7 +575,7 @@ def gen_api( f.write(" }\n") f.write(" }\n") - f.write(" else if (options_.kernel_tracing) {\n") + f.write(" else if (options_.kernel_tracing || IsAnyCallbackSubscriberActive() ) {\n") for func in kfunc_list: # if func not in exclude_from_prologue_list: # f.write( @@ -692,13 +692,14 @@ def gen_enter_callback(f, func, synchronize_func_list_on_enter, hybrid_mode_func if func not in hybrid_mode_func_list: f.write(" if (collector->options_.hybrid_mode) return;\n") + f.write(" ze_instance_data.callback_id_ = " + func + "_id;\n") if func in synchronize_func_list_on_enter: f.write(" std::vector kids;\n") f.write("\n") cb = get_kernel_tracing_callback("OnEnter" + func[2:]) if cb != "": - f.write(" if (collector->options_.kernel_tracing) { \n") + f.write(" if (collector->options_.kernel_tracing || collector->IsAnyCallbackSubscriberActive() ) { \n") if func in synchronize_func_list_on_enter: f.write( " " + cb + "(params, global_data, instance_user_data, &kids); \n" @@ -746,6 +747,7 @@ def gen_exit_callback( f.write(" [[maybe_unused]] uint64_t end_time_host = 0;\n") f.write(" end_time_host = utils::GetTime();\n") f.write(" ze_instance_data.end_time_host = end_time_host;\n") + f.write(" ze_instance_data.callback_id_ = " + func + "_id;\n") cb = get_kernel_tracing_callback("OnExit" + func[2:]) @@ -767,7 +769,7 @@ def gen_exit_callback( f.write(" uint64_t synch_corrid = UniCorrId::GetUniCorrId();\n") if cb != "": - f.write(" if (collector->options_.kernel_tracing) { \n") + f.write(" if (collector->options_.kernel_tracing || collector->IsAnyCallbackSubscriberActive() ) { \n") if (func in submission_func_list) or (func in synchronize_func_list_on_exit): if func in synchronization_viewkind_api_list: f.write( diff --git a/sdk/src/levelzero/ze_collector.h b/sdk/src/levelzero/ze_collector.h index 881b886..33330bc 100644 --- a/sdk/src/levelzero/ze_collector.h +++ b/sdk/src/levelzero/ze_collector.h @@ -41,6 +41,7 @@ #include "pti_api_ids_state_maps.h" #include "unikernel.h" #include "utils.h" +#include "ze_collector_cb_helpers.h" #include "ze_driver_init.h" #include "ze_event_cache.h" #include "ze_local_collection_helpers.h" @@ -78,8 +79,6 @@ struct ZeKernelGroupSize { uint32_t z; }; -enum class KernelCommandType { kInvalid = 0, kKernel = 1, kMemory = 2, kCommand = 3 }; - struct ZeKernelCommandProps { std::string name; KernelCommandType type = KernelCommandType::kInvalid; @@ -434,12 +433,184 @@ class ZeCollector { #endif } + const CollectorOptions& GetCollectorOptions() const { return options_; } + bool IsTracingOn() const { return startstop_mode_changer.IsTracingOn(); } + void SetKernelTracing(bool enable) { options_.kernel_tracing = enable; } + void SetCollectorOptionSynchronization() { options_.lz_enabled_views.synch_enabled = true; } void SetCollectorOptionApiCalls() { options_.lz_enabled_views.api_calls_enabled = true; } void UnSetCollectorOptionSynchronization() { options_.lz_enabled_views.synch_enabled = false; } void UnSetCollectorOptionApiCalls() { options_.lz_enabled_views.api_calls_enabled = false; } + // Multiple subscriber support with ID-based management + uint32_t AddCallbackSubscriber(pti_callback_function callback, void* user_data) { + std::unique_lock lock(subscribers_mutex_); + auto subscriber = std::make_unique(); + subscriber->SetUserData(user_data); + subscriber->SetCallback(callback); + uint32_t id = subscriber->GetId(); + callback_subscribers_[id] = std::move(subscriber); + return id; + } + + pti_result RemoveCallbackSubscriber(uint32_t subscriber_id) { + std::unique_lock lock(subscribers_mutex_); + auto it = callback_subscribers_.find(subscriber_id); + if (it == callback_subscribers_.end()) { + return pti_result::PTI_ERROR_BAD_ARGUMENT; + } + it->second->Clean(); + callback_subscribers_.erase(it); + return pti_result::PTI_SUCCESS; + } + + ZeCollectorCallbackSubscriber* GetCallbackSubscriber(uint32_t subscriber_id) { + std::shared_lock lock(subscribers_mutex_); + auto it = callback_subscribers_.find(subscriber_id); + return (it != callback_subscribers_.end()) ? it->second.get() : nullptr; + } + + std::vector GetAllSubscriberIds() const { + std::shared_lock lock(subscribers_mutex_); + std::vector ids; + ids.reserve(callback_subscribers_.size()); + for (const auto& [id, subscriber] : callback_subscribers_) { + ids.push_back(id); + } + return ids; + } + + pti_result EnableCallbackDomain(uint32_t subscriber_id, pti_callback_domain domain, + uint32_t enter_cb, uint32_t exit_cb) { + auto* subscriber = GetCallbackSubscriber(subscriber_id); + if (!subscriber) { + return pti_result::PTI_ERROR_BAD_ARGUMENT; + } + return subscriber->EnableDomain(domain, enter_cb, exit_cb); + } + + pti_result DisableCallbackDomain(uint32_t subscriber_id, pti_callback_domain domain) { + auto* subscriber = GetCallbackSubscriber(subscriber_id); + if (!subscriber) { + return pti_result::PTI_ERROR_BAD_ARGUMENT; + } + return subscriber->DisableDomain(domain); + } + + pti_result DisableAllCallbackDomains(uint32_t subscriber_id) { + auto* subscriber = GetCallbackSubscriber(subscriber_id); + if (!subscriber) { + return pti_result::PTI_ERROR_BAD_ARGUMENT; + } + return subscriber->DisableAllDomains(); + } + + bool IsCallbackSubscriberActive(uint32_t subscriber_id) const { + std::shared_lock lock(subscribers_mutex_); + auto it = callback_subscribers_.find(subscriber_id); + return (it != callback_subscribers_.end()) ? it->second->IsActive() : false; + } + + bool IsCallbackDomainEnabled(uint32_t subscriber_id, pti_callback_domain domain, + uint32_t cb_type) const { + std::shared_lock lock(subscribers_mutex_); + auto it = callback_subscribers_.find(subscriber_id); + return (it != callback_subscribers_.end()) ? it->second->IsEnabled(domain, cb_type) : false; + } + + // Check if any subscriber has the given domain enabled + bool IsCallbackDomainEnabled(pti_callback_domain domain, uint32_t cb_type) const { + std::shared_lock lock(subscribers_mutex_); + for (const auto& [id, subscriber] : callback_subscribers_) { + if (subscriber->IsEnabled(domain, cb_type)) { + return true; + } + } + return false; + } + + bool IsAnyCallbackSubscriberActive() { + std::shared_lock lock(subscribers_mutex_); + for (const auto& kv : callback_subscribers_) { + if (kv.second && kv.second->IsActive()) { + return true; + } + } + return false; + } + + private: + pti_gpu_op_details MakeGPUOpDetails(const ZeKernelCommand& command) { + return pti_gpu_op_details{ + ._operation_kind = ZeCollectorCallbackSubscriber::GetGPUOperationKind(command.props.type), + ._operation_id = command.kernel_id, + ._kernel_handle = INVALID_KERNEL_HANDLE, // temp, until modules & kernels in them supported + ._name = command.props.name.c_str()}; + } + + pti_callback_gpu_op_data MakeGPUOpData(const ZeKernelCommand& command, pti_callback_phase phase, + ze_result_t return_code, pti_gpu_op_details* op_details) { + pti_backend_command_list_type cmd_list_props = IsCommandListImmediate(command.command_list) + ? PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE + : PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN; + pti_backend_queue_t queue_handle = + (IsCommandListImmediate(command.command_list)) ? command.command_list : nullptr; + return pti_callback_gpu_op_data{._domain = PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, + ._cmd_list_properties = cmd_list_props, + ._cmd_list_handle = command.command_list, + ._queue_handle = queue_handle, + ._device_handle = command.device, + ._phase = phase, + ._return_code = return_code, + ._correlation_id = command.corr_id_, + ._operation_count = 1, + ._operation_details = op_details}; + } + + void DoCallbackOnGPUOperationCompletion(std::vector kcexecrec) { + SPDLOG_TRACE("On {}", __func__); + if (IsCallbackDomainEnabled(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1) && + kcexecrec.size() > 0) { + // TODO: Optimize - create callback data outside of the loop and synchronization + + // Call the callback with the collected records for all active subscribers + std::shared_lock lock(subscribers_mutex_); + for (const auto& record : kcexecrec) { + for (const auto& [id, subscriber] : callback_subscribers_) { + if (subscriber->IsEnabled(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1)) { + subscriber->InvokeCallbackGPUOperationCompletion(record, ze_instance_data.callback_id_); + } + } + } + } + } + + void DoCallbackOnGPUOperationAppended(const ZeKernelCommand* command, pti_callback_phase phase, + ze_result_t return_code) { + SPDLOG_TRACE("On {}", __func__); + if (IsCallbackDomainEnabled(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, phase)) { + pti_gpu_op_details op_details = MakeGPUOpDetails(*command); + pti_callback_gpu_op_data callback_data = + MakeGPUOpData(*command, phase, return_code, &op_details); + + // Invoke callbacks for all subscribers with this domain enabled + std::shared_lock lock(subscribers_mutex_); + // TODO: Make correct order for different phases: + // ENTER - forward, EXIT -> backward + for (const auto& [id, subscriber] : callback_subscribers_) { + if (subscriber->IsEnabled(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, + PTI_CB_PHASE_API_ENTER) && + subscriber->GetCallback()) { + subscriber->GetCallback()(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, + PTI_API_GROUP_LEVELZERO, ze_instance_data.callback_id_, + command->context, &callback_data, subscriber->GetUserData(), + subscriber->GetPtrForInstanceUserData()); + } + } + } + } + private: // Implementation ZeCollector(CollectorOptions options, OnZeKernelFinishCallback acallback, OnZeApiCallsFinishCallback fcallback, void* callback_data, @@ -905,7 +1076,7 @@ class ZeCollector { // // - All times reported by PTI_VIEW in CPU (aka Host) timescale // - However GPU "commands" (kernel & memory transfers) start/end reported in GPU timescale - // - There is significant time drift between CPU and GPU, so to cope wth it - need to + // - There is significant time drift between CPU and GPU, so to cope with it - need to // "sync" often calling zeDeviceGetGlobalTimestamps, // where command->submit_time_device_ comes with GPU time // command->submit_time comes with CPU time @@ -1004,6 +1175,7 @@ class ZeCollector { rec.implicit_scaling_ = false; } + rec.command_type_ = command->props.type; if (command->props.type == KernelCommandType::kMemory) { rec.device_ = command->props.src_device; rec.dst_device_ = command->props.dst_device; @@ -1387,47 +1559,50 @@ class ZeCollector { SPDLOG_TRACE("In {} event: {}", __FUNCTION__, (void*)*(params->phEvent)); ze_result_t status; ZeCollector* collector = static_cast(global_data); - const std::lock_guard lock(collector->lock_); - if (result == ZE_RESULT_SUCCESS) { - std::vector kcexec; - collector->ProcessCallEvent(*(params->phEvent), kids, &kcexec); - if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { - collector->acallback_(collector->callback_data_, kcexec); + std::vector kcexec; + { + const std::lock_guard lock(collector->lock_); + if (result == ZE_RESULT_SUCCESS) { + collector->ProcessCallEvent(*(params->phEvent), kids, &kcexec); + if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { + collector->acallback_(collector->callback_data_, kcexec); + } } - } - // Process generation of synch record even if result is not successful. - if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && - collector->acallback_ != nullptr) { - std::vector kcexec; - ZeKernelCommandExecutionRecord rec = {}; - ze_event_handle_t event_h = *params->phEvent; - ze_event_pool_handle_t epool_h = nullptr; - ze_context_handle_t ctxt_h = nullptr; - rec.context_ = nullptr; - if (collector->IsIntrospectionCapable()) { - status = collector->l0_wrapper_.w_zeEventGetEventPool(event_h, &epool_h); - if (status == ZE_RESULT_SUCCESS) { - status = collector->l0_wrapper_.w_zeEventPoolGetContextHandle(epool_h, &ctxt_h); + // Process generation of synch record even if result is not successful. + if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && + collector->acallback_ != nullptr) { + std::vector kcexec1; + ZeKernelCommandExecutionRecord rec = {}; + ze_event_handle_t event_h = *params->phEvent; + ze_event_pool_handle_t epool_h = nullptr; + ze_context_handle_t ctxt_h = nullptr; + rec.context_ = nullptr; + if (collector->IsIntrospectionCapable()) { + status = collector->l0_wrapper_.w_zeEventGetEventPool(event_h, &epool_h); if (status == ZE_RESULT_SUCCESS) { - rec.context_ = ctxt_h; - } else { - SPDLOG_WARN( - "\tLevel-Zero Introspection API: zeEventPoolGetContextHandle return unsuccessful " - "-- inserting null context handle in synch. record.."); + status = collector->l0_wrapper_.w_zeEventPoolGetContextHandle(epool_h, &ctxt_h); + if (status == ZE_RESULT_SUCCESS) { + rec.context_ = ctxt_h; + } else { + SPDLOG_WARN( + "\tLevel-Zero Introspection API: zeEventPoolGetContextHandle return unsuccessful " + "-- inserting null context handle in synch. record.."); + } } } + rec.name_ = "zeEventHostSynchronize"; + rec.tid_ = thread_local_pid_tid_info.tid; + rec.start_time_ = ze_instance_data.start_time_host; + rec.end_time_ = utils::GetTime(); + rec.event_ = event_h; + rec.cid_ = synch_corrid; + rec.result_ = result; + rec.callback_id_ = zeEventHostSynchronize_id; + kcexec1.push_back(std::move(rec)); + collector->acallback_(collector->callback_data_, kcexec1); } - rec.name_ = "zeEventHostSynchronize"; - rec.tid_ = thread_local_pid_tid_info.tid; - rec.start_time_ = ze_instance_data.start_time_host; - rec.end_time_ = utils::GetTime(); - rec.event_ = event_h; - rec.cid_ = synch_corrid; - rec.result_ = result; - rec.callback_id_ = zeEventHostSynchronize_id; - kcexec.push_back(std::move(rec)); - collector->acallback_(collector->callback_data_, kcexec); } + collector->DoCallbackOnGPUOperationCompletion(kcexec); } static void OnExitCommandListHostSynchronize( @@ -1436,18 +1611,21 @@ class ZeCollector { [[maybe_unused]] uint64_t synch_corrid) { SPDLOG_TRACE("In {}", __FUNCTION__); ZeCollector* collector = static_cast(global_data); - const std::lock_guard lock(collector->lock_); - if (result == ZE_RESULT_SUCCESS) { - std::vector kcexec; - collector->ProcessCalls(kids, &kcexec); - if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { - collector->acallback_(collector->callback_data_, kcexec); + std::vector kcexec; + { + const std::lock_guard lock(collector->lock_); + if (result == ZE_RESULT_SUCCESS) { + collector->ProcessCalls(kids, &kcexec); + if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { + collector->acallback_(collector->callback_data_, kcexec); + } } } + // Process generation of synch record even if result is not successful. if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && collector->acallback_ != nullptr) { - std::vector kcexec; + std::vector kcexec1; ze_result_t status; ZeKernelCommandExecutionRecord rec = {}; ze_command_list_handle_t clist_h = *params->phCommandList; @@ -1472,9 +1650,11 @@ class ZeCollector { rec.cid_ = synch_corrid; rec.result_ = result; rec.callback_id_ = zeCommandListHostSynchronize_id; - kcexec.push_back(std::move(rec)); - collector->acallback_(collector->callback_data_, kcexec); + kcexec1.push_back(std::move(rec)); + collector->acallback_(collector->callback_data_, kcexec1); } + + collector->DoCallbackOnGPUOperationCompletion(kcexec); } static void OnExitEventQueryStatus([[maybe_unused]] ze_event_query_status_params_t* params, @@ -1505,10 +1685,10 @@ class ZeCollector { uint64_t synch_corrid) { SPDLOG_TRACE("In {}, result {} ", __FUNCTION__, static_cast(result)); ZeCollector* collector = static_cast(global_data); - const std::lock_guard lock(collector->lock_); + std::vector kcexec; if (result == ZE_RESULT_SUCCESS) { + const std::lock_guard lock(collector->lock_); PTI_ASSERT(*(params->phFence) != nullptr); - std::vector kcexec; collector->ProcessCallFence(*(params->phFence), kids, &kcexec); if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { @@ -1518,7 +1698,7 @@ class ZeCollector { // Process generation of synch record even if result is not successful. if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && collector->acallback_ != nullptr) { - std::vector kcexec; + std::vector kcexec1; ZeKernelCommandExecutionRecord rec = {}; ze_fence_handle_t fence_h = *params->phFence; rec.context_ = nullptr; @@ -1539,9 +1719,10 @@ class ZeCollector { rec.cid_ = synch_corrid; rec.result_ = result; rec.callback_id_ = zeFenceHostSynchronize_id; - kcexec.push_back(std::move(rec)); - collector->acallback_(collector->callback_data_, kcexec); + kcexec1.push_back(std::move(rec)); + collector->acallback_(collector->callback_data_, kcexec1); } + collector->DoCallbackOnGPUOperationCompletion(kcexec); } static void OnExitImageCreate(ze_image_create_params_t* params, ze_result_t result, @@ -1624,6 +1805,32 @@ class ZeCollector { command->command_list = command_list; command->device = device; command->context = context; + // Need kernel_id on Enter as it might be needed for Callback API + command->kernel_id = UniKernelId::GetKernelId(); + if (command->props.type == KernelCommandType::kKernel) { + if (sycl_data_kview.cid_) { + command->corr_id_ = sycl_data_kview.cid_; + } else { + command->corr_id_ = UniCorrId::GetUniCorrId(); +#if defined(PTI_TRACE_SYCL) + if (SyclCollector::Instance().Enabled()) { + sycl_data_kview.cid_ = command->corr_id_; + } +#endif + } + } else if (command->props.type == KernelCommandType::kMemory) { + if (sycl_data_mview.cid_) { + command->corr_id_ = sycl_data_mview.cid_; + } else { + command->corr_id_ = UniCorrId::GetUniCorrId(); +#if defined(PTI_TRACE_SYCL) + if (SyclCollector::Instance().Enabled()) { + sycl_data_mview.cid_ = command->corr_id_; + } +#endif + } + } + SPDLOG_TRACE("\tcontext: {}, device: {}", (void*)context, (void*)device); command->event_swap = nullptr; @@ -1674,6 +1881,9 @@ class ZeCollector { } } + // Subscriber callback + collector->DoCallbackOnGPUOperationAppended(command, PTI_CB_PHASE_API_ENTER, ZE_RESULT_SUCCESS); + uint64_t host_timestamp = 0; uint64_t device_timestamp = 0; // in ticks @@ -1685,22 +1895,20 @@ class ZeCollector { } void PostAppendKernelCommandCommon(ZeCollector* /*collector*/, ZeKernelCommand* command, - ZeKernelCommandProps& props, ze_event_handle_t& signal_event, + ze_event_handle_t& signal_event, ZeCommandListInfo& command_list_info, std::vector* kids) { SPDLOG_TRACE("In {}, command: {}, kernel name {}", __FUNCTION__, - static_cast(command), props.name.c_str()); + static_cast(command), command->props.name.c_str()); if (ZeCollectionState::Abnormal == collection_state_) { return; } PTI_ASSERT(command != nullptr); - command->props = props; PTI_ASSERT(signal_event != nullptr); command->tid = thread_local_pid_tid_info.tid; uint64_t host_timestamp = ze_instance_data.start_time_host; command->append_time = host_timestamp; - command->kernel_id = UniKernelId::GetKernelId(); command->device_timer_frequency_ = device_descriptors_[command->device].device_timer_frequency; command->device_timer_mask_ = device_descriptors_[command->device].device_timer_mask; if (command->props.type == KernelCommandType::kKernel) { @@ -1713,33 +1921,9 @@ class ZeCollector { sycl_data_kview.tid_ = command->tid; command->source_file_name_ = sycl_data_kview.source_file_name_; command->source_line_number_ = sycl_data_kview.source_line_number_; - if (sycl_data_kview.cid_) { - command->corr_id_ = sycl_data_kview.cid_; - } else { - command->corr_id_ = UniCorrId::GetUniCorrId(); -#if defined(PTI_TRACE_SYCL) - if (SyclCollector::Instance().Enabled()) { - sycl_data_kview.cid_ = command->corr_id_; - } -#endif - } } else if (command->props.type == KernelCommandType::kMemory) { - command->props.src_device = props.src_device; - command->props.dst_device = props.dst_device; - sycl_data_mview.kid_ = command->kernel_id; sycl_data_mview.tid_ = command->tid; - if (sycl_data_mview.cid_) { - command->corr_id_ = sycl_data_mview.cid_; - } else { - command->corr_id_ = UniCorrId::GetUniCorrId(); -#if defined(PTI_TRACE_SYCL) - if (SyclCollector::Instance().Enabled()) { - sycl_data_mview.cid_ = command->corr_id_; - } -#endif - } - command->sycl_node_id_ = sycl_data_mview.sycl_node_id_; command->sycl_queue_id_ = sycl_data_mview.sycl_queue_id_; command->sycl_invocation_id_ = sycl_data_mview.sycl_invocation_id_; @@ -1808,18 +1992,18 @@ class ZeCollector { void PostAppendKernel(ZeCollector* collector, ze_kernel_handle_t kernel, const ze_group_count_t* group_count, ze_event_handle_t& signal_event, - ze_command_list_handle_t command_list, void** instance_data, - std::vector* kids) { + ze_command_list_handle_t command_list, ze_result_t result, + void** instance_data, std::vector* kids) { PTI_ASSERT(command_list != nullptr); PTI_ASSERT(kernel != nullptr); SPDLOG_TRACE("In {}", __FUNCTION__); - ZeKernelCommandProps props{}; + ZeKernelCommand* command = static_cast(*instance_data); - props.name = utils::ze::GetKernelName(kernel, options_.demangle); - props.type = KernelCommandType::kKernel; - props.simd_width = utils::ze::GetKernelMaxSubgroupSize(kernel); - props.bytes_transferred = 0; + command->props.name = utils::ze::GetKernelName(kernel, options_.demangle); + command->props.type = KernelCommandType::kKernel; + command->props.simd_width = utils::ze::GetKernelMaxSubgroupSize(kernel); + command->props.bytes_transferred = 0; ZeKernelGroupSize group_size{}; { @@ -1831,20 +2015,24 @@ class ZeCollector { } } - props.group_size[0] = group_size.x; - props.group_size[1] = group_size.y; - props.group_size[2] = group_size.z; + command->props.group_size[0] = group_size.x; + command->props.group_size[1] = group_size.y; + command->props.group_size[2] = group_size.z; if (group_count != nullptr) { - props.group_count[0] = group_count->groupCountX; - props.group_count[1] = group_count->groupCountY; - props.group_count[2] = group_count->groupCountZ; + command->props.group_count[0] = group_count->groupCountX; + command->props.group_count[1] = group_count->groupCountY; + command->props.group_count[2] = group_count->groupCountZ; } ZeCommandListInfo& command_list_info = GetCommandListInfo(command_list); - PostAppendKernelCommandCommon(collector, static_cast(*instance_data), props, - signal_event, command_list_info, kids); + // Subscriber callback + collector->DoCallbackOnGPUOperationAppended(command, PTI_CB_PHASE_API_EXIT, result); + + if (result == ZE_RESULT_SUCCESS) { + PostAppendKernelCommandCommon(collector, command, signal_event, command_list_info, kids); + } } void PostAppendMemoryCommand(ZeCollector* collector, std::string command_name, @@ -1867,15 +2055,15 @@ class ZeCollector { ze_context_handle_t context = command_list_info.context; PTI_ASSERT(context != nullptr); - ZeKernelCommandProps props = + ZeKernelCommand* command = static_cast(*instance_data); + command->props = GetTransferProps(std::move(command_name), bytes_transferred, (src ? context : nullptr), src, (dst ? context : nullptr), dst, pattern_size); - PostAppendKernelCommandCommon(collector, static_cast(*instance_data), props, - signal_event, command_list_info, kids); + PostAppendKernelCommandCommon(collector, command, signal_event, command_list_info, kids); } - void AppendMemoryCommandContext(ZeCollector* collector, std::string command, + void AppendMemoryCommandContext(ZeCollector* collector, std::string command_name, size_t bytes_transferred, ze_context_handle_t src_context, const void* src, ze_context_handle_t dst_context, const void* dst, ze_event_handle_t& signal_event, @@ -1889,15 +2077,14 @@ class ZeCollector { ze_context_handle_t context = command_list_info.context; PTI_ASSERT(context != nullptr); - ZeKernelCommandProps props = - GetTransferProps(std::move(command), bytes_transferred, src_context, src, - (dst_context ? dst_context : context), dst); + ZeKernelCommand* command = static_cast(*instance_data); + command->props = GetTransferProps(std::move(command_name), bytes_transferred, src_context, src, + (dst_context ? dst_context : context), dst); - PostAppendKernelCommandCommon(collector, static_cast(*instance_data), props, - signal_event, command_list_info, kids); + PostAppendKernelCommandCommon(collector, command, signal_event, command_list_info, kids); } - void AppendImageMemoryCopyCommand(ZeCollector* collector, std::string command, + void AppendImageMemoryCopyCommand(ZeCollector* collector, std::string command_name, ze_image_handle_t image, const void* src, const void* dst, ze_event_handle_t& signal_event, ze_command_list_handle_t command_list, void** instance_data, @@ -1911,17 +2098,17 @@ class ZeCollector { size_t bytes_transferred = GetImageSize(image); - ZeKernelCommandProps props = - GetTransferProps(std::move(command), bytes_transferred, context, src, context, dst); + ZeKernelCommand* command = static_cast(*instance_data); + command->props = + GetTransferProps(std::move(command_name), bytes_transferred, context, src, context, dst); // TODO implement image copy support in Local collection model if (collector->collection_mode_ != ZeCollectionMode::Local) { - PostAppendKernelCommandCommon(collector, static_cast(*instance_data), props, - signal_event, command_list_info, kids); + PostAppendKernelCommandCommon(collector, command, signal_event, command_list_info, kids); } } - void PostAppendCommand(ZeCollector* collector, std::string command, + void PostAppendCommand(ZeCollector* collector, std::string command_name, ze_event_handle_t& signal_event, ze_command_list_handle_t command_list, void** instance_data, std::vector* kids) { SPDLOG_TRACE("In {}", __FUNCTION__); @@ -1935,12 +2122,11 @@ class ZeCollector { ze_context_handle_t context = command_list_info.context; PTI_ASSERT(context != nullptr); - ZeKernelCommandProps props{}; - props.name = std::move(command); - props.type = KernelCommandType::kCommand; + ZeKernelCommand* command = static_cast(*instance_data); + command->props.name = std::move(command_name); + command->props.type = KernelCommandType::kCommand; - PostAppendKernelCommandCommon(collector, static_cast(*instance_data), props, - signal_event, command_list_info, kids); + PostAppendKernelCommandCommon(collector, command, signal_event, command_list_info, kids); } static ZeKernelCommandProps GetTransferProps(std::string name, size_t bytes_transferred, @@ -2060,11 +2246,10 @@ class ZeCollector { void** instance_data, std::vector* kids) { SPDLOG_TRACE("In {}, result: {}", __FUNCTION__, (uint32_t)result); ZeCollector* collector = static_cast(global_data); - if (result == ZE_RESULT_SUCCESS) { - collector->PostAppendKernel(collector, *(params->phKernel), *(params->ppLaunchFuncArgs), - *(params->phSignalEvent), *(params->phCommandList), instance_data, - kids); - } else { + collector->PostAppendKernel(collector, *(params->phKernel), *(params->ppLaunchFuncArgs), + *(params->phSignalEvent), *(params->phCommandList), result, + instance_data, kids); + if (result != ZE_RESULT_SUCCESS) { collector->event_cache_.ReleaseEvent(*(params->phSignalEvent)); } } @@ -2083,11 +2268,11 @@ class ZeCollector { void* global_data, void** instance_data, std::vector* kids) { SPDLOG_TRACE("In {}, result: {}", __FUNCTION__, (uint32_t)result); ZeCollector* collector = static_cast(global_data); - if (result == ZE_RESULT_SUCCESS) { - collector->PostAppendKernel(collector, *(params->phKernel), *(params->ppLaunchFuncArgs), - *(params->phSignalEvent), *(params->phCommandList), instance_data, - kids); - } else { + + collector->PostAppendKernel(collector, *(params->phKernel), *(params->ppLaunchFuncArgs), + *(params->phSignalEvent), *(params->phCommandList), result, + instance_data, kids); + if (result != ZE_RESULT_SUCCESS) { collector->event_cache_.ReleaseEvent(*(params->phSignalEvent)); } } @@ -2106,11 +2291,10 @@ class ZeCollector { void* global_data, void** instance_data, std::vector* kids) { SPDLOG_TRACE("In {}, result: {}", __FUNCTION__, (uint32_t)result); ZeCollector* collector = static_cast(global_data); - if (result == ZE_RESULT_SUCCESS) { - collector->PostAppendKernel(collector, *(params->phKernel), - *(params->ppLaunchArgumentsBuffer), *(params->phSignalEvent), - *(params->phCommandList), instance_data, kids); - } else { + collector->PostAppendKernel(collector, *(params->phKernel), *(params->ppLaunchArgumentsBuffer), + *(params->phSignalEvent), *(params->phCommandList), result, + instance_data, kids); + if (result != ZE_RESULT_SUCCESS) { collector->event_cache_.ReleaseEvent(*(params->phSignalEvent)); } } @@ -2549,38 +2733,41 @@ class ZeCollector { [[maybe_unused]] uint64_t synch_corrid) { SPDLOG_TRACE("In {}, result: {}", __FUNCTION__, static_cast(result)); ZeCollector* collector = static_cast(global_data); - const std::lock_guard lock(collector->lock_); - if (result == ZE_RESULT_SUCCESS) { - std::vector kcexec; - collector->ProcessCalls(kids, &kcexec); - if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { - collector->acallback_(collector->callback_data_, kcexec); + std::vector kcexec; + { + const std::lock_guard lock(collector->lock_); + if (result == ZE_RESULT_SUCCESS) { + collector->ProcessCalls(kids, &kcexec); + if (collector->cb_enabled_.acallback && collector->acallback_ != nullptr) { + collector->acallback_(collector->callback_data_, kcexec); + } } - } - // Process generation of synch record even if result is not successful. - if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && - collector->acallback_ != nullptr) { - std::vector kcexec; - ZeKernelCommandExecutionRecord rec = {}; - auto it = collector->command_queues_.find(*params->phCommandQueue); - rec.context_ = nullptr; - if (it != collector->command_queues_.end()) { - rec.context_ = it->second.context_; + // Process generation of synch record even if result is not successful. + if (collector->cb_enabled_.acallback && collector->options_.lz_enabled_views.synch_enabled && + collector->acallback_ != nullptr) { + std::vector kcexec1; + ZeKernelCommandExecutionRecord rec = {}; + auto it = collector->command_queues_.find(*params->phCommandQueue); + rec.context_ = nullptr; + if (it != collector->command_queues_.end()) { + rec.context_ = it->second.context_; + } + rec.name_ = "zeCommandQueueSynchronize"; + rec.tid_ = thread_local_pid_tid_info.tid; + rec.start_time_ = ze_instance_data.start_time_host; + rec.end_time_ = ze_instance_data.end_time_host; + rec.context_ = nullptr; + rec.queue_ = *params->phCommandQueue; + rec.event_ = nullptr; + rec.cid_ = synch_corrid; + rec.callback_id_ = zeCommandQueueSynchronize_id; + rec.result_ = result; + kcexec1.push_back(std::move(rec)); + collector->acallback_(collector->callback_data_, kcexec1); } - rec.name_ = "zeCommandQueueSynchronize"; - rec.tid_ = thread_local_pid_tid_info.tid; - rec.start_time_ = ze_instance_data.start_time_host; - rec.end_time_ = ze_instance_data.end_time_host; - rec.context_ = nullptr; - rec.queue_ = *params->phCommandQueue; - rec.event_ = nullptr; - rec.cid_ = synch_corrid; - rec.callback_id_ = zeCommandQueueSynchronize_id; - rec.result_ = result; - kcexec.push_back(std::move(rec)); - collector->acallback_(collector->callback_data_, kcexec); } + collector->DoCallbackOnGPUOperationCompletion(kcexec); } static void OnExitCommandQueueCreate(ze_command_queue_create_params_t* params, @@ -2759,6 +2946,11 @@ class ZeCollector { Level0Wrapper l0_wrapper_; + // Multiple subscribers support with ID-based access + // important that container is ordered, callbacks should be called in an order + std::map> callback_subscribers_; + mutable std::shared_mutex subscribers_mutex_; + std::atomic collection_state_ = ZeCollectionState::Normal; // pointer to state of an object that created ZeCollector @@ -2824,6 +3016,8 @@ class ZeCollector { return ref_count; } + inline bool IsTracingOn() const { return (ref_count > 0); } + private: // Track enable/disable tracing layer calls on a global basis - in order to swap apis. // zelEnableTracingLayer and zelDisableTracingLayer are not thread specific -- and act globally. diff --git a/sdk/src/levelzero/ze_collector_cb_helpers.h b/sdk/src/levelzero/ze_collector_cb_helpers.h new file mode 100644 index 0000000..0ac85a2 --- /dev/null +++ b/sdk/src/levelzero/ze_collector_cb_helpers.h @@ -0,0 +1,172 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef ZE_COLLECTOR_CB_HELPERS_H_ +#define ZE_COLLECTOR_CB_HELPERS_H_ + +#include + +#include +#include + +#include "pti/pti_sync_callback.h" + +using SubscriberId = uint32_t; +#define INVALID_KERNEL_HANDLE 0 + +struct CbDomainProperties { + pti_callback_domain domain; + // serve for ENTER/EXIT to API, THREAD_START/THREAD_END to THREAD or other "events" that + // come in a pair + bool is_enter_enabled; + bool is_exit_enabled; +}; + +class ZeCollectorCallbackSubscriber { + private: + static inline std::atomic next_id_{1}; + + static SubscriberId GenerateUniqueId() { return next_id_.fetch_add(1); } + + public: + ZeCollectorCallbackSubscriber() + : id_(GenerateUniqueId()), callback_(nullptr), user_data_(nullptr) {} + + ZeCollectorCallbackSubscriber(const ZeCollectorCallbackSubscriber&) = delete; + ZeCollectorCallbackSubscriber& operator=(const ZeCollectorCallbackSubscriber&) = delete; + ZeCollectorCallbackSubscriber(ZeCollectorCallbackSubscriber&&) = delete; + ZeCollectorCallbackSubscriber& operator=(ZeCollectorCallbackSubscriber&&) = delete; + + // Virtual destructor for proper cleanup in derived classes + virtual ~ZeCollectorCallbackSubscriber() = default; + + SubscriberId GetId() const { return id_; } + + pti_result EnableDomain(pti_callback_domain domain, uint32_t enter_cb, uint32_t exit_cb) { + // for now only few specific domains supported + if (domain != PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED && + domain != PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED) { + return PTI_ERROR_NOT_IMPLEMENTED; + } + if (enter_cb == 0 && exit_cb == 0) { + return PTI_ERROR_BAD_ARGUMENT; // at least one should be non-zero + } + if (enabled_domains_.find(domain) != enabled_domains_.end()) { + enabled_domains_[domain].is_enter_enabled = static_cast(enter_cb); + enabled_domains_[domain].is_exit_enabled = static_cast(exit_cb); + // Domain already enabled, return success + return PTI_SUCCESS; + } + CbDomainProperties properties = {domain, static_cast(enter_cb), + static_cast(exit_cb)}; + enabled_domains_[domain] = properties; + return PTI_SUCCESS; + } + + pti_result DisableDomain(pti_callback_domain domain) { + auto it = enabled_domains_.find(domain); + if (it != enabled_domains_.end()) { + enabled_domains_.erase(it); + return PTI_SUCCESS; + } + return PTI_ERROR_BAD_ARGUMENT; // Domain not found + } + + pti_result DisableAllDomains() { + SPDLOG_TRACE("In {}", __func__); + enabled_domains_.clear(); + return PTI_SUCCESS; + } + + pti_result SetCallback(pti_callback_function callback) { + if (callback == nullptr) { + return PTI_ERROR_BAD_ARGUMENT; + } + callback_ = callback; + return PTI_SUCCESS; + } + + pti_callback_function GetCallback() const { return callback_; } + + bool IsActive() const { return !enabled_domains_.empty() && callback_ != nullptr; } + + void* GetUserData() const { return user_data_; } + + void SetUserData(void* user_data) { user_data_ = user_data; } + + void** GetPtrForInstanceUserData() { return &instance_user_data_; } + + pti_result Clean() { + callback_ = nullptr; + user_data_ = nullptr; + return DisableAllDomains(); + } + + bool IsEnabled(pti_callback_domain domain, uint32_t cb_type) const { + auto it = enabled_domains_.find(domain); + if (it != enabled_domains_.end()) { + if (cb_type == PTI_CB_PHASE_API_ENTER) { + return it->second.is_enter_enabled; + } else if (cb_type == PTI_CB_PHASE_API_EXIT) { + return it->second.is_exit_enabled; + } + } + return false; + } + + void InvokeCallbackGPUOperationCompletion(const ZeKernelCommandExecutionRecord& record, + uint32_t l0_api_id) { + if (callback_ != nullptr) { + pti_gpu_op_details gpu_op_details = { + ._operation_kind = GetGPUOperationKind(record.command_type_), + ._operation_id = record.kid_, + // temp, until modules & kernels in them supported + ._kernel_handle = INVALID_KERNEL_HANDLE, + ._name = record.name_.c_str()}; + pti_device_handle_t device_handle = record.device_; + if (record.command_type_ == KernelCommandType::kMemory && device_handle == nullptr) { + device_handle = record.dst_device_; // for memcpy use dst device + } + pti_callback_gpu_op_data callback_data = { + ._domain = PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, + ._cmd_list_properties = PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN, + ._cmd_list_handle = nullptr, + ._queue_handle = record.queue_, + ._device_handle = device_handle, + ._phase = PTI_CB_PHASE_API_EXIT, + ._return_code = 0, + ._correlation_id = record.cid_, + ._operation_count = 1, + ._operation_details = &gpu_op_details}; + callback_(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, PTI_API_GROUP_LEVELZERO, l0_api_id, + record.context_, &callback_data, GetUserData(), GetPtrForInstanceUserData()); + } + } + + static pti_gpu_operation_kind GetGPUOperationKind(const KernelCommandType& type) { + // Map KernelCommandType to pti_gpu_operation_kind + // TODO: consider to use the same enum, make it external and then this funcion not needed + switch (type) { + case KernelCommandType::kKernel: + return PTI_GPU_OPERATION_KIND_KERNEL; + case KernelCommandType::kMemory: + return PTI_GPU_OPERATION_KIND_MEMORY; + case KernelCommandType::kCommand: + return PTI_GPU_OPERATION_KIND_OTHER; + default: + return PTI_GPU_OPERATION_KIND_INVALID; + } + } + + private: + SubscriberId id_; + pti_callback_function callback_ = nullptr; + std::unordered_map enabled_domains_; + void* user_data_ = nullptr; + void* instance_user_data_ = nullptr; +}; + +#endif diff --git a/sdk/src/pti.cc b/sdk/src/pti.cc index dd696db..b2a9b2c 100644 --- a/sdk/src/pti.cc +++ b/sdk/src/pti.cc @@ -6,12 +6,15 @@ #include "pti/pti.h" +#include "pti/pti_sync_callback.h" + #define PTI_CASE_ENUM_TO_STRING(e) \ case e: \ return #e; const char* ptiResultTypeToString(pti_result result_value) { switch (result_value) { + PTI_CASE_ENUM_TO_STRING(PTI_ERROR_AT_LEAST_ONE_GPU_VIEW_MUST_BE_ENABLED) PTI_CASE_ENUM_TO_STRING(PTI_ERROR_BAD_API_ID) PTI_CASE_ENUM_TO_STRING(PTI_ERROR_BAD_ARGUMENT) PTI_CASE_ENUM_TO_STRING(PTI_ERROR_BAD_TIMESTAMP) @@ -36,4 +39,37 @@ const char* ptiResultTypeToString(pti_result result_value) { return "INVALID"; } +const char* ptiCallbackDomainTypeToString(pti_callback_domain domain) { + switch (domain) { + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_API) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_CONTEXT_CREATED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_HOST_SYNCHRONIZATION) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_MODULE_LOADED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_DRIVER_MODULE_UNLOADED) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_INTERNAL_EVENT) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_INTERNAL_THREADS) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_INVALID) + PTI_CASE_ENUM_TO_STRING(PTI_CB_DOMAIN_MAX) + } + + return "INVALID"; +} + +const char* ptiCallbackPhaseTypeToString(pti_callback_phase phase) { + switch (phase) { + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_API_ENTER) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_API_EXIT) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_INTERNAL_THREAD_START) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_INTERNAL_THREAD_END) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_INTERNAL_EVENT) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_INVALID) + PTI_CASE_ENUM_TO_STRING(PTI_CB_PHASE_MAX) + } + + return "INVALID"; +} + #undef PTI_CASE_ENUM_TO_STRING diff --git a/sdk/src/pti_lib_handler.h b/sdk/src/pti_lib_handler.h index 1f8eaa3..7d67ecf 100644 --- a/sdk/src/pti_lib_handler.h +++ b/sdk/src/pti_lib_handler.h @@ -14,6 +14,7 @@ #include "pti/pti.h" #include "pti/pti_metrics.h" +#include "pti/pti_sync_callback.h" #include "pti/pti_view.h" #include "utils/library_loader.h" #include "utils/platform_strings.h" @@ -99,6 +100,11 @@ class PtiLibHandler { decltype(&ptiMetricsGetDevices) ptiMetricsGetDevices_ = nullptr; // NOLINT decltype(&ptiMetricsStartCollection) ptiMetricsStartCollection_ = nullptr; // NOLINT decltype(&ptiMetricGetCalculatedData) ptiMetricGetCalculatedData_ = nullptr; // NOLINT + decltype(&ptiCallbackSubscribe) ptiCallbackSubscribe_ = nullptr; // NOLINT + decltype(&ptiCallbackUnsubscribe) ptiCallbackUnsubscribe_ = nullptr; // NOLINT + decltype(&ptiCallbackEnableDomain) ptiCallbackEnableDomain_ = nullptr; // NOLINT + decltype(&ptiCallbackDisableDomain) ptiCallbackDisableDomain_ = nullptr; // NOLINT + decltype(&ptiCallbackDisableAllDomains) ptiCallbackDisableAllDomains_ = nullptr; // NOLINT decltype(&PtiSetXPTIEnvironmentDetails) PtiSetXPTIEnvironmentDetails_ = nullptr; // NOLINT private: @@ -167,6 +173,13 @@ class PtiLibHandler { PTI_VIEW_GET_SYMBOL(ptiMetricsGetDevices); PTI_VIEW_GET_SYMBOL(ptiMetricsStartCollection); PTI_VIEW_GET_SYMBOL(ptiMetricGetCalculatedData); + + PTI_VIEW_GET_SYMBOL(ptiCallbackSubscribe); + PTI_VIEW_GET_SYMBOL(ptiCallbackUnsubscribe); + PTI_VIEW_GET_SYMBOL(ptiCallbackEnableDomain); + PTI_VIEW_GET_SYMBOL(ptiCallbackDisableDomain); + PTI_VIEW_GET_SYMBOL(ptiCallbackDisableAllDomains); + PTI_VIEW_GET_SYMBOL(PtiSetXPTIEnvironmentDetails); #undef PTI_VIEW_GET_SYMBOL CommunicateForeignXPTISubscriber(); diff --git a/sdk/src/pti_view.cc b/sdk/src/pti_view.cc index 21c00c0..fbb687b 100644 --- a/sdk/src/pti_view.cc +++ b/sdk/src/pti_view.cc @@ -12,6 +12,7 @@ #include #include "internal_helper.h" +#include "pti/pti_sync_callback.h" #include "tracing_cb_api.gen" #include "view_handler.h" @@ -486,3 +487,77 @@ pti_result ptiViewEnableDriverApiClass(uint32_t enable, pti_api_class pti_class, } return status; } + +pti_result ptiCallbackSubscribe(pti_callback_subscriber_handle* subscriber, + pti_callback_function callback, void* user_data) { + try { + return Instance().CallbackSubscribe(subscriber, callback, user_data); + } catch (const std::runtime_error& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (const std::exception& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackUnsubscribe(pti_callback_subscriber_handle subscriber) { + try { + return Instance().CallbackUnsubscribe(subscriber); + } catch (const std::runtime_error& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (const std::exception& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackEnableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain, uint32_t enter_cb, + uint32_t exit_cb) { + try { + return Instance().CallbackEnableDomain(subscriber, domain, enter_cb, exit_cb); + } catch (const std::runtime_error& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (const std::exception& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackDisableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain) { + try { + return Instance().CallbackDisableDomain(subscriber, domain); + } catch (const std::runtime_error& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (const std::exception& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackDisableAllDomains(pti_callback_subscriber_handle subscriber) { + try { + return Instance().CallbackDisableAllDomains(subscriber); + } catch (const std::runtime_error& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (const std::exception& e) { + LogException(e); + return pti_result::PTI_ERROR_INTERNAL; + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} diff --git a/sdk/src/pti_view_load.cc b/sdk/src/pti_view_load.cc index 9eaca33..f732248 100644 --- a/sdk/src/pti_view_load.cc +++ b/sdk/src/pti_view_load.cc @@ -480,4 +480,89 @@ pti_result ptiMetricGetCalculatedData(pti_device_handle_t device_handle, } catch (...) { return pti_result::PTI_ERROR_INTERNAL; } +} + +pti_result ptiCallbackSubscribe(pti_callback_subscriber_handle* subscriber, + pti_callback_function callback, void* user_data) { + try { + if (!pti::PtiLibHandler::Instance().ViewAvailable()) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + if (!pti::PtiLibHandler::Instance().ptiCallbackSubscribe_) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + return pti::PtiLibHandler::Instance().ptiCallbackSubscribe_(subscriber, callback, user_data); + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackUnsubscribe(pti_callback_subscriber_handle subscriber) { + try { + if (!pti::PtiLibHandler::Instance().ViewAvailable()) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + if (!pti::PtiLibHandler::Instance().ptiCallbackUnsubscribe_) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + return pti::PtiLibHandler::Instance().ptiCallbackUnsubscribe_(subscriber); + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackEnableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain, uint32_t enter_cb, + uint32_t exit_cb) { + try { + if (!pti::PtiLibHandler::Instance().ViewAvailable()) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + if (!pti::PtiLibHandler::Instance().ptiCallbackEnableDomain_) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + return pti::PtiLibHandler::Instance().ptiCallbackEnableDomain_(subscriber, domain, enter_cb, + exit_cb); + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackDisableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain) { + try { + if (!pti::PtiLibHandler::Instance().ViewAvailable()) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + if (!pti::PtiLibHandler::Instance().ptiCallbackDisableDomain_) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + return pti::PtiLibHandler::Instance().ptiCallbackDisableDomain_(subscriber, domain); + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } +} + +pti_result ptiCallbackDisableAllDomains(pti_callback_subscriber_handle subscriber) { + try { + if (!pti::PtiLibHandler::Instance().ViewAvailable()) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + if (!pti::PtiLibHandler::Instance().ptiCallbackDisableAllDomains_) { + return pti_result::PTI_ERROR_NOT_IMPLEMENTED; + } + + return pti::PtiLibHandler::Instance().ptiCallbackDisableAllDomains_(subscriber); + } catch (...) { + return pti_result::PTI_ERROR_INTERNAL; + } } \ No newline at end of file diff --git a/sdk/src/unikernel.h b/sdk/src/unikernel.h index 011cd34..0a8366b 100644 --- a/sdk/src/unikernel.h +++ b/sdk/src/unikernel.h @@ -97,6 +97,8 @@ struct ZeMemoryCommandRoute { } }; +enum class KernelCommandType { kInvalid = 0, kKernel = 1, kMemory = 2, kCommand = 3 }; + struct ZeKernelCommandExecutionRecord { uint64_t sycl_node_id_; uint64_t sycl_queue_id_ = PTI_INVALID_QUEUE_ID; @@ -107,8 +109,9 @@ struct ZeKernelCommandExecutionRecord { const char* sycl_function_name_ = nullptr; uint32_t source_line_number_; - uint64_t kid_; - uint32_t cid_; + KernelCommandType command_type_; + uint64_t kid_; // kernel id + uint32_t cid_; // correlation id uint32_t tid_; uint32_t pid_; int32_t tile_; diff --git a/sdk/src/view_handler.h b/sdk/src/view_handler.h index b7c8aa5..e89220b 100644 --- a/sdk/src/view_handler.h +++ b/sdk/src/view_handler.h @@ -282,6 +282,11 @@ struct PtiViewRecordHandler { if (!collector_) { CollectorOptions collector_options{}; + // TODO: Implement this better: + // this line here is from the beginning, + // and it is wrong as for simple API tracing - no need to trace GPU ops + // (too much overhead) + // However, dealing with it requires cross-thread synchronization collector_options.kernel_tracing = true; collector_ = ZeCollector::Create(&state_, collector_options, ZeChromeKernelStagesCallback, ZeApiCallsCallback, nullptr); @@ -775,6 +780,75 @@ struct PtiViewRecordHandler { return ts_shift_; } + // Callback API + // Multiple subscriber support with ID-based management + inline pti_result CallbackSubscribe(pti_callback_subscriber_handle* subscriber, + pti_callback_function callback, void* user_data) { + // Limitation (hopefully temporal) Callbacks only supported when kernel tracing is ON + if (collector_ && collector_->IsTracingOn() && + collector_->GetCollectorOptions().kernel_tracing) { + uint32_t subscriber_id = collector_->AddCallbackSubscriber(callback, user_data); + if (subscriber_id == 0) { + SPDLOG_ERROR("Failed to add callback subscriber"); + return PTI_ERROR_INTERNAL; + } + *subscriber = subscriber_id; + return PTI_SUCCESS; + } + return PTI_ERROR_AT_LEAST_ONE_GPU_VIEW_MUST_BE_ENABLED; + } + + inline pti_result CallbackUnsubscribe(pti_callback_subscriber_handle subscriber) { + if (collector_) { + auto result = collector_->RemoveCallbackSubscriber(subscriber); + if (result != pti_result::PTI_SUCCESS) { + SPDLOG_ERROR("Failed to unsubscribe callback: {}", static_cast(result)); + return result; + } + return PTI_SUCCESS; + } + return PTI_ERROR_INTERNAL; + } + + inline pti_result CallbackEnableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain, uint32_t enter_cb, + uint32_t exit_cb) { + if (collector_) { + auto result = collector_->EnableCallbackDomain(subscriber, domain, enter_cb, exit_cb); + if (result != pti_result::PTI_SUCCESS) { + SPDLOG_ERROR("Failed to enable domain: {}", static_cast(result)); + return result; + } + return PTI_SUCCESS; + } + return PTI_ERROR_INTERNAL; + } + + inline pti_result CallbackDisableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain) { + if (collector_) { + auto result = collector_->DisableCallbackDomain(subscriber, domain); + if (result != pti_result::PTI_SUCCESS) { + SPDLOG_ERROR("Failed to disable domain: {}", static_cast(result)); + return result; + } + return PTI_SUCCESS; + } + return PTI_ERROR_INTERNAL; + } + + inline pti_result CallbackDisableAllDomains(pti_callback_subscriber_handle subscriber) { + if (collector_) { + auto result = collector_->DisableAllCallbackDomains(subscriber); + if (result != pti_result::PTI_SUCCESS) { + SPDLOG_ERROR("Failed to disable all domains: {}", static_cast(result)); + return result; + } + return PTI_SUCCESS; + } + return PTI_ERROR_INTERNAL; + } + private: inline void RequestNewBuffer(pti::view::utilities::ViewBuffer& buffer) { unsigned char* raw_buffer = nullptr; diff --git a/sdk/test/CMakeLists.txt b/sdk/test/CMakeLists.txt index a966013..1922c76 100644 --- a/sdk/test/CMakeLists.txt +++ b/sdk/test/CMakeLists.txt @@ -582,12 +582,13 @@ if(PTI_BUILD_SAMPLES AND HAVE_SYCL) add_test(NAME run-vec_sqadd COMMAND vec_sqadd) add_test(NAME run-iso COMMAND iso3dfd 256 256 256 32 8 64 10 gpu) add_test(NAME run-metrics-iso COMMAND metrics_iso3dfd 256 256 256 32 8 64 10 gpu) + add_test(NAME run-callback COMMAND callback) set_tests_properties( run-metrics-iso PROPERTIES ENVIRONMENT "ZET_ENABLE_METRICS=1" LABELS "functional;samples;hw-metrics") set_tests_properties( mt-awk-test mt-awk-test-immediate run-dlworkload run-dpc_gemm run-onemkl_gemm_exe run-vec_sqadd - run-iso PROPERTIES LABELS "functional;samples") + run-iso run-callback PROPERTIES LABELS "functional;samples") endif()