Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 19 additions & 43 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,52 +153,28 @@ BackendManager::BackendManager(SessionContext& session_context,
model_stream);
} catch (const OnnxRuntimeException& ex) {
std::string exception_str = ex.what();
bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
!session_context_.so_disable_cpu_ep_fallback &&
!subgraph_context_.is_ep_ctx_graph;
#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
eligible_for_cpu_fallback = false;
#else
if (eligible_for_cpu_fallback) {
LOGS_DEFAULT(VERBOSE) << exception_str;
LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
<< "Falling back to OV CPU for execution";
session_context_.device_type = "CPU";
session_context_.precision = "FP32";
try {
concrete_backend_ = BackendFactory::MakeBackend(model_proto,
session_context_,
subgraph_context_,
shared_context_,
model_stream);
} catch (std::string const& msg) {
ORT_THROW(msg);
}
}
#endif
if (!eligible_for_cpu_fallback) {
if (device_type.find("NPU") != std::string::npos &&
exception_str.find("intel_npu") != std::string::npos) {
// Handle NPU device related errors

if (session_context_.device_type.find("NPU") != std::string::npos &&
exception_str.find("intel_npu") != std::string::npos) {
// Handle NPU device related errors
#ifndef NDEBUG
ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
#else
std::string error_message = "UNKNOWN NPU ERROR";
std::string error_code = "code 0x0";
std::regex error_message_pattern(R"(\bZE_\w*\b)");
std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
std::smatch matches;
if (std::regex_search(exception_str, matches, error_message_pattern)) {
error_message = matches[0];
}
if (std::regex_search(exception_str, matches, error_code_pattern)) {
error_code = matches[0];
}
throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
#endif
} else {
ORT_THROW(exception_str);
std::string error_message = "UNKNOWN NPU ERROR";
std::string error_code = "code 0x0";
std::regex error_message_pattern(R"(\bZE_\w*\b)");
std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
std::smatch matches;
if (std::regex_search(exception_str, matches, error_message_pattern)) {
error_message = matches[0];
}
if (std::regex_search(exception_str, matches, error_code_pattern)) {
error_code = matches[0];
}
throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
#endif
} else {
ORT_THROW(exception_str);
}
}
}
Expand Down
158 changes: 91 additions & 67 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,42 +36,14 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
if (ValidateSubgraph(const_outputs_map_))
return;

// OV Config
// Pre-requisite is provider_option "context" must be set
auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
(session_context_.OpenVINO_Version.at(0) >= 2024 &&
session_context_.OpenVINO_Version.at(1) > 2));
ov::AnyMap device_config;
PopulateConfigValue(device_config);

// Enable caching
EnableCaching();

// Setting OpenCL queue throttling for GPU
EnableGPUThrottling(device_config);

// Enable streams; default=1 unless overridden by user configuration
EnableStreams();

// Set the inference_num_threads property of the CPU
SetNumThreads(device_config);

auto npuw_status =
std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
(pair.second.as<std::string>() == "YES");
});

if (npuw_status) {
LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
}

try {
// IO_BUFFER is enabled on GPU HW.
// Pre-requisite is provider_option "context" must be set
auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
(session_context_.OpenVINO_Version.at(0) >= 2024 &&
session_context_.OpenVINO_Version.at(1) > 2));
bool disable_cpu_fallback = !(hw_target.find("NPU") != std::string::npos &&
!session_context_.so_disable_cpu_ep_fallback &&
!subgraph_context_.is_ep_ctx_graph);
if (subgraph_context_.is_ep_ctx_graph) {
SetOVDeviceConfiguration(device_config);
if (subgraph_context_.is_ep_ctx_graph) {
try {
if (subgraph_context_.is_ep_ctx_ovir_encapsulated) {
// model_file_path will use so_context_file_path if the onnx_model_path_name is not available,
// especially in case of CreateSessionFormArray() where user must explicitly
Expand Down Expand Up @@ -104,41 +76,67 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
device_config,
subgraph_context_.subgraph_name);
}
model_stream.reset(); // Delete stream after it is no longer needed
} else if (!session_context_.has_external_weights &&
!subgraph_context_.has_dynamic_input_shape &&
!session_context_.so_context_enable &&
session_context_.reshape.empty() &&
!enable_causallm &&
auto_unified_compile) {
// Unified OV compile_model is efficient when ov model caching is enabled
// Unified OV compile_model API is supported with AUTO from version 2024.3 and above
// Inputs with static dimensions
// Not enabled for models with external weights and when ep context is set.
const std::string model = model_proto->SerializeAsString();
// we have the serialized string, so we can release model proto to lower the peak memory consumption
if (disable_cpu_fallback) model_proto.reset();
exe_network_ = OVCore::Get()->CompileModel(model,
hw_target,
device_config,
subgraph_context_.subgraph_name);
} else { // For all other types use ov::ov_core read_model() to generate OV IR
// followed by ov::ov_core compile_model()
std::string model = model_proto->SerializeAsString();
// Reset model proto only when cpu fallback is disabled or when the model has dynamic input shapes.
// This is to avoid memory peak usage when the model is large.
if (!subgraph_context.has_dynamic_input_shape && disable_cpu_fallback) {
model_proto.reset();
model_stream.reset();
} catch (const char* msg) {
ORT_THROW(msg);
} // Delete stream after it is no longer needed
} else {
std::string model = model_proto->SerializeAsString();
if (!subgraph_context.has_dynamic_input_shape) {
model_proto.reset();
}
try {
// SetOVDeviceConfiguration(device_config);
Copy link

@ankitm3k ankitm3k Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// SetOVDeviceConfiguration(device_config);

Redundant line

if (!session_context_.has_external_weights &&
!subgraph_context_.has_dynamic_input_shape &&
!session_context_.so_context_enable &&
session_context_.reshape.empty() &&
!enable_causallm &&
auto_unified_compile) {
// Unified OV compile_model is efficient when ov model caching is enabled
// Unified OV compile_model API is supported with AUTO from version 2024.3 and above
// Inputs with static dimensions
// Not enabled for models with external weights and when ep context is set.

exe_network_ = OVCore::Get()->CompileModel(model,
hw_target,
device_config,
subgraph_context_.subgraph_name);
} else { // For all other types use ov::ov_core read_model() to generate OV IR
// followed by ov::ov_core compile_model()
auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
exe_network_ = OVCore::Get()->CompileModel(
ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
}
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
} catch (const OnnxRuntimeException& ex) {
std::string exception_str = ex.what();
bool eligible_for_cpu_fallback = session_context_.device_type.find("NPU") != std::string::npos &&
!session_context_.so_disable_cpu_ep_fallback &&
!subgraph_context_.is_ep_ctx_graph;
#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
eligible_for_cpu_fallback = false;
#endif
if (eligible_for_cpu_fallback) {
LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
<< "Falling back to OV CPU for execution";
session_context_.device_type = "CPU";
session_context_.precision = "FP32";
device_config.clear();
SetOVDeviceConfiguration(device_config);
try {
// Recreate the model with CPU device type
auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
exe_network_ = OVCore::Get()->CompileModel(
ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
} catch (std::string const& msg) {
ORT_THROW(msg);
}
} else {
ORT_THROW(ex.what());
}
auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
exe_network_ = OVCore::Get()->CompileModel(
ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
}
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
} catch (const char* msg) {
ORT_THROW(msg);
}

int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
std::function<void(OVInferRequestPtr)> initializer = [](OVInferRequestPtr) {};
auto metadata = shared_context_.shared_weights.metadata;
Expand Down Expand Up @@ -385,6 +383,32 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
}

void BasicBackend::SetOVDeviceConfiguration(ov::AnyMap& device_config) {
PopulateConfigValue(device_config);

// Enable caching
EnableCaching();

// Setting OpenCL queue throttling for GPU
EnableGPUThrottling(device_config);

// Enable streams; default=1 unless overridden by user configuration
EnableStreams();

// Set the inference_num_threads property of the CPU
SetNumThreads(device_config);

auto npuw_status =
std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
(pair.second.as<std::string>() == "YES");
});

if (npuw_status) {
LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
}
}

void BasicBackend::ValidateOrtDimsAgainstPartialShape(const std::vector<int64_t>& ort_dims,
const ov::PartialShape& partial_shape) const {
// Check if the number of dimensions matches
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ class BasicBackend : public IBackend {
void EnableGPUThrottling(ov::AnyMap& device_config);
void EnableStreams();
void SetNumThreads(ov::AnyMap& device_config);
void SetOVDeviceConfiguration(ov::AnyMap& device_config);
void ValidateOrtDimsAgainstPartialShape(const std::vector<int64_t>& ort_dims,
const ov::PartialShape& partial_shape) const;

Expand Down
Loading