Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
return false;
}

#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 4)))
static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
return type_proto && type_proto->has_tensor_type() &&
Expand Down Expand Up @@ -431,6 +432,7 @@ static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_vi
}
return false;
}
#endif

static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
Expand Down Expand Up @@ -490,10 +492,6 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
}
#endif

// Check if the graph is QDQ and has int16 or uint16 quantization
// If so, we will apply the QDQ scales fix transformation (for GPU device only)
bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);

const auto& onnx_model_path_name = subgraph.ModelPath();
// QDQ stripping enabled only for the NPU and experimentally on the GPU
if ((session_context_.device_type.find("NPU") != std::string::npos) &&
Expand All @@ -506,8 +504,11 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
is_qdq_graph_uint16_or_int16) {
}
#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 4)))
// Enable OVEP-level QDQ stripping only for OV versions that don't have it
else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
IsQDQGraphWithUint16OrInt16(subgraph)) {
// Create a copy of the model
std::unique_ptr<onnxruntime::Model> model;
Status status = qdq_scales_fix::Transform(subgraph, logger, model);
Expand All @@ -517,7 +518,9 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else if (IsModelBF16(subgraph)) {
}
#endif
else if (IsModelBF16(subgraph)) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
std::unique_ptr<onnxruntime::Model> model;
Status status = bfloat16_fix::Transform(subgraph, logger, model);
Expand Down