intel · mklimenk · Oct 6, 2025
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -394,6 +394,7 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
+#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 4)))
 static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
   const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
   return type_proto && type_proto->has_tensor_type() &&
@@ -431,6 +432,7 @@ static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_vi
   }
   return false;
 }
+#endif
 
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
@@ -490,10 +492,6 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   }
 #endif
 
-  // Check if the graph is QDQ and has int16 or uint16 quantization
-  // If so, we will apply the QDQ scales fix transformation (for GPU device only)
-  bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
-
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
   if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -506,8 +504,11 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
-  } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-             is_qdq_graph_uint16_or_int16) {
+  }
+#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 4)))
+  // Enable OVEP-level QDQ stripping only for OV versions that don't have it
+  else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+           IsQDQGraphWithUint16OrInt16(subgraph)) {
     // Create a copy of the model
     std::unique_ptr<onnxruntime::Model> model;
     Status status = qdq_scales_fix::Transform(subgraph, logger, model);
@@ -517,7 +518,9 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
-  } else if (IsModelBF16(subgraph)) {
+  } 
+#endif
+  else if (IsModelBF16(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
     std::unique_ptr<onnxruntime::Model> model;
     Status status = bfloat16_fix::Transform(subgraph, logger, model);