From 52aa9a8263d9df10e1933309234bfac1a367d42c Mon Sep 17 00:00:00 2001 From: gs-olive <113141689+gs-olive@users.noreply.github.com> Date: Wed, 22 Mar 2023 18:40:25 -0700 Subject: [PATCH] fix: Bugfix in shape analysis for multi-GPU systems - Shape analysis code in partitioning defaults dry-run tensors to cuda:0 despite user-specified devices - This leads to errors about device casting for internal tensors, which users cannot cast - Add GPU-ID function arguments in functions to generate new tensors on the user-specified (or default) device --- core/partitioning/partitioning.cpp | 20 ++++++++++++++++---- core/partitioning/partitioning.h | 3 ++- core/partitioning/shape_analysis.cpp | 15 +++++++++------ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index ee9579f13c..7e13484504 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -542,14 +542,26 @@ bool isInputDynamic(PartitioningCtx* ctx) { void populateInputIValues(PartitioningCtx* ctx) { if (isInputDynamic(ctx)) { ctx->min_input_ivalues_map = partitioning::generateRandomInputs( - ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMIN); + ctx->settings.collection_input_spec_map, + ctx->input_types_map, + ir::ShapeMode::kMIN, + ctx->settings.target_device.gpu_id); ctx->opt_input_ivalues_map = partitioning::generateRandomInputs( - ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT); + ctx->settings.collection_input_spec_map, + ctx->input_types_map, + ir::ShapeMode::kOPT, + ctx->settings.target_device.gpu_id); ctx->max_input_ivalues_map = partitioning::generateRandomInputs( - ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMAX); + ctx->settings.collection_input_spec_map, + ctx->input_types_map, + ir::ShapeMode::kMAX, + ctx->settings.target_device.gpu_id); } else { ctx->opt_input_ivalues_map = partitioning::generateRandomInputs( - ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT); + ctx->settings.collection_input_spec_map, + ctx->input_types_map, + ir::ShapeMode::kOPT, + ctx->settings.target_device.gpu_id); } } diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h index 3315ffa210..8634e764f4 100644 --- a/core/partitioning/partitioning.h +++ b/core/partitioning/partitioning.h @@ -34,7 +34,8 @@ const std::unordered_set CollectionNodeKinds = { ExampleIValues generateRandomInputs( ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types, - const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT); + const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT, + int64_t gpu_id = 0); void populateInputIValues(PartitioningCtx* ctx); diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp index 387c9d27bd..4f2de18b7c 100644 --- a/core/partitioning/shape_analysis.cpp +++ b/core/partitioning/shape_analysis.cpp @@ -13,7 +13,8 @@ namespace partitioning { at::Tensor generateSingleInput( ir::Input& input, c10::optional& type_opt, - const ir::ShapeMode& shape_mode) { + const ir::ShapeMode& shape_mode, + int64_t gpu_id) { nvinfer1::Dims input_shape = input.input_shape; if (input.input_is_dynamic) { if (shape_mode == ir::ShapeMode::kMIN) { @@ -42,7 +43,8 @@ at::Tensor generateSingleInput( // Make the value range for input tensor a uniform (float) distribution // over [LoValIncl, HiValExcl), then cast to the desired dtype - auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape), {at::kCUDA}) + LoValIncl).to(type); + auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape)) + LoValIncl) + .to(at::Device(at::kCUDA, gpu_id), type); return in; } @@ -50,7 +52,8 @@ at::Tensor generateSingleInput( std::unordered_map generateRandomInputs( std::unordered_map>& inputs, std::unordered_map>>& types, - const ir::ShapeMode& shape_mode) { + const ir::ShapeMode& shape_mode, + int64_t gpu_id) { // generate random inputs for running pytorch segments std::unordered_map ivalue_map; @@ -59,7 +62,7 @@ std::unordered_map generateRandomI c10::TypePtr elementType = c10::TensorType::get(); auto generic_list = c10::impl::GenericList(elementType); for (size_t i = 0; i < input.second.size(); i++) { - auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode); + auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id); generic_list.push_back(in.clone()); } ivalue_map[input.first] = c10::IValue(generic_list); @@ -67,13 +70,13 @@ std::unordered_map generateRandomI // create tuple std::vector list; for (size_t i = 0; i < input.second.size(); i++) { - auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode); + auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id); list.push_back(in.clone()); } auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr ivalue_map[input.first] = c10::IValue(tuple); } else { - auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode); + auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode, gpu_id); ivalue_map[input.first] = in.clone(); } }