From 52aa9a8263d9df10e1933309234bfac1a367d42c Mon Sep 17 00:00:00 2001
From: gs-olive <113141689+gs-olive@users.noreply.github.com>
Date: Wed, 22 Mar 2023 18:40:25 -0700
Subject: [PATCH] fix: Bugfix in shape analysis for multi-GPU systems

- Shape analysis code in partitioning defaults dry-run tensors to cuda:0
despite user-specified devices
- This leads to errors about device casting for internal tensors, which
users cannot cast
- Add GPU-ID function arguments in functions to generate new tensors on
the user-specified (or default) device
---
 core/partitioning/partitioning.cpp   | 20 ++++++++++++++++----
 core/partitioning/partitioning.h     |  3 ++-
 core/partitioning/shape_analysis.cpp | 15 +++++++++------
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index ee9579f13c..7e13484504 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -542,14 +542,26 @@ bool isInputDynamic(PartitioningCtx* ctx) {
 void populateInputIValues(PartitioningCtx* ctx) {
   if (isInputDynamic(ctx)) {
     ctx->min_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMIN);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kMIN,
+        ctx->settings.target_device.gpu_id);
     ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kOPT,
+        ctx->settings.target_device.gpu_id);
     ctx->max_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMAX);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kMAX,
+        ctx->settings.target_device.gpu_id);
   } else {
     ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kOPT,
+        ctx->settings.target_device.gpu_id);
   }
 }
 
diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h
index 3315ffa210..8634e764f4 100644
--- a/core/partitioning/partitioning.h
+++ b/core/partitioning/partitioning.h
@@ -34,7 +34,8 @@ const std::unordered_set<c10::Symbol> CollectionNodeKinds = {
 ExampleIValues generateRandomInputs(
     ir::CollectionInputSpecMap& input_ranges,
     ir::CollectionTypeMap& input_types,
-    const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT);
+    const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT,
+    int64_t gpu_id = 0);
 
 void populateInputIValues(PartitioningCtx* ctx);
 
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 387c9d27bd..4f2de18b7c 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -13,7 +13,8 @@ namespace partitioning {
 at::Tensor generateSingleInput(
     ir::Input& input,
     c10::optional<at::ScalarType>& type_opt,
-    const ir::ShapeMode& shape_mode) {
+    const ir::ShapeMode& shape_mode,
+    int64_t gpu_id) {
   nvinfer1::Dims input_shape = input.input_shape;
   if (input.input_is_dynamic) {
     if (shape_mode == ir::ShapeMode::kMIN) {
@@ -42,7 +43,8 @@ at::Tensor generateSingleInput(
 
   // Make the value range for input tensor a uniform (float) distribution
   // over [LoValIncl, HiValExcl), then cast to the desired dtype
-  auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape), {at::kCUDA}) + LoValIncl).to(type);
+  auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape)) + LoValIncl)
+                .to(at::Device(at::kCUDA, gpu_id), type);
 
   return in;
 }
@@ -50,7 +52,8 @@ at::Tensor generateSingleInput(
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types,
-    const ir::ShapeMode& shape_mode) {
+    const ir::ShapeMode& shape_mode,
+    int64_t gpu_id) {
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
 
@@ -59,7 +62,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id);
         generic_list.push_back(in.clone());
       }
       ivalue_map[input.first] = c10::IValue(generic_list);
@@ -67,13 +70,13 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       // create tuple
       std::vector<torch::jit::IValue> list;
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id);
         list.push_back(in.clone());
       }
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
       ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode);
+      auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode, gpu_id);
       ivalue_map[input.first] = in.clone();
     }
   }