[Texture] Add memory scope entity into graph JSON/runtime (#11875)

elvin-n · csullivan · web-flow · commit 07672d0b4100 · 2022-07-08T14:44:18.000-07:00
This PR is a split part of origin PR #11357 Co-authored-by: Chris Sullivan <csullivan@octoml.ai>
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
@@ -326,6 +326,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     if (num_unknown_devices == 0) {
       node->attrs_["device_index"] = device_types;
     }
+    // storage scope
+    std::vector<std::string> storage_scope;
+    for (const auto& virtual_device : storage_info->virtual_devices) {
+      storage_scope.push_back(std::string(virtual_device->memory_scope));
+    }
+    node->attrs_["storage_scope"] = std::move(storage_scope);
     auto node_id = nodes_.size();
     nodes_.push_back(node);
     // Tuple return value, flatten as tuple
@@ -442,7 +448,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
         return AddNode(node, call);
       }
     } else if (!call_node->attrs.defined()) {  // Call is an extern function
-      std::cout << "call_node: \n" << PrettyPrint(call) << std::endl;
       const auto* func = call_node->op.as<GlobalVarNode>();
       ICHECK(func) << "Expected the operator to be a global var, but got "
                    << call_node->op->GetTypeKey();  // getting a relay fn here, not sure why.
@@ -539,12 +544,15 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     size_t num_entry = 0;
     ShapeVector shapes;
     std::vector<size_t> storage_ids;
+    std::vector<std::string> storage_scopes;
     std::vector<size_t> device_types;
     std::vector<std::string> dltypes;
     std::vector<size_t> node_row_ptr{0};
     for (auto node : nodes_) {
       const auto& shape_vec = dmlc::get<ShapeVector>(node->attrs_["shape"]);
       const auto& storage_id = dmlc::get<std::vector<int64_t>>(node->attrs_["storage_id"]);
+      const auto& storage_scope =
+          dmlc::get<std::vector<std::string>>(node->attrs_["storage_scope"]);
       const auto& dtype_vec = dmlc::get<std::vector<std::string>>(node->attrs_["dtype"]);
 
       ICHECK_EQ(node->num_outputs_, shape_vec.size());
@@ -553,12 +561,25 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       shapes.insert(shapes.end(), shape_vec.begin(), shape_vec.end());
       dltypes.insert(dltypes.end(), dtype_vec.begin(), dtype_vec.end());
       storage_ids.insert(storage_ids.end(), storage_id.begin(), storage_id.end());
+      storage_scopes.insert(storage_scopes.end(), storage_scope.begin(), storage_scope.end());
       if (node->attrs_.count("device_index")) {
         const auto& dev_types = dmlc::get<std::vector<int64_t>>(node->attrs_["device_index"]);
         device_types.insert(device_types.end(), dev_types.begin(), dev_types.end());
       }
       node_row_ptr.push_back(num_entry);
     }
+
+    // verification if storage_scope contains any non global memory scope
+    // in other case it's better not to write scopes to the JSON at all
+    bool global_only_scope = true;
+    for (const auto& ss : storage_scopes) {
+      if (!(ss.empty() || ss == "global")) {
+        global_only_scope = false;
+      }
+    }
+    if (global_only_scope) {
+      storage_scopes.clear();
+    }
     writer->BeginObject();
     writer->WriteObjectKeyValue("nodes", nodes_);
     writer->WriteObjectKeyValue("arg_nodes", arg_nodes);
@@ -572,6 +593,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       attrs["device_index"].emplace_back(std::string("list_int"));
       attrs["device_index"].emplace_back(device_types);
     }
+    if (storage_scopes.size()) {
+      attrs["storage_scope"].emplace_back(std::string("list_str"));
+      attrs["storage_scope"].emplace_back(storage_scopes);
+    }
     attrs["dltype"].emplace_back(std::string("list_str"));
     attrs["dltype"].emplace_back(dltypes);
     writer->WriteObjectKeyValue("attrs", attrs);
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
@@ -42,6 +42,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../texture.h"
 
 namespace tvm {
 namespace runtime {
@@ -51,6 +52,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   if (align < kAllocAlignment) return kAllocAlignment;
   return align;
 }
+constexpr auto Is2DStorage = IsTextureStorage;
 }  // namespace details
 
 /*!
@@ -361,24 +363,16 @@ void GraphExecutor::SetupStorage() {
   // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i];
     // Use the fallback device if no device index is available.
     int device_type = static_cast<int>(devices_[0].device_type);
     if (!attrs_.device_index.empty()) {
       device_type = attrs_.device_index[i];
     }
-    size_t size = 1;
-    for (int64_t sz : attrs_.shape[i]) {
-      size *= static_cast<size_t>(sz);
-    }
-    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
-    DLDataType t = vtype[i];
-    size_t bits = t.bits * t.lanes;
-    ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-    size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
     if (sid >= pool_entry.size()) {
-      pool_entry.resize(sid + 1, {0, -1});
+      pool_entry.resize(sid + 1, {-1, {0}, {}});
     } else {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
@@ -395,8 +389,38 @@ void GraphExecutor::SetupStorage() {
       pool_entry[sid].linked_param = lookup_rv;
     }
     pool_entry[sid].param_data_entry = i;
-    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
+    pool_entry[sid].scope = storage_scope;
+
+    DLDataType t = vtype[i];
+    if (!details::Is2DStorage(storage_scope)) {
+      size_t size = 1;
+      for (int64_t sz : attrs_.shape[i]) {
+        size *= static_cast<size_t>(sz);
+      }
+      size_t bits = t.bits * t.lanes;
+      ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
+      int64_t bytes = ((bits + 7U) / 8U) * size;
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
+      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
+    } else {
+      if (pool_entry[sid].shape.size() == 1) {
+        pool_entry[sid].shape.resize(3, 0);
+      }
+      size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope);
+      auto shape = ApplyTexture2DFlattening<int64_t>(attrs_.shape[i], attrs_.shape[i].size(), axis);
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height);
+      pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width);
+      CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel)
+          << pool_entry[sid].shape[2] << " != " << shape.channel
+          << ",  texture channel length must be consistent within a storage pool";
+      pool_entry[sid].shape[2] = shape.channel;
+      CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t))
+          << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t)
+          << ", pool entry for 2d texure allocations must be of the same type;"
+          << " downstream error from memory planner likely";
+      pool_entry[sid].dtype = t;
+    }
   }
 
   // Allocate the space.
@@ -410,9 +434,15 @@ void GraphExecutor::SetupStorage() {
     if (pit.linked_param.defined()) {
       storage_pool_.push_back(pit.linked_param);
     } else {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev));
+      std::vector<int64_t> shape = pit.shape;
+      if (shape.size() == 1) {
+        shape[0] = (shape[0] + 3) / 4;
+      }
+      Optional<String> mem_scope;
+      if (!pit.scope.empty()) {
+        mem_scope = String(pit.scope);
+      }
+      storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, dev, mem_scope));
     }
   }
 
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
@@ -204,10 +204,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
  protected:
   // Memory pool entry.
   struct PoolEntry {
-    size_t size;
     int device_type;
+    std::vector<int64_t> shape;
+    DLDataType dtype;
     int param_data_entry;
     NDArray linked_param;
+    std::string scope;
     //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
     //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
@@ -303,6 +305,7 @@ class TVM_DLL GraphExecutor : public ModuleNode {
     std::vector<int> storage_id;
     std::vector<int> device_index;
     std::vector<std::string> dltype;
+    std::vector<std::string> storage_scope;
     std::vector<std::vector<int64_t>> shape;
     // The graph attribute fields.
     void Load(dmlc::JSONReader* reader) {
@@ -328,6 +331,15 @@ class TVM_DLL GraphExecutor : public ModuleNode {
           reader->Read(&storage_id);
           ICHECK(!reader->NextArrayItem());
           bitmask |= 2;
+        } else if (key == "storage_scope") {
+          reader->BeginArray();
+          ICHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          ICHECK_EQ(type, "list_str");
+          ICHECK(reader->NextArrayItem());
+          reader->Read(&storage_scope);
+          ICHECK(!reader->NextArrayItem());
+          bitmask |= 1;
         } else if (key == "shape") {
           reader->BeginArray();
           ICHECK(reader->NextArrayItem());
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
@@ -98,7 +98,7 @@ std::string CodeGenOpenCL::Finish() {
                    "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
                    "#else\n"
                    "#error \"Half precision floating point not supported"
-                   "by OpenCL implementation on your device.\" \n"
+                   " by OpenCL implementation on your device.\" \n"
                    "#endif\n\n";
   }
 
@@ -109,7 +109,7 @@ std::string CodeGenOpenCL::Finish() {
                    "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
                    "#else\n"
                    "#error \"Double precision floating point not supported"
-                   "by OpenCL implementation on your device.\" \n"
+                   " by OpenCL implementation on your device.\" \n"
                    "#endif\n\n";
   }