Skip to content

Commit 0b30034

Browse files
committed
change into cutlass
1 parent 50a7eb9 commit 0b30034

File tree

3 files changed

+89
-30
lines changed

3 files changed

+89
-30
lines changed

gallery/how_to/work_with_relay/using_with_pipeline_executor.py

Lines changed: 73 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@
4646
def get_network():
4747
out_channels = 16
4848
batch_size = 1
49-
data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float32"))
49+
data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16"))
5050
dense_weight = relay.var(
51-
"data", relay.TensorType((batch_size, 16 * img_size * img_size), "float32")
51+
"dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
5252
)
5353
weight = relay.var("weight")
5454
second_weight = relay.var("second_weight")
@@ -92,47 +92,73 @@ def get_network():
9292
"""
9393
#subgraphs[0])
9494
95-
def @main(%data: Tensor[(1, 3, img_size, img_size), float32]) {
96-
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */;
97-
%1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, img_size, img_size), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
95+
def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
96+
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */;
97+
%1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */;
9898
%2 = %1.0;
99-
nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
99+
nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
100100
}
101101
102102
peline-tutorial
103103
104104
#subgraphs[1]
105105
106-
def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
107-
nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
106+
def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
107+
%0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
108+
nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
108109
}
110+
109111
"""
110112

111113
# sphinx_gallery_start_ignore
112114
from tvm import testing
113115

114116
testing.utils.install_request_hook(depth=3)
115117
# sphinx_gallery_end_ignore
118+
#########################################
119+
# Build the subgraph with cutlass target.
120+
# ---------------------------------------
121+
#########################################
122+
cutlass = tvm.target.Target(
123+
{
124+
"kind": "cutlass",
125+
"sm": 80,
126+
"use_3xtf32": True,
127+
"split_k_slices": [1],
128+
"profile_all_alignments": False,
129+
"find_first_valid": True,
130+
"use_multiprocessing": True,
131+
"use_fast_math": False,
132+
"tmp_dir": "./tmp",
133+
},
134+
host=tvm.target.Target("llvm"),
135+
)
136+
137+
138+
def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"):
139+
target = [target, cutlass]
140+
lib = relay.build_module.build(
141+
mod, target=target, params=params, target_host=target_host, mod_name=mod_name
142+
)
143+
return lib
144+
116145

117146
###########################################################
118147
# Run the two subgraphs in pipeline with pipeline executor.
119148
# ---------------------------------------------------------
120149
# Define a function to do all the codegen and pipeline executor works.
121150
# To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
122-
# and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
151+
# and the 'USE_CUTLASS' should set as ON in config.cmake.
123152
def run_pipeline():
124153
from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
125154

126155
#########################################
127156
# Create subgraph pipeline configuration.
128157
# Associate the subgraph module with a target.
129158
# Using BYOC to set the codegen of the second subgraph module.
130-
# To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
159+
# To use cutlass the 'USE_CUTLASS' should set as ON.
131160
mod0, mod1 = subgraphs[0], subgraphs[1]
132-
# mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
133-
# mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0)
134-
# mod0 = relay.transform.MergeCompilerRegions()(mod0)
135-
# mod0 = relay.transform.PartitionGraph()(mod0)
161+
# Apply cutlass as the codegen.
136162
mod1 = partition_for_cutlass(mod1)
137163
#################################################
138164
# Get the pipeline executor configuration object.
@@ -144,10 +170,13 @@ def run_pipeline():
144170
###############################################################################
145171
# Set the cpu afinity for control flow, for example using cpu 0 for control flow.
146172
pipe_config[mod1].cpu_affinity = "0"
173+
pipe_config[mod1].export_cc = None
147174
##############################################################
148175
# Set the compile target of the second subgraph module as LLVM.
149-
pipe_config[mod1].target = "cuda"
176+
pipe_config[mod1].target = "cuda" # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
150177
pipe_config[mod1].dev = tvm.device("cuda", 0)
178+
pipe_config[mod1].build_func = cutlass_build
179+
pipe_config[mod1].export_cc = "nvcc"
151180
#################################################################################
152181
# Set the cpu afinity for control flow, for example using cpu 1 for control flow.
153182
pipe_config[mod1].cpu_affinity = "1"
@@ -171,7 +200,7 @@ def run_pipeline():
171200
# sphinx_gallery_start_ignore
172201
from tvm import testing
173202

174-
testing.utils.install_request_hook(depth=3)
203+
# testing.utils.install_request_hook(depth=3)
175204
# sphinx_gallery_end_ignore
176205
##############################
177206
# Build the pipeline executor.
@@ -195,7 +224,7 @@ def run_pipeline():
195224
# Run the pipeline executor.
196225
# --------------------------
197226
# Allocated a input data.
198-
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
227+
data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
199228
pipeline_module.set_input("data", tvm.nd.array(data))
200229
##########################################################################
201230
# Run the two subgraph in pipeline mode and get the output asynchronously.
@@ -209,18 +238,39 @@ def run_pipeline():
209238
# ------------------------------------
210239
# Run these two subgraphs in sequence with graph_executor to get the output.
211240
target = "llvm"
212-
dev = tvm.device(target, 0)
241+
dev0 = tvm.device(target, 0)
213242
lib0 = relay.build_module.build(mod0, target, params=params)
214-
lib1 = relay.build_module.build(mod1, target, params=params)
215-
module0 = runtime.GraphModule(lib0["default"](dev))
216-
module1 = runtime.GraphModule(lib1["default"](dev))
243+
module0 = runtime.GraphModule(lib0["default"](dev0))
244+
cutlass = tvm.target.Target(
245+
{
246+
"kind": "cutlass",
247+
"sm": 75,
248+
"use_3xtf32": True,
249+
"split_k_slices": [1],
250+
"profile_all_alignments": False,
251+
"find_first_valid": True,
252+
"use_multiprocessing": True,
253+
"use_fast_math": False,
254+
"tmp_dir": "./tmp",
255+
},
256+
host=tvm.target.Target("llvm"),
257+
)
258+
cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
259+
lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
260+
lib1 = finalize_modules(lib1, "compile.so", "./tmp")
261+
262+
dev1 = tvm.device("cuda", 0)
263+
264+
module1 = runtime.GraphModule(lib1["default"](dev1))
265+
217266
module0.set_input("data", data)
218267
module0.run()
219268
out_shape = (1, 16, img_size, img_size)
220-
out = module0.get_output(0, tvm.nd.empty(out_shape))
269+
out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
221270
module1.set_input("data_n_0", out)
222271
module1.run()
223-
out = module1.get_output(0, tvm.nd.empty(out_shape))
272+
out_shape = (1, 1)
273+
out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
224274
####################
225275
# Verify the result.
226276
tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())

python/tvm/contrib/pipeline_executor.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,11 +302,16 @@ def export_library(self, directory_path):
302302
self.pipeline_mods[lib_index]["dev"].device_type,
303303
self.pipeline_mods[lib_index]["dev"].device_id,
304304
)
305-
306305
# Get the graph, lib, and parameters from GraphExecutorFactoryModule.
307306
lib = self.pipeline_mods[lib_index]["lib"]
308307
# Export the lib, graph, and parameters to disk.
309-
lib.export_library(mconfig["lib_name"])
308+
if self.pipeline_mods[lib_index]["export_cc"]:
309+
lib.export_library(
310+
mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"]
311+
)
312+
else:
313+
lib.export_library(mconfig["lib_name"])
314+
310315
with open(mconfig["json_name"], "w") as file_handle:
311316
file_handle.write(lib.graph_json)
312317
with open(mconfig["params_name"], "wb") as file_handle:

python/tvm/contrib/pipeline_executor_build.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,12 @@ def build(pipe_configs):
8686
# Use "mod_idx" as the key to create a "module_connection" map which is not only
8787
# for the module index but also for the module connection used to build the pipeline.
8888
module_string_config[mod_idx] = pipe_config
89-
libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]}
89+
libs[mod_idx] = {
90+
"lib": lib,
91+
"dev": dev,
92+
"fcompile": mod_config["fcompile"],
93+
"export_cc": mod_config["export_cc"],
94+
}
9095

9196
# Creating a text form configuration to record the "input_connection" and the
9297
# "module_connection" information. The "input_connection" is used to record the
@@ -132,10 +137,7 @@ def export_library(factory, directory_path):
132137
mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
133138
mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
134139
lib_config = factory.pipeline_mods[lib_index]
135-
mconfig["dev"] = "{},{}".format(
136-
lib_config["dev"].device_type,
137-
lib_config["dev"].device_id,
138-
)
140+
mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id)
139141
fcompile = lib_config["fcompile"]
140142
if not fcompile:
141143
fcompile = False
@@ -413,6 +415,7 @@ def __init__(self, mod=None):
413415
self.fcompile = None
414416
self.name = None
415417
self.dev = None
418+
self.export_cc = None
416419
self.cpu_affinity = ""
417420
self.idx = None
418421
self.mod = mod
@@ -601,6 +604,7 @@ def get_config(self):
601604
"target": module.target,
602605
"fcompile": module.fcompile,
603606
"dev": module.dev,
607+
"export_cc": module.export_cc,
604608
}
605609

606610
# Creating a map including pipeline inputs and subgraph inputs.

0 commit comments

Comments
 (0)