4646def get_network ():
4747 out_channels = 16
4848 batch_size = 1
49- data = relay .var ("data" , relay .TensorType ((batch_size , 3 , img_size , img_size ), "float32 " ))
49+ data = relay .var ("data" , relay .TensorType ((batch_size , 3 , img_size , img_size ), "float16 " ))
5050 dense_weight = relay .var (
51- "data " , relay .TensorType ((batch_size , 16 * img_size * img_size ), "float32 " )
51+ "dweight " , relay .TensorType ((batch_size , 16 * img_size * img_size ), "float16 " )
5252 )
5353 weight = relay .var ("weight" )
5454 second_weight = relay .var ("second_weight" )
@@ -92,47 +92,73 @@ def get_network():
9292"""
9393#subgraphs[0])
9494
95- def @main(%data: Tensor[(1, 3, img_size, img_size), float32 ]) {
96- %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32 ] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32 ] */;
97- %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32 ] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32 ]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32 ] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32 ] */) /* ty=(Tensor[(1,16, img_size, img_size), float32 ], Tensor[(16), float32 ], Tensor[(16), float32 ]) */;
95+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16 ]) {
96+ %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16 ] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16 ] */;
97+ %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16 ] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16 ]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16 ] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16 ] */) /* ty=(Tensor[(1,16, img_size, img_size), float16 ], Tensor[(16), float16 ], Tensor[(16), float16 ]) */;
9898 %2 = %1.0;
99- nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32 ] */
99+ nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16 ] */
100100 }
101101
102102peline-tutorial
103103
104104#subgraphs[1]
105105
106- def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
107- nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
106+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
107+ %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
108+ nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
108109 }
110+
109111"""
110112
111113# sphinx_gallery_start_ignore
112114from tvm import testing
113115
114116testing .utils .install_request_hook (depth = 3 )
115117# sphinx_gallery_end_ignore
118+ #########################################
119+ # Build the subgraph with cutlass target.
120+ # ---------------------------------------
121+ #########################################
122+ cutlass = tvm .target .Target (
123+ {
124+ "kind" : "cutlass" ,
125+ "sm" : 80 ,
126+ "use_3xtf32" : True ,
127+ "split_k_slices" : [1 ],
128+ "profile_all_alignments" : False ,
129+ "find_first_valid" : True ,
130+ "use_multiprocessing" : True ,
131+ "use_fast_math" : False ,
132+ "tmp_dir" : "./tmp" ,
133+ },
134+ host = tvm .target .Target ("llvm" ),
135+ )
136+
137+
138+ def cutlass_build (mod , target , params = None , target_host = None , mod_name = "default" ):
139+ target = [target , cutlass ]
140+ lib = relay .build_module .build (
141+ mod , target = target , params = params , target_host = target_host , mod_name = mod_name
142+ )
143+ return lib
144+
116145
117146###########################################################
118147# Run the two subgraphs in pipeline with pipeline executor.
119148# ---------------------------------------------------------
120149# Define a function to do all the codegen and pipeline executor works.
121150# To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
122- # and the 'USE_DNNL_CODEGEN ' should set as ON in config.cmake and installing MKL-DNN .
151+ # and the 'USE_CUTLASS ' should set as ON in config.cmake.
123152def run_pipeline ():
124153 from tvm .contrib import graph_executor , pipeline_executor , pipeline_executor_build
125154
126155 #########################################
127156 # Create subgraph pipeline configuration.
128157 # Associate the subgraph module with a target.
129158 # Using BYOC to set the codegen of the second subgraph module.
130- # To use dnnl the 'USE_DNNL_CODEGEN ' should set as ON in config.cmake and installing MKL-DNN .
159+ # To use cutlass the 'USE_CUTLASS ' should set as ON.
131160 mod0 , mod1 = subgraphs [0 ], subgraphs [1 ]
132- # mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
133- # mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0)
134- # mod0 = relay.transform.MergeCompilerRegions()(mod0)
135- # mod0 = relay.transform.PartitionGraph()(mod0)
161+ # Apply cutlass as the codegen.
136162 mod1 = partition_for_cutlass (mod1 )
137163 #################################################
138164 # Get the pipeline executor configuration object.
@@ -144,10 +170,13 @@ def run_pipeline():
144170 ###############################################################################
145171 # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
146172 pipe_config [mod1 ].cpu_affinity = "0"
173+ pipe_config [mod1 ].export_cc = None
147174 ##############################################################
148175 # Set the compile target of the second subgraph module as LLVM.
149- pipe_config [mod1 ].target = "cuda"
176+ pipe_config [mod1 ].target = "cuda" # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
150177 pipe_config [mod1 ].dev = tvm .device ("cuda" , 0 )
178+ pipe_config [mod1 ].build_func = cutlass_build
179+ pipe_config [mod1 ].export_cc = "nvcc"
151180 #################################################################################
152181 # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
153182 pipe_config [mod1 ].cpu_affinity = "1"
@@ -171,7 +200,7 @@ def run_pipeline():
171200 # sphinx_gallery_start_ignore
172201 from tvm import testing
173202
174- testing .utils .install_request_hook (depth = 3 )
203+ # testing.utils.install_request_hook(depth=3)
175204 # sphinx_gallery_end_ignore
176205 ##############################
177206 # Build the pipeline executor.
@@ -195,7 +224,7 @@ def run_pipeline():
195224 # Run the pipeline executor.
196225 # --------------------------
197226 # Allocated a input data.
198- data = np .random .uniform (- 1 , 1 , size = data_shape ).astype ("float32 " )
227+ data = np .random .uniform (- 1 , 1 , size = data_shape ).astype ("float16 " )
199228 pipeline_module .set_input ("data" , tvm .nd .array (data ))
200229 ##########################################################################
201230 # Run the two subgraph in pipeline mode and get the output asynchronously.
@@ -209,18 +238,39 @@ def run_pipeline():
209238 # ------------------------------------
210239 # Run these two subgraphs in sequence with graph_executor to get the output.
211240 target = "llvm"
212- dev = tvm .device (target , 0 )
241+ dev0 = tvm .device (target , 0 )
213242 lib0 = relay .build_module .build (mod0 , target , params = params )
214- lib1 = relay .build_module .build (mod1 , target , params = params )
215- module0 = runtime .GraphModule (lib0 ["default" ](dev ))
216- module1 = runtime .GraphModule (lib1 ["default" ](dev ))
243+ module0 = runtime .GraphModule (lib0 ["default" ](dev0 ))
244+ cutlass = tvm .target .Target (
245+ {
246+ "kind" : "cutlass" ,
247+ "sm" : 75 ,
248+ "use_3xtf32" : True ,
249+ "split_k_slices" : [1 ],
250+ "profile_all_alignments" : False ,
251+ "find_first_valid" : True ,
252+ "use_multiprocessing" : True ,
253+ "use_fast_math" : False ,
254+ "tmp_dir" : "./tmp" ,
255+ },
256+ host = tvm .target .Target ("llvm" ),
257+ )
258+ cuda = tvm .target .Target ("cuda" , host = tvm .target .Target ("llvm" ))
259+ lib1 = relay .build_module .build (mod1 , [cuda , cutlass ], params = params )
260+ lib1 = finalize_modules (lib1 , "compile.so" , "./tmp" )
261+
262+ dev1 = tvm .device ("cuda" , 0 )
263+
264+ module1 = runtime .GraphModule (lib1 ["default" ](dev1 ))
265+
217266 module0 .set_input ("data" , data )
218267 module0 .run ()
219268 out_shape = (1 , 16 , img_size , img_size )
220- out = module0 .get_output (0 , tvm .nd .empty (out_shape ))
269+ out = module0 .get_output (0 , tvm .nd .empty (out_shape , "float16" ))
221270 module1 .set_input ("data_n_0" , out )
222271 module1 .run ()
223- out = module1 .get_output (0 , tvm .nd .empty (out_shape ))
272+ out_shape = (1 , 1 )
273+ out = module1 .get_output (0 , tvm .nd .empty (out_shape , "float16" ))
224274 ####################
225275 # Verify the result.
226276 tvm .testing .assert_allclose (outputs [0 ].numpy (), out .numpy ())
0 commit comments