@@ -411,20 +411,20 @@ def tile_wmma_fragment(block_read, height):
411411
412412f = tvm .build (sch .mod ["main" ], target = "cuda" , name = "dense" )
413413
414- # dev = tvm.device("cuda", 0)
415- # a_np = np.random.uniform(size=(N, K)).astype("float16")
416- # b_np = np.random.uniform(size=(K, M)).astype("float16")
417- # c_np = np.dot(a_np.astype("float32"), b_np.astype("float32"))
418- # a = tvm.nd.array(a_np, dev)
419- # b = tvm.nd.array(b_np, dev)
420- # c = tvm.nd.array(np.zeros((M, N), dtype="float32"), dev)
421-
422- # print(f.imported_modules[0].get_source())
423- # f(a, b, c)
414+ dev = tvm .device ("cuda" , 0 )
415+ a_np = np .random .uniform (size = (N , K )).astype ("float16" )
416+ b_np = np .random .uniform (size = (K , M )).astype ("float16" )
417+ c_np = np .dot (a_np .astype ("float32" ), b_np .astype ("float32" ))
418+ a = tvm .nd .array (a_np , dev )
419+ b = tvm .nd .array (b_np , dev )
420+ c = tvm .nd .array (np .zeros ((M , N ), dtype = "float32" ), dev )
421+
422+ print (f .imported_modules [0 ].get_source ())
423+ f (a , b , c )
424424# tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
425425# print("ok")
426426
427- # evaluator = f.time_evaluator(f.entry_name, dev, number=1000 )
428- # gflops = (N * M * K) * 2 / 1e9
429- # time_ms = evaluator(a, b, c).mean * 1e3
430- # print("matmul with tensor core: %f ms, %f GFLOPS" % (time_ms, gflops / (time_ms / 1e3)))
427+ evaluator = f .time_evaluator (f .entry_name , dev , number = 10 )
428+ gflops = (N * M * K ) * 2 / 1e9
429+ time_ms = evaluator (a , b , c ).mean * 1e3
430+ print ("matmul with tensor core: %f ms, %f GFLOPS" % (time_ms , gflops / (time_ms / 1e3 )))
0 commit comments